xref: /illumos-gate/usr/src/uts/common/io/ib/clients/ibd/ibd.c (revision d3d50737)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * An implementation of the IPoIB standard based on PSARC 2001/289.
29  */
30 
31 #include <sys/types.h>
32 #include <sys/conf.h>
33 #include <sys/ddi.h>
34 #include <sys/sunddi.h>
35 #include <sys/modctl.h>
36 #include <sys/stropts.h>
37 #include <sys/stream.h>
38 #include <sys/strsun.h>
39 #include <sys/strsubr.h>
40 #include <sys/dlpi.h>
41 #include <sys/mac_provider.h>
42 
43 #include <sys/pattr.h>		/* for HCK_FULLCKSUM */
44 #include <sys/sysmacros.h>	/* for offsetof */
45 #include <sys/disp.h>		/* for async thread pri */
46 #include <sys/atomic.h>		/* for atomic_add*() */
47 #include <sys/ethernet.h>	/* for ETHERTYPE_IPV6 */
48 #include <netinet/in.h>		/* for netinet/ip.h below */
49 #include <netinet/ip.h>		/* for struct ip */
50 #include <netinet/udp.h>	/* for struct udphdr */
51 #include <inet/common.h>	/* for inet/ip.h below */
52 #include <inet/ip.h>		/* for ipha_t */
53 #include <inet/ip6.h>		/* for ip6_t */
54 #include <inet/tcp.h>		/* for tcph_t */
55 #include <netinet/icmp6.h>	/* for icmp6_t */
56 #include <sys/callb.h>
57 #include <sys/modhash.h>
58 
59 #include <sys/ib/clients/ibd/ibd.h>
60 #include <sys/ib/mgt/sm_attr.h>	/* for SM_INIT_TYPE_* */
61 #include <sys/note.h>
62 #include <sys/multidata.h>
63 
64 #include <sys/ib/mgt/ibmf/ibmf.h>	/* for ibd_get_portspeed */
65 
66 /*
67  * Per-interface tunables (for developers)
68  *
69  * ibd_tx_copy_thresh
70  *     This sets the threshold at which ibd will attempt to do a bcopy of the
71  *     outgoing data into a pre-mapped buffer. The IPoIB driver's send behavior
72  *     is restricted by various parameters, so setting of this value must be
73  *     made after careful considerations only.  For instance, IB HCAs currently
74  *     impose a relatively small limit (when compared to ethernet NICs) on the
75  *     length of the SGL for transmit. On the other hand, the ip stack could
76  *     send down mp chains that are quite long when LSO is enabled.
77  *
78  * ibd_num_swqe
79  *     Number of "send WQE" elements that will be allocated and used by ibd.
80  *     When tuning this parameter, the size of pre-allocated, pre-mapped copy
81  *     buffer in each of these send wqes must be taken into account. This
82  *     copy buffer size is determined by the value of IBD_TX_BUF_SZ (this is
83  *     currently set to the same value of ibd_tx_copy_thresh, but may be
84  *     changed independently if needed).
85  *
86  * ibd_num_rwqe
87  *     Number of "receive WQE" elements that will be allocated and used by
88  *     ibd. This parameter is limited by the maximum channel size of the HCA.
89  *     Each buffer in the receive wqe will be of MTU size.
90  *
91  * ibd_num_lso_bufs
92  *     Number of "larger-than-MTU" copy buffers to use for cases when the
93  *     outgoing mblk chain is too fragmented to be used with ibt_map_mem_iov()
94  *     and too large to be used with regular MTU-sized copy buffers. It is
95  *     not recommended to tune this variable without understanding the
96  *     application environment and/or memory resources. The size of each of
97  *     these lso buffers is determined by the value of IBD_LSO_BUFSZ.
98  *
99  * ibd_num_ah
100  *     Number of AH cache entries to allocate
101  *
102  * ibd_hash_size
103  *     Hash table size for the active AH list
104  *
105  * ibd_tx_softintr
106  * ibd_rx_softintr
107  *     The softintr mechanism allows ibd to avoid event queue overflows if
108  *     the receive/completion handlers are to be expensive. These are enabled
109  *     by default.
110  *
111  * ibd_log_sz
112  *     This specifies the size of the ibd log buffer in bytes. The buffer is
113  *     allocated and logging is enabled only when IBD_LOGGING is defined.
114  *
115  */
116 uint_t ibd_tx_copy_thresh = 0x1000;
117 uint_t ibd_num_swqe = 4000;
118 uint_t ibd_num_rwqe = 4000;
119 uint_t ibd_num_lso_bufs = 0x400;
120 uint_t ibd_num_ah = 64;
121 uint_t ibd_hash_size = 32;
122 uint_t ibd_rx_softintr = 1;
123 uint_t ibd_tx_softintr = 1;
124 uint_t ibd_create_broadcast_group = 1;
125 #ifdef IBD_LOGGING
126 uint_t ibd_log_sz = 0x20000;
127 #endif
128 
129 #define	IBD_TX_COPY_THRESH		ibd_tx_copy_thresh
130 #define	IBD_TX_BUF_SZ			ibd_tx_copy_thresh
131 #define	IBD_NUM_SWQE			ibd_num_swqe
132 #define	IBD_NUM_RWQE			ibd_num_rwqe
133 #define	IBD_NUM_LSO_BUFS		ibd_num_lso_bufs
134 #define	IBD_NUM_AH			ibd_num_ah
135 #define	IBD_HASH_SIZE			ibd_hash_size
136 #ifdef IBD_LOGGING
137 #define	IBD_LOG_SZ			ibd_log_sz
138 #endif
139 
140 /*
141  * Receive CQ moderation parameters: tunable (for developers)
142  */
143 uint_t ibd_rxcomp_count = 4;
144 uint_t ibd_rxcomp_usec = 10;
145 
146 /*
147  * Send CQ moderation parameters: tunable (for developers)
148  */
149 uint_t ibd_txcomp_count = 16;
150 uint_t ibd_txcomp_usec = 300;
151 
152 /*
153  * Thresholds
154  *
155  * When waiting for resources (swqes or lso buffers) to become available,
156  * the first two thresholds below determine how long to wait before informing
157  * the network layer to start sending packets again. The IBD_TX_POLL_THRESH
158  * determines how low the available swqes should go before we start polling
159  * the completion queue.
160  */
161 #define	IBD_FREE_LSOS_THRESH		8
162 #define	IBD_FREE_SWQES_THRESH		20
163 #define	IBD_TX_POLL_THRESH		80
164 
165 /*
166  * When doing multiple-send-wr, this value determines how many to do at
167  * a time (in a single ibt_post_send).
168  */
169 #define	IBD_MAX_TX_POST_MULTIPLE	4
170 
171 /* Post IBD_RX_POST_CNT receive work requests at a time. */
172 #define	IBD_RX_POST_CNT			16
173 
174 /* Hash into 1 << IBD_LOG_RX_POST number of rx post queues */
175 #define	IBD_LOG_RX_POST			3
176 
177 /* Minimum number of receive work requests driver needs to always have */
178 #define	IBD_RWQE_MIN	((IBD_RX_POST_CNT << IBD_LOG_RX_POST) * 4)
179 
180 /*
181  * Maximum length for returning chained mps back to crossbow.
182  * Also used as the maximum number of rx wc's polled at a time.
183  */
184 #define	IBD_MAX_RX_MP_LEN		16
185 
186 /*
187  * LSO parameters
188  */
189 #define	IBD_LSO_MAXLEN			65536
190 #define	IBD_LSO_BUFSZ			8192
191 #define	IBD_PROP_LSO_POLICY		"lso-policy"
192 
193 /*
194  * Completion queue polling control
195  */
196 #define	IBD_CQ_POLLING			0x1
197 #define	IBD_REDO_CQ_POLLING		0x2
198 
199 /*
200  * Flag bits for resources to reap
201  */
202 #define	IBD_RSRC_SWQE			0x1
203 #define	IBD_RSRC_LSOBUF			0x2
204 
205 /*
206  * Async operation types
207  */
208 #define	IBD_ASYNC_GETAH			1
209 #define	IBD_ASYNC_JOIN			2
210 #define	IBD_ASYNC_LEAVE			3
211 #define	IBD_ASYNC_PROMON		4
212 #define	IBD_ASYNC_PROMOFF		5
213 #define	IBD_ASYNC_REAP			6
214 #define	IBD_ASYNC_TRAP			7
215 #define	IBD_ASYNC_SCHED			8
216 #define	IBD_ASYNC_LINK			9
217 #define	IBD_ASYNC_EXIT			10
218 
219 /*
220  * Async operation states
221  */
222 #define	IBD_OP_NOTSTARTED		0
223 #define	IBD_OP_ONGOING			1
224 #define	IBD_OP_COMPLETED		2
225 #define	IBD_OP_ERRORED			3
226 #define	IBD_OP_ROUTERED			4
227 
228 /*
229  * State of IBD driver initialization during attach/m_start
230  */
231 #define	IBD_DRV_STATE_INITIALIZED	0x00001
232 #define	IBD_DRV_RXINTR_ADDED		0x00002
233 #define	IBD_DRV_TXINTR_ADDED		0x00004
234 #define	IBD_DRV_IBTL_ATTACH_DONE	0x00008
235 #define	IBD_DRV_HCA_OPENED		0x00010
236 #define	IBD_DRV_PD_ALLOCD		0x00020
237 #define	IBD_DRV_MAC_REGISTERED		0x00040
238 #define	IBD_DRV_PORT_DETAILS_OBTAINED	0x00080
239 #define	IBD_DRV_BCAST_GROUP_FOUND	0x00100
240 #define	IBD_DRV_ACACHE_INITIALIZED	0x00200
241 #define	IBD_DRV_CQS_ALLOCD		0x00400
242 #define	IBD_DRV_UD_CHANNEL_SETUP	0x00800
243 #define	IBD_DRV_TXLIST_ALLOCD		0x01000
244 #define	IBD_DRV_SCQ_NOTIFY_ENABLED	0x02000
245 #define	IBD_DRV_RXLIST_ALLOCD		0x04000
246 #define	IBD_DRV_BCAST_GROUP_JOINED	0x08000
247 #define	IBD_DRV_ASYNC_THR_CREATED	0x10000
248 #define	IBD_DRV_RCQ_NOTIFY_ENABLED	0x20000
249 #define	IBD_DRV_SM_NOTICES_REGISTERED	0x40000
250 #define	IBD_DRV_STARTED			0x80000
251 
252 /*
253  * Start/stop in-progress flags; note that restart must always remain
254  * the OR of start and stop flag values.
255  */
256 #define	IBD_DRV_START_IN_PROGRESS	0x10000000
257 #define	IBD_DRV_STOP_IN_PROGRESS	0x20000000
258 #define	IBD_DRV_RESTART_IN_PROGRESS	0x30000000
259 
260 /*
261  * Miscellaneous constants
262  */
263 #define	IBD_SEND			0
264 #define	IBD_RECV			1
265 #define	IB_MGID_IPV4_LOWGRP_MASK	0xFFFFFFFF
266 #define	IBD_DEF_MAX_SDU			2044
267 #define	IBD_DEFAULT_QKEY		0xB1B
268 #ifdef IBD_LOGGING
269 #define	IBD_DMAX_LINE			100
270 #endif
271 
272 /*
273  * Enumerations for link states
274  */
275 typedef enum {
276 	IBD_LINK_DOWN,
277 	IBD_LINK_UP,
278 	IBD_LINK_UP_ABSENT
279 } ibd_link_op_t;
280 
281 /*
282  * Driver State Pointer
283  */
284 void *ibd_list;
285 
286 /*
287  * Logging
288  */
289 #ifdef IBD_LOGGING
290 kmutex_t ibd_lbuf_lock;
291 uint8_t *ibd_lbuf;
292 uint32_t ibd_lbuf_ndx;
293 #endif
294 
295 /*
296  * Required system entry points
297  */
298 static int ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd);
299 static int ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd);
300 
301 /*
302  * Required driver entry points for GLDv3
303  */
304 static int ibd_m_stat(void *, uint_t, uint64_t *);
305 static int ibd_m_start(void *);
306 static void ibd_m_stop(void *);
307 static int ibd_m_promisc(void *, boolean_t);
308 static int ibd_m_multicst(void *, boolean_t, const uint8_t *);
309 static int ibd_m_unicst(void *, const uint8_t *);
310 static mblk_t *ibd_m_tx(void *, mblk_t *);
311 static boolean_t ibd_m_getcapab(void *, mac_capab_t, void *);
312 
313 /*
314  * Private driver entry points for GLDv3
315  */
316 
317 /*
318  * Initialization
319  */
320 static int ibd_state_init(ibd_state_t *, dev_info_t *);
321 static int ibd_init_txlist(ibd_state_t *);
322 static int ibd_init_rxlist(ibd_state_t *);
323 static int ibd_acache_init(ibd_state_t *);
324 #ifdef IBD_LOGGING
325 static void ibd_log_init(void);
326 #endif
327 
328 /*
329  * Termination/cleanup
330  */
331 static void ibd_state_fini(ibd_state_t *);
332 static void ibd_fini_txlist(ibd_state_t *);
333 static void ibd_fini_rxlist(ibd_state_t *);
334 static void ibd_tx_cleanup(ibd_state_t *, ibd_swqe_t *);
335 static void ibd_tx_cleanup_list(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *);
336 static void ibd_acache_fini(ibd_state_t *);
337 #ifdef IBD_LOGGING
338 static void ibd_log_fini(void);
339 #endif
340 
341 /*
342  * Allocation/acquire/map routines
343  */
344 static int ibd_alloc_tx_copybufs(ibd_state_t *);
345 static int ibd_alloc_rx_copybufs(ibd_state_t *);
346 static int ibd_alloc_tx_lsobufs(ibd_state_t *);
347 static ibd_swqe_t *ibd_acquire_swqe(ibd_state_t *);
348 static int ibd_acquire_lsobufs(ibd_state_t *, uint_t, ibt_wr_ds_t *,
349     uint32_t *);
350 
351 /*
352  * Free/release/unmap routines
353  */
354 static void ibd_free_rwqe(ibd_state_t *, ibd_rwqe_t *);
355 static void ibd_free_tx_copybufs(ibd_state_t *);
356 static void ibd_free_rx_copybufs(ibd_state_t *);
357 static void ibd_free_tx_lsobufs(ibd_state_t *);
358 static void ibd_release_swqe(ibd_state_t *, ibd_swqe_t *, ibd_swqe_t *, int);
359 static void ibd_release_lsobufs(ibd_state_t *, ibt_wr_ds_t *, uint32_t);
360 static void ibd_free_lsohdr(ibd_swqe_t *, mblk_t *);
361 static void ibd_unmap_mem(ibd_state_t *, ibd_swqe_t *);
362 
363 /*
364  * Handlers/callback routines
365  */
366 static uint_t ibd_intr(caddr_t);
367 static uint_t ibd_tx_recycle(caddr_t);
368 static void ibd_rcq_handler(ibt_cq_hdl_t, void *);
369 static void ibd_scq_handler(ibt_cq_hdl_t, void *);
370 static void ibd_poll_rcq(ibd_state_t *, ibt_cq_hdl_t);
371 static void ibd_poll_scq(ibd_state_t *, ibt_cq_hdl_t);
372 static void ibd_drain_rcq(ibd_state_t *, ibt_cq_hdl_t);
373 static void ibd_drain_scq(ibd_state_t *, ibt_cq_hdl_t);
374 static void ibd_freemsg_cb(char *);
375 static void ibd_async_handler(void *, ibt_hca_hdl_t, ibt_async_code_t,
376     ibt_async_event_t *);
377 static void ibd_snet_notices_handler(void *, ib_gid_t,
378     ibt_subnet_event_code_t, ibt_subnet_event_t *);
379 
380 /*
381  * Send/receive routines
382  */
383 static boolean_t ibd_send(ibd_state_t *, mblk_t *);
384 static void ibd_post_send(ibd_state_t *, ibd_swqe_t *);
385 static void ibd_post_recv(ibd_state_t *, ibd_rwqe_t *);
386 static mblk_t *ibd_process_rx(ibd_state_t *, ibd_rwqe_t *, ibt_wc_t *);
387 
388 /*
389  * Threads
390  */
391 static void ibd_async_work(ibd_state_t *);
392 
393 /*
394  * Async tasks
395  */
396 static void ibd_async_acache(ibd_state_t *, ipoib_mac_t *);
397 static void ibd_async_multicast(ibd_state_t *, ib_gid_t, int);
398 static void ibd_async_setprom(ibd_state_t *);
399 static void ibd_async_unsetprom(ibd_state_t *);
400 static void ibd_async_reap_group(ibd_state_t *, ibd_mce_t *, ib_gid_t, uint8_t);
401 static void ibd_async_trap(ibd_state_t *, ibd_req_t *);
402 static void ibd_async_txsched(ibd_state_t *);
403 static void ibd_async_link(ibd_state_t *, ibd_req_t *);
404 
405 /*
406  * Async task helpers
407  */
408 static ibd_mce_t *ibd_async_mcache(ibd_state_t *, ipoib_mac_t *, boolean_t *);
409 static ibd_mce_t *ibd_join_group(ibd_state_t *, ib_gid_t, uint8_t);
410 static ibd_mce_t *ibd_mcache_find(ib_gid_t, struct list *);
411 static boolean_t ibd_get_allroutergroup(ibd_state_t *,
412     ipoib_mac_t *, ipoib_mac_t *);
413 static void ibd_leave_group(ibd_state_t *, ib_gid_t, uint8_t);
414 static void ibd_reacquire_group(ibd_state_t *, ibd_mce_t *);
415 static ibt_status_t ibd_iba_join(ibd_state_t *, ib_gid_t, ibd_mce_t *);
416 static ibt_status_t ibd_find_bgroup(ibd_state_t *);
417 static void ibd_n2h_gid(ipoib_mac_t *, ib_gid_t *);
418 static void ibd_h2n_mac(ipoib_mac_t *, ib_qpn_t, ib_sn_prefix_t, ib_guid_t);
419 static uint64_t ibd_get_portspeed(ibd_state_t *);
420 static boolean_t ibd_async_safe(ibd_state_t *);
421 static void ibd_async_done(ibd_state_t *);
422 static ibd_ace_t *ibd_acache_find(ibd_state_t *, ipoib_mac_t *, boolean_t, int);
423 static ibd_ace_t *ibd_acache_lookup(ibd_state_t *, ipoib_mac_t *, int *, int);
424 static ibd_ace_t *ibd_acache_get_unref(ibd_state_t *);
425 static boolean_t ibd_acache_recycle(ibd_state_t *, ipoib_mac_t *, boolean_t);
426 static void ibd_dec_ref_ace(ibd_state_t *, ibd_ace_t *);
427 static void ibd_link_mod(ibd_state_t *, ibt_async_code_t);
428 static int ibd_locate_pkey(ib_pkey_t *, uint16_t, ib_pkey_t, uint16_t *);
429 
430 /*
431  * Helpers for attach/start routines
432  */
433 static int ibd_register_mac(ibd_state_t *, dev_info_t *);
434 static int ibd_record_capab(ibd_state_t *, dev_info_t *);
435 static int ibd_unattach(ibd_state_t *, dev_info_t *);
436 static int ibd_get_port_details(ibd_state_t *);
437 static int ibd_alloc_cqs(ibd_state_t *);
438 static int ibd_setup_ud_channel(ibd_state_t *);
439 static int ibd_start(ibd_state_t *);
440 static int ibd_undo_start(ibd_state_t *, link_state_t);
441 static void ibd_set_mac_progress(ibd_state_t *, uint_t);
442 static void ibd_clr_mac_progress(ibd_state_t *, uint_t);
443 
444 
445 /*
446  * Miscellaneous helpers
447  */
448 static int ibd_sched_poll(ibd_state_t *, int, int);
449 static void ibd_queue_work_slot(ibd_state_t *, ibd_req_t *, int);
450 static void ibd_resume_transmission(ibd_state_t *);
451 static int ibd_setup_lso(ibd_swqe_t *, mblk_t *, uint32_t, ibt_ud_dest_hdl_t);
452 static int ibd_prepare_sgl(ibd_state_t *, mblk_t *, ibd_swqe_t *, uint_t);
453 static void *list_get_head(list_t *);
454 static int ibd_hash_key_cmp(mod_hash_key_t, mod_hash_key_t);
455 static uint_t ibd_hash_by_id(void *, mod_hash_key_t);
456 static void ibd_print_warn(ibd_state_t *, char *, ...);
457 #ifdef IBD_LOGGING
458 static void ibd_log(const char *, ...);
459 #endif
460 
461 DDI_DEFINE_STREAM_OPS(ibd_dev_ops, nulldev, nulldev, ibd_attach, ibd_detach,
462     nodev, NULL, D_MP, NULL, ddi_quiesce_not_needed);
463 
464 /* Module Driver Info */
465 static struct modldrv ibd_modldrv = {
466 	&mod_driverops,			/* This one is a driver */
467 	"InfiniBand GLDv3 Driver",	/* short description */
468 	&ibd_dev_ops			/* driver specific ops */
469 };
470 
471 /* Module Linkage */
472 static struct modlinkage ibd_modlinkage = {
473 	MODREV_1, (void *)&ibd_modldrv, NULL
474 };
475 
476 /*
477  * Module (static) info passed to IBTL during ibt_attach
478  */
479 static struct ibt_clnt_modinfo_s ibd_clnt_modinfo = {
480 	IBTI_V_CURR,
481 	IBT_NETWORK,
482 	ibd_async_handler,
483 	NULL,
484 	"IPIB"
485 };
486 
487 /*
488  * GLDv3 entry points
489  */
490 #define	IBD_M_CALLBACK_FLAGS	(MC_GETCAPAB)
491 static mac_callbacks_t ibd_m_callbacks = {
492 	IBD_M_CALLBACK_FLAGS,
493 	ibd_m_stat,
494 	ibd_m_start,
495 	ibd_m_stop,
496 	ibd_m_promisc,
497 	ibd_m_multicst,
498 	ibd_m_unicst,
499 	ibd_m_tx,
500 	NULL,
501 	ibd_m_getcapab
502 };
503 
504 /*
505  * Fill/clear <scope> and <p_key> in multicast/broadcast address
506  */
507 #define	IBD_FILL_SCOPE_PKEY(maddr, scope, pkey)		\
508 {							\
509 	*(uint32_t *)((char *)(maddr) + 4) |=		\
510 	    htonl((uint32_t)(scope) << 16);		\
511 	*(uint32_t *)((char *)(maddr) + 8) |=		\
512 	    htonl((uint32_t)(pkey) << 16);		\
513 }
514 
515 #define	IBD_CLEAR_SCOPE_PKEY(maddr)			\
516 {							\
517 	*(uint32_t *)((char *)(maddr) + 4) &=		\
518 	    htonl(~((uint32_t)0xF << 16));		\
519 	*(uint32_t *)((char *)(maddr) + 8) &=		\
520 	    htonl(~((uint32_t)0xFFFF << 16));		\
521 }
522 
523 /*
524  * Rudimentary debugging support
525  */
526 #ifdef DEBUG
527 int ibd_debuglevel = 100;
528 static void
529 debug_print(int l, char *fmt, ...)
530 {
531 	va_list ap;
532 
533 	if (l < ibd_debuglevel)
534 		return;
535 	va_start(ap, fmt);
536 	vcmn_err(CE_CONT, fmt, ap);
537 	va_end(ap);
538 }
539 #define	DPRINT		debug_print
540 #else
541 #define	DPRINT		0 &&
542 #endif
543 
544 /*
545  * Common routine to print warning messages; adds in hca guid, port number
546  * and pkey to be able to identify the IBA interface.
547  */
548 static void
549 ibd_print_warn(ibd_state_t *state, char *fmt, ...)
550 {
551 	ib_guid_t hca_guid;
552 	char ibd_print_buf[256];
553 	int len;
554 	va_list ap;
555 
556 	hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, state->id_dip,
557 	    0, "hca-guid", 0);
558 	len = snprintf(ibd_print_buf, sizeof (ibd_print_buf),
559 	    "%s%d: HCA GUID %016llx port %d PKEY %02x ",
560 	    ddi_driver_name(state->id_dip), ddi_get_instance(state->id_dip),
561 	    (u_longlong_t)hca_guid, state->id_port, state->id_pkey);
562 	va_start(ap, fmt);
563 	(void) vsnprintf(ibd_print_buf + len, sizeof (ibd_print_buf) - len,
564 	    fmt, ap);
565 	cmn_err(CE_NOTE, "!%s", ibd_print_buf);
566 	va_end(ap);
567 }
568 
569 /*
570  * Warlock directives
571  */
572 
573 /*
574  * id_lso_lock
575  *
576  * state->id_lso->bkt_nfree may be accessed without a lock to
577  * determine the threshold at which we have to ask the nw layer
578  * to resume transmission (see ibd_resume_transmission()).
579  */
580 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_lso_lock,
581     ibd_state_t::id_lso))
582 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_lso))
583 _NOTE(SCHEME_PROTECTS_DATA("init", ibd_state_t::id_lso_policy))
584 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_lsobkt_t::bkt_nfree))
585 
586 /*
587  * id_scq_poll_lock
588  */
589 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_scq_poll_lock,
590     ibd_state_t::id_scq_poll_busy))
591 
592 /*
593  * id_txpost_lock
594  */
595 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock,
596     ibd_state_t::id_tx_head))
597 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_txpost_lock,
598     ibd_state_t::id_tx_busy))
599 
600 /*
601  * id_acache_req_lock
602  */
603 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock,
604     ibd_state_t::id_acache_req_cv))
605 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_acache_req_lock,
606     ibd_state_t::id_req_list))
607 _NOTE(SCHEME_PROTECTS_DATA("atomic",
608     ibd_acache_s::ac_ref))
609 
610 /*
611  * id_ac_mutex
612  *
613  * This mutex is actually supposed to protect id_ah_op as well,
614  * but this path of the code isn't clean (see update of id_ah_op
615  * in ibd_async_acache(), immediately after the call to
616  * ibd_async_mcache()). For now, we'll skip this check by
617  * declaring that id_ah_op is protected by some internal scheme
618  * that warlock isn't aware of.
619  */
620 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
621     ibd_state_t::id_ah_active))
622 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
623     ibd_state_t::id_ah_free))
624 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
625     ibd_state_t::id_ah_addr))
626 _NOTE(SCHEME_PROTECTS_DATA("ac mutex should protect this",
627     ibd_state_t::id_ah_op))
628 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
629     ibd_state_t::id_ah_error))
630 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_ac_mutex,
631     ibd_state_t::id_ac_hot_ace))
632 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_ah_error))
633 
634 /*
635  * id_mc_mutex
636  */
637 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex,
638     ibd_state_t::id_mc_full))
639 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_mc_mutex,
640     ibd_state_t::id_mc_non))
641 
642 /*
643  * id_trap_lock
644  */
645 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
646     ibd_state_t::id_trap_cv))
647 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
648     ibd_state_t::id_trap_stop))
649 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_trap_lock,
650     ibd_state_t::id_trap_inprog))
651 
652 /*
653  * id_prom_op
654  */
655 _NOTE(SCHEME_PROTECTS_DATA("only by async thread",
656     ibd_state_t::id_prom_op))
657 
658 /*
659  * id_sched_lock
660  */
661 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_sched_lock,
662     ibd_state_t::id_sched_needed))
663 
664 /*
665  * id_link_mutex
666  */
667 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_link_mutex,
668     ibd_state_t::id_link_state))
669 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_link_state))
670 _NOTE(SCHEME_PROTECTS_DATA("only async thr and ibd_m_start",
671     ibd_state_t::id_link_speed))
672 _NOTE(DATA_READABLE_WITHOUT_LOCK(ibd_state_t::id_sgid))
673 
674 /*
675  * id_tx_list.dl_mutex
676  */
677 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
678     ibd_state_t::id_tx_list.dl_head))
679 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
680     ibd_state_t::id_tx_list.dl_pending_sends))
681 _NOTE(MUTEX_PROTECTS_DATA(ibd_state_t::id_tx_list.dl_mutex,
682     ibd_state_t::id_tx_list.dl_cnt))
683 
684 /*
685  * id_rx_list.dl_mutex
686  */
687 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
688     ibd_state_t::id_rx_list.dl_bufs_outstanding))
689 _NOTE(SCHEME_PROTECTS_DATA("atomic or dl mutex or single thr",
690     ibd_state_t::id_rx_list.dl_cnt))
691 
692 
693 /*
694  * Items protected by atomic updates
695  */
696 _NOTE(SCHEME_PROTECTS_DATA("atomic update only",
697     ibd_state_s::id_brd_rcv
698     ibd_state_s::id_brd_xmt
699     ibd_state_s::id_multi_rcv
700     ibd_state_s::id_multi_xmt
701     ibd_state_s::id_num_intrs
702     ibd_state_s::id_rcv_bytes
703     ibd_state_s::id_rcv_pkt
704     ibd_state_s::id_tx_short
705     ibd_state_s::id_xmt_bytes
706     ibd_state_s::id_xmt_pkt))
707 
708 /*
709  * Non-mutex protection schemes for data elements. Almost all of
710  * these are non-shared items.
711  */
712 _NOTE(SCHEME_PROTECTS_DATA("unshared or single-threaded",
713     callb_cpr
714     ib_gid_s
715     ib_header_info
716     ibd_acache_rq
717     ibd_acache_s::ac_mce
718     ibd_mcache::mc_fullreap
719     ibd_mcache::mc_jstate
720     ibd_mcache::mc_req
721     ibd_rwqe_s
722     ibd_swqe_s
723     ibd_wqe_s
724     ibt_wr_ds_s::ds_va
725     ibt_wr_lso_s
726     ipoib_mac::ipoib_qpn
727     mac_capab_lso_s
728     msgb::b_next
729     msgb::b_rptr
730     msgb::b_wptr
731     ibd_state_s::id_bgroup_created
732     ibd_state_s::id_mac_state
733     ibd_state_s::id_mtu
734     ibd_state_s::id_num_rwqe
735     ibd_state_s::id_num_swqe
736     ibd_state_s::id_qpnum
737     ibd_state_s::id_rcq_hdl
738     ibd_state_s::id_rx_buf_sz
739     ibd_state_s::id_rx_bufs
740     ibd_state_s::id_rx_mr_hdl
741     ibd_state_s::id_rx_wqes
742     ibd_state_s::id_rxwcs
743     ibd_state_s::id_rxwcs_size
744     ibd_state_s::id_rx_nqueues
745     ibd_state_s::id_rx_queues
746     ibd_state_s::id_scope
747     ibd_state_s::id_scq_hdl
748     ibd_state_s::id_tx_buf_sz
749     ibd_state_s::id_tx_bufs
750     ibd_state_s::id_tx_mr_hdl
751     ibd_state_s::id_tx_rel_list.dl_cnt
752     ibd_state_s::id_tx_wqes
753     ibd_state_s::id_txwcs
754     ibd_state_s::id_txwcs_size))
755 
756 int
757 _init()
758 {
759 	int status;
760 
761 	status = ddi_soft_state_init(&ibd_list, max(sizeof (ibd_state_t),
762 	    PAGESIZE), 0);
763 	if (status != 0) {
764 		DPRINT(10, "_init:failed in ddi_soft_state_init()");
765 		return (status);
766 	}
767 
768 	mac_init_ops(&ibd_dev_ops, "ibd");
769 	status = mod_install(&ibd_modlinkage);
770 	if (status != 0) {
771 		DPRINT(10, "_init:failed in mod_install()");
772 		ddi_soft_state_fini(&ibd_list);
773 		mac_fini_ops(&ibd_dev_ops);
774 		return (status);
775 	}
776 
777 #ifdef IBD_LOGGING
778 	ibd_log_init();
779 #endif
780 	return (0);
781 }
782 
783 int
784 _info(struct modinfo *modinfop)
785 {
786 	return (mod_info(&ibd_modlinkage, modinfop));
787 }
788 
789 int
790 _fini()
791 {
792 	int status;
793 
794 	status = mod_remove(&ibd_modlinkage);
795 	if (status != 0)
796 		return (status);
797 
798 	mac_fini_ops(&ibd_dev_ops);
799 	ddi_soft_state_fini(&ibd_list);
800 #ifdef IBD_LOGGING
801 	ibd_log_fini();
802 #endif
803 	return (0);
804 }
805 
806 /*
807  * Convert the GID part of the mac address from network byte order
808  * to host order.
809  */
810 static void
811 ibd_n2h_gid(ipoib_mac_t *mac, ib_gid_t *dgid)
812 {
813 	ib_sn_prefix_t nbopref;
814 	ib_guid_t nboguid;
815 
816 	bcopy(mac->ipoib_gidpref, &nbopref, sizeof (ib_sn_prefix_t));
817 	bcopy(mac->ipoib_gidsuff, &nboguid, sizeof (ib_guid_t));
818 	dgid->gid_prefix = b2h64(nbopref);
819 	dgid->gid_guid = b2h64(nboguid);
820 }
821 
822 /*
823  * Create the IPoIB address in network byte order from host order inputs.
824  */
825 static void
826 ibd_h2n_mac(ipoib_mac_t *mac, ib_qpn_t qpn, ib_sn_prefix_t prefix,
827     ib_guid_t guid)
828 {
829 	ib_sn_prefix_t nbopref;
830 	ib_guid_t nboguid;
831 
832 	mac->ipoib_qpn = htonl(qpn);
833 	nbopref = h2b64(prefix);
834 	nboguid = h2b64(guid);
835 	bcopy(&nbopref, mac->ipoib_gidpref, sizeof (ib_sn_prefix_t));
836 	bcopy(&nboguid, mac->ipoib_gidsuff, sizeof (ib_guid_t));
837 }
838 
839 /*
840  * Send to the appropriate all-routers group when the IBA multicast group
841  * does not exist, based on whether the target group is v4 or v6.
842  */
843 static boolean_t
844 ibd_get_allroutergroup(ibd_state_t *state, ipoib_mac_t *mcmac,
845     ipoib_mac_t *rmac)
846 {
847 	boolean_t retval = B_TRUE;
848 	uint32_t adjscope = state->id_scope << 16;
849 	uint32_t topword;
850 
851 	/*
852 	 * Copy the first 4 bytes in without assuming any alignment of
853 	 * input mac address; this will have IPoIB signature, flags and
854 	 * scope bits.
855 	 */
856 	bcopy(mcmac->ipoib_gidpref, &topword, sizeof (uint32_t));
857 	topword = ntohl(topword);
858 
859 	/*
860 	 * Generate proper address for IPv4/v6, adding in the Pkey properly.
861 	 */
862 	if ((topword == (IB_MCGID_IPV4_PREFIX | adjscope)) ||
863 	    (topword == (IB_MCGID_IPV6_PREFIX | adjscope)))
864 		ibd_h2n_mac(rmac, IB_MC_QPN, (((uint64_t)topword << 32) |
865 		    ((uint32_t)(state->id_pkey << 16))),
866 		    (INADDR_ALLRTRS_GROUP - INADDR_UNSPEC_GROUP));
867 	else
868 		/*
869 		 * Does not have proper bits in the mgid address.
870 		 */
871 		retval = B_FALSE;
872 
873 	return (retval);
874 }
875 
876 /*
877  * Padding for nd6 Neighbor Solicitation and Advertisement needs to be at
878  * front of optional src/tgt link layer address. Right now Solaris inserts
879  * padding by default at the end. The routine which is doing is nce_xmit()
880  * in ip_ndp.c. It copies the nd_lla_addr after the nd_opt_hdr_t. So when
881  * the packet comes down from IP layer to the IBD driver, it is in the
882  * following format: [IPoIB_PTXHDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T]
883  * This size is 2 bytes followed by [22 bytes of ipoib_machdr]. As a result
884  * machdr is not 4 byte aligned and had 2 bytes of padding at the end.
885  *
886  * The send routine at IBD driver changes this packet as follows:
887  * [IPoIB_HDR_T][INET6 packet][ICMP6][OPT_ND_HDR_T + 2 bytes of padding]
888  * followed by [22 bytes of ipoib_machdr] resulting in machdr 4 byte
889  * aligned.
890  *
891  * At the receiving side again ibd_process_rx takes the above packet and
892  * removes the two bytes of front padding and inserts it at the end. This
893  * is since the IP layer does not understand padding at the front.
894  */
895 #define	IBD_PAD_NSNA(ip6h, len, type) {					\
896 	uchar_t 	*nd_lla_ptr;					\
897 	icmp6_t 	*icmp6;						\
898 	nd_opt_hdr_t	*opt;						\
899 	int 		i;						\
900 									\
901 	icmp6 = (icmp6_t *)&ip6h[1];					\
902 	len -= sizeof (nd_neighbor_advert_t);				\
903 	if (((icmp6->icmp6_type == ND_NEIGHBOR_SOLICIT) ||		\
904 	    (icmp6->icmp6_type == ND_NEIGHBOR_ADVERT)) &&		\
905 	    (len != 0)) {						\
906 		opt = (nd_opt_hdr_t *)((uint8_t *)ip6h			\
907 		    + IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t));	\
908 		ASSERT(opt != NULL);					\
909 		nd_lla_ptr = (uchar_t *)&opt[1];			\
910 		if (type == IBD_SEND) {					\
911 			for (i = IPOIB_ADDRL; i > 0; i--)		\
912 				*(nd_lla_ptr + i + 1) =			\
913 				    *(nd_lla_ptr + i - 1);		\
914 		} else {						\
915 			for (i = 0; i < IPOIB_ADDRL; i++)		\
916 				*(nd_lla_ptr + i) =			\
917 				    *(nd_lla_ptr + i + 2);		\
918 		}							\
919 		*(nd_lla_ptr + i) = 0;					\
920 		*(nd_lla_ptr + i + 1) = 0;				\
921 	}								\
922 }
923 
924 /*
925  * Address handle entries maintained by the driver are kept in the
926  * free and active lists. Each entry starts out in the free list;
927  * it migrates to the active list when primed using ibt_get_paths()
928  * and ibt_modify_ud_dest() for transmission to a specific destination.
929  * In the active list, the entry has a reference count indicating the
930  * number of ongoing/uncompleted transmits that reference it. The
931  * entry is left in the active list even after the reference count
932  * goes to 0, since successive transmits can find it there and do
933  * not need to set up another entry (ie the path information is
934  * cached using the active list). Entries on the active list are
935  * also hashed using the destination link address as a key for faster
936  * lookups during transmits.
937  *
938  * For any destination address (unicast or multicast, whatever the
939  * join states), there will be at most one entry in the active list.
940  * Entries with a 0 reference count on the active list can be reused
941  * for a transmit to a new destination, if the free list is empty.
942  *
943  * The AH free list insertion/deletion is protected with the id_ac_mutex,
944  * since the async thread and Tx callback handlers insert/delete. The
945  * active list does not need a lock (all operations are done by the
946  * async thread) but updates to the reference count are atomically
947  * done (increments done by Tx path, decrements by the Tx callback handler).
948  */
949 #define	IBD_ACACHE_INSERT_FREE(state, ce) \
950 	list_insert_head(&state->id_ah_free, ce)
951 #define	IBD_ACACHE_GET_FREE(state) \
952 	list_get_head(&state->id_ah_free)
953 #define	IBD_ACACHE_INSERT_ACTIVE(state, ce) {			\
954 	int _ret_;						\
955 	list_insert_head(&state->id_ah_active, ce);		\
956 	_ret_ = mod_hash_insert(state->id_ah_active_hash,	\
957 	    (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce);	\
958 	ASSERT(_ret_ == 0);					\
959 	state->id_ac_hot_ace = ce;				\
960 }
961 #define	IBD_ACACHE_PULLOUT_ACTIVE(state, ce) {			\
962 	list_remove(&state->id_ah_active, ce);			\
963 	if (state->id_ac_hot_ace == ce)				\
964 		state->id_ac_hot_ace = NULL;			\
965 	(void) mod_hash_remove(state->id_ah_active_hash,	\
966 	    (mod_hash_key_t)&ce->ac_mac, (mod_hash_val_t)ce);	\
967 }
968 #define	IBD_ACACHE_GET_ACTIVE(state) \
969 	list_get_head(&state->id_ah_active)
970 
971 /*
972  * Membership states for different mcg's are tracked by two lists:
973  * the "non" list is used for promiscuous mode, when all mcg traffic
974  * needs to be inspected. This type of membership is never used for
975  * transmission, so there can not be an AH in the active list
976  * corresponding to a member in this list. This list does not need
977  * any protection, since all operations are performed by the async
978  * thread.
979  *
980  * "Full" and "SendOnly" membership is tracked using a single list,
981  * the "full" list. This is because this single list can then be
982  * searched during transmit to a multicast group (if an AH for the
983  * mcg is not found in the active list), since at least one type
984  * of membership must be present before initiating the transmit.
985  * This list is also emptied during driver detach, since sendonly
986  * membership acquired during transmit is dropped at detach time
987  * along with ipv4 broadcast full membership. Insert/deletes to
988  * this list are done only by the async thread, but it is also
989  * searched in program context (see multicast disable case), thus
990  * the id_mc_mutex protects the list. The driver detach path also
991  * deconstructs the "full" list, but it ensures that the async
992  * thread will not be accessing the list (by blocking out mcg
993  * trap handling and making sure no more Tx reaping will happen).
994  *
995  * Currently, an IBA attach is done in the SendOnly case too,
996  * although this is not required.
997  */
998 #define	IBD_MCACHE_INSERT_FULL(state, mce) \
999 	list_insert_head(&state->id_mc_full, mce)
1000 #define	IBD_MCACHE_INSERT_NON(state, mce) \
1001 	list_insert_head(&state->id_mc_non, mce)
1002 #define	IBD_MCACHE_FIND_FULL(state, mgid) \
1003 	ibd_mcache_find(mgid, &state->id_mc_full)
1004 #define	IBD_MCACHE_FIND_NON(state, mgid) \
1005 	ibd_mcache_find(mgid, &state->id_mc_non)
1006 #define	IBD_MCACHE_PULLOUT_FULL(state, mce) \
1007 	list_remove(&state->id_mc_full, mce)
1008 #define	IBD_MCACHE_PULLOUT_NON(state, mce) \
1009 	list_remove(&state->id_mc_non, mce)
1010 
1011 /*
1012  * AH and MCE active list manipulation:
1013  *
1014  * Multicast disable requests and MCG delete traps are two cases
1015  * where the active AH entry for the mcg (if any unreferenced one exists)
1016  * will be moved to the free list (to force the next Tx to the mcg to
1017  * join the MCG in SendOnly mode). Port up handling will also move AHs
1018  * from active to free list.
1019  *
1020  * In the case when some transmits are still pending on an entry
1021  * for an mcg, but a multicast disable has already been issued on the
1022  * mcg, there are some options to consider to preserve the join state
1023  * to ensure the emitted packet is properly routed on the IBA fabric.
1024  * For the AH, we can
1025  * 1. take out of active list at multicast disable time.
1026  * 2. take out of active list only when last pending Tx completes.
1027  * For the MCE, we can
1028  * 3. take out of active list at multicast disable time.
1029  * 4. take out of active list only when last pending Tx completes.
1030  * 5. move from active list to stale list at multicast disable time.
1031  * We choose to use 2,4. We use option 4 so that if a multicast enable
1032  * is tried before the pending Tx completes, the enable code finds the
1033  * mce in the active list and just has to make sure it will not be reaped
1034  * (ie the mcg leave done) when the pending Tx does complete. Alternatively,
1035  * a stale list (#5) that would be checked in the enable code would need
1036  * to be implemented. Option 2 is used, because otherwise, a Tx attempt
1037  * after the multicast disable would try to put an AH in the active list,
1038  * and associate the mce it finds in the active list to this new AH,
1039  * whereas the mce is already associated with the previous AH (taken off
1040  * the active list), and will be removed once the pending Tx's complete
1041  * (unless a reference count on mce's is implemented). One implication of
1042  * using 2,4 is that new Tx's posted before the pending Tx's complete will
1043  * grab new references on the AH, further delaying the leave.
1044  *
1045  * In the case of mcg delete (or create) trap when the port is sendonly
1046  * joined, the AH and MCE handling is different: the AH and MCE has to be
1047  * immediately taken off the active lists (forcing a join and path lookup
1048  * at the next Tx is the only guaranteed means of ensuring a proper Tx
1049  * to an mcg as it is repeatedly created and deleted and goes thru
1050  * reincarnations).
1051  *
1052  * When a port is already sendonly joined, and a multicast enable is
1053  * attempted, the same mce structure is promoted; this ensures only a
1054  * single mce on the active list tracks the most powerful join state.
1055  *
1056  * In the case of port up event handling, the MCE for sendonly membership
1057  * is freed up, and the ACE is put into the free list as soon as possible
1058  * (depending on whether posted Tx's have completed). For fullmembership
1059  * MCE's though, the ACE is similarly handled; but the MCE is kept around
1060  * (a re-JOIN is attempted) only if the DLPI leave has not already been
1061  * done; else the mce is deconstructed (mc_fullreap case).
1062  *
1063  * MCG creation and deletion trap handling:
1064  *
1065  * These traps are unreliable (meaning sometimes the trap might never
1066  * be delivered to the subscribed nodes) and may arrive out-of-order
1067  * since they use UD transport. An alternative to relying on these
1068  * unreliable traps is to poll for mcg presence every so often, but
1069  * instead of doing that, we try to be as conservative as possible
1070  * while handling the traps, and hope that the traps do arrive at
1071  * the subscribed nodes soon. Note that if a node is fullmember
1072  * joined to an mcg, it can not possibly receive a mcg create/delete
1073  * trap for that mcg (by fullmember definition); if it does, it is
1074  * an old trap from a previous incarnation of the mcg.
1075  *
1076  * Whenever a trap is received, the driver cleans up its sendonly
1077  * membership to the group; we choose to do a sendonly leave even
1078  * on a creation trap to handle the case of a prior deletion of the mcg
1079  * having gone unnoticed. Consider an example scenario:
1080  * T1: MCG M is deleted, and fires off deletion trap D1.
1081  * T2: MCG M is recreated, fires off creation trap C1, which is lost.
1082  * T3: Node N tries to transmit to M, joining in sendonly mode.
1083  * T4: MCG M is deleted, and fires off deletion trap D2.
1084  * T5: N receives a deletion trap, but can not distinguish D1 from D2.
1085  *     If the trap is D2, then a LEAVE is not required, since the mcg
1086  *     is already deleted; but if it is D1, a LEAVE is required. A safe
1087  *     approach is to always LEAVE, but the SM may be confused if it
1088  *     receives a LEAVE without a prior JOIN.
1089  *
1090  * Management of the non-membership to an mcg is similar to the above,
1091  * except that if the interface is in promiscuous mode, it is required
1092  * to attempt to re-join the mcg after receiving a trap. Unfortunately,
1093  * if the re-join attempt fails (in which case a warning message needs
1094  * to be printed), it is not clear whether it failed due to the mcg not
1095  * existing, or some fabric/hca issues, due to the delayed nature of
1096  * trap delivery. Querying the SA to establish presence/absence of the
1097  * mcg is also racy at best. Thus, the driver just prints a warning
1098  * message when it can not rejoin after receiving a create trap, although
1099  * this might be (on rare occasions) a mis-warning if the create trap is
1100  * received after the mcg was deleted.
1101  */
1102 
1103 /*
1104  * Implementation of atomic "recycle" bits and reference count
1105  * on address handles. This utilizes the fact that max reference
1106  * count on any handle is limited by number of send wqes, thus
1107  * high bits in the ac_ref field can be used as the recycle bits,
1108  * and only the low bits hold the number of pending Tx requests.
1109  * This atomic AH reference counting allows the Tx completion
1110  * handler not to acquire the id_ac_mutex to process every completion,
1111  * thus reducing lock contention problems between completion and
1112  * the Tx path.
1113  */
1114 #define	CYCLEVAL		0x80000
1115 #define	CLEAR_REFCYCLE(ace)	(ace)->ac_ref = 0
1116 #define	CYCLE_SET(ace)		(((ace)->ac_ref & CYCLEVAL) == CYCLEVAL)
1117 #define	GET_REF(ace)		((ace)->ac_ref)
1118 #define	GET_REF_CYCLE(ace) (				\
1119 	/*						\
1120 	 * Make sure "cycle" bit is set.		\
1121 	 */						\
1122 	ASSERT(CYCLE_SET(ace)),				\
1123 	((ace)->ac_ref & ~(CYCLEVAL))			\
1124 )
1125 #define	INC_REF(ace, num) {				\
1126 	atomic_add_32(&(ace)->ac_ref, num);		\
1127 }
1128 #define	SET_CYCLE_IF_REF(ace) (				\
1129 	CYCLE_SET(ace) ? B_TRUE :			\
1130 	    atomic_add_32_nv(&ace->ac_ref, CYCLEVAL) ==	\
1131 		CYCLEVAL ?				\
1132 		/*					\
1133 		 * Clear the "cycle" bit we just set;	\
1134 		 * ref count known to be 0 from above.	\
1135 		 */					\
1136 		CLEAR_REFCYCLE(ace), B_FALSE :		\
1137 		/*					\
1138 		 * We set "cycle" bit; let caller know.	\
1139 		 */					\
1140 		B_TRUE					\
1141 )
1142 #define	DEC_REF_DO_CYCLE(ace) (				\
1143 	atomic_add_32_nv(&ace->ac_ref, -1) ==		\
1144 	    CYCLEVAL ?					\
1145 		/*					\
1146 		 * Ref count known to be 0 from above.	\
1147 		 */					\
1148 		B_TRUE :				\
1149 		B_FALSE					\
1150 )
1151 
1152 static void *
1153 list_get_head(list_t *list)
1154 {
1155 	list_node_t *lhead = list_head(list);
1156 
1157 	if (lhead != NULL)
1158 		list_remove(list, lhead);
1159 	return (lhead);
1160 }
1161 
1162 /*
1163  * This is always guaranteed to be able to queue the work.
1164  */
1165 static void
1166 ibd_queue_work_slot(ibd_state_t *state, ibd_req_t *ptr, int op)
1167 {
1168 	/* Initialize request */
1169 	DPRINT(1, "ibd_queue_work_slot : op: %d \n", op);
1170 	ptr->rq_op = op;
1171 
1172 	/*
1173 	 * Queue provided slot onto request pool.
1174 	 */
1175 	mutex_enter(&state->id_acache_req_lock);
1176 	list_insert_tail(&state->id_req_list, ptr);
1177 
1178 	/* Go, fetch, async thread */
1179 	cv_signal(&state->id_acache_req_cv);
1180 	mutex_exit(&state->id_acache_req_lock);
1181 }
1182 
1183 /*
1184  * Main body of the per interface async thread.
1185  */
1186 static void
1187 ibd_async_work(ibd_state_t *state)
1188 {
1189 	ibd_req_t *ptr;
1190 	callb_cpr_t cprinfo;
1191 
1192 	mutex_enter(&state->id_acache_req_lock);
1193 	CALLB_CPR_INIT(&cprinfo, &state->id_acache_req_lock,
1194 	    callb_generic_cpr, "ibd_async_work");
1195 
1196 	for (;;) {
1197 		ptr = list_get_head(&state->id_req_list);
1198 		if (ptr != NULL) {
1199 			mutex_exit(&state->id_acache_req_lock);
1200 
1201 			/*
1202 			 * Once we have done the operation, there is no
1203 			 * guarantee the request slot is going to be valid,
1204 			 * it might be freed up (as in IBD_ASYNC_LEAVE, REAP,
1205 			 * TRAP).
1206 			 *
1207 			 * Perform the request.
1208 			 */
1209 			switch (ptr->rq_op) {
1210 				case IBD_ASYNC_GETAH:
1211 					ibd_async_acache(state, &ptr->rq_mac);
1212 					break;
1213 				case IBD_ASYNC_JOIN:
1214 				case IBD_ASYNC_LEAVE:
1215 					ibd_async_multicast(state,
1216 					    ptr->rq_gid, ptr->rq_op);
1217 					break;
1218 				case IBD_ASYNC_PROMON:
1219 					ibd_async_setprom(state);
1220 					break;
1221 				case IBD_ASYNC_PROMOFF:
1222 					ibd_async_unsetprom(state);
1223 					break;
1224 				case IBD_ASYNC_REAP:
1225 					ibd_async_reap_group(state,
1226 					    ptr->rq_ptr, ptr->rq_gid,
1227 					    IB_MC_JSTATE_FULL);
1228 					/*
1229 					 * the req buf contains in mce
1230 					 * structure, so we do not need
1231 					 * to free it here.
1232 					 */
1233 					ptr = NULL;
1234 					break;
1235 				case IBD_ASYNC_TRAP:
1236 					ibd_async_trap(state, ptr);
1237 					break;
1238 				case IBD_ASYNC_SCHED:
1239 					ibd_async_txsched(state);
1240 					break;
1241 				case IBD_ASYNC_LINK:
1242 					ibd_async_link(state, ptr);
1243 					break;
1244 				case IBD_ASYNC_EXIT:
1245 					mutex_enter(&state->id_acache_req_lock);
1246 #ifndef __lock_lint
1247 					CALLB_CPR_EXIT(&cprinfo);
1248 #else
1249 					mutex_exit(&state->id_acache_req_lock);
1250 #endif
1251 					return;
1252 			}
1253 			if (ptr != NULL)
1254 				kmem_cache_free(state->id_req_kmc, ptr);
1255 
1256 			mutex_enter(&state->id_acache_req_lock);
1257 		} else {
1258 #ifndef __lock_lint
1259 			/*
1260 			 * Nothing to do: wait till new request arrives.
1261 			 */
1262 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
1263 			cv_wait(&state->id_acache_req_cv,
1264 			    &state->id_acache_req_lock);
1265 			CALLB_CPR_SAFE_END(&cprinfo,
1266 			    &state->id_acache_req_lock);
1267 #endif
1268 		}
1269 	}
1270 
1271 	/*NOTREACHED*/
1272 	_NOTE(NOT_REACHED)
1273 }
1274 
1275 /*
1276  * Return when it is safe to queue requests to the async daemon; primarily
1277  * for subnet trap and async event handling. Disallow requests before the
1278  * daemon is created, and when interface deinitilization starts.
1279  */
1280 static boolean_t
1281 ibd_async_safe(ibd_state_t *state)
1282 {
1283 	mutex_enter(&state->id_trap_lock);
1284 	if (state->id_trap_stop) {
1285 		mutex_exit(&state->id_trap_lock);
1286 		return (B_FALSE);
1287 	}
1288 	state->id_trap_inprog++;
1289 	mutex_exit(&state->id_trap_lock);
1290 	return (B_TRUE);
1291 }
1292 
1293 /*
1294  * Wake up ibd_m_stop() if the unplumb code is waiting for pending subnet
1295  * trap or event handling to complete to kill the async thread and deconstruct
1296  * the mcg/ace list.
1297  */
1298 static void
1299 ibd_async_done(ibd_state_t *state)
1300 {
1301 	mutex_enter(&state->id_trap_lock);
1302 	if (--state->id_trap_inprog == 0)
1303 		cv_signal(&state->id_trap_cv);
1304 	mutex_exit(&state->id_trap_lock);
1305 }
1306 
1307 /*
1308  * Hash functions:
1309  * ibd_hash_by_id: Returns the qpn as the hash entry into bucket.
1310  * ibd_hash_key_cmp: Compares two keys, return 0 on success or else 1.
1311  * These operate on mac addresses input into ibd_send, but there is no
1312  * guarantee on the alignment of the ipoib_mac_t structure.
1313  */
1314 /*ARGSUSED*/
1315 static uint_t
1316 ibd_hash_by_id(void *hash_data, mod_hash_key_t key)
1317 {
1318 	ulong_t ptraddr = (ulong_t)key;
1319 	uint_t hval;
1320 
1321 	/*
1322 	 * If the input address is 4 byte aligned, we can just dereference
1323 	 * it. This is most common, since IP will send in a 4 byte aligned
1324 	 * IP header, which implies the 24 byte IPoIB psuedo header will be
1325 	 * 4 byte aligned too.
1326 	 */
1327 	if ((ptraddr & 3) == 0)
1328 		return ((uint_t)((ipoib_mac_t *)key)->ipoib_qpn);
1329 
1330 	bcopy(&(((ipoib_mac_t *)key)->ipoib_qpn), &hval, sizeof (uint_t));
1331 	return (hval);
1332 }
1333 
1334 static int
1335 ibd_hash_key_cmp(mod_hash_key_t key1, mod_hash_key_t key2)
1336 {
1337 	if (bcmp((char *)key1, (char *)key2, IPOIB_ADDRL) == 0)
1338 		return (0);
1339 	else
1340 		return (1);
1341 }
1342 
1343 /*
1344  * Initialize all the per interface caches and lists; AH cache,
1345  * MCG list etc.
1346  */
1347 static int
1348 ibd_acache_init(ibd_state_t *state)
1349 {
1350 	ibd_ace_t *ce;
1351 	int i;
1352 
1353 	mutex_init(&state->id_acache_req_lock, NULL, MUTEX_DRIVER, NULL);
1354 	cv_init(&state->id_acache_req_cv, NULL, CV_DEFAULT, NULL);
1355 
1356 	mutex_init(&state->id_ac_mutex, NULL, MUTEX_DRIVER, NULL);
1357 	mutex_init(&state->id_mc_mutex, NULL, MUTEX_DRIVER, NULL);
1358 	mutex_enter(&state->id_ac_mutex);
1359 	list_create(&state->id_ah_free, sizeof (ibd_ace_t),
1360 	    offsetof(ibd_ace_t, ac_list));
1361 	list_create(&state->id_ah_active, sizeof (ibd_ace_t),
1362 	    offsetof(ibd_ace_t, ac_list));
1363 	state->id_ah_active_hash = mod_hash_create_extended("IBD AH hash",
1364 	    IBD_HASH_SIZE, mod_hash_null_keydtor, mod_hash_null_valdtor,
1365 	    ibd_hash_by_id, NULL, ibd_hash_key_cmp, KM_SLEEP);
1366 	list_create(&state->id_mc_full, sizeof (ibd_mce_t),
1367 	    offsetof(ibd_mce_t, mc_list));
1368 	list_create(&state->id_mc_non, sizeof (ibd_mce_t),
1369 	    offsetof(ibd_mce_t, mc_list));
1370 	list_create(&state->id_req_list, sizeof (ibd_req_t),
1371 	    offsetof(ibd_req_t, rq_list));
1372 	state->id_ac_hot_ace = NULL;
1373 
1374 	state->id_ac_list = ce = (ibd_ace_t *)kmem_zalloc(sizeof (ibd_ace_t) *
1375 	    IBD_NUM_AH, KM_SLEEP);
1376 	for (i = 0; i < IBD_NUM_AH; i++, ce++) {
1377 		if (ibt_alloc_ud_dest(state->id_hca_hdl, IBT_UD_DEST_NO_FLAGS,
1378 		    state->id_pd_hdl, &ce->ac_dest) != IBT_SUCCESS) {
1379 			mutex_exit(&state->id_ac_mutex);
1380 			ibd_acache_fini(state);
1381 			return (DDI_FAILURE);
1382 		} else {
1383 			CLEAR_REFCYCLE(ce);
1384 			ce->ac_mce = NULL;
1385 			IBD_ACACHE_INSERT_FREE(state, ce);
1386 		}
1387 	}
1388 	mutex_exit(&state->id_ac_mutex);
1389 	return (DDI_SUCCESS);
1390 }
1391 
1392 static void
1393 ibd_acache_fini(ibd_state_t *state)
1394 {
1395 	ibd_ace_t *ptr;
1396 
1397 	mutex_enter(&state->id_ac_mutex);
1398 
1399 	while ((ptr = IBD_ACACHE_GET_ACTIVE(state)) != NULL) {
1400 		ASSERT(GET_REF(ptr) == 0);
1401 		(void) ibt_free_ud_dest(ptr->ac_dest);
1402 	}
1403 
1404 	while ((ptr = IBD_ACACHE_GET_FREE(state)) != NULL) {
1405 		ASSERT(GET_REF(ptr) == 0);
1406 		(void) ibt_free_ud_dest(ptr->ac_dest);
1407 	}
1408 
1409 	list_destroy(&state->id_ah_free);
1410 	list_destroy(&state->id_ah_active);
1411 	list_destroy(&state->id_mc_full);
1412 	list_destroy(&state->id_mc_non);
1413 	list_destroy(&state->id_req_list);
1414 	kmem_free(state->id_ac_list, sizeof (ibd_ace_t) * IBD_NUM_AH);
1415 	mutex_exit(&state->id_ac_mutex);
1416 	mutex_destroy(&state->id_ac_mutex);
1417 	mutex_destroy(&state->id_mc_mutex);
1418 	mutex_destroy(&state->id_acache_req_lock);
1419 	cv_destroy(&state->id_acache_req_cv);
1420 }
1421 
1422 /*
1423  * Search AH active hash list for a cached path to input destination.
1424  * If we are "just looking", hold == F. When we are in the Tx path,
1425  * we set hold == T to grab a reference on the AH so that it can not
1426  * be recycled to a new destination while the Tx request is posted.
1427  */
1428 static ibd_ace_t *
1429 ibd_acache_find(ibd_state_t *state, ipoib_mac_t *mac, boolean_t hold, int num)
1430 {
1431 	ibd_ace_t *ptr;
1432 
1433 	ASSERT(mutex_owned(&state->id_ac_mutex));
1434 
1435 	/*
1436 	 * Do hash search.
1437 	 */
1438 	if (mod_hash_find(state->id_ah_active_hash,
1439 	    (mod_hash_key_t)mac, (mod_hash_val_t)&ptr) == 0) {
1440 		if (hold)
1441 			INC_REF(ptr, num);
1442 		return (ptr);
1443 	}
1444 	return (NULL);
1445 }
1446 
1447 /*
1448  * This is called by the tx side; if an initialized AH is found in
1449  * the active list, it is locked down and can be used; if no entry
1450  * is found, an async request is queued to do path resolution.
1451  */
1452 static ibd_ace_t *
1453 ibd_acache_lookup(ibd_state_t *state, ipoib_mac_t *mac, int *err, int numwqe)
1454 {
1455 	ibd_ace_t *ptr;
1456 	ibd_req_t *req;
1457 
1458 	/*
1459 	 * Only attempt to print when we can; in the mdt pattr case, the
1460 	 * address is not aligned properly.
1461 	 */
1462 	if (((ulong_t)mac & 3) == 0) {
1463 		DPRINT(4,
1464 		    "ibd_acache_lookup : lookup for %08X:%08X:%08X:%08X:%08X",
1465 		    htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1466 		    htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]),
1467 		    htonl(mac->ipoib_gidsuff[1]));
1468 	}
1469 
1470 	mutex_enter(&state->id_ac_mutex);
1471 
1472 	if (((ptr = state->id_ac_hot_ace) != NULL) &&
1473 	    (memcmp(&ptr->ac_mac, mac, sizeof (*mac)) == 0)) {
1474 		INC_REF(ptr, numwqe);
1475 		mutex_exit(&state->id_ac_mutex);
1476 		return (ptr);
1477 	}
1478 	if (((ptr = ibd_acache_find(state, mac, B_TRUE, numwqe)) != NULL)) {
1479 		state->id_ac_hot_ace = ptr;
1480 		mutex_exit(&state->id_ac_mutex);
1481 		return (ptr);
1482 	}
1483 
1484 	/*
1485 	 * Implementation of a single outstanding async request; if
1486 	 * the operation is not started yet, queue a request and move
1487 	 * to ongoing state. Remember in id_ah_addr for which address
1488 	 * we are queueing the request, in case we need to flag an error;
1489 	 * Any further requests, for the same or different address, until
1490 	 * the operation completes, is sent back to GLDv3 to be retried.
1491 	 * The async thread will update id_ah_op with an error indication
1492 	 * or will set it to indicate the next look up can start; either
1493 	 * way, it will mac_tx_update() so that all blocked requests come
1494 	 * back here.
1495 	 */
1496 	*err = EAGAIN;
1497 	if (state->id_ah_op == IBD_OP_NOTSTARTED) {
1498 		req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
1499 		if (req != NULL) {
1500 			/*
1501 			 * We did not even find the entry; queue a request
1502 			 * for it.
1503 			 */
1504 			bcopy(mac, &(req->rq_mac), IPOIB_ADDRL);
1505 			ibd_queue_work_slot(state, req, IBD_ASYNC_GETAH);
1506 			state->id_ah_op = IBD_OP_ONGOING;
1507 			bcopy(mac, &state->id_ah_addr, IPOIB_ADDRL);
1508 		}
1509 	} else if ((state->id_ah_op != IBD_OP_ONGOING) &&
1510 	    (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) == 0)) {
1511 		/*
1512 		 * Check the status of the pathrecord lookup request
1513 		 * we had queued before.
1514 		 */
1515 		if (state->id_ah_op == IBD_OP_ERRORED) {
1516 			*err = EFAULT;
1517 			state->id_ah_error++;
1518 		} else {
1519 			/*
1520 			 * IBD_OP_ROUTERED case: We need to send to the
1521 			 * all-router MCG. If we can find the AH for
1522 			 * the mcg, the Tx will be attempted. If we
1523 			 * do not find the AH, we return NORESOURCES
1524 			 * to retry.
1525 			 */
1526 			ipoib_mac_t routermac;
1527 
1528 			(void) ibd_get_allroutergroup(state, mac, &routermac);
1529 			ptr = ibd_acache_find(state, &routermac, B_TRUE,
1530 			    numwqe);
1531 		}
1532 		state->id_ah_op = IBD_OP_NOTSTARTED;
1533 	} else if ((state->id_ah_op != IBD_OP_ONGOING) &&
1534 	    (bcmp(&state->id_ah_addr, mac, IPOIB_ADDRL) != 0)) {
1535 		/*
1536 		 * This case can happen when we get a higher band
1537 		 * packet. The easiest way is to reset the state machine
1538 		 * to accommodate the higher priority packet.
1539 		 */
1540 		state->id_ah_op = IBD_OP_NOTSTARTED;
1541 	}
1542 	mutex_exit(&state->id_ac_mutex);
1543 
1544 	return (ptr);
1545 }
1546 
1547 /*
1548  * Grab a not-currently-in-use AH/PathRecord from the active
1549  * list to recycle to a new destination. Only the async thread
1550  * executes this code.
1551  */
1552 static ibd_ace_t *
1553 ibd_acache_get_unref(ibd_state_t *state)
1554 {
1555 	ibd_ace_t *ptr = list_head(&state->id_ah_active);
1556 
1557 	ASSERT(mutex_owned(&state->id_ac_mutex));
1558 
1559 	/*
1560 	 * Do plain linear search.
1561 	 */
1562 	while (ptr != NULL) {
1563 		/*
1564 		 * Note that it is possible that the "cycle" bit
1565 		 * is set on the AH w/o any reference count. The
1566 		 * mcg must have been deleted, and the tx cleanup
1567 		 * just decremented the reference count to 0, but
1568 		 * hasn't gotten around to grabbing the id_ac_mutex
1569 		 * to move the AH into the free list.
1570 		 */
1571 		if (GET_REF(ptr) == 0) {
1572 			IBD_ACACHE_PULLOUT_ACTIVE(state, ptr);
1573 			break;
1574 		}
1575 		ptr = list_next(&state->id_ah_active, ptr);
1576 	}
1577 	return (ptr);
1578 }
1579 
1580 /*
1581  * Invoked to clean up AH from active list in case of multicast
1582  * disable and to handle sendonly memberships during mcg traps.
1583  * And for port up processing for multicast and unicast AHs.
1584  * Normally, the AH is taken off the active list, and put into
1585  * the free list to be recycled for a new destination. In case
1586  * Tx requests on the AH have not completed yet, the AH is marked
1587  * for reaping (which will put the AH on the free list) once the Tx's
1588  * complete; in this case, depending on the "force" input, we take
1589  * out the AH from the active list right now, or leave it also for
1590  * the reap operation. Returns TRUE if the AH is taken off the active
1591  * list (and either put into the free list right now, or arranged for
1592  * later), FALSE otherwise.
1593  */
1594 static boolean_t
1595 ibd_acache_recycle(ibd_state_t *state, ipoib_mac_t *acmac, boolean_t force)
1596 {
1597 	ibd_ace_t *acactive;
1598 	boolean_t ret = B_TRUE;
1599 
1600 	ASSERT(mutex_owned(&state->id_ac_mutex));
1601 
1602 	if ((acactive = ibd_acache_find(state, acmac, B_FALSE, 0)) != NULL) {
1603 
1604 		/*
1605 		 * Note that the AH might already have the cycle bit set
1606 		 * on it; this might happen if sequences of multicast
1607 		 * enables and disables are coming so fast, that posted
1608 		 * Tx's to the mcg have not completed yet, and the cycle
1609 		 * bit is set successively by each multicast disable.
1610 		 */
1611 		if (SET_CYCLE_IF_REF(acactive)) {
1612 			if (!force) {
1613 				/*
1614 				 * The ace is kept on the active list, further
1615 				 * Tx's can still grab a reference on it; the
1616 				 * ace is reaped when all pending Tx's
1617 				 * referencing the AH complete.
1618 				 */
1619 				ret = B_FALSE;
1620 			} else {
1621 				/*
1622 				 * In the mcg trap case, we always pull the
1623 				 * AH from the active list. And also the port
1624 				 * up multi/unicast case.
1625 				 */
1626 				IBD_ACACHE_PULLOUT_ACTIVE(state, acactive);
1627 				acactive->ac_mce = NULL;
1628 			}
1629 		} else {
1630 			/*
1631 			 * Determined the ref count is 0, thus reclaim
1632 			 * immediately after pulling out the ace from
1633 			 * the active list.
1634 			 */
1635 			IBD_ACACHE_PULLOUT_ACTIVE(state, acactive);
1636 			acactive->ac_mce = NULL;
1637 			IBD_ACACHE_INSERT_FREE(state, acactive);
1638 		}
1639 
1640 	}
1641 	return (ret);
1642 }
1643 
1644 /*
1645  * Helper function for async path record lookup. If we are trying to
1646  * Tx to a MCG, check our membership, possibly trying to join the
1647  * group if required. If that fails, try to send the packet to the
1648  * all router group (indicated by the redirect output), pointing
1649  * the input mac address to the router mcg address.
1650  */
1651 static ibd_mce_t *
1652 ibd_async_mcache(ibd_state_t *state, ipoib_mac_t *mac, boolean_t *redirect)
1653 {
1654 	ib_gid_t mgid;
1655 	ibd_mce_t *mce;
1656 	ipoib_mac_t routermac;
1657 
1658 	*redirect = B_FALSE;
1659 	ibd_n2h_gid(mac, &mgid);
1660 
1661 	/*
1662 	 * Check the FullMember+SendOnlyNonMember list.
1663 	 * Since we are the only one who manipulates the
1664 	 * id_mc_full list, no locks are needed.
1665 	 */
1666 	mce = IBD_MCACHE_FIND_FULL(state, mgid);
1667 	if (mce != NULL) {
1668 		DPRINT(4, "ibd_async_mcache : already joined to group");
1669 		return (mce);
1670 	}
1671 
1672 	/*
1673 	 * Not found; try to join(SendOnlyNonMember) and attach.
1674 	 */
1675 	DPRINT(4, "ibd_async_mcache : not joined to group");
1676 	if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) !=
1677 	    NULL) {
1678 		DPRINT(4, "ibd_async_mcache : nonmem joined to group");
1679 		return (mce);
1680 	}
1681 
1682 	/*
1683 	 * MCGroup not present; try to join the all-router group. If
1684 	 * any of the following steps succeed, we will be redirecting
1685 	 * to the all router group.
1686 	 */
1687 	DPRINT(4, "ibd_async_mcache : nonmem join failed");
1688 	if (!ibd_get_allroutergroup(state, mac, &routermac))
1689 		return (NULL);
1690 	*redirect = B_TRUE;
1691 	ibd_n2h_gid(&routermac, &mgid);
1692 	bcopy(&routermac, mac, IPOIB_ADDRL);
1693 	DPRINT(4, "ibd_async_mcache : router mgid : %016llx:%016llx\n",
1694 	    mgid.gid_prefix, mgid.gid_guid);
1695 
1696 	/*
1697 	 * Are we already joined to the router group?
1698 	 */
1699 	if ((mce = IBD_MCACHE_FIND_FULL(state, mgid)) != NULL) {
1700 		DPRINT(4, "ibd_async_mcache : using already joined router"
1701 		    "group\n");
1702 		return (mce);
1703 	}
1704 
1705 	/*
1706 	 * Can we join(SendOnlyNonMember) the router group?
1707 	 */
1708 	DPRINT(4, "ibd_async_mcache : attempting join to router grp");
1709 	if ((mce = ibd_join_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON)) !=
1710 	    NULL) {
1711 		DPRINT(4, "ibd_async_mcache : joined to router grp");
1712 		return (mce);
1713 	}
1714 
1715 	return (NULL);
1716 }
1717 
1718 /*
1719  * Async path record lookup code.
1720  */
1721 static void
1722 ibd_async_acache(ibd_state_t *state, ipoib_mac_t *mac)
1723 {
1724 	ibd_ace_t *ce;
1725 	ibd_mce_t *mce = NULL;
1726 	ibt_path_attr_t path_attr;
1727 	ibt_path_info_t path_info;
1728 	ib_gid_t destgid;
1729 	char ret = IBD_OP_NOTSTARTED;
1730 
1731 	DPRINT(4, "ibd_async_acache :  %08X:%08X:%08X:%08X:%08X",
1732 	    htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1733 	    htonl(mac->ipoib_gidpref[1]), htonl(mac->ipoib_gidsuff[0]),
1734 	    htonl(mac->ipoib_gidsuff[1]));
1735 
1736 	/*
1737 	 * Check whether we are trying to transmit to a MCG.
1738 	 * In that case, we need to make sure we are a member of
1739 	 * the MCG.
1740 	 */
1741 	if (mac->ipoib_qpn == htonl(IB_MC_QPN)) {
1742 		boolean_t redirected;
1743 
1744 		/*
1745 		 * If we can not find or join the group or even
1746 		 * redirect, error out.
1747 		 */
1748 		if ((mce = ibd_async_mcache(state, mac, &redirected)) ==
1749 		    NULL) {
1750 			state->id_ah_op = IBD_OP_ERRORED;
1751 			return;
1752 		}
1753 
1754 		/*
1755 		 * If we got redirected, we need to determine whether
1756 		 * the AH for the new mcg is in the cache already, and
1757 		 * not pull it in then; otherwise proceed to get the
1758 		 * path for the new mcg. There is no guarantee that
1759 		 * if the AH is currently in the cache, it will still be
1760 		 * there when we look in ibd_acache_lookup(), but that's
1761 		 * okay, we will come back here.
1762 		 */
1763 		if (redirected) {
1764 			ret = IBD_OP_ROUTERED;
1765 			DPRINT(4, "ibd_async_acache :  redirected to "
1766 			    "%08X:%08X:%08X:%08X:%08X",
1767 			    htonl(mac->ipoib_qpn), htonl(mac->ipoib_gidpref[0]),
1768 			    htonl(mac->ipoib_gidpref[1]),
1769 			    htonl(mac->ipoib_gidsuff[0]),
1770 			    htonl(mac->ipoib_gidsuff[1]));
1771 
1772 			mutex_enter(&state->id_ac_mutex);
1773 			if (ibd_acache_find(state, mac, B_FALSE, 0) != NULL) {
1774 				state->id_ah_op = IBD_OP_ROUTERED;
1775 				mutex_exit(&state->id_ac_mutex);
1776 				DPRINT(4, "ibd_async_acache : router AH found");
1777 				return;
1778 			}
1779 			mutex_exit(&state->id_ac_mutex);
1780 		}
1781 	}
1782 
1783 	/*
1784 	 * Get an AH from the free list.
1785 	 */
1786 	mutex_enter(&state->id_ac_mutex);
1787 	if ((ce = IBD_ACACHE_GET_FREE(state)) == NULL) {
1788 		/*
1789 		 * No free ones; try to grab an unreferenced active
1790 		 * one. Maybe we need to make the active list LRU,
1791 		 * but that will create more work for Tx callbacks.
1792 		 * Is there a way of not having to pull out the
1793 		 * entry from the active list, but just indicate it
1794 		 * is being recycled? Yes, but that creates one more
1795 		 * check in the fast lookup path.
1796 		 */
1797 		if ((ce = ibd_acache_get_unref(state)) == NULL) {
1798 			/*
1799 			 * Pretty serious shortage now.
1800 			 */
1801 			state->id_ah_op = IBD_OP_NOTSTARTED;
1802 			mutex_exit(&state->id_ac_mutex);
1803 			DPRINT(10, "ibd_async_acache : failed to find AH "
1804 			    "slot\n");
1805 			return;
1806 		}
1807 		/*
1808 		 * We could check whether ac_mce points to a SendOnly
1809 		 * member and drop that membership now. Or do it lazily
1810 		 * at detach time.
1811 		 */
1812 		ce->ac_mce = NULL;
1813 	}
1814 	mutex_exit(&state->id_ac_mutex);
1815 	ASSERT(ce->ac_mce == NULL);
1816 
1817 	/*
1818 	 * Update the entry.
1819 	 */
1820 	bcopy((char *)mac, &ce->ac_mac, IPOIB_ADDRL);
1821 
1822 	bzero(&path_info, sizeof (path_info));
1823 	bzero(&path_attr, sizeof (ibt_path_attr_t));
1824 	path_attr.pa_sgid = state->id_sgid;
1825 	path_attr.pa_num_dgids = 1;
1826 	ibd_n2h_gid(&ce->ac_mac, &destgid);
1827 	path_attr.pa_dgids = &destgid;
1828 	path_attr.pa_sl = state->id_mcinfo->mc_adds_vect.av_srvl;
1829 	if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS,
1830 	    &path_attr, 1, &path_info, NULL) != IBT_SUCCESS) {
1831 		DPRINT(10, "ibd_async_acache : failed in ibt_get_paths");
1832 		goto error;
1833 	}
1834 	if (ibt_modify_ud_dest(ce->ac_dest, state->id_mcinfo->mc_qkey,
1835 	    ntohl(ce->ac_mac.ipoib_qpn),
1836 	    &path_info.pi_prim_cep_path.cep_adds_vect) != IBT_SUCCESS) {
1837 		DPRINT(10, "ibd_async_acache : failed in ibt_modify_ud_dest");
1838 		goto error;
1839 	}
1840 
1841 	/*
1842 	 * mce is set whenever an AH is being associated with a
1843 	 * MCG; this will come in handy when we leave the MCG. The
1844 	 * lock protects Tx fastpath from scanning the active list.
1845 	 */
1846 	if (mce != NULL)
1847 		ce->ac_mce = mce;
1848 	mutex_enter(&state->id_ac_mutex);
1849 	IBD_ACACHE_INSERT_ACTIVE(state, ce);
1850 	state->id_ah_op = ret;
1851 	mutex_exit(&state->id_ac_mutex);
1852 	return;
1853 error:
1854 	/*
1855 	 * We might want to drop SendOnly membership here if we
1856 	 * joined above. The lock protects Tx callbacks inserting
1857 	 * into the free list.
1858 	 */
1859 	mutex_enter(&state->id_ac_mutex);
1860 	state->id_ah_op = IBD_OP_ERRORED;
1861 	IBD_ACACHE_INSERT_FREE(state, ce);
1862 	mutex_exit(&state->id_ac_mutex);
1863 }
1864 
1865 /*
1866  * While restoring port's presence on the subnet on a port up, it is possible
1867  * that the port goes down again.
1868  */
1869 static void
1870 ibd_async_link(ibd_state_t *state, ibd_req_t *req)
1871 {
1872 	ibd_link_op_t opcode = (ibd_link_op_t)req->rq_ptr;
1873 	link_state_t lstate = (opcode == IBD_LINK_DOWN) ? LINK_STATE_DOWN :
1874 	    LINK_STATE_UP;
1875 	ibd_mce_t *mce, *pmce;
1876 	ibd_ace_t *ace, *pace;
1877 
1878 	DPRINT(10, "ibd_async_link(): %d", opcode);
1879 
1880 	/*
1881 	 * On a link up, revalidate the link speed/width. No point doing
1882 	 * this on a link down, since we will be unable to do SA operations,
1883 	 * defaulting to the lowest speed. Also notice that we update our
1884 	 * notion of speed before calling mac_link_update(), which will do
1885 	 * necessary higher level notifications for speed changes.
1886 	 */
1887 	if ((opcode == IBD_LINK_UP_ABSENT) || (opcode == IBD_LINK_UP)) {
1888 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state))
1889 		state->id_link_speed = ibd_get_portspeed(state);
1890 		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state))
1891 	}
1892 
1893 	/*
1894 	 * Do all the work required to establish our presence on
1895 	 * the subnet.
1896 	 */
1897 	if (opcode == IBD_LINK_UP_ABSENT) {
1898 		/*
1899 		 * If in promiscuous mode ...
1900 		 */
1901 		if (state->id_prom_op == IBD_OP_COMPLETED) {
1902 			/*
1903 			 * Drop all nonmembership.
1904 			 */
1905 			ibd_async_unsetprom(state);
1906 
1907 			/*
1908 			 * Then, try to regain nonmembership to all mcg's.
1909 			 */
1910 			ibd_async_setprom(state);
1911 
1912 		}
1913 
1914 		/*
1915 		 * Drop all sendonly membership (which also gets rid of the
1916 		 * AHs); try to reacquire all full membership.
1917 		 */
1918 		mce = list_head(&state->id_mc_full);
1919 		while ((pmce = mce) != NULL) {
1920 			mce = list_next(&state->id_mc_full, mce);
1921 			if (pmce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON)
1922 				ibd_leave_group(state,
1923 				    pmce->mc_info.mc_adds_vect.av_dgid,
1924 				    IB_MC_JSTATE_SEND_ONLY_NON);
1925 			else
1926 				ibd_reacquire_group(state, pmce);
1927 		}
1928 
1929 		/*
1930 		 * Recycle all active AHs to free list (and if there are
1931 		 * pending posts, make sure they will go into the free list
1932 		 * once the Tx's complete). Grab the lock to prevent
1933 		 * concurrent Tx's as well as Tx cleanups.
1934 		 */
1935 		mutex_enter(&state->id_ac_mutex);
1936 		ace = list_head(&state->id_ah_active);
1937 		while ((pace = ace) != NULL) {
1938 			boolean_t cycled;
1939 
1940 			ace = list_next(&state->id_ah_active, ace);
1941 			mce = pace->ac_mce;
1942 			cycled = ibd_acache_recycle(state, &pace->ac_mac,
1943 			    B_TRUE);
1944 			/*
1945 			 * If this is for an mcg, it must be for a fullmember,
1946 			 * since we got rid of send-only members above when
1947 			 * processing the mce list.
1948 			 */
1949 			ASSERT(cycled && ((mce == NULL) || (mce->mc_jstate ==
1950 			    IB_MC_JSTATE_FULL)));
1951 
1952 			/*
1953 			 * Check if the fullmember mce needs to be torn down,
1954 			 * ie whether the DLPI disable has already been done.
1955 			 * If so, do some of the work of tx_cleanup, namely
1956 			 * causing leave (which will fail), detach and
1957 			 * mce-freeing. tx_cleanup will put the AH into free
1958 			 * list. The reason to duplicate some of this
1959 			 * tx_cleanup work is because we want to delete the
1960 			 * AH right now instead of waiting for tx_cleanup, to
1961 			 * force subsequent Tx's to reacquire an AH.
1962 			 */
1963 			if ((mce != NULL) && (mce->mc_fullreap))
1964 				ibd_async_reap_group(state, mce,
1965 				    mce->mc_info.mc_adds_vect.av_dgid,
1966 				    mce->mc_jstate);
1967 		}
1968 		mutex_exit(&state->id_ac_mutex);
1969 	}
1970 
1971 	/*
1972 	 * mac handle is guaranteed to exist since driver does ibt_close_hca()
1973 	 * (which stops further events from being delivered) before
1974 	 * mac_unregister(). At this point, it is guaranteed that mac_register
1975 	 * has already been done.
1976 	 */
1977 	mutex_enter(&state->id_link_mutex);
1978 	state->id_link_state = lstate;
1979 	mac_link_update(state->id_mh, lstate);
1980 	mutex_exit(&state->id_link_mutex);
1981 
1982 	ibd_async_done(state);
1983 }
1984 
1985 /*
1986  * Check the pkey table to see if we can find the pkey we're looking for.
1987  * Set the pkey index in 'pkix' if found. Return 0 on success and -1 on
1988  * failure.
1989  */
1990 static int
1991 ibd_locate_pkey(ib_pkey_t *pkey_tbl, uint16_t pkey_tbl_sz, ib_pkey_t pkey,
1992     uint16_t *pkix)
1993 {
1994 	uint16_t ndx;
1995 
1996 	ASSERT(pkix != NULL);
1997 
1998 	for (ndx = 0; ndx < pkey_tbl_sz; ndx++) {
1999 		if (pkey_tbl[ndx] == pkey) {
2000 			*pkix = ndx;
2001 			return (0);
2002 		}
2003 	}
2004 	return (-1);
2005 }
2006 
2007 /*
2008  * When the link is notified up, we need to do a few things, based
2009  * on the port's current p_init_type_reply claiming a reinit has been
2010  * done or not. The reinit steps are:
2011  * 1. If in InitTypeReply, NoLoadReply == PreserveContentReply == 0, verify
2012  *    the old Pkey and GID0 are correct.
2013  * 2. Register for mcg traps (already done by ibmf).
2014  * 3. If PreservePresenceReply indicates the SM has restored port's presence
2015  *    in subnet, nothing more to do. Else go to next steps (on async daemon).
2016  * 4. Give up all sendonly memberships.
2017  * 5. Acquire all full memberships.
2018  * 6. In promiscuous mode, acquire all non memberships.
2019  * 7. Recycle all AHs to free list.
2020  */
2021 static void
2022 ibd_link_mod(ibd_state_t *state, ibt_async_code_t code)
2023 {
2024 	ibt_hca_portinfo_t *port_infop = NULL;
2025 	ibt_status_t ibt_status;
2026 	uint_t psize, port_infosz;
2027 	ibd_link_op_t opcode;
2028 	ibd_req_t *req;
2029 	link_state_t new_link_state = LINK_STATE_UP;
2030 	uint8_t itreply;
2031 	uint16_t pkix;
2032 	int ret;
2033 
2034 	/*
2035 	 * Let's not race with a plumb or an unplumb; if we detect a
2036 	 * pkey relocation event later on here, we may have to restart.
2037 	 */
2038 	ibd_set_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS);
2039 
2040 	mutex_enter(&state->id_link_mutex);
2041 
2042 	/*
2043 	 * If the init code in ibd_m_start hasn't yet set up the
2044 	 * pkey/gid, nothing to do; that code will set the link state.
2045 	 */
2046 	if (state->id_link_state == LINK_STATE_UNKNOWN) {
2047 		mutex_exit(&state->id_link_mutex);
2048 		goto link_mod_return;
2049 	}
2050 
2051 	/*
2052 	 * If this routine was called in response to a port down event,
2053 	 * we just need to see if this should be informed.
2054 	 */
2055 	if (code == IBT_ERROR_PORT_DOWN) {
2056 		new_link_state = LINK_STATE_DOWN;
2057 		goto update_link_state;
2058 	}
2059 
2060 	/*
2061 	 * If it's not a port down event we've received, try to get the port
2062 	 * attributes first. If we fail here, the port is as good as down.
2063 	 * Otherwise, if the link went down by the time the handler gets
2064 	 * here, give up - we cannot even validate the pkey/gid since those
2065 	 * are not valid and this is as bad as a port down anyway.
2066 	 */
2067 	ibt_status = ibt_query_hca_ports(state->id_hca_hdl, state->id_port,
2068 	    &port_infop, &psize, &port_infosz);
2069 	if ((ibt_status != IBT_SUCCESS) || (psize != 1) ||
2070 	    (port_infop->p_linkstate != IBT_PORT_ACTIVE)) {
2071 		new_link_state = LINK_STATE_DOWN;
2072 		goto update_link_state;
2073 	}
2074 
2075 	/*
2076 	 * Check the SM InitTypeReply flags. If both NoLoadReply and
2077 	 * PreserveContentReply are 0, we don't know anything about the
2078 	 * data loaded into the port attributes, so we need to verify
2079 	 * if gid0 and pkey are still valid.
2080 	 */
2081 	itreply = port_infop->p_init_type_reply;
2082 	if (((itreply & SM_INIT_TYPE_REPLY_NO_LOAD_REPLY) == 0) &&
2083 	    ((itreply & SM_INIT_TYPE_PRESERVE_CONTENT_REPLY) == 0)) {
2084 		/*
2085 		 * Check to see if the subnet part of GID0 has changed. If
2086 		 * not, check the simple case first to see if the pkey
2087 		 * index is the same as before; finally check to see if the
2088 		 * pkey has been relocated to a different index in the table.
2089 		 */
2090 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid))
2091 		if (bcmp(port_infop->p_sgid_tbl,
2092 		    &state->id_sgid, sizeof (ib_gid_t)) != 0) {
2093 
2094 			new_link_state = LINK_STATE_DOWN;
2095 
2096 		} else if (port_infop->p_pkey_tbl[state->id_pkix] ==
2097 		    state->id_pkey) {
2098 
2099 			new_link_state = LINK_STATE_UP;
2100 
2101 		} else if (ibd_locate_pkey(port_infop->p_pkey_tbl,
2102 		    port_infop->p_pkey_tbl_sz, state->id_pkey, &pkix) == 0) {
2103 
2104 			ibt_free_portinfo(port_infop, port_infosz);
2105 			mutex_exit(&state->id_link_mutex);
2106 
2107 			/*
2108 			 * Currently a restart is required if our pkey has moved
2109 			 * in the pkey table. If we get the ibt_recycle_ud() to
2110 			 * work as documented (expected), we may be able to
2111 			 * avoid a complete restart.  Note that we've already
2112 			 * marked both the start and stop 'in-progress' flags,
2113 			 * so it is ok to go ahead and do this restart.
2114 			 */
2115 			(void) ibd_undo_start(state, LINK_STATE_DOWN);
2116 			if ((ret = ibd_start(state)) != 0) {
2117 				DPRINT(10, "ibd_restart: cannot restart, "
2118 				    "ret=%d", ret);
2119 			}
2120 
2121 			goto link_mod_return;
2122 		} else {
2123 			new_link_state = LINK_STATE_DOWN;
2124 		}
2125 		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid))
2126 	}
2127 
2128 update_link_state:
2129 	if (port_infop) {
2130 		ibt_free_portinfo(port_infop, port_infosz);
2131 	}
2132 
2133 	/*
2134 	 * If the old state is the same as the new state, nothing to do
2135 	 */
2136 	if (state->id_link_state == new_link_state) {
2137 		mutex_exit(&state->id_link_mutex);
2138 		goto link_mod_return;
2139 	}
2140 
2141 	/*
2142 	 * Ok, so there was a link state change; see if it's safe to ask
2143 	 * the async thread to do the work
2144 	 */
2145 	if (!ibd_async_safe(state)) {
2146 		state->id_link_state = new_link_state;
2147 		mutex_exit(&state->id_link_mutex);
2148 		goto link_mod_return;
2149 	}
2150 
2151 	mutex_exit(&state->id_link_mutex);
2152 
2153 	/*
2154 	 * If we're reporting a link up, check InitTypeReply to see if
2155 	 * the SM has ensured that the port's presence in mcg, traps,
2156 	 * etc. is intact.
2157 	 */
2158 	if (new_link_state == LINK_STATE_DOWN) {
2159 		opcode = IBD_LINK_DOWN;
2160 	} else {
2161 		if ((itreply & SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) ==
2162 		    SM_INIT_TYPE_PRESERVE_PRESENCE_REPLY) {
2163 			opcode = IBD_LINK_UP;
2164 		} else {
2165 			opcode = IBD_LINK_UP_ABSENT;
2166 		}
2167 	}
2168 
2169 	/*
2170 	 * Queue up a request for ibd_async_link() to handle this link
2171 	 * state change event
2172 	 */
2173 	req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP);
2174 	req->rq_ptr = (void *)opcode;
2175 	ibd_queue_work_slot(state, req, IBD_ASYNC_LINK);
2176 
2177 link_mod_return:
2178 	ibd_clr_mac_progress(state, IBD_DRV_RESTART_IN_PROGRESS);
2179 }
2180 
2181 /*
2182  * For the port up/down events, IBTL guarantees there will not be concurrent
2183  * invocations of the handler. IBTL might coalesce link transition events,
2184  * and not invoke the handler for _each_ up/down transition, but it will
2185  * invoke the handler with last known state
2186  */
2187 static void
2188 ibd_async_handler(void *clnt_private, ibt_hca_hdl_t hca_hdl,
2189     ibt_async_code_t code, ibt_async_event_t *event)
2190 {
2191 	ibd_state_t *state = (ibd_state_t *)clnt_private;
2192 
2193 	switch (code) {
2194 	case IBT_ERROR_CATASTROPHIC_CHAN:
2195 		ibd_print_warn(state, "catastrophic channel error");
2196 		break;
2197 	case IBT_ERROR_CQ:
2198 		ibd_print_warn(state, "completion queue error");
2199 		break;
2200 	case IBT_PORT_CHANGE_EVENT:
2201 		/*
2202 		 * Events will be delivered to all instances that have
2203 		 * done ibt_open_hca() but not yet done ibt_close_hca().
2204 		 * Only need to do work for our port; IBTF will deliver
2205 		 * events for other ports on the hca we have ibt_open_hca'ed
2206 		 * too. Note that id_port is initialized in ibd_attach()
2207 		 * before we do an ibt_open_hca() in ibd_attach().
2208 		 */
2209 		ASSERT(state->id_hca_hdl == hca_hdl);
2210 		if (state->id_port != event->ev_port)
2211 			break;
2212 
2213 		if ((event->ev_port_flags & IBT_PORT_CHANGE_PKEY) ==
2214 		    IBT_PORT_CHANGE_PKEY) {
2215 			ibd_link_mod(state, code);
2216 		}
2217 		break;
2218 	case IBT_ERROR_PORT_DOWN:
2219 	case IBT_CLNT_REREG_EVENT:
2220 	case IBT_EVENT_PORT_UP:
2221 		/*
2222 		 * Events will be delivered to all instances that have
2223 		 * done ibt_open_hca() but not yet done ibt_close_hca().
2224 		 * Only need to do work for our port; IBTF will deliver
2225 		 * events for other ports on the hca we have ibt_open_hca'ed
2226 		 * too. Note that id_port is initialized in ibd_attach()
2227 		 * before we do an ibt_open_hca() in ibd_attach().
2228 		 */
2229 		ASSERT(state->id_hca_hdl == hca_hdl);
2230 		if (state->id_port != event->ev_port)
2231 			break;
2232 
2233 		ibd_link_mod(state, code);
2234 		break;
2235 
2236 	case IBT_HCA_ATTACH_EVENT:
2237 	case IBT_HCA_DETACH_EVENT:
2238 		/*
2239 		 * When a new card is plugged to the system, attach_event is
2240 		 * invoked. Additionally, a cfgadm needs to be run to make the
2241 		 * card known to the system, and an ifconfig needs to be run to
2242 		 * plumb up any ibd interfaces on the card. In the case of card
2243 		 * unplug, a cfgadm is run that will trigger any RCM scripts to
2244 		 * unplumb the ibd interfaces on the card; when the card is
2245 		 * actually unplugged, the detach_event is invoked;
2246 		 * additionally, if any ibd instances are still active on the
2247 		 * card (eg there were no associated RCM scripts), driver's
2248 		 * detach routine is invoked.
2249 		 */
2250 		break;
2251 	default:
2252 		break;
2253 	}
2254 }
2255 
2256 static int
2257 ibd_register_mac(ibd_state_t *state, dev_info_t *dip)
2258 {
2259 	mac_register_t *macp;
2260 	int ret;
2261 
2262 	if ((macp = mac_alloc(MAC_VERSION)) == NULL) {
2263 		DPRINT(10, "ibd_register_mac: mac_alloc() failed");
2264 		return (DDI_FAILURE);
2265 	}
2266 
2267 	/*
2268 	 * Note that when we register with mac during attach, we don't
2269 	 * have the id_macaddr yet, so we'll simply be registering a
2270 	 * zero macaddr that we'll overwrite later during plumb (in
2271 	 * ibd_m_start()). Similar is the case with id_mtu - we'll
2272 	 * update the mac layer with the correct mtu during plumb.
2273 	 */
2274 	macp->m_type_ident = MAC_PLUGIN_IDENT_IB;
2275 	macp->m_driver = state;
2276 	macp->m_dip = dip;
2277 	macp->m_src_addr = (uint8_t *)&state->id_macaddr;
2278 	macp->m_callbacks = &ibd_m_callbacks;
2279 	macp->m_min_sdu = 0;
2280 	macp->m_max_sdu = IBD_DEF_MAX_SDU;
2281 
2282 	/*
2283 	 *  Register ourselves with the GLDv3 interface
2284 	 */
2285 	if ((ret = mac_register(macp, &state->id_mh)) != 0) {
2286 		mac_free(macp);
2287 		DPRINT(10,
2288 		    "ibd_register_mac: mac_register() failed, ret=%d", ret);
2289 		return (DDI_FAILURE);
2290 	}
2291 
2292 	mac_free(macp);
2293 	return (DDI_SUCCESS);
2294 }
2295 
2296 static int
2297 ibd_record_capab(ibd_state_t *state, dev_info_t *dip)
2298 {
2299 	ibt_hca_attr_t hca_attrs;
2300 	ibt_status_t ibt_status;
2301 
2302 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*state))
2303 
2304 	/*
2305 	 * Query the HCA and fetch its attributes
2306 	 */
2307 	ibt_status = ibt_query_hca(state->id_hca_hdl, &hca_attrs);
2308 	ASSERT(ibt_status == IBT_SUCCESS);
2309 
2310 	/*
2311 	 * 1. Set the Hardware Checksum capability. Currently we only consider
2312 	 *    full checksum offload.
2313 	 */
2314 	if ((hca_attrs.hca_flags & IBT_HCA_CKSUM_FULL) == IBT_HCA_CKSUM_FULL) {
2315 		state->id_hwcksum_capab = IBT_HCA_CKSUM_FULL;
2316 	}
2317 
2318 	/*
2319 	 * 2. Set LSO policy, capability and maximum length
2320 	 */
2321 	if (ddi_prop_get_int(DDI_DEV_T_ANY, dip,
2322 	    DDI_PROP_DONTPASS | DDI_PROP_NOTPROM, IBD_PROP_LSO_POLICY, 1)) {
2323 		state->id_lso_policy = B_TRUE;
2324 	} else {
2325 		state->id_lso_policy = B_FALSE;
2326 	}
2327 
2328 	if (hca_attrs.hca_max_lso_size > 0) {
2329 		state->id_lso_capable = B_TRUE;
2330 		if (hca_attrs.hca_max_lso_size > IBD_LSO_MAXLEN)
2331 			state->id_lso_maxlen = IBD_LSO_MAXLEN;
2332 		else
2333 			state->id_lso_maxlen = hca_attrs.hca_max_lso_size;
2334 	} else {
2335 		state->id_lso_capable = B_FALSE;
2336 		state->id_lso_maxlen = 0;
2337 	}
2338 
2339 	/*
2340 	 * 3. Set Reserved L_Key capability
2341 	 */
2342 	if ((hca_attrs.hca_flags2 & IBT_HCA2_RES_LKEY) == IBT_HCA2_RES_LKEY) {
2343 		state->id_hca_res_lkey_capab = 1;
2344 		state->id_res_lkey = hca_attrs.hca_reserved_lkey;
2345 	}
2346 
2347 	/*
2348 	 * 4. Set maximum sqseg value after checking to see if extended sgl
2349 	 *    size information is provided by the hca
2350 	 */
2351 	if (hca_attrs.hca_flags & IBT_HCA_WQE_SIZE_INFO) {
2352 		state->id_max_sqseg = hca_attrs.hca_ud_send_sgl_sz;
2353 	} else {
2354 		state->id_max_sqseg = hca_attrs.hca_max_sgl;
2355 	}
2356 	if (state->id_max_sqseg > IBD_MAX_SQSEG) {
2357 		state->id_max_sqseg = IBD_MAX_SQSEG;
2358 	} else if (state->id_max_sqseg < IBD_MAX_SQSEG) {
2359 		ibd_print_warn(state, "Set #sgl = %d instead of default %d",
2360 		    state->id_max_sqseg, IBD_MAX_SQSEG);
2361 	}
2362 
2363 	/*
2364 	 * Translating the virtual address regions into physical regions
2365 	 * for using the Reserved LKey feature results in a wr sgl that
2366 	 * is a little longer. Since failing ibt_map_mem_iov() is costly,
2367 	 * we'll fix a high-water mark (65%) for when we should stop.
2368 	 */
2369 	state->id_max_sqseg_hiwm = (state->id_max_sqseg * 65) / 100;
2370 
2371 	/*
2372 	 * 5. Set number of recv and send wqes after checking hca maximum
2373 	 *    channel size
2374 	 */
2375 	if (hca_attrs.hca_max_chan_sz < IBD_NUM_RWQE) {
2376 		state->id_num_rwqe = hca_attrs.hca_max_chan_sz;
2377 	} else {
2378 		state->id_num_rwqe = IBD_NUM_RWQE;
2379 	}
2380 	state->id_rx_bufs_outstanding_limit = state->id_num_rwqe - IBD_RWQE_MIN;
2381 	if (hca_attrs.hca_max_chan_sz < IBD_NUM_SWQE) {
2382 		state->id_num_swqe = hca_attrs.hca_max_chan_sz;
2383 	} else {
2384 		state->id_num_swqe = IBD_NUM_SWQE;
2385 	}
2386 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state))
2387 
2388 	return (DDI_SUCCESS);
2389 }
2390 
2391 static int
2392 ibd_unattach(ibd_state_t *state, dev_info_t *dip)
2393 {
2394 	int instance;
2395 	uint32_t progress = state->id_mac_state;
2396 	ibt_status_t ret;
2397 
2398 	if (progress & IBD_DRV_MAC_REGISTERED) {
2399 		(void) mac_unregister(state->id_mh);
2400 		state->id_mac_state &= (~IBD_DRV_MAC_REGISTERED);
2401 	}
2402 
2403 	if (progress & IBD_DRV_PD_ALLOCD) {
2404 		if ((ret = ibt_free_pd(state->id_hca_hdl,
2405 		    state->id_pd_hdl)) != IBT_SUCCESS) {
2406 			ibd_print_warn(state, "failed to free "
2407 			    "protection domain, ret=%d", ret);
2408 		}
2409 		state->id_pd_hdl = NULL;
2410 		state->id_mac_state &= (~IBD_DRV_PD_ALLOCD);
2411 	}
2412 
2413 	if (progress & IBD_DRV_HCA_OPENED) {
2414 		if ((ret = ibt_close_hca(state->id_hca_hdl)) !=
2415 		    IBT_SUCCESS) {
2416 			ibd_print_warn(state, "failed to close "
2417 			    "HCA device, ret=%d", ret);
2418 		}
2419 		state->id_hca_hdl = NULL;
2420 		state->id_mac_state &= (~IBD_DRV_HCA_OPENED);
2421 	}
2422 
2423 	if (progress & IBD_DRV_IBTL_ATTACH_DONE) {
2424 		if ((ret = ibt_detach(state->id_ibt_hdl)) != IBT_SUCCESS) {
2425 			ibd_print_warn(state,
2426 			    "ibt_detach() failed, ret=%d", ret);
2427 		}
2428 		state->id_ibt_hdl = NULL;
2429 		state->id_mac_state &= (~IBD_DRV_IBTL_ATTACH_DONE);
2430 	}
2431 
2432 	if (progress & IBD_DRV_TXINTR_ADDED) {
2433 		ddi_remove_softintr(state->id_tx);
2434 		state->id_tx = NULL;
2435 		state->id_mac_state &= (~IBD_DRV_TXINTR_ADDED);
2436 	}
2437 
2438 	if (progress & IBD_DRV_RXINTR_ADDED) {
2439 		ddi_remove_softintr(state->id_rx);
2440 		state->id_rx = NULL;
2441 		state->id_mac_state &= (~IBD_DRV_RXINTR_ADDED);
2442 	}
2443 
2444 	if (progress & IBD_DRV_STATE_INITIALIZED) {
2445 		ibd_state_fini(state);
2446 		state->id_mac_state &= (~IBD_DRV_STATE_INITIALIZED);
2447 	}
2448 
2449 	instance = ddi_get_instance(dip);
2450 	ddi_soft_state_free(ibd_list, instance);
2451 
2452 	return (DDI_SUCCESS);
2453 }
2454 
2455 /*
2456  * Attach device to the IO framework.
2457  */
2458 static int
2459 ibd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
2460 {
2461 	ibd_state_t *state = NULL;
2462 	ib_guid_t hca_guid;
2463 	int instance;
2464 	ibt_status_t ret;
2465 	int rv;
2466 
2467 	/*
2468 	 * IBD doesn't support suspend/resume
2469 	 */
2470 	if (cmd != DDI_ATTACH)
2471 		return (DDI_FAILURE);
2472 
2473 	/*
2474 	 * Allocate softstate structure
2475 	 */
2476 	instance = ddi_get_instance(dip);
2477 	if (ddi_soft_state_zalloc(ibd_list, instance) == DDI_FAILURE)
2478 		return (DDI_FAILURE);
2479 	state = ddi_get_soft_state(ibd_list, instance);
2480 
2481 	/*
2482 	 * Initialize mutexes and condition variables
2483 	 */
2484 	if (ibd_state_init(state, dip) != DDI_SUCCESS) {
2485 		DPRINT(10, "ibd_attach: failed in ibd_state_init()");
2486 		goto attach_fail;
2487 	}
2488 	state->id_mac_state |= IBD_DRV_STATE_INITIALIZED;
2489 
2490 	/*
2491 	 * Allocate rx,tx softintr
2492 	 */
2493 	if (ibd_rx_softintr == 1) {
2494 		if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_rx,
2495 		    NULL, NULL, ibd_intr, (caddr_t)state)) != DDI_SUCCESS) {
2496 			DPRINT(10, "ibd_attach: failed in "
2497 			    "ddi_add_softintr(id_rx),  ret=%d", rv);
2498 			goto attach_fail;
2499 		}
2500 		state->id_mac_state |= IBD_DRV_RXINTR_ADDED;
2501 	}
2502 	if (ibd_tx_softintr == 1) {
2503 		if ((rv = ddi_add_softintr(dip, DDI_SOFTINT_LOW, &state->id_tx,
2504 		    NULL, NULL, ibd_tx_recycle,
2505 		    (caddr_t)state)) != DDI_SUCCESS) {
2506 			DPRINT(10, "ibd_attach: failed in "
2507 			    "ddi_add_softintr(id_tx), ret=%d", rv);
2508 			goto attach_fail;
2509 		}
2510 		state->id_mac_state |= IBD_DRV_TXINTR_ADDED;
2511 	}
2512 
2513 	/*
2514 	 * Obtain IBA P_Key, port number and HCA guid and validate
2515 	 * them (for P_Key, only full members are allowed as per
2516 	 * IPoIB specification; neither port number nor HCA guid
2517 	 * can be zero)
2518 	 */
2519 	if ((state->id_pkey = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
2520 	    "port-pkey", IB_PKEY_INVALID_LIMITED)) <= IB_PKEY_INVALID_FULL) {
2521 		DPRINT(10, "ibd_attach: port device has wrong partition (0x%x)",
2522 		    state->id_pkey);
2523 		goto attach_fail;
2524 	}
2525 	if ((state->id_port = ddi_prop_get_int(DDI_DEV_T_ANY, dip, 0,
2526 	    "port-number", 0)) == 0) {
2527 		DPRINT(10, "ibd_attach: invalid port number (%d)",
2528 		    state->id_port);
2529 		goto attach_fail;
2530 	}
2531 	if ((hca_guid = ddi_prop_get_int64(DDI_DEV_T_ANY, dip, 0,
2532 	    "hca-guid", 0)) == 0) {
2533 		DPRINT(10, "ibd_attach: port hca has invalid guid (0x%llx)",
2534 		    hca_guid);
2535 		goto attach_fail;
2536 	}
2537 
2538 	/*
2539 	 * Attach to IBTL
2540 	 */
2541 	if ((ret = ibt_attach(&ibd_clnt_modinfo, dip, state,
2542 	    &state->id_ibt_hdl)) != IBT_SUCCESS) {
2543 		DPRINT(10, "ibd_attach: failed in ibt_attach(), ret=%d", ret);
2544 		goto attach_fail;
2545 	}
2546 	state->id_mac_state |= IBD_DRV_IBTL_ATTACH_DONE;
2547 
2548 	/*
2549 	 * Open the HCA
2550 	 */
2551 	if ((ret = ibt_open_hca(state->id_ibt_hdl, hca_guid,
2552 	    &state->id_hca_hdl)) != IBT_SUCCESS) {
2553 		DPRINT(10, "ibd_attach: ibt_open_hca() failed, ret=%d", ret);
2554 		goto attach_fail;
2555 	}
2556 	state->id_mac_state |= IBD_DRV_HCA_OPENED;
2557 
2558 	/*
2559 	 * Record capabilities
2560 	 */
2561 	(void) ibd_record_capab(state, dip);
2562 
2563 	/*
2564 	 * Allocate a protection domain on the HCA
2565 	 */
2566 	if ((ret = ibt_alloc_pd(state->id_hca_hdl, IBT_PD_NO_FLAGS,
2567 	    &state->id_pd_hdl)) != IBT_SUCCESS) {
2568 		DPRINT(10, "ibd_attach: ibt_alloc_pd() failed, ret=%d", ret);
2569 		goto attach_fail;
2570 	}
2571 	state->id_mac_state |= IBD_DRV_PD_ALLOCD;
2572 
2573 
2574 	/*
2575 	 * Register ibd interfaces with the Nemo framework
2576 	 */
2577 	if (ibd_register_mac(state, dip) != IBT_SUCCESS) {
2578 		DPRINT(10, "ibd_attach: failed in ibd_register_mac()");
2579 		goto attach_fail;
2580 	}
2581 	state->id_mac_state |= IBD_DRV_MAC_REGISTERED;
2582 
2583 	/*
2584 	 * We're done with everything we could to make the attach
2585 	 * succeed.  All the buffer allocations and IPoIB broadcast
2586 	 * group joins are deferred to when the interface instance
2587 	 * is actually plumbed to avoid wasting memory.
2588 	 */
2589 	return (DDI_SUCCESS);
2590 
2591 attach_fail:
2592 	(void) ibd_unattach(state, dip);
2593 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(*state))
2594 	return (DDI_FAILURE);
2595 }
2596 
2597 /*
2598  * Detach device from the IO framework.
2599  */
2600 static int
2601 ibd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
2602 {
2603 	ibd_state_t *state;
2604 	int instance;
2605 
2606 	/*
2607 	 * IBD doesn't support suspend/resume
2608 	 */
2609 	if (cmd != DDI_DETACH)
2610 		return (DDI_FAILURE);
2611 
2612 	/*
2613 	 * Get the instance softstate
2614 	 */
2615 	instance = ddi_get_instance(dip);
2616 	state = ddi_get_soft_state(ibd_list, instance);
2617 
2618 	/*
2619 	 * Release all resources we're holding still.  Note that if we'd
2620 	 * done ibd_attach(), ibd_m_start() and ibd_m_stop() correctly
2621 	 * so far, we should find all the flags we need in id_mac_state.
2622 	 */
2623 	(void) ibd_unattach(state, dip);
2624 
2625 	return (DDI_SUCCESS);
2626 }
2627 
2628 /*
2629  * Pre ibt_attach() driver initialization
2630  */
2631 static int
2632 ibd_state_init(ibd_state_t *state, dev_info_t *dip)
2633 {
2634 	char buf[64];
2635 
2636 	mutex_init(&state->id_link_mutex, NULL, MUTEX_DRIVER, NULL);
2637 	state->id_link_state = LINK_STATE_UNKNOWN;
2638 
2639 	mutex_init(&state->id_trap_lock, NULL, MUTEX_DRIVER, NULL);
2640 	cv_init(&state->id_trap_cv, NULL, CV_DEFAULT, NULL);
2641 	state->id_trap_stop = B_TRUE;
2642 	state->id_trap_inprog = 0;
2643 
2644 	mutex_init(&state->id_scq_poll_lock, NULL, MUTEX_DRIVER, NULL);
2645 	mutex_init(&state->id_rcq_poll_lock, NULL, MUTEX_DRIVER, NULL);
2646 	state->id_dip = dip;
2647 
2648 	mutex_init(&state->id_sched_lock, NULL, MUTEX_DRIVER, NULL);
2649 
2650 	mutex_init(&state->id_tx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2651 	mutex_enter(&state->id_tx_list.dl_mutex);
2652 	state->id_tx_list.dl_head = NULL;
2653 	state->id_tx_list.dl_pending_sends = B_FALSE;
2654 	state->id_tx_list.dl_cnt = 0;
2655 	mutex_exit(&state->id_tx_list.dl_mutex);
2656 	mutex_init(&state->id_tx_rel_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2657 	mutex_enter(&state->id_tx_rel_list.dl_mutex);
2658 	state->id_tx_rel_list.dl_head = NULL;
2659 	state->id_tx_rel_list.dl_pending_sends = B_FALSE;
2660 	state->id_tx_rel_list.dl_cnt = 0;
2661 	mutex_exit(&state->id_tx_rel_list.dl_mutex);
2662 	mutex_init(&state->id_txpost_lock, NULL, MUTEX_DRIVER, NULL);
2663 	state->id_tx_busy = 0;
2664 	mutex_init(&state->id_lso_lock, NULL, MUTEX_DRIVER, NULL);
2665 
2666 	state->id_rx_list.dl_bufs_outstanding = 0;
2667 	state->id_rx_list.dl_cnt = 0;
2668 	mutex_init(&state->id_rx_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2669 	mutex_init(&state->id_rx_free_list.dl_mutex, NULL, MUTEX_DRIVER, NULL);
2670 	(void) sprintf(buf, "ibd_req%d", ddi_get_instance(dip));
2671 	state->id_req_kmc = kmem_cache_create(buf, sizeof (ibd_req_t),
2672 	    0, NULL, NULL, NULL, NULL, NULL, 0);
2673 
2674 	mutex_init(&state->id_macst_lock, NULL, MUTEX_DRIVER, NULL);
2675 	cv_init(&state->id_macst_cv, NULL, CV_DEFAULT, NULL);
2676 
2677 	return (DDI_SUCCESS);
2678 }
2679 
2680 /*
2681  * Post ibt_detach() driver deconstruction
2682  */
2683 static void
2684 ibd_state_fini(ibd_state_t *state)
2685 {
2686 	cv_destroy(&state->id_macst_cv);
2687 	mutex_destroy(&state->id_macst_lock);
2688 
2689 	kmem_cache_destroy(state->id_req_kmc);
2690 
2691 	mutex_destroy(&state->id_rx_list.dl_mutex);
2692 	mutex_destroy(&state->id_rx_free_list.dl_mutex);
2693 
2694 	mutex_destroy(&state->id_txpost_lock);
2695 	mutex_destroy(&state->id_tx_list.dl_mutex);
2696 	mutex_destroy(&state->id_tx_rel_list.dl_mutex);
2697 	mutex_destroy(&state->id_lso_lock);
2698 
2699 	mutex_destroy(&state->id_sched_lock);
2700 	mutex_destroy(&state->id_scq_poll_lock);
2701 	mutex_destroy(&state->id_rcq_poll_lock);
2702 
2703 	cv_destroy(&state->id_trap_cv);
2704 	mutex_destroy(&state->id_trap_lock);
2705 	mutex_destroy(&state->id_link_mutex);
2706 }
2707 
2708 /*
2709  * Fetch link speed from SA for snmp ifspeed reporting.
2710  */
2711 static uint64_t
2712 ibd_get_portspeed(ibd_state_t *state)
2713 {
2714 	int			ret;
2715 	ibt_path_info_t		path;
2716 	ibt_path_attr_t		path_attr;
2717 	uint8_t			num_paths;
2718 	uint64_t		ifspeed;
2719 
2720 	/*
2721 	 * Due to serdes 8b10b encoding on the wire, 2.5 Gbps on wire
2722 	 * translates to 2 Gbps data rate. Thus, 1X single data rate is
2723 	 * 2000000000. Start with that as default.
2724 	 */
2725 	ifspeed = 2000000000;
2726 
2727 	bzero(&path_attr, sizeof (path_attr));
2728 
2729 	/*
2730 	 * Get the port speed from Loopback path information.
2731 	 */
2732 	path_attr.pa_dgids = &state->id_sgid;
2733 	path_attr.pa_num_dgids = 1;
2734 	path_attr.pa_sgid = state->id_sgid;
2735 
2736 	if (ibt_get_paths(state->id_ibt_hdl, IBT_PATH_NO_FLAGS,
2737 	    &path_attr, 1, &path, &num_paths) != IBT_SUCCESS)
2738 		goto earlydone;
2739 
2740 	if (num_paths < 1)
2741 		goto earlydone;
2742 
2743 	/*
2744 	 * In case SA does not return an expected value, report the default
2745 	 * speed as 1X.
2746 	 */
2747 	ret = 1;
2748 	switch (path.pi_prim_cep_path.cep_adds_vect.av_srate) {
2749 		case IBT_SRATE_2:	/*  1X SDR i.e 2.5 Gbps */
2750 			ret = 1;
2751 			break;
2752 		case IBT_SRATE_10:	/*  4X SDR or 1X QDR i.e 10 Gbps */
2753 			ret = 4;
2754 			break;
2755 		case IBT_SRATE_30:	/* 12X SDR i.e 30 Gbps */
2756 			ret = 12;
2757 			break;
2758 		case IBT_SRATE_5:	/*  1X DDR i.e  5 Gbps */
2759 			ret = 2;
2760 			break;
2761 		case IBT_SRATE_20:	/*  4X DDR or 8X SDR i.e 20 Gbps */
2762 			ret = 8;
2763 			break;
2764 		case IBT_SRATE_40:	/*  8X DDR or 4X QDR i.e 40 Gbps */
2765 			ret = 16;
2766 			break;
2767 		case IBT_SRATE_60:	/* 12X DDR i.e 60 Gbps */
2768 			ret = 24;
2769 			break;
2770 		case IBT_SRATE_80:	/*  8X QDR i.e 80 Gbps */
2771 			ret = 32;
2772 			break;
2773 		case IBT_SRATE_120:	/* 12X QDR i.e 120 Gbps */
2774 			ret = 48;
2775 			break;
2776 	}
2777 
2778 	ifspeed *= ret;
2779 
2780 earlydone:
2781 	return (ifspeed);
2782 }
2783 
2784 /*
2785  * Search input mcg list (id_mc_full or id_mc_non) for an entry
2786  * representing the input mcg mgid.
2787  */
2788 static ibd_mce_t *
2789 ibd_mcache_find(ib_gid_t mgid, struct list *mlist)
2790 {
2791 	ibd_mce_t *ptr = list_head(mlist);
2792 
2793 	/*
2794 	 * Do plain linear search.
2795 	 */
2796 	while (ptr != NULL) {
2797 		if (bcmp(&mgid, &ptr->mc_info.mc_adds_vect.av_dgid,
2798 		    sizeof (ib_gid_t)) == 0)
2799 			return (ptr);
2800 		ptr = list_next(mlist, ptr);
2801 	}
2802 	return (NULL);
2803 }
2804 
2805 /*
2806  * Execute IBA JOIN.
2807  */
2808 static ibt_status_t
2809 ibd_iba_join(ibd_state_t *state, ib_gid_t mgid, ibd_mce_t *mce)
2810 {
2811 	ibt_mcg_attr_t mcg_attr;
2812 
2813 	bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
2814 	mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey;
2815 	mcg_attr.mc_mgid = mgid;
2816 	mcg_attr.mc_join_state = mce->mc_jstate;
2817 	mcg_attr.mc_scope = state->id_scope;
2818 	mcg_attr.mc_pkey = state->id_pkey;
2819 	mcg_attr.mc_flow = state->id_mcinfo->mc_adds_vect.av_flow;
2820 	mcg_attr.mc_sl = state->id_mcinfo->mc_adds_vect.av_srvl;
2821 	mcg_attr.mc_tclass = state->id_mcinfo->mc_adds_vect.av_tclass;
2822 	return (ibt_join_mcg(state->id_sgid, &mcg_attr, &mce->mc_info,
2823 	    NULL, NULL));
2824 }
2825 
2826 /*
2827  * This code JOINs the port in the proper way (depending on the join
2828  * state) so that IBA fabric will forward mcg packets to/from the port.
2829  * It also attaches the QPN to the mcg so it can receive those mcg
2830  * packets. This code makes sure not to attach the mcg to the QP if
2831  * that has been previously done due to the mcg being joined with a
2832  * different join state, even though this is not required by SWG_0216,
2833  * refid 3610.
2834  */
2835 static ibd_mce_t *
2836 ibd_join_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate)
2837 {
2838 	ibt_status_t ibt_status;
2839 	ibd_mce_t *mce, *tmce, *omce = NULL;
2840 	boolean_t do_attach = B_TRUE;
2841 
2842 	DPRINT(2, "ibd_join_group : join_group state %d : %016llx:%016llx\n",
2843 	    jstate, mgid.gid_prefix, mgid.gid_guid);
2844 
2845 	/*
2846 	 * For enable_multicast Full member joins, we need to do some
2847 	 * extra work. If there is already an mce on the list that
2848 	 * indicates full membership, that means the membership has
2849 	 * not yet been dropped (since the disable_multicast was issued)
2850 	 * because there are pending Tx's to the mcg; in that case, just
2851 	 * mark the mce not to be reaped when the Tx completion queues
2852 	 * an async reap operation.
2853 	 *
2854 	 * If there is already an mce on the list indicating sendonly
2855 	 * membership, try to promote to full membership. Be careful
2856 	 * not to deallocate the old mce, since there might be an AH
2857 	 * pointing to it; instead, update the old mce with new data
2858 	 * that tracks the full membership.
2859 	 */
2860 	if ((jstate == IB_MC_JSTATE_FULL) && ((omce =
2861 	    IBD_MCACHE_FIND_FULL(state, mgid)) != NULL)) {
2862 		if (omce->mc_jstate == IB_MC_JSTATE_FULL) {
2863 			ASSERT(omce->mc_fullreap);
2864 			omce->mc_fullreap = B_FALSE;
2865 			return (omce);
2866 		} else {
2867 			ASSERT(omce->mc_jstate == IB_MC_JSTATE_SEND_ONLY_NON);
2868 		}
2869 	}
2870 
2871 	/*
2872 	 * Allocate the ibd_mce_t to track this JOIN.
2873 	 */
2874 	mce = kmem_zalloc(sizeof (ibd_mce_t), KM_SLEEP);
2875 	mce->mc_fullreap = B_FALSE;
2876 	mce->mc_jstate = jstate;
2877 
2878 	if ((ibt_status = ibd_iba_join(state, mgid, mce)) != IBT_SUCCESS) {
2879 		DPRINT(10, "ibd_join_group : failed ibt_join_mcg() %d",
2880 		    ibt_status);
2881 		kmem_free(mce, sizeof (ibd_mce_t));
2882 		return (NULL);
2883 	}
2884 
2885 	/*
2886 	 * Is an IBA attach required? Not if the interface is already joined
2887 	 * to the mcg in a different appropriate join state.
2888 	 */
2889 	if (jstate == IB_MC_JSTATE_NON) {
2890 		tmce = IBD_MCACHE_FIND_FULL(state, mgid);
2891 		if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL))
2892 			do_attach = B_FALSE;
2893 	} else if (jstate == IB_MC_JSTATE_FULL) {
2894 		if (IBD_MCACHE_FIND_NON(state, mgid) != NULL)
2895 			do_attach = B_FALSE;
2896 	} else {	/* jstate == IB_MC_JSTATE_SEND_ONLY_NON */
2897 		do_attach = B_FALSE;
2898 	}
2899 
2900 	if (do_attach) {
2901 		/*
2902 		 * Do the IBA attach.
2903 		 */
2904 		DPRINT(10, "ibd_join_group: ibt_attach_mcg \n");
2905 		if ((ibt_status = ibt_attach_mcg(state->id_chnl_hdl,
2906 		    &mce->mc_info)) != IBT_SUCCESS) {
2907 			DPRINT(10, "ibd_join_group : failed qp attachment "
2908 			    "%d\n", ibt_status);
2909 			/*
2910 			 * NOTE that we should probably preserve the join info
2911 			 * in the list and later try to leave again at detach
2912 			 * time.
2913 			 */
2914 			(void) ibt_leave_mcg(state->id_sgid, mgid,
2915 			    state->id_sgid, jstate);
2916 			kmem_free(mce, sizeof (ibd_mce_t));
2917 			return (NULL);
2918 		}
2919 	}
2920 
2921 	/*
2922 	 * Insert the ibd_mce_t in the proper list.
2923 	 */
2924 	if (jstate == IB_MC_JSTATE_NON) {
2925 		IBD_MCACHE_INSERT_NON(state, mce);
2926 	} else {
2927 		/*
2928 		 * Set up the mc_req fields used for reaping the
2929 		 * mcg in case of delayed tx completion (see
2930 		 * ibd_tx_cleanup()). Also done for sendonly join in
2931 		 * case we are promoted to fullmembership later and
2932 		 * keep using the same mce.
2933 		 */
2934 		mce->mc_req.rq_gid = mgid;
2935 		mce->mc_req.rq_ptr = mce;
2936 		/*
2937 		 * Check whether this is the case of trying to join
2938 		 * full member, and we were already joined send only.
2939 		 * We try to drop our SendOnly membership, but it is
2940 		 * possible that the mcg does not exist anymore (and
2941 		 * the subnet trap never reached us), so the leave
2942 		 * operation might fail.
2943 		 */
2944 		if (omce != NULL) {
2945 			(void) ibt_leave_mcg(state->id_sgid, mgid,
2946 			    state->id_sgid, IB_MC_JSTATE_SEND_ONLY_NON);
2947 			omce->mc_jstate = IB_MC_JSTATE_FULL;
2948 			bcopy(&mce->mc_info, &omce->mc_info,
2949 			    sizeof (ibt_mcg_info_t));
2950 			kmem_free(mce, sizeof (ibd_mce_t));
2951 			return (omce);
2952 		}
2953 		mutex_enter(&state->id_mc_mutex);
2954 		IBD_MCACHE_INSERT_FULL(state, mce);
2955 		mutex_exit(&state->id_mc_mutex);
2956 	}
2957 
2958 	return (mce);
2959 }
2960 
2961 /*
2962  * Called during port up event handling to attempt to reacquire full
2963  * membership to an mcg. Stripped down version of ibd_join_group().
2964  * Note that it is possible that the mcg might have gone away, and
2965  * gets recreated at this point.
2966  */
2967 static void
2968 ibd_reacquire_group(ibd_state_t *state, ibd_mce_t *mce)
2969 {
2970 	ib_gid_t mgid;
2971 
2972 	/*
2973 	 * If the mc_fullreap flag is set, or this join fails, a subsequent
2974 	 * reap/leave is going to try to leave the group. We could prevent
2975 	 * that by adding a boolean flag into ibd_mce_t, if required.
2976 	 */
2977 	if (mce->mc_fullreap)
2978 		return;
2979 
2980 	mgid = mce->mc_info.mc_adds_vect.av_dgid;
2981 
2982 	DPRINT(2, "ibd_reacquire_group : %016llx:%016llx\n", mgid.gid_prefix,
2983 	    mgid.gid_guid);
2984 
2985 	if (ibd_iba_join(state, mgid, mce) != IBT_SUCCESS)
2986 		ibd_print_warn(state, "Failure on port up to rejoin "
2987 		    "multicast gid %016llx:%016llx",
2988 		    (u_longlong_t)mgid.gid_prefix,
2989 		    (u_longlong_t)mgid.gid_guid);
2990 }
2991 
2992 /*
2993  * This code handles delayed Tx completion cleanups for mcg's to which
2994  * disable_multicast has been issued, regular mcg related cleanups during
2995  * disable_multicast, disable_promiscuous and mcg traps, as well as
2996  * cleanups during driver detach time. Depending on the join state,
2997  * it deletes the mce from the appropriate list and issues the IBA
2998  * leave/detach; except in the disable_multicast case when the mce
2999  * is left on the active list for a subsequent Tx completion cleanup.
3000  */
3001 static void
3002 ibd_async_reap_group(ibd_state_t *state, ibd_mce_t *mce, ib_gid_t mgid,
3003     uint8_t jstate)
3004 {
3005 	ibd_mce_t *tmce;
3006 	boolean_t do_detach = B_TRUE;
3007 
3008 	/*
3009 	 * Before detaching, we must check whether the other list
3010 	 * contains the mcg; if we detach blindly, the consumer
3011 	 * who set up the other list will also stop receiving
3012 	 * traffic.
3013 	 */
3014 	if (jstate == IB_MC_JSTATE_FULL) {
3015 		/*
3016 		 * The following check is only relevant while coming
3017 		 * from the Tx completion path in the reap case.
3018 		 */
3019 		if (!mce->mc_fullreap)
3020 			return;
3021 		mutex_enter(&state->id_mc_mutex);
3022 		IBD_MCACHE_PULLOUT_FULL(state, mce);
3023 		mutex_exit(&state->id_mc_mutex);
3024 		if (IBD_MCACHE_FIND_NON(state, mgid) != NULL)
3025 			do_detach = B_FALSE;
3026 	} else if (jstate == IB_MC_JSTATE_NON) {
3027 		IBD_MCACHE_PULLOUT_NON(state, mce);
3028 		tmce = IBD_MCACHE_FIND_FULL(state, mgid);
3029 		if ((tmce != NULL) && (tmce->mc_jstate == IB_MC_JSTATE_FULL))
3030 			do_detach = B_FALSE;
3031 	} else {	/* jstate == IB_MC_JSTATE_SEND_ONLY_NON */
3032 		mutex_enter(&state->id_mc_mutex);
3033 		IBD_MCACHE_PULLOUT_FULL(state, mce);
3034 		mutex_exit(&state->id_mc_mutex);
3035 		do_detach = B_FALSE;
3036 	}
3037 
3038 	/*
3039 	 * If we are reacting to a mcg trap and leaving our sendonly or
3040 	 * non membership, the mcg is possibly already gone, so attempting
3041 	 * to leave might fail. On the other hand, we must try to leave
3042 	 * anyway, since this might be a trap from long ago, and we could
3043 	 * have potentially sendonly joined to a recent incarnation of
3044 	 * the mcg and are about to loose track of this information.
3045 	 */
3046 	if (do_detach) {
3047 		DPRINT(2, "ibd_async_reap_group : ibt_detach_mcg : "
3048 		    "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid);
3049 		(void) ibt_detach_mcg(state->id_chnl_hdl, &mce->mc_info);
3050 	}
3051 
3052 	(void) ibt_leave_mcg(state->id_sgid, mgid, state->id_sgid, jstate);
3053 	kmem_free(mce, sizeof (ibd_mce_t));
3054 }
3055 
3056 /*
3057  * Async code executed due to multicast and promiscuous disable requests
3058  * and mcg trap handling; also executed during driver detach. Mostly, a
3059  * leave and detach is done; except for the fullmember case when Tx
3060  * requests are pending, whence arrangements are made for subsequent
3061  * cleanup on Tx completion.
3062  */
3063 static void
3064 ibd_leave_group(ibd_state_t *state, ib_gid_t mgid, uint8_t jstate)
3065 {
3066 	ipoib_mac_t mcmac;
3067 	boolean_t recycled;
3068 	ibd_mce_t *mce;
3069 
3070 	DPRINT(2, "ibd_leave_group : leave_group state %d : %016llx:%016llx\n",
3071 	    jstate, mgid.gid_prefix, mgid.gid_guid);
3072 
3073 	if (jstate == IB_MC_JSTATE_NON) {
3074 		recycled = B_TRUE;
3075 		mce = IBD_MCACHE_FIND_NON(state, mgid);
3076 		/*
3077 		 * In case we are handling a mcg trap, we might not find
3078 		 * the mcg in the non list.
3079 		 */
3080 		if (mce == NULL) {
3081 			return;
3082 		}
3083 	} else {
3084 		mce = IBD_MCACHE_FIND_FULL(state, mgid);
3085 
3086 		/*
3087 		 * In case we are handling a mcg trap, make sure the trap
3088 		 * is not arriving late; if we have an mce that indicates
3089 		 * that we are already a fullmember, that would be a clear
3090 		 * indication that the trap arrived late (ie, is for a
3091 		 * previous incarnation of the mcg).
3092 		 */
3093 		if (jstate == IB_MC_JSTATE_SEND_ONLY_NON) {
3094 			if ((mce == NULL) || (mce->mc_jstate ==
3095 			    IB_MC_JSTATE_FULL)) {
3096 				return;
3097 			}
3098 		} else {
3099 			ASSERT(jstate == IB_MC_JSTATE_FULL);
3100 
3101 			/*
3102 			 * If join group failed, mce will be NULL here.
3103 			 * This is because in GLDv3 driver, set multicast
3104 			 *  will always return success.
3105 			 */
3106 			if (mce == NULL) {
3107 				return;
3108 			}
3109 
3110 			mce->mc_fullreap = B_TRUE;
3111 		}
3112 
3113 		/*
3114 		 * If no pending Tx's remain that reference the AH
3115 		 * for the mcg, recycle it from active to free list.
3116 		 * Else in the IB_MC_JSTATE_FULL case, just mark the AH,
3117 		 * so the last completing Tx will cause an async reap
3118 		 * operation to be invoked, at which time we will drop our
3119 		 * membership to the mcg so that the pending Tx's complete
3120 		 * successfully. Refer to comments on "AH and MCE active
3121 		 * list manipulation" at top of this file. The lock protects
3122 		 * against Tx fast path and Tx cleanup code.
3123 		 */
3124 		mutex_enter(&state->id_ac_mutex);
3125 		ibd_h2n_mac(&mcmac, IB_MC_QPN, mgid.gid_prefix, mgid.gid_guid);
3126 		recycled = ibd_acache_recycle(state, &mcmac, (jstate ==
3127 		    IB_MC_JSTATE_SEND_ONLY_NON));
3128 		mutex_exit(&state->id_ac_mutex);
3129 	}
3130 
3131 	if (recycled) {
3132 		DPRINT(2, "ibd_leave_group : leave_group reaping : "
3133 		    "%016llx:%016llx\n", mgid.gid_prefix, mgid.gid_guid);
3134 		ibd_async_reap_group(state, mce, mgid, jstate);
3135 	}
3136 }
3137 
3138 /*
3139  * Find the broadcast address as defined by IPoIB; implicitly
3140  * determines the IBA scope, mtu, tclass etc of the link the
3141  * interface is going to be a member of.
3142  */
3143 static ibt_status_t
3144 ibd_find_bgroup(ibd_state_t *state)
3145 {
3146 	ibt_mcg_attr_t mcg_attr;
3147 	uint_t numg;
3148 	uchar_t scopes[] = { IB_MC_SCOPE_SUBNET_LOCAL,
3149 	    IB_MC_SCOPE_SITE_LOCAL, IB_MC_SCOPE_ORG_LOCAL,
3150 	    IB_MC_SCOPE_GLOBAL };
3151 	int i, mcgmtu;
3152 	boolean_t found = B_FALSE;
3153 	int ret;
3154 	ibt_mcg_info_t mcg_info;
3155 
3156 	state->id_bgroup_created = B_FALSE;
3157 
3158 query_bcast_grp:
3159 	bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
3160 	mcg_attr.mc_pkey = state->id_pkey;
3161 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid))
3162 	state->id_mgid.gid_guid = IB_MGID_IPV4_LOWGRP_MASK;
3163 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid))
3164 
3165 	for (i = 0; i < sizeof (scopes)/sizeof (scopes[0]); i++) {
3166 		state->id_scope = mcg_attr.mc_scope = scopes[i];
3167 
3168 		/*
3169 		 * Look for the IPoIB broadcast group.
3170 		 */
3171 		_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid))
3172 		state->id_mgid.gid_prefix =
3173 		    (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) |
3174 		    ((uint64_t)state->id_scope << 48) |
3175 		    ((uint32_t)(state->id_pkey << 16)));
3176 		mcg_attr.mc_mgid = state->id_mgid;
3177 		_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid))
3178 		if (ibt_query_mcg(state->id_sgid, &mcg_attr, 1,
3179 		    &state->id_mcinfo, &numg) == IBT_SUCCESS) {
3180 			found = B_TRUE;
3181 			break;
3182 		}
3183 	}
3184 
3185 	if (!found) {
3186 		if (ibd_create_broadcast_group) {
3187 			/*
3188 			 * If we created the broadcast group, but failed to
3189 			 * find it, we can't do anything except leave the
3190 			 * one we created and return failure.
3191 			 */
3192 			if (state->id_bgroup_created) {
3193 				ibd_print_warn(state, "IPoIB broadcast group "
3194 				    "absent. Unable to query after create.");
3195 				goto find_bgroup_fail;
3196 			}
3197 
3198 			/*
3199 			 * Create the ipoib broadcast group if it didn't exist
3200 			 */
3201 			bzero(&mcg_attr, sizeof (ibt_mcg_attr_t));
3202 			mcg_attr.mc_qkey = IBD_DEFAULT_QKEY;
3203 			mcg_attr.mc_join_state = IB_MC_JSTATE_FULL;
3204 			mcg_attr.mc_scope = IB_MC_SCOPE_SUBNET_LOCAL;
3205 			mcg_attr.mc_pkey = state->id_pkey;
3206 			mcg_attr.mc_flow = 0;
3207 			mcg_attr.mc_sl = 0;
3208 			mcg_attr.mc_tclass = 0;
3209 			_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_mgid))
3210 			state->id_mgid.gid_prefix =
3211 			    (((uint64_t)IB_MCGID_IPV4_PREFIX << 32) |
3212 			    ((uint64_t)IB_MC_SCOPE_SUBNET_LOCAL << 48) |
3213 			    ((uint32_t)(state->id_pkey << 16)));
3214 			mcg_attr.mc_mgid = state->id_mgid;
3215 			_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_mgid))
3216 
3217 			if ((ret = ibt_join_mcg(state->id_sgid, &mcg_attr,
3218 			    &mcg_info, NULL, NULL)) != IBT_SUCCESS) {
3219 				ibd_print_warn(state, "IPoIB broadcast group "
3220 				    "absent, create failed: ret = %d\n", ret);
3221 				state->id_bgroup_created = B_FALSE;
3222 				return (IBT_FAILURE);
3223 			}
3224 			state->id_bgroup_created = B_TRUE;
3225 			goto query_bcast_grp;
3226 		} else {
3227 			ibd_print_warn(state, "IPoIB broadcast group absent");
3228 			return (IBT_FAILURE);
3229 		}
3230 	}
3231 
3232 	/*
3233 	 * Assert that the mcg mtu <= id_mtu. Fill in updated id_mtu.
3234 	 */
3235 	mcgmtu = (128 << state->id_mcinfo->mc_mtu);
3236 	if (state->id_mtu < mcgmtu) {
3237 		ibd_print_warn(state, "IPoIB broadcast group MTU %d "
3238 		    "greater than port's maximum MTU %d", mcgmtu,
3239 		    state->id_mtu);
3240 		ibt_free_mcg_info(state->id_mcinfo, 1);
3241 		goto find_bgroup_fail;
3242 	}
3243 	state->id_mtu = mcgmtu;
3244 
3245 	return (IBT_SUCCESS);
3246 
3247 find_bgroup_fail:
3248 	if (state->id_bgroup_created) {
3249 		(void) ibt_leave_mcg(state->id_sgid,
3250 		    mcg_info.mc_adds_vect.av_dgid, state->id_sgid,
3251 		    IB_MC_JSTATE_FULL);
3252 	}
3253 
3254 	return (IBT_FAILURE);
3255 }
3256 
3257 static int
3258 ibd_alloc_tx_copybufs(ibd_state_t *state)
3259 {
3260 	ibt_mr_attr_t mem_attr;
3261 
3262 	/*
3263 	 * Allocate one big chunk for all regular tx copy bufs
3264 	 */
3265 	state->id_tx_buf_sz = state->id_mtu;
3266 	if (state->id_lso_policy && state->id_lso_capable &&
3267 	    (IBD_TX_BUF_SZ > state->id_mtu)) {
3268 		state->id_tx_buf_sz = IBD_TX_BUF_SZ;
3269 	}
3270 
3271 	state->id_tx_bufs = kmem_zalloc(state->id_num_swqe *
3272 	    state->id_tx_buf_sz, KM_SLEEP);
3273 
3274 	state->id_tx_wqes = kmem_zalloc(state->id_num_swqe *
3275 	    sizeof (ibd_swqe_t), KM_SLEEP);
3276 
3277 	/*
3278 	 * Do one memory registration on the entire txbuf area
3279 	 */
3280 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_tx_bufs;
3281 	mem_attr.mr_len = state->id_num_swqe * state->id_tx_buf_sz;
3282 	mem_attr.mr_as = NULL;
3283 	mem_attr.mr_flags = IBT_MR_SLEEP;
3284 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
3285 	    &state->id_tx_mr_hdl, &state->id_tx_mr_desc) != IBT_SUCCESS) {
3286 		DPRINT(10, "ibd_alloc_tx_copybufs: ibt_register_mr failed");
3287 		kmem_free(state->id_tx_wqes,
3288 		    state->id_num_swqe * sizeof (ibd_swqe_t));
3289 		kmem_free(state->id_tx_bufs,
3290 		    state->id_num_swqe * state->id_tx_buf_sz);
3291 		state->id_tx_bufs = NULL;
3292 		return (DDI_FAILURE);
3293 	}
3294 
3295 	return (DDI_SUCCESS);
3296 }
3297 
3298 static int
3299 ibd_alloc_tx_lsobufs(ibd_state_t *state)
3300 {
3301 	ibt_mr_attr_t mem_attr;
3302 	ibd_lsobuf_t *buflist;
3303 	ibd_lsobuf_t *lbufp;
3304 	ibd_lsobuf_t *tail;
3305 	ibd_lsobkt_t *bktp;
3306 	uint8_t *membase;
3307 	uint8_t *memp;
3308 	uint_t memsz;
3309 	int i;
3310 
3311 	/*
3312 	 * Allocate the lso bucket
3313 	 */
3314 	bktp = kmem_zalloc(sizeof (ibd_lsobkt_t), KM_SLEEP);
3315 
3316 	/*
3317 	 * Allocate the entire lso memory and register it
3318 	 */
3319 	memsz = IBD_NUM_LSO_BUFS * IBD_LSO_BUFSZ;
3320 	membase = kmem_zalloc(memsz, KM_SLEEP);
3321 
3322 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)membase;
3323 	mem_attr.mr_len = memsz;
3324 	mem_attr.mr_as = NULL;
3325 	mem_attr.mr_flags = IBT_MR_SLEEP;
3326 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl,
3327 	    &mem_attr, &bktp->bkt_mr_hdl, &bktp->bkt_mr_desc) != IBT_SUCCESS) {
3328 		DPRINT(10, "ibd_alloc_tx_lsobufs: ibt_register_mr failed");
3329 		kmem_free(membase, memsz);
3330 		kmem_free(bktp, sizeof (ibd_lsobkt_t));
3331 		return (DDI_FAILURE);
3332 	}
3333 
3334 	mutex_enter(&state->id_lso_lock);
3335 
3336 	/*
3337 	 * Now allocate the buflist.  Note that the elements in the buflist and
3338 	 * the buffers in the lso memory have a permanent 1-1 relation, so we
3339 	 * can always derive the address of a buflist entry from the address of
3340 	 * an lso buffer.
3341 	 */
3342 	buflist = kmem_zalloc(IBD_NUM_LSO_BUFS * sizeof (ibd_lsobuf_t),
3343 	    KM_SLEEP);
3344 
3345 	/*
3346 	 * Set up the lso buf chain
3347 	 */
3348 	memp = membase;
3349 	lbufp = buflist;
3350 	for (i = 0; i < IBD_NUM_LSO_BUFS; i++) {
3351 		lbufp->lb_isfree = 1;
3352 		lbufp->lb_buf = memp;
3353 		lbufp->lb_next = lbufp + 1;
3354 
3355 		tail = lbufp;
3356 
3357 		memp += IBD_LSO_BUFSZ;
3358 		lbufp++;
3359 	}
3360 	tail->lb_next = NULL;
3361 
3362 	/*
3363 	 * Set up the LSO buffer information in ibd state
3364 	 */
3365 	bktp->bkt_bufl = buflist;
3366 	bktp->bkt_free_head = buflist;
3367 	bktp->bkt_mem = membase;
3368 	bktp->bkt_nelem = IBD_NUM_LSO_BUFS;
3369 	bktp->bkt_nfree = bktp->bkt_nelem;
3370 
3371 	state->id_lso = bktp;
3372 	mutex_exit(&state->id_lso_lock);
3373 
3374 	return (DDI_SUCCESS);
3375 }
3376 
3377 /*
3378  * Statically allocate Tx buffer list(s).
3379  */
3380 static int
3381 ibd_init_txlist(ibd_state_t *state)
3382 {
3383 	ibd_swqe_t *swqe;
3384 	ibt_lkey_t lkey;
3385 	int i;
3386 	uint_t len;
3387 	uint8_t *bufaddr;
3388 
3389 	if (ibd_alloc_tx_copybufs(state) != DDI_SUCCESS)
3390 		return (DDI_FAILURE);
3391 
3392 	if (state->id_lso_policy && state->id_lso_capable) {
3393 		if (ibd_alloc_tx_lsobufs(state) != DDI_SUCCESS)
3394 			state->id_lso_policy = B_FALSE;
3395 	}
3396 
3397 	/*
3398 	 * Allocate and setup the swqe list
3399 	 */
3400 	lkey = state->id_tx_mr_desc.md_lkey;
3401 	bufaddr = state->id_tx_bufs;
3402 	len = state->id_tx_buf_sz;
3403 	swqe = state->id_tx_wqes;
3404 	mutex_enter(&state->id_tx_list.dl_mutex);
3405 	for (i = 0; i < state->id_num_swqe; i++, swqe++, bufaddr += len) {
3406 		swqe->swqe_type = IBD_WQE_SEND;
3407 		swqe->swqe_next = NULL;
3408 		swqe->swqe_im_mblk = NULL;
3409 
3410 		swqe->swqe_copybuf.ic_sgl.ds_va = (ib_vaddr_t)(uintptr_t)
3411 		    bufaddr;
3412 		swqe->swqe_copybuf.ic_sgl.ds_key = lkey;
3413 		swqe->swqe_copybuf.ic_sgl.ds_len = 0; /* set in send */
3414 
3415 		swqe->w_swr.wr_id = (ibt_wrid_t)(uintptr_t)swqe;
3416 		swqe->w_swr.wr_flags = IBT_WR_NO_FLAGS;
3417 		swqe->w_swr.wr_trans = IBT_UD_SRV;
3418 
3419 		/* These are set in send */
3420 		swqe->w_swr.wr_nds = 0;
3421 		swqe->w_swr.wr_sgl = NULL;
3422 		swqe->w_swr.wr_opcode = IBT_WRC_SEND;
3423 
3424 		/* add to list */
3425 		state->id_tx_list.dl_cnt++;
3426 		swqe->swqe_next = state->id_tx_list.dl_head;
3427 		state->id_tx_list.dl_head = SWQE_TO_WQE(swqe);
3428 	}
3429 	mutex_exit(&state->id_tx_list.dl_mutex);
3430 
3431 	return (DDI_SUCCESS);
3432 }
3433 
3434 static int
3435 ibd_acquire_lsobufs(ibd_state_t *state, uint_t req_sz, ibt_wr_ds_t *sgl_p,
3436     uint32_t *nds_p)
3437 {
3438 	ibd_lsobkt_t *bktp;
3439 	ibd_lsobuf_t *lbufp;
3440 	ibd_lsobuf_t *nextp;
3441 	ibt_lkey_t lso_lkey;
3442 	uint_t frag_sz;
3443 	uint_t num_needed;
3444 	int i;
3445 
3446 	ASSERT(sgl_p != NULL);
3447 	ASSERT(nds_p != NULL);
3448 	ASSERT(req_sz != 0);
3449 
3450 	/*
3451 	 * Determine how many bufs we'd need for the size requested
3452 	 */
3453 	num_needed = req_sz / IBD_LSO_BUFSZ;
3454 	if ((frag_sz = req_sz % IBD_LSO_BUFSZ) != 0)
3455 		num_needed++;
3456 
3457 	mutex_enter(&state->id_lso_lock);
3458 
3459 	/*
3460 	 * If we don't have enough lso bufs, return failure
3461 	 */
3462 	ASSERT(state->id_lso != NULL);
3463 	bktp = state->id_lso;
3464 	if (bktp->bkt_nfree < num_needed) {
3465 		mutex_exit(&state->id_lso_lock);
3466 		return (-1);
3467 	}
3468 
3469 	/*
3470 	 * Pick the first 'num_needed' bufs from the free list
3471 	 */
3472 	lso_lkey = bktp->bkt_mr_desc.md_lkey;
3473 	lbufp = bktp->bkt_free_head;
3474 	for (i = 0; i < num_needed; i++) {
3475 		ASSERT(lbufp->lb_isfree != 0);
3476 		ASSERT(lbufp->lb_buf != NULL);
3477 
3478 		nextp = lbufp->lb_next;
3479 
3480 		sgl_p[i].ds_va = (ib_vaddr_t)(uintptr_t)lbufp->lb_buf;
3481 		sgl_p[i].ds_key = lso_lkey;
3482 		sgl_p[i].ds_len = IBD_LSO_BUFSZ;
3483 
3484 		lbufp->lb_isfree = 0;
3485 		lbufp->lb_next = NULL;
3486 
3487 		lbufp = nextp;
3488 	}
3489 	bktp->bkt_free_head = lbufp;
3490 
3491 	/*
3492 	 * If the requested size is not a multiple of IBD_LSO_BUFSZ, we need
3493 	 * to adjust the last sgl entry's length. Since we know we need atleast
3494 	 * one, the i-1 use below is ok.
3495 	 */
3496 	if (frag_sz) {
3497 		sgl_p[i-1].ds_len = frag_sz;
3498 	}
3499 
3500 	/*
3501 	 * Update nfree count and return
3502 	 */
3503 	bktp->bkt_nfree -= num_needed;
3504 
3505 	mutex_exit(&state->id_lso_lock);
3506 
3507 	*nds_p = num_needed;
3508 
3509 	return (0);
3510 }
3511 
3512 static void
3513 ibd_release_lsobufs(ibd_state_t *state, ibt_wr_ds_t *sgl_p, uint32_t nds)
3514 {
3515 	ibd_lsobkt_t *bktp;
3516 	ibd_lsobuf_t *lbufp;
3517 	uint8_t *lso_mem_end;
3518 	uint_t ndx;
3519 	int i;
3520 
3521 	mutex_enter(&state->id_lso_lock);
3522 
3523 	bktp = state->id_lso;
3524 	ASSERT(bktp != NULL);
3525 
3526 	lso_mem_end = bktp->bkt_mem + bktp->bkt_nelem * IBD_LSO_BUFSZ;
3527 	for (i = 0; i < nds; i++) {
3528 		uint8_t *va;
3529 
3530 		va = (uint8_t *)(uintptr_t)sgl_p[i].ds_va;
3531 		ASSERT(va >= bktp->bkt_mem && va < lso_mem_end);
3532 
3533 		/*
3534 		 * Figure out the buflist element this sgl buffer corresponds
3535 		 * to and put it back at the head
3536 		 */
3537 		ndx = (va - bktp->bkt_mem) / IBD_LSO_BUFSZ;
3538 		lbufp = bktp->bkt_bufl + ndx;
3539 
3540 		ASSERT(lbufp->lb_isfree == 0);
3541 		ASSERT(lbufp->lb_buf == va);
3542 
3543 		lbufp->lb_isfree = 1;
3544 		lbufp->lb_next = bktp->bkt_free_head;
3545 		bktp->bkt_free_head = lbufp;
3546 	}
3547 	bktp->bkt_nfree += nds;
3548 
3549 	mutex_exit(&state->id_lso_lock);
3550 }
3551 
3552 static void
3553 ibd_free_tx_copybufs(ibd_state_t *state)
3554 {
3555 	/*
3556 	 * Unregister txbuf mr
3557 	 */
3558 	if (ibt_deregister_mr(state->id_hca_hdl,
3559 	    state->id_tx_mr_hdl) != IBT_SUCCESS) {
3560 		DPRINT(10, "ibd_free_tx_copybufs: ibt_deregister_mr failed");
3561 	}
3562 	state->id_tx_mr_hdl = NULL;
3563 
3564 	/*
3565 	 * Free txbuf memory
3566 	 */
3567 	kmem_free(state->id_tx_wqes, state->id_num_swqe * sizeof (ibd_swqe_t));
3568 	kmem_free(state->id_tx_bufs, state->id_num_swqe * state->id_tx_buf_sz);
3569 	state->id_tx_wqes = NULL;
3570 	state->id_tx_bufs = NULL;
3571 }
3572 
3573 static void
3574 ibd_free_tx_lsobufs(ibd_state_t *state)
3575 {
3576 	ibd_lsobkt_t *bktp;
3577 
3578 	mutex_enter(&state->id_lso_lock);
3579 
3580 	if ((bktp = state->id_lso) == NULL) {
3581 		mutex_exit(&state->id_lso_lock);
3582 		return;
3583 	}
3584 
3585 	/*
3586 	 * First, free the buflist
3587 	 */
3588 	ASSERT(bktp->bkt_bufl != NULL);
3589 	kmem_free(bktp->bkt_bufl, bktp->bkt_nelem * sizeof (ibd_lsobuf_t));
3590 
3591 	/*
3592 	 * Unregister the LSO memory and free it
3593 	 */
3594 	ASSERT(bktp->bkt_mr_hdl != NULL);
3595 	if (ibt_deregister_mr(state->id_hca_hdl,
3596 	    bktp->bkt_mr_hdl) != IBT_SUCCESS) {
3597 		DPRINT(10,
3598 		    "ibd_free_lsobufs: ibt_deregister_mr failed");
3599 	}
3600 	ASSERT(bktp->bkt_mem);
3601 	kmem_free(bktp->bkt_mem, bktp->bkt_nelem * IBD_LSO_BUFSZ);
3602 
3603 	/*
3604 	 * Finally free the bucket
3605 	 */
3606 	kmem_free(bktp, sizeof (ibd_lsobkt_t));
3607 	state->id_lso = NULL;
3608 
3609 	mutex_exit(&state->id_lso_lock);
3610 }
3611 
3612 /*
3613  * Free the statically allocated Tx buffer list.
3614  */
3615 static void
3616 ibd_fini_txlist(ibd_state_t *state)
3617 {
3618 	ibd_swqe_t *node;
3619 
3620 	/*
3621 	 * Free the allocated swqes
3622 	 */
3623 	mutex_enter(&state->id_tx_list.dl_mutex);
3624 	while (state->id_tx_list.dl_head != NULL) {
3625 		node = WQE_TO_SWQE(state->id_tx_list.dl_head);
3626 		state->id_tx_list.dl_head = node->swqe_next;
3627 		ASSERT(state->id_tx_list.dl_cnt > 0);
3628 		state->id_tx_list.dl_cnt--;
3629 	}
3630 	ASSERT(state->id_tx_list.dl_cnt == 0);
3631 	mutex_exit(&state->id_tx_list.dl_mutex);
3632 	mutex_enter(&state->id_tx_rel_list.dl_mutex);
3633 	while (state->id_tx_rel_list.dl_head != NULL) {
3634 		node = WQE_TO_SWQE(state->id_tx_rel_list.dl_head);
3635 		state->id_tx_rel_list.dl_head = node->swqe_next;
3636 		ASSERT(state->id_tx_rel_list.dl_cnt > 0);
3637 		state->id_tx_rel_list.dl_cnt--;
3638 	}
3639 	ASSERT(state->id_tx_rel_list.dl_cnt == 0);
3640 	mutex_exit(&state->id_tx_rel_list.dl_mutex);
3641 
3642 	ibd_free_tx_lsobufs(state);
3643 	ibd_free_tx_copybufs(state);
3644 }
3645 
3646 static void
3647 ibd_post_recv_task(ibd_rwqe_t *rwqe, ibd_rwqe_t *tail)
3648 {
3649 	uint_t		i;
3650 	uint_t		num_posted;
3651 	ibt_status_t	ibt_status;
3652 	ibt_recv_wr_t	wrs[IBD_RX_POST_CNT];
3653 	ibd_state_t	*state = rwqe->w_state;
3654 
3655 	mutex_enter(&state->id_rx_post_lock);
3656 	if (state->id_rx_post_busy) {
3657 		tail->rwqe_next = state->id_rx_post_head;
3658 		state->id_rx_post_head = RWQE_TO_WQE(rwqe);
3659 		mutex_exit(&state->id_rx_post_lock);
3660 		return;
3661 	}
3662 	state->id_rx_post_busy = 1;
3663 	mutex_exit(&state->id_rx_post_lock);
3664 
3665 loop:
3666 	/* Post the IBD_RX_POST_CNT receive work requests pointed to by arg. */
3667 	for (i = 0; i < IBD_RX_POST_CNT; i++) {
3668 		wrs[i] = rwqe->w_rwr;
3669 		rwqe = WQE_TO_RWQE(rwqe->rwqe_next);
3670 	}
3671 
3672 	/*
3673 	 * If posting fails for some reason, we'll never receive
3674 	 * completion intimation, so we'll need to cleanup. But
3675 	 * we need to make sure we don't clean up nodes whose
3676 	 * wrs have been successfully posted. We assume that the
3677 	 * hca driver returns on the first failure to post and
3678 	 * therefore the first 'num_posted' entries don't need
3679 	 * cleanup here.
3680 	 */
3681 	atomic_add_32(&state->id_rx_list.dl_cnt, IBD_RX_POST_CNT);
3682 
3683 	num_posted = 0;
3684 	ibt_status = ibt_post_recv(state->id_chnl_hdl,
3685 	    wrs, IBD_RX_POST_CNT, &num_posted);
3686 	if (ibt_status != IBT_SUCCESS) {
3687 		ibd_print_warn(state, "ibd_post_recv: FATAL: "
3688 		    "posting multiple wrs failed: "
3689 		    "requested=%d, done=%d, ret=%d",
3690 		    IBD_RX_POST_CNT, num_posted, ibt_status);
3691 		atomic_add_32(&state->id_rx_list.dl_cnt,
3692 		    -(IBD_RX_POST_CNT - num_posted));
3693 		/* This cannot happen! */
3694 	}
3695 	if (rwqe != NULL)	/* more rwqes on our list? */
3696 		goto loop;
3697 
3698 	/* check if we have a new list */
3699 	mutex_enter(&state->id_rx_post_lock);
3700 	if ((rwqe = WQE_TO_RWQE(state->id_rx_post_head)) != NULL) {
3701 		state->id_rx_post_head = NULL;
3702 		mutex_exit(&state->id_rx_post_lock);
3703 		goto loop;
3704 	}
3705 	state->id_rx_post_busy = 0;
3706 	mutex_exit(&state->id_rx_post_lock);
3707 }
3708 
3709 /* macro explained below */
3710 #define	RX_QUEUE_HASH(rwqe) \
3711 	(((uintptr_t)(rwqe) >> 8) & (state->id_rx_nqueues - 1))
3712 
3713 /*
3714  * Add a rwqe to one of the the Rx lists.  If the list is large enough
3715  * (exactly IBD_RX_POST_CNT), post the list to the hardware.
3716  *
3717  * Note: one of 2^N lists is chosen via a hash.  This is done
3718  * because using one list is contentious.  If the first list is busy
3719  * (mutex_tryenter fails), use a second list (just call mutex_enter).
3720  *
3721  * The number 8 in RX_QUEUE_HASH is a random choice that provides
3722  * even distribution of mapping rwqes to the 2^N queues.
3723  */
3724 static void
3725 ibd_post_recv(ibd_state_t *state, ibd_rwqe_t *rwqe)
3726 {
3727 	ibd_rx_queue_t	*rxp;
3728 	ibd_rwqe_t	*tail;
3729 
3730 	rxp = state->id_rx_queues + RX_QUEUE_HASH(rwqe);
3731 
3732 	if (!mutex_tryenter(&rxp->rx_post_lock)) {
3733 		/* Failed.  Try a different queue ("ptr + 16" ensures that). */
3734 		rxp = state->id_rx_queues + RX_QUEUE_HASH(rwqe + 16);
3735 		mutex_enter(&rxp->rx_post_lock);
3736 	}
3737 	rwqe->rwqe_next = rxp->rx_head;
3738 	if (rxp->rx_cnt == 0)
3739 		rxp->rx_tail = RWQE_TO_WQE(rwqe);
3740 	if (++rxp->rx_cnt == IBD_RX_POST_CNT) {
3741 		rxp->rx_head = NULL;
3742 		tail = WQE_TO_RWQE(rxp->rx_tail);
3743 		rxp->rx_cnt = 0;
3744 	} else {
3745 		rxp->rx_head = RWQE_TO_WQE(rwqe);
3746 		rwqe = NULL;
3747 	}
3748 	rxp->rx_stat++;
3749 	mutex_exit(&rxp->rx_post_lock);
3750 	if (rwqe) {
3751 		ibd_post_recv_task(rwqe, tail);
3752 	}
3753 }
3754 
3755 static int
3756 ibd_alloc_rx_copybufs(ibd_state_t *state)
3757 {
3758 	ibt_mr_attr_t mem_attr;
3759 	int i;
3760 
3761 	/*
3762 	 * Allocate one big chunk for all regular rx copy bufs
3763 	 */
3764 	state->id_rx_buf_sz = state->id_mtu + IPOIB_GRH_SIZE;
3765 
3766 	state->id_rx_bufs = kmem_zalloc(state->id_num_rwqe *
3767 	    state->id_rx_buf_sz, KM_SLEEP);
3768 
3769 	state->id_rx_wqes = kmem_zalloc(state->id_num_rwqe *
3770 	    sizeof (ibd_rwqe_t), KM_SLEEP);
3771 
3772 	state->id_rx_nqueues = 1 << IBD_LOG_RX_POST;
3773 	state->id_rx_queues = kmem_zalloc(state->id_rx_nqueues *
3774 	    sizeof (ibd_rx_queue_t), KM_SLEEP);
3775 	for (i = 0; i < state->id_rx_nqueues; i++) {
3776 		ibd_rx_queue_t *rxp = state->id_rx_queues + i;
3777 		mutex_init(&rxp->rx_post_lock, NULL, MUTEX_DRIVER, NULL);
3778 	}
3779 
3780 	/*
3781 	 * Do one memory registration on the entire rxbuf area
3782 	 */
3783 	mem_attr.mr_vaddr = (uint64_t)(uintptr_t)state->id_rx_bufs;
3784 	mem_attr.mr_len = state->id_num_rwqe * state->id_rx_buf_sz;
3785 	mem_attr.mr_as = NULL;
3786 	mem_attr.mr_flags = IBT_MR_SLEEP | IBT_MR_ENABLE_LOCAL_WRITE;
3787 	if (ibt_register_mr(state->id_hca_hdl, state->id_pd_hdl, &mem_attr,
3788 	    &state->id_rx_mr_hdl, &state->id_rx_mr_desc) != IBT_SUCCESS) {
3789 		DPRINT(10, "ibd_alloc_rx_copybufs: ibt_register_mr failed");
3790 		kmem_free(state->id_rx_wqes,
3791 		    state->id_num_rwqe * sizeof (ibd_rwqe_t));
3792 		kmem_free(state->id_rx_bufs,
3793 		    state->id_num_rwqe * state->id_rx_buf_sz);
3794 		state->id_rx_bufs = NULL;
3795 		state->id_rx_wqes = NULL;
3796 		return (DDI_FAILURE);
3797 	}
3798 
3799 	return (DDI_SUCCESS);
3800 }
3801 
3802 /*
3803  * Allocate the statically allocated Rx buffer list.
3804  */
3805 static int
3806 ibd_init_rxlist(ibd_state_t *state)
3807 {
3808 	ibd_rwqe_t *rwqe;
3809 	ibt_lkey_t lkey;
3810 	int i;
3811 	uint_t len;
3812 	uint8_t *bufaddr;
3813 
3814 	if (ibd_alloc_rx_copybufs(state) != DDI_SUCCESS)
3815 		return (DDI_FAILURE);
3816 
3817 	/*
3818 	 * Allocate and setup the rwqe list
3819 	 */
3820 	lkey = state->id_rx_mr_desc.md_lkey;
3821 	rwqe = state->id_rx_wqes;
3822 	bufaddr = state->id_rx_bufs;
3823 	len = state->id_rx_buf_sz;
3824 	for (i = 0; i < state->id_num_rwqe; i++, rwqe++, bufaddr += len) {
3825 		rwqe->rwqe_type = IBD_WQE_RECV;
3826 		rwqe->w_state = state;
3827 		rwqe->w_freeing_wqe = B_FALSE;
3828 		rwqe->w_freemsg_cb.free_func = ibd_freemsg_cb;
3829 		rwqe->w_freemsg_cb.free_arg = (char *)rwqe;
3830 
3831 		rwqe->rwqe_copybuf.ic_bufaddr = bufaddr;
3832 
3833 		if ((rwqe->rwqe_im_mblk = desballoc(bufaddr, len, 0,
3834 		    &rwqe->w_freemsg_cb)) == NULL) {
3835 			DPRINT(10, "ibd_init_rxlist : failed in desballoc()");
3836 			rwqe->rwqe_copybuf.ic_bufaddr = NULL;
3837 			ibd_fini_rxlist(state);
3838 			return (DDI_FAILURE);
3839 		}
3840 
3841 		rwqe->rwqe_copybuf.ic_sgl.ds_key = lkey;
3842 		rwqe->rwqe_copybuf.ic_sgl.ds_va =
3843 		    (ib_vaddr_t)(uintptr_t)bufaddr;
3844 		rwqe->rwqe_copybuf.ic_sgl.ds_len = len;
3845 		rwqe->w_rwr.wr_id = (ibt_wrid_t)(uintptr_t)rwqe;
3846 		rwqe->w_rwr.wr_nds = 1;
3847 		rwqe->w_rwr.wr_sgl = &rwqe->rwqe_copybuf.ic_sgl;
3848 
3849 		ibd_post_recv(state, rwqe);
3850 	}
3851 
3852 	return (DDI_SUCCESS);
3853 }
3854 
3855 static void
3856 ibd_free_rx_copybufs(ibd_state_t *state)
3857 {
3858 	int i;
3859 
3860 	/*
3861 	 * Unregister rxbuf mr
3862 	 */
3863 	if (ibt_deregister_mr(state->id_hca_hdl,
3864 	    state->id_rx_mr_hdl) != IBT_SUCCESS) {
3865 		DPRINT(10, "ibd_free_rx_copybufs: ibt_deregister_mr failed");
3866 	}
3867 	state->id_rx_mr_hdl = NULL;
3868 
3869 	/*
3870 	 * Free rxbuf memory
3871 	 */
3872 	for (i = 0; i < state->id_rx_nqueues; i++) {
3873 		ibd_rx_queue_t *rxp = state->id_rx_queues + i;
3874 		mutex_destroy(&rxp->rx_post_lock);
3875 	}
3876 	kmem_free(state->id_rx_queues, state->id_rx_nqueues *
3877 	    sizeof (ibd_rx_queue_t));
3878 	kmem_free(state->id_rx_wqes, state->id_num_rwqe * sizeof (ibd_rwqe_t));
3879 	kmem_free(state->id_rx_bufs, state->id_num_rwqe * state->id_rx_buf_sz);
3880 	state->id_rx_queues = NULL;
3881 	state->id_rx_wqes = NULL;
3882 	state->id_rx_bufs = NULL;
3883 }
3884 
3885 /*
3886  * Free the statically allocated Rx buffer list.
3887  *
3888  */
3889 static void
3890 ibd_fini_rxlist(ibd_state_t *state)
3891 {
3892 	ibd_rwqe_t *rwqe;
3893 	int i;
3894 
3895 	mutex_enter(&state->id_rx_list.dl_mutex);
3896 	rwqe = state->id_rx_wqes;
3897 	for (i = 0; i < state->id_num_rwqe; i++, rwqe++) {
3898 		if (rwqe->rwqe_im_mblk != NULL) {
3899 			rwqe->w_freeing_wqe = B_TRUE;
3900 			freemsg(rwqe->rwqe_im_mblk);
3901 		}
3902 	}
3903 	mutex_exit(&state->id_rx_list.dl_mutex);
3904 
3905 	ibd_free_rx_copybufs(state);
3906 }
3907 
3908 /*
3909  * Free an allocated recv wqe.
3910  */
3911 /* ARGSUSED */
3912 static void
3913 ibd_free_rwqe(ibd_state_t *state, ibd_rwqe_t *rwqe)
3914 {
3915 	/*
3916 	 * desballoc() failed (no memory).
3917 	 *
3918 	 * This rwqe is placed on a free list so that it
3919 	 * can be reinstated when memory is available.
3920 	 *
3921 	 * NOTE: no code currently exists to reinstate
3922 	 * these "lost" rwqes.
3923 	 */
3924 	mutex_enter(&state->id_rx_free_list.dl_mutex);
3925 	state->id_rx_free_list.dl_cnt++;
3926 	rwqe->rwqe_next = state->id_rx_free_list.dl_head;
3927 	state->id_rx_free_list.dl_head = RWQE_TO_WQE(rwqe);
3928 	mutex_exit(&state->id_rx_free_list.dl_mutex);
3929 }
3930 
3931 /*
3932  * IBA Rx completion queue handler. Guaranteed to be single
3933  * threaded and nonreentrant for this CQ.
3934  */
3935 /* ARGSUSED */
3936 static void
3937 ibd_rcq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
3938 {
3939 	ibd_state_t *state = (ibd_state_t *)arg;
3940 
3941 	atomic_add_64(&state->id_num_intrs, 1);
3942 
3943 	if (ibd_rx_softintr == 1) {
3944 		mutex_enter(&state->id_rcq_poll_lock);
3945 		if (state->id_rcq_poll_busy & IBD_CQ_POLLING) {
3946 			state->id_rcq_poll_busy |= IBD_REDO_CQ_POLLING;
3947 			mutex_exit(&state->id_rcq_poll_lock);
3948 			return;
3949 		} else {
3950 			mutex_exit(&state->id_rcq_poll_lock);
3951 			ddi_trigger_softintr(state->id_rx);
3952 		}
3953 	} else
3954 		(void) ibd_intr((caddr_t)state);
3955 }
3956 
3957 /*
3958  * CQ handler for Tx completions, when the Tx CQ is in
3959  * interrupt driven mode.
3960  */
3961 /* ARGSUSED */
3962 static void
3963 ibd_scq_handler(ibt_cq_hdl_t cq_hdl, void *arg)
3964 {
3965 	ibd_state_t *state = (ibd_state_t *)arg;
3966 
3967 	atomic_add_64(&state->id_num_intrs, 1);
3968 
3969 	if (ibd_tx_softintr == 1) {
3970 		mutex_enter(&state->id_scq_poll_lock);
3971 		if (state->id_scq_poll_busy & IBD_CQ_POLLING) {
3972 			state->id_scq_poll_busy |= IBD_REDO_CQ_POLLING;
3973 			mutex_exit(&state->id_scq_poll_lock);
3974 			return;
3975 		} else {
3976 			mutex_exit(&state->id_scq_poll_lock);
3977 			ddi_trigger_softintr(state->id_tx);
3978 		}
3979 	} else
3980 		(void) ibd_tx_recycle((caddr_t)state);
3981 }
3982 
3983 /*
3984  * Multicast group create/delete trap handler. These will be delivered
3985  * on a kernel thread (handling can thus block) and can be invoked
3986  * concurrently. The handler can be invoked anytime after it is
3987  * registered and before ibt_detach().
3988  */
3989 /* ARGSUSED */
3990 static void
3991 ibd_snet_notices_handler(void *arg, ib_gid_t gid, ibt_subnet_event_code_t code,
3992     ibt_subnet_event_t *event)
3993 {
3994 	ibd_state_t *state = (ibd_state_t *)arg;
3995 	ibd_req_t *req;
3996 
3997 	/*
3998 	 * The trap handler will get invoked once for every event for
3999 	 * every port. The input "gid" is the GID0 of the port the
4000 	 * trap came in on; we just need to act on traps that came
4001 	 * to our port, meaning the port on which the ipoib interface
4002 	 * resides. Since ipoib uses GID0 of the port, we just match
4003 	 * the gids to check whether we need to handle the trap.
4004 	 */
4005 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid))
4006 	if (bcmp(&gid, &state->id_sgid, sizeof (ib_gid_t)) != 0)
4007 		return;
4008 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid))
4009 
4010 	DPRINT(10, "ibd_notices_handler : %d\n", code);
4011 
4012 	switch (code) {
4013 		case IBT_SM_EVENT_UNAVAILABLE:
4014 			/*
4015 			 * If we are in promiscuous mode or have
4016 			 * sendnonmembers, we need to print a warning
4017 			 * message right now. Else, just store the
4018 			 * information, print when we enter promiscuous
4019 			 * mode or attempt nonmember send. We might
4020 			 * also want to stop caching sendnonmember.
4021 			 */
4022 			ibd_print_warn(state, "IBA multicast support "
4023 			    "degraded due to unavailability of multicast "
4024 			    "traps");
4025 			break;
4026 		case IBT_SM_EVENT_AVAILABLE:
4027 			/*
4028 			 * If we printed a warning message above or
4029 			 * while trying to nonmember send or get into
4030 			 * promiscuous mode, print an okay message.
4031 			 */
4032 			ibd_print_warn(state, "IBA multicast support "
4033 			    "restored due to availability of multicast "
4034 			    "traps");
4035 			break;
4036 		case IBT_SM_EVENT_MCG_CREATED:
4037 		case IBT_SM_EVENT_MCG_DELETED:
4038 			/*
4039 			 * Common processing of creation/deletion traps.
4040 			 * First check if the instance is being
4041 			 * [de]initialized; back off then, without doing
4042 			 * anything more, since we are not sure if the
4043 			 * async thread is around, or whether we might
4044 			 * be racing with the detach code in ibd_m_stop()
4045 			 * that scans the mcg list.
4046 			 */
4047 			if (!ibd_async_safe(state))
4048 				return;
4049 
4050 			req = kmem_cache_alloc(state->id_req_kmc, KM_SLEEP);
4051 			req->rq_gid = event->sm_notice_gid;
4052 			req->rq_ptr = (void *)code;
4053 			ibd_queue_work_slot(state, req, IBD_ASYNC_TRAP);
4054 			break;
4055 	}
4056 }
4057 
4058 static void
4059 ibd_async_trap(ibd_state_t *state, ibd_req_t *req)
4060 {
4061 	ib_gid_t mgid = req->rq_gid;
4062 	ibt_subnet_event_code_t code = (ibt_subnet_event_code_t)req->rq_ptr;
4063 
4064 	DPRINT(10, "ibd_async_trap : %d\n", code);
4065 
4066 	/*
4067 	 * Atomically search the nonmember and sendonlymember lists and
4068 	 * delete.
4069 	 */
4070 	ibd_leave_group(state, mgid, IB_MC_JSTATE_SEND_ONLY_NON);
4071 
4072 	if (state->id_prom_op == IBD_OP_COMPLETED) {
4073 		ibd_leave_group(state, mgid, IB_MC_JSTATE_NON);
4074 
4075 		/*
4076 		 * If in promiscuous mode, try to join/attach to the new
4077 		 * mcg. Given the unreliable out-of-order mode of trap
4078 		 * delivery, we can never be sure whether it is a problem
4079 		 * if the join fails. Thus, we warn the admin of a failure
4080 		 * if this was a creation trap. Note that the trap might
4081 		 * actually be reporting a long past event, and the mcg
4082 		 * might already have been deleted, thus we might be warning
4083 		 * in vain.
4084 		 */
4085 		if ((ibd_join_group(state, mgid, IB_MC_JSTATE_NON) ==
4086 		    NULL) && (code == IBT_SM_EVENT_MCG_CREATED))
4087 			ibd_print_warn(state, "IBA promiscuous mode missed "
4088 			    "new multicast gid %016llx:%016llx",
4089 			    (u_longlong_t)mgid.gid_prefix,
4090 			    (u_longlong_t)mgid.gid_guid);
4091 	}
4092 
4093 	/*
4094 	 * Free the request slot allocated by the subnet event thread.
4095 	 */
4096 	ibd_async_done(state);
4097 }
4098 
4099 /*
4100  * GLDv3 entry point to get capabilities.
4101  */
4102 static boolean_t
4103 ibd_m_getcapab(void *arg, mac_capab_t cap, void *cap_data)
4104 {
4105 	ibd_state_t *state = arg;
4106 
4107 	switch (cap) {
4108 	case MAC_CAPAB_HCKSUM: {
4109 		uint32_t *txflags = cap_data;
4110 
4111 		/*
4112 		 * We either do full checksum or not do it at all
4113 		 */
4114 		if (state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL)
4115 			*txflags = HCK_FULLCKSUM | HCKSUM_INET_FULL_V4;
4116 		else
4117 			return (B_FALSE);
4118 		break;
4119 	}
4120 
4121 	case MAC_CAPAB_LSO: {
4122 		mac_capab_lso_t *cap_lso = cap_data;
4123 
4124 		/*
4125 		 * In addition to the capability and policy, since LSO
4126 		 * relies on hw checksum, we'll not enable LSO if we
4127 		 * don't have hw checksum.  Of course, if the HCA doesn't
4128 		 * provide the reserved lkey capability, enabling LSO will
4129 		 * actually affect performance adversely, so we'll disable
4130 		 * LSO even for that case.
4131 		 */
4132 		if (!state->id_lso_policy || !state->id_lso_capable)
4133 			return (B_FALSE);
4134 
4135 		if ((state->id_hwcksum_capab & IBT_HCA_CKSUM_FULL) == 0)
4136 			return (B_FALSE);
4137 
4138 		if (state->id_hca_res_lkey_capab == 0) {
4139 			ibd_print_warn(state, "no reserved-lkey capability, "
4140 			    "disabling LSO");
4141 			return (B_FALSE);
4142 		}
4143 
4144 		cap_lso->lso_flags = LSO_TX_BASIC_TCP_IPV4;
4145 		cap_lso->lso_basic_tcp_ipv4.lso_max = state->id_lso_maxlen - 1;
4146 		break;
4147 	}
4148 
4149 	default:
4150 		return (B_FALSE);
4151 	}
4152 
4153 	return (B_TRUE);
4154 }
4155 
4156 static int
4157 ibd_get_port_details(ibd_state_t *state)
4158 {
4159 	ibt_hca_portinfo_t *port_infop;
4160 	ibt_status_t ret;
4161 	uint_t psize, port_infosz;
4162 
4163 	mutex_enter(&state->id_link_mutex);
4164 
4165 	/*
4166 	 * Query for port information
4167 	 */
4168 	ret = ibt_query_hca_ports(state->id_hca_hdl, state->id_port,
4169 	    &port_infop, &psize, &port_infosz);
4170 	if ((ret != IBT_SUCCESS) || (psize != 1)) {
4171 		mutex_exit(&state->id_link_mutex);
4172 		DPRINT(10, "ibd_get_port_details: ibt_query_hca_ports() "
4173 		    "failed, ret=%d", ret);
4174 		return (ENETDOWN);
4175 	}
4176 
4177 	/*
4178 	 * If the link already went down by the time we get here,
4179 	 * give up
4180 	 */
4181 	if (port_infop->p_linkstate != IBT_PORT_ACTIVE) {
4182 		mutex_exit(&state->id_link_mutex);
4183 		ibt_free_portinfo(port_infop, port_infosz);
4184 		DPRINT(10, "ibd_get_port_details: port is not active");
4185 		return (ENETDOWN);
4186 	}
4187 
4188 	/*
4189 	 * If the link is active, verify the pkey
4190 	 */
4191 	if ((ret = ibt_pkey2index(state->id_hca_hdl, state->id_port,
4192 	    state->id_pkey, &state->id_pkix)) != IBT_SUCCESS) {
4193 		mutex_exit(&state->id_link_mutex);
4194 		ibt_free_portinfo(port_infop, port_infosz);
4195 		DPRINT(10, "ibd_get_port_details: ibt_pkey2index "
4196 		    "failed, ret=%d", ret);
4197 		return (ENONET);
4198 	}
4199 
4200 	state->id_mtu = (128 << port_infop->p_mtu);
4201 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_sgid))
4202 	state->id_sgid = *port_infop->p_sgid_tbl;
4203 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_sgid))
4204 	state->id_link_state = LINK_STATE_UP;
4205 
4206 	mutex_exit(&state->id_link_mutex);
4207 	ibt_free_portinfo(port_infop, port_infosz);
4208 
4209 	/*
4210 	 * Now that the port is active, record the port speed
4211 	 */
4212 	state->id_link_speed = ibd_get_portspeed(state);
4213 
4214 	return (0);
4215 }
4216 
4217 static int
4218 ibd_alloc_cqs(ibd_state_t *state)
4219 {
4220 	ibt_hca_attr_t hca_attrs;
4221 	ibt_cq_attr_t cq_attr;
4222 	ibt_status_t ret;
4223 	uint32_t real_size;
4224 
4225 	ret = ibt_query_hca(state->id_hca_hdl, &hca_attrs);
4226 	ASSERT(ret == IBT_SUCCESS);
4227 
4228 	/*
4229 	 * Allocate Rx/combined CQ:
4230 	 * Theoretically, there is no point in having more than #rwqe
4231 	 * plus #swqe cqe's, except that the CQ will be signaled for
4232 	 * overflow when the last wqe completes, if none of the previous
4233 	 * cqe's have been polled. Thus, we allocate just a few less wqe's
4234 	 * to make sure such overflow does not occur.
4235 	 */
4236 	cq_attr.cq_sched = NULL;
4237 	cq_attr.cq_flags = IBT_CQ_NO_FLAGS;
4238 
4239 	/*
4240 	 * Allocate Receive CQ.
4241 	 */
4242 	if (hca_attrs.hca_max_cq_sz >= (state->id_num_rwqe + 1)) {
4243 		cq_attr.cq_size = state->id_num_rwqe + 1;
4244 	} else {
4245 		cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
4246 		state->id_num_rwqe = cq_attr.cq_size - 1;
4247 	}
4248 
4249 	if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
4250 	    &state->id_rcq_hdl, &real_size)) != IBT_SUCCESS) {
4251 		DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(rcq) "
4252 		    "failed, ret=%d\n", ret);
4253 		return (DDI_FAILURE);
4254 	}
4255 
4256 	if ((ret = ibt_modify_cq(state->id_rcq_hdl,
4257 	    ibd_rxcomp_count, ibd_rxcomp_usec, 0)) != IBT_SUCCESS) {
4258 		DPRINT(10, "ibd_alloc_cqs: Receive CQ interrupt "
4259 		    "moderation failed, ret=%d\n", ret);
4260 	}
4261 
4262 	/* make the #rx wc's the same as max rx chain size */
4263 	state->id_rxwcs_size = IBD_MAX_RX_MP_LEN;
4264 	state->id_rxwcs = kmem_alloc(sizeof (ibt_wc_t) *
4265 	    state->id_rxwcs_size, KM_SLEEP);
4266 
4267 	/*
4268 	 * Allocate Send CQ.
4269 	 */
4270 	if (hca_attrs.hca_max_cq_sz >= (state->id_num_swqe + 1)) {
4271 		cq_attr.cq_size = state->id_num_swqe + 1;
4272 	} else {
4273 		cq_attr.cq_size = hca_attrs.hca_max_cq_sz;
4274 		state->id_num_swqe = cq_attr.cq_size - 1;
4275 	}
4276 
4277 	if ((ret = ibt_alloc_cq(state->id_hca_hdl, &cq_attr,
4278 	    &state->id_scq_hdl, &real_size)) != IBT_SUCCESS) {
4279 		DPRINT(10, "ibd_alloc_cqs: ibt_alloc_cq(scq) "
4280 		    "failed, ret=%d\n", ret);
4281 		kmem_free(state->id_rxwcs, sizeof (ibt_wc_t) *
4282 		    state->id_rxwcs_size);
4283 		(void) ibt_free_cq(state->id_rcq_hdl);
4284 		return (DDI_FAILURE);
4285 	}
4286 	if ((ret = ibt_modify_cq(state->id_scq_hdl,
4287 	    ibd_txcomp_count, ibd_txcomp_usec, 0)) != IBT_SUCCESS) {
4288 		DPRINT(10, "ibd_alloc_cqs: Send CQ interrupt "
4289 		    "moderation failed, ret=%d\n", ret);
4290 	}
4291 
4292 	state->id_txwcs_size = IBD_TX_POLL_THRESH;
4293 	state->id_txwcs = kmem_alloc(sizeof (ibt_wc_t) *
4294 	    state->id_txwcs_size, KM_SLEEP);
4295 
4296 	/*
4297 	 * Print message in case we could not allocate as many wqe's
4298 	 * as was requested.
4299 	 */
4300 	if (state->id_num_rwqe != IBD_NUM_RWQE) {
4301 		ibd_print_warn(state, "Setting #rwqe = %d instead of default "
4302 		    "%d", state->id_num_rwqe, IBD_NUM_RWQE);
4303 	}
4304 	if (state->id_num_swqe != IBD_NUM_SWQE) {
4305 		ibd_print_warn(state, "Setting #swqe = %d instead of default "
4306 		    "%d", state->id_num_swqe, IBD_NUM_SWQE);
4307 	}
4308 
4309 	return (DDI_SUCCESS);
4310 }
4311 
4312 static int
4313 ibd_setup_ud_channel(ibd_state_t *state)
4314 {
4315 	ibt_ud_chan_alloc_args_t ud_alloc_attr;
4316 	ibt_ud_chan_query_attr_t ud_chan_attr;
4317 	ibt_status_t ret;
4318 
4319 	ud_alloc_attr.ud_flags  = IBT_ALL_SIGNALED;
4320 	if (state->id_hca_res_lkey_capab)
4321 		ud_alloc_attr.ud_flags |= IBT_FAST_REG_RES_LKEY;
4322 	if (state->id_lso_policy && state->id_lso_capable)
4323 		ud_alloc_attr.ud_flags |= IBT_USES_LSO;
4324 
4325 	ud_alloc_attr.ud_hca_port_num	= state->id_port;
4326 	ud_alloc_attr.ud_sizes.cs_sq_sgl = state->id_max_sqseg;
4327 	ud_alloc_attr.ud_sizes.cs_rq_sgl = IBD_MAX_RQSEG;
4328 	ud_alloc_attr.ud_sizes.cs_sq    = state->id_num_swqe;
4329 	ud_alloc_attr.ud_sizes.cs_rq    = state->id_num_rwqe;
4330 	ud_alloc_attr.ud_qkey		= state->id_mcinfo->mc_qkey;
4331 	ud_alloc_attr.ud_scq		= state->id_scq_hdl;
4332 	ud_alloc_attr.ud_rcq		= state->id_rcq_hdl;
4333 	ud_alloc_attr.ud_pd		= state->id_pd_hdl;
4334 	ud_alloc_attr.ud_pkey_ix	= state->id_pkix;
4335 	ud_alloc_attr.ud_clone_chan	= NULL;
4336 
4337 	if ((ret = ibt_alloc_ud_channel(state->id_hca_hdl, IBT_ACHAN_NO_FLAGS,
4338 	    &ud_alloc_attr, &state->id_chnl_hdl, NULL)) != IBT_SUCCESS) {
4339 		DPRINT(10, "ibd_setup_ud_channel: ibt_alloc_ud_channel() "
4340 		    "failed, ret=%d\n", ret);
4341 		return (DDI_FAILURE);
4342 	}
4343 
4344 	if ((ret = ibt_query_ud_channel(state->id_chnl_hdl,
4345 	    &ud_chan_attr)) != IBT_SUCCESS) {
4346 		DPRINT(10, "ibd_setup_ud_channel: ibt_query_ud_channel() "
4347 		    "failed, ret=%d\n", ret);
4348 		(void) ibt_free_channel(state->id_chnl_hdl);
4349 		return (DDI_FAILURE);
4350 	}
4351 
4352 	state->id_qpnum = ud_chan_attr.ud_qpn;
4353 
4354 	return (DDI_SUCCESS);
4355 }
4356 
4357 static int
4358 ibd_undo_start(ibd_state_t *state, link_state_t cur_link_state)
4359 {
4360 	uint32_t progress = state->id_mac_state;
4361 	uint_t attempts;
4362 	ibt_status_t ret;
4363 	ib_gid_t mgid;
4364 	ibd_mce_t *mce;
4365 	uint8_t jstate;
4366 
4367 	/*
4368 	 * Before we try to stop/undo whatever we did in ibd_start(),
4369 	 * we need to mark the link state appropriately to prevent the
4370 	 * ip layer from using this instance for any new transfers. Note
4371 	 * that if the original state of the link was "up" when we're
4372 	 * here, we'll set the final link state to "unknown", to behave
4373 	 * in the same fashion as other ethernet drivers.
4374 	 */
4375 	mutex_enter(&state->id_link_mutex);
4376 	if (cur_link_state == LINK_STATE_DOWN) {
4377 		state->id_link_state = cur_link_state;
4378 	} else {
4379 		state->id_link_state = LINK_STATE_UNKNOWN;
4380 	}
4381 	mutex_exit(&state->id_link_mutex);
4382 	mac_link_update(state->id_mh, state->id_link_state);
4383 
4384 	state->id_mac_state &= (~IBD_DRV_PORT_DETAILS_OBTAINED);
4385 	if (progress & IBD_DRV_STARTED) {
4386 		state->id_mac_state &= (~IBD_DRV_STARTED);
4387 	}
4388 
4389 	/*
4390 	 * First, stop receive interrupts; this stops the driver from
4391 	 * handing up buffers to higher layers.  Wait for receive buffers
4392 	 * to be returned and give up after 5 seconds.
4393 	 */
4394 	if (progress & IBD_DRV_RCQ_NOTIFY_ENABLED) {
4395 
4396 		ibt_set_cq_handler(state->id_rcq_hdl, 0, 0);
4397 
4398 		attempts = 50;
4399 		while (state->id_rx_list.dl_bufs_outstanding > 0) {
4400 			delay(drv_usectohz(100000));
4401 			if (--attempts == 0) {
4402 				/*
4403 				 * There are pending bufs with the network
4404 				 * layer and we have no choice but to wait
4405 				 * for them to be done with. Reap all the
4406 				 * Tx/Rx completions that were posted since
4407 				 * we turned off the notification and
4408 				 * return failure.
4409 				 */
4410 				DPRINT(2, "ibd_undo_start: "
4411 				    "reclaiming failed");
4412 				ibd_poll_rcq(state, state->id_rcq_hdl);
4413 				ibt_set_cq_handler(state->id_rcq_hdl,
4414 				    ibd_rcq_handler, state);
4415 				return (DDI_FAILURE);
4416 			}
4417 		}
4418 		state->id_mac_state &= (~IBD_DRV_RCQ_NOTIFY_ENABLED);
4419 	}
4420 
4421 	if (progress & IBD_DRV_SM_NOTICES_REGISTERED) {
4422 		ibt_register_subnet_notices(state->id_ibt_hdl, NULL, NULL);
4423 
4424 		mutex_enter(&state->id_trap_lock);
4425 		state->id_trap_stop = B_TRUE;
4426 		while (state->id_trap_inprog > 0)
4427 			cv_wait(&state->id_trap_cv, &state->id_trap_lock);
4428 		mutex_exit(&state->id_trap_lock);
4429 
4430 		state->id_mac_state &= (~IBD_DRV_SM_NOTICES_REGISTERED);
4431 	}
4432 
4433 	if (progress & IBD_DRV_SCQ_NOTIFY_ENABLED) {
4434 		/*
4435 		 * Flushing the channel ensures that all pending WQE's
4436 		 * are marked with flush_error and handed to the CQ. It
4437 		 * does not guarantee the invocation of the CQ handler.
4438 		 * This call is guaranteed to return successfully for
4439 		 * UD QPNs.
4440 		 */
4441 		if ((ret = ibt_flush_channel(state->id_chnl_hdl)) !=
4442 		    IBT_SUCCESS) {
4443 			DPRINT(10, "ibd_undo_start: flush_channel "
4444 			    "failed, ret=%d", ret);
4445 		}
4446 
4447 		/*
4448 		 * Turn off Tx interrupts and poll. By the time the polling
4449 		 * returns an empty indicator, we are sure we have seen all
4450 		 * pending Tx callbacks. Note that after the call to
4451 		 * ibt_set_cq_handler() returns, the old handler is
4452 		 * guaranteed not to be invoked anymore.
4453 		 */
4454 		ibt_set_cq_handler(state->id_scq_hdl, 0, 0);
4455 		ibd_poll_scq(state, state->id_scq_hdl);
4456 
4457 		state->id_mac_state &= (~IBD_DRV_SCQ_NOTIFY_ENABLED);
4458 	}
4459 
4460 	if (progress & IBD_DRV_ASYNC_THR_CREATED) {
4461 		/*
4462 		 * No new async requests will be posted since the device
4463 		 * link state has been marked as unknown; completion handlers
4464 		 * have been turned off, so Tx handler will not cause any
4465 		 * more IBD_ASYNC_REAP requests.
4466 		 *
4467 		 * Queue a request for the async thread to exit, which will
4468 		 * be serviced after any pending ones. This can take a while,
4469 		 * specially if the SM is unreachable, since IBMF will slowly
4470 		 * timeout each SM request issued by the async thread.  Reap
4471 		 * the thread before continuing on, we do not want it to be
4472 		 * lingering in modunloaded code (or we could move the reap
4473 		 * to ibd_detach(), provided we keep track of the current
4474 		 * id_async_thrid somewhere safe).
4475 		 */
4476 		ibd_queue_work_slot(state, &state->id_ah_req, IBD_ASYNC_EXIT);
4477 		thread_join(state->id_async_thrid);
4478 
4479 		state->id_mac_state &= (~IBD_DRV_ASYNC_THR_CREATED);
4480 	}
4481 
4482 	if (progress & IBD_DRV_BCAST_GROUP_JOINED) {
4483 		/*
4484 		 * Drop all residual full/non membership. This includes full
4485 		 * membership to the broadcast group, and any nonmembership
4486 		 * acquired during transmits. We do this after the Tx completion
4487 		 * handlers are done, since those might result in some late
4488 		 * leaves; this also eliminates a potential race with that
4489 		 * path wrt the mc full list insert/delete. Trap handling
4490 		 * has also been suppressed at this point. Thus, no locks
4491 		 * are required while traversing the mc full list.
4492 		 */
4493 		DPRINT(2, "ibd_undo_start: clear full cache entries");
4494 		mce = list_head(&state->id_mc_full);
4495 		while (mce != NULL) {
4496 			mgid = mce->mc_info.mc_adds_vect.av_dgid;
4497 			jstate = mce->mc_jstate;
4498 			mce = list_next(&state->id_mc_full, mce);
4499 			ibd_leave_group(state, mgid, jstate);
4500 		}
4501 		state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_JOINED);
4502 	}
4503 
4504 	if (progress & IBD_DRV_RXLIST_ALLOCD) {
4505 		ibd_fini_rxlist(state);
4506 		state->id_mac_state &= (~IBD_DRV_RXLIST_ALLOCD);
4507 	}
4508 
4509 	if (progress & IBD_DRV_TXLIST_ALLOCD) {
4510 		ibd_fini_txlist(state);
4511 		state->id_mac_state &= (~IBD_DRV_TXLIST_ALLOCD);
4512 	}
4513 
4514 	if (progress & IBD_DRV_UD_CHANNEL_SETUP) {
4515 		if ((ret = ibt_free_channel(state->id_chnl_hdl)) !=
4516 		    IBT_SUCCESS) {
4517 			DPRINT(10, "ibd_undo_start: free_channel "
4518 			    "failed, ret=%d", ret);
4519 		}
4520 
4521 		state->id_mac_state &= (~IBD_DRV_UD_CHANNEL_SETUP);
4522 	}
4523 
4524 	if (progress & IBD_DRV_CQS_ALLOCD) {
4525 		kmem_free(state->id_txwcs,
4526 		    sizeof (ibt_wc_t) * state->id_txwcs_size);
4527 		if ((ret = ibt_free_cq(state->id_scq_hdl)) !=
4528 		    IBT_SUCCESS) {
4529 			DPRINT(10, "ibd_undo_start: free_cq(scq) "
4530 			    "failed, ret=%d", ret);
4531 		}
4532 
4533 		kmem_free(state->id_rxwcs,
4534 		    sizeof (ibt_wc_t) * state->id_rxwcs_size);
4535 		if ((ret = ibt_free_cq(state->id_rcq_hdl)) != IBT_SUCCESS) {
4536 			DPRINT(10, "ibd_undo_start: free_cq(rcq) failed, "
4537 			    "ret=%d", ret);
4538 		}
4539 
4540 		state->id_txwcs = NULL;
4541 		state->id_rxwcs = NULL;
4542 		state->id_scq_hdl = NULL;
4543 		state->id_rcq_hdl = NULL;
4544 
4545 		state->id_mac_state &= (~IBD_DRV_CQS_ALLOCD);
4546 	}
4547 
4548 	if (progress & IBD_DRV_ACACHE_INITIALIZED) {
4549 		mutex_enter(&state->id_ac_mutex);
4550 		mod_hash_destroy_hash(state->id_ah_active_hash);
4551 		mutex_exit(&state->id_ac_mutex);
4552 		ibd_acache_fini(state);
4553 
4554 		state->id_mac_state &= (~IBD_DRV_ACACHE_INITIALIZED);
4555 	}
4556 
4557 	if (progress & IBD_DRV_BCAST_GROUP_FOUND) {
4558 		/*
4559 		 * If we'd created the ipoib broadcast group and had
4560 		 * successfully joined it, leave it now
4561 		 */
4562 		if (state->id_bgroup_created) {
4563 			mgid = state->id_mcinfo->mc_adds_vect.av_dgid;
4564 			jstate = IB_MC_JSTATE_FULL;
4565 			(void) ibt_leave_mcg(state->id_sgid, mgid,
4566 			    state->id_sgid, jstate);
4567 		}
4568 		ibt_free_mcg_info(state->id_mcinfo, 1);
4569 
4570 		state->id_mac_state &= (~IBD_DRV_BCAST_GROUP_FOUND);
4571 	}
4572 
4573 	return (DDI_SUCCESS);
4574 }
4575 
4576 /*
4577  * These pair of routines are used to set/clear the condition that
4578  * the caller is likely to do something to change the id_mac_state.
4579  * If there's already someone doing either a start or a stop (possibly
4580  * due to the async handler detecting a pkey relocation event, a plumb
4581  * or dlpi_open, or an unplumb or dlpi_close coming in), we wait until
4582  * that's done.
4583  */
4584 static void
4585 ibd_set_mac_progress(ibd_state_t *state, uint_t flag)
4586 {
4587 	mutex_enter(&state->id_macst_lock);
4588 	while (state->id_mac_state & IBD_DRV_RESTART_IN_PROGRESS)
4589 		cv_wait(&state->id_macst_cv, &state->id_macst_lock);
4590 
4591 	state->id_mac_state |= flag;
4592 	mutex_exit(&state->id_macst_lock);
4593 }
4594 
4595 static void
4596 ibd_clr_mac_progress(ibd_state_t *state, uint_t flag)
4597 {
4598 	mutex_enter(&state->id_macst_lock);
4599 	state->id_mac_state &= (~flag);
4600 	cv_signal(&state->id_macst_cv);
4601 	mutex_exit(&state->id_macst_lock);
4602 }
4603 
4604 /*
4605  * GLDv3 entry point to start hardware.
4606  */
4607 /*ARGSUSED*/
4608 static int
4609 ibd_m_start(void *arg)
4610 {
4611 	ibd_state_t *state = arg;
4612 	int	ret;
4613 
4614 	ibd_set_mac_progress(state, IBD_DRV_START_IN_PROGRESS);
4615 
4616 	ret = ibd_start(state);
4617 
4618 	ibd_clr_mac_progress(state, IBD_DRV_START_IN_PROGRESS);
4619 
4620 	return (ret);
4621 }
4622 
4623 static int
4624 ibd_start(ibd_state_t *state)
4625 {
4626 	kthread_t *kht;
4627 	int err;
4628 	ibt_status_t ret;
4629 
4630 	if (state->id_mac_state & IBD_DRV_STARTED)
4631 		return (DDI_SUCCESS);
4632 
4633 	/*
4634 	 * Get port details; if we fail here, very likely the port
4635 	 * state is inactive or the pkey can't be verified.
4636 	 */
4637 	if ((err = ibd_get_port_details(state)) != 0) {
4638 		DPRINT(10, "ibd_start: ibd_get_port_details() failed");
4639 		goto start_fail;
4640 	}
4641 	state->id_mac_state |= IBD_DRV_PORT_DETAILS_OBTAINED;
4642 
4643 	/*
4644 	 * Find the IPoIB broadcast group
4645 	 */
4646 	if (ibd_find_bgroup(state) != IBT_SUCCESS) {
4647 		DPRINT(10, "ibd_start: ibd_find_bgroup() failed");
4648 		err = ENOTACTIVE;
4649 		goto start_fail;
4650 	}
4651 	state->id_mac_state |= IBD_DRV_BCAST_GROUP_FOUND;
4652 
4653 	/*
4654 	 * Initialize per-interface caches and lists; if we fail here,
4655 	 * it is most likely due to a lack of resources
4656 	 */
4657 	if (ibd_acache_init(state) != DDI_SUCCESS) {
4658 		DPRINT(10, "ibd_start: ibd_acache_init() failed");
4659 		err = ENOMEM;
4660 		goto start_fail;
4661 	}
4662 	state->id_mac_state |= IBD_DRV_ACACHE_INITIALIZED;
4663 
4664 	/*
4665 	 * Allocate send and receive completion queues
4666 	 */
4667 	if (ibd_alloc_cqs(state) != DDI_SUCCESS) {
4668 		DPRINT(10, "ibd_start: ibd_alloc_cqs() failed");
4669 		err = ENOMEM;
4670 		goto start_fail;
4671 	}
4672 	state->id_mac_state |= IBD_DRV_CQS_ALLOCD;
4673 
4674 	/*
4675 	 * Setup a UD channel
4676 	 */
4677 	if (ibd_setup_ud_channel(state) != DDI_SUCCESS) {
4678 		err = ENOMEM;
4679 		DPRINT(10, "ibd_start: ibd_setup_ud_channel() failed");
4680 		goto start_fail;
4681 	}
4682 	state->id_mac_state |= IBD_DRV_UD_CHANNEL_SETUP;
4683 
4684 	/*
4685 	 * Allocate and initialize the tx buffer list
4686 	 */
4687 	if (ibd_init_txlist(state) != DDI_SUCCESS) {
4688 		DPRINT(10, "ibd_start: ibd_init_txlist() failed");
4689 		err = ENOMEM;
4690 		goto start_fail;
4691 	}
4692 	state->id_mac_state |= IBD_DRV_TXLIST_ALLOCD;
4693 
4694 	/*
4695 	 * Create the send cq handler here
4696 	 */
4697 	ibt_set_cq_handler(state->id_scq_hdl, ibd_scq_handler, state);
4698 	if ((ret = ibt_enable_cq_notify(state->id_scq_hdl,
4699 	    IBT_NEXT_COMPLETION)) != IBT_SUCCESS) {
4700 		DPRINT(10, "ibd_start: ibt_enable_cq_notify(scq) "
4701 		    "failed, ret=%d", ret);
4702 		err = EINVAL;
4703 		goto start_fail;
4704 	}
4705 	state->id_mac_state |= IBD_DRV_SCQ_NOTIFY_ENABLED;
4706 
4707 	/*
4708 	 * Allocate and initialize the rx buffer list
4709 	 */
4710 	if (ibd_init_rxlist(state) != DDI_SUCCESS) {
4711 		DPRINT(10, "ibd_start: ibd_init_rxlist() failed");
4712 		err = ENOMEM;
4713 		goto start_fail;
4714 	}
4715 	state->id_mac_state |= IBD_DRV_RXLIST_ALLOCD;
4716 
4717 	/*
4718 	 * Join IPoIB broadcast group
4719 	 */
4720 	if (ibd_join_group(state, state->id_mgid, IB_MC_JSTATE_FULL) == NULL) {
4721 		DPRINT(10, "ibd_start: ibd_join_group() failed");
4722 		err = ENOTACTIVE;
4723 		goto start_fail;
4724 	}
4725 	state->id_mac_state |= IBD_DRV_BCAST_GROUP_JOINED;
4726 
4727 	/*
4728 	 * Create the async thread; thread_create never fails.
4729 	 */
4730 	kht = thread_create(NULL, 0, ibd_async_work, state, 0, &p0,
4731 	    TS_RUN, minclsyspri);
4732 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(state->id_async_thrid))
4733 	state->id_async_thrid = kht->t_did;
4734 	_NOTE(NOW_VISIBLE_TO_OTHER_THREADS(state->id_async_thrid))
4735 	state->id_mac_state |= IBD_DRV_ASYNC_THR_CREATED;
4736 
4737 	/*
4738 	 * When we did mac_register() in ibd_attach(), we didn't register
4739 	 * the real macaddr and we didn't have the true port mtu. Now that
4740 	 * we're almost ready, set the local mac address and broadcast
4741 	 * addresses and update gldv3 about the real values of these
4742 	 * parameters.
4743 	 */
4744 	ibd_h2n_mac(&state->id_macaddr, state->id_qpnum,
4745 	    state->id_sgid.gid_prefix, state->id_sgid.gid_guid);
4746 	ibd_h2n_mac(&state->id_bcaddr, IB_QPN_MASK,
4747 	    state->id_mgid.gid_prefix, state->id_mgid.gid_guid);
4748 
4749 	(void) mac_maxsdu_update(state->id_mh, state->id_mtu - IPOIB_HDRSIZE);
4750 	mac_unicst_update(state->id_mh, (uint8_t *)&state->id_macaddr);
4751 
4752 	/*
4753 	 * Setup the receive cq handler
4754 	 */
4755 	ibt_set_cq_handler(state->id_rcq_hdl, ibd_rcq_handler, state);
4756 	if ((ret = ibt_enable_cq_notify(state->id_rcq_hdl,
4757 	    IBT_NEXT_COMPLETION)) != IBT_SUCCESS) {
4758 		DPRINT(10, "ibd_start: ibt_enable_cq_notify(rcq) "
4759 		    "failed, ret=%d", ret);
4760 		err = EINVAL;
4761 		goto start_fail;
4762 	}
4763 	state->id_mac_state |= IBD_DRV_RCQ_NOTIFY_ENABLED;
4764 
4765 	/*
4766 	 * Setup the subnet notices handler after we've initialized the acache/
4767 	 * mcache and started the async thread, both of which are required for
4768 	 * the trap handler to function properly.
4769 	 *
4770 	 * Now that the async thread has been started (and we've already done
4771 	 * a mac_register() during attach so mac_tx_update() can be called
4772 	 * if necessary without any problem), we can enable the trap handler
4773 	 * to queue requests to the async thread.
4774 	 */
4775 	ibt_register_subnet_notices(state->id_ibt_hdl,
4776 	    ibd_snet_notices_handler, state);
4777 	mutex_enter(&state->id_trap_lock);
4778 	state->id_trap_stop = B_FALSE;
4779 	mutex_exit(&state->id_trap_lock);
4780 	state->id_mac_state |= IBD_DRV_SM_NOTICES_REGISTERED;
4781 
4782 	/*
4783 	 * Indicate link status to GLDv3 and higher layers. By default,
4784 	 * we assume we are in up state (which must have been true at
4785 	 * least at the time the broadcast mcg's were probed); if there
4786 	 * were any up/down transitions till the time we come here, the
4787 	 * async handler will have updated last known state, which we
4788 	 * use to tell GLDv3. The async handler will not send any
4789 	 * notifications to GLDv3 till we reach here in the initialization
4790 	 * sequence.
4791 	 */
4792 	state->id_mac_state |= IBD_DRV_STARTED;
4793 	mac_link_update(state->id_mh, state->id_link_state);
4794 
4795 	return (DDI_SUCCESS);
4796 
4797 start_fail:
4798 	/*
4799 	 * If we ran into a problem during ibd_start() and ran into
4800 	 * some other problem during undoing our partial work, we can't
4801 	 * do anything about it.  Ignore any errors we might get from
4802 	 * ibd_undo_start() and just return the original error we got.
4803 	 */
4804 	(void) ibd_undo_start(state, LINK_STATE_DOWN);
4805 	return (err);
4806 }
4807 
4808 /*
4809  * GLDv3 entry point to stop hardware from receiving packets.
4810  */
4811 /*ARGSUSED*/
4812 static void
4813 ibd_m_stop(void *arg)
4814 {
4815 	ibd_state_t *state = (ibd_state_t *)arg;
4816 
4817 	ibd_set_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS);
4818 
4819 	(void) ibd_undo_start(state, state->id_link_state);
4820 
4821 	ibd_clr_mac_progress(state, IBD_DRV_STOP_IN_PROGRESS);
4822 }
4823 
4824 /*
4825  * GLDv3 entry point to modify device's mac address. We do not
4826  * allow address modifications.
4827  */
4828 static int
4829 ibd_m_unicst(void *arg, const uint8_t *macaddr)
4830 {
4831 	ibd_state_t *state = arg;
4832 
4833 	/*
4834 	 * Don't bother even comparing the macaddr if we haven't
4835 	 * completed ibd_m_start().
4836 	 */
4837 	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
4838 		return (0);
4839 
4840 	if (bcmp(macaddr, &state->id_macaddr, IPOIB_ADDRL) == 0)
4841 		return (0);
4842 	else
4843 		return (EINVAL);
4844 }
4845 
4846 /*
4847  * The blocking part of the IBA join/leave operations are done out
4848  * of here on the async thread.
4849  */
4850 static void
4851 ibd_async_multicast(ibd_state_t *state, ib_gid_t mgid, int op)
4852 {
4853 	DPRINT(3, "ibd_async_multicast : async_setmc op %d :"
4854 	    "%016llx:%016llx\n", op, mgid.gid_prefix, mgid.gid_guid);
4855 
4856 	if (op == IBD_ASYNC_JOIN) {
4857 		if (ibd_join_group(state, mgid, IB_MC_JSTATE_FULL) == NULL) {
4858 			ibd_print_warn(state, "Join multicast group failed :"
4859 			"%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid);
4860 		}
4861 	} else {
4862 		/*
4863 		 * Here, we must search for the proper mcg_info and
4864 		 * use that to leave the group.
4865 		 */
4866 		ibd_leave_group(state, mgid, IB_MC_JSTATE_FULL);
4867 	}
4868 }
4869 
4870 /*
4871  * GLDv3 entry point for multicast enable/disable requests.
4872  * This function queues the operation to the async thread and
4873  * return success for a valid multicast address.
4874  */
4875 static int
4876 ibd_m_multicst(void *arg, boolean_t add, const uint8_t *mcmac)
4877 {
4878 	ibd_state_t *state = (ibd_state_t *)arg;
4879 	ipoib_mac_t maddr, *mcast;
4880 	ib_gid_t mgid;
4881 	ibd_req_t *req;
4882 
4883 	/*
4884 	 * If we haven't completed ibd_m_start(), async thread wouldn't
4885 	 * have been started and id_bcaddr wouldn't be set, so there's
4886 	 * no point in continuing.
4887 	 */
4888 	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
4889 		return (0);
4890 
4891 	/*
4892 	 * The incoming multicast address might not be aligned properly
4893 	 * on a 4 byte boundary to be considered an ipoib_mac_t. We force
4894 	 * it to look like one though, to get the offsets of the mc gid,
4895 	 * since we know we are not going to dereference any values with
4896 	 * the ipoib_mac_t pointer.
4897 	 */
4898 	bcopy(mcmac, &maddr, sizeof (ipoib_mac_t));
4899 	mcast = &maddr;
4900 
4901 	/*
4902 	 * Check validity of MCG address. We could additionally check
4903 	 * that a enable/disable is not being issued on the "broadcast"
4904 	 * mcg, but since this operation is only invokable by privileged
4905 	 * programs anyway, we allow the flexibility to those dlpi apps.
4906 	 * Note that we do not validate the "scope" of the IBA mcg.
4907 	 */
4908 	if ((ntohl(mcast->ipoib_qpn) & IB_QPN_MASK) != IB_MC_QPN)
4909 		return (EINVAL);
4910 
4911 	/*
4912 	 * fill in multicast pkey and scope
4913 	 */
4914 	IBD_FILL_SCOPE_PKEY(mcast, state->id_scope, state->id_pkey);
4915 
4916 	/*
4917 	 * If someone is trying to JOIN/LEAVE the broadcast group, we do
4918 	 * nothing (i.e. we stay JOINed to the broadcast group done in
4919 	 * ibd_m_start()), to mimic ethernet behavior. IPv4 specifically
4920 	 * requires to be joined to broadcast groups at all times.
4921 	 * ibd_join_group() has an ASSERT(omce->mc_fullreap) that also
4922 	 * depends on this.
4923 	 */
4924 	if (bcmp(mcast, &state->id_bcaddr, IPOIB_ADDRL) == 0)
4925 		return (0);
4926 
4927 	ibd_n2h_gid(mcast, &mgid);
4928 	req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
4929 	if (req == NULL)
4930 		return (ENOMEM);
4931 
4932 	req->rq_gid = mgid;
4933 
4934 	if (add) {
4935 		DPRINT(1, "ibd_m_multicst : %016llx:%016llx\n",
4936 		    mgid.gid_prefix, mgid.gid_guid);
4937 		ibd_queue_work_slot(state, req, IBD_ASYNC_JOIN);
4938 	} else {
4939 		DPRINT(1, "ibd_m_multicst : unset_multicast : "
4940 		    "%016llx:%016llx", mgid.gid_prefix, mgid.gid_guid);
4941 		ibd_queue_work_slot(state, req, IBD_ASYNC_LEAVE);
4942 	}
4943 	return (0);
4944 }
4945 
4946 /*
4947  * The blocking part of the IBA promiscuous operations are done
4948  * out of here on the async thread. The dlpireq parameter indicates
4949  * whether this invocation is due to a dlpi request or due to
4950  * a port up/down event.
4951  */
4952 static void
4953 ibd_async_unsetprom(ibd_state_t *state)
4954 {
4955 	ibd_mce_t *mce = list_head(&state->id_mc_non);
4956 	ib_gid_t mgid;
4957 
4958 	DPRINT(2, "ibd_async_unsetprom : async_unset_promisc");
4959 
4960 	while (mce != NULL) {
4961 		mgid = mce->mc_info.mc_adds_vect.av_dgid;
4962 		mce = list_next(&state->id_mc_non, mce);
4963 		ibd_leave_group(state, mgid, IB_MC_JSTATE_NON);
4964 	}
4965 	state->id_prom_op = IBD_OP_NOTSTARTED;
4966 }
4967 
4968 /*
4969  * The blocking part of the IBA promiscuous operations are done
4970  * out of here on the async thread. The dlpireq parameter indicates
4971  * whether this invocation is due to a dlpi request or due to
4972  * a port up/down event.
4973  */
4974 static void
4975 ibd_async_setprom(ibd_state_t *state)
4976 {
4977 	ibt_mcg_attr_t mcg_attr;
4978 	ibt_mcg_info_t *mcg_info;
4979 	ib_gid_t mgid;
4980 	uint_t numg;
4981 	int i;
4982 	char ret = IBD_OP_COMPLETED;
4983 
4984 	DPRINT(2, "ibd_async_setprom : async_set_promisc");
4985 
4986 	/*
4987 	 * Obtain all active MC groups on the IB fabric with
4988 	 * specified criteria (scope + Pkey + Qkey + mtu).
4989 	 */
4990 	bzero(&mcg_attr, sizeof (mcg_attr));
4991 	mcg_attr.mc_pkey = state->id_pkey;
4992 	mcg_attr.mc_scope = state->id_scope;
4993 	mcg_attr.mc_qkey = state->id_mcinfo->mc_qkey;
4994 	mcg_attr.mc_mtu_req.r_mtu = state->id_mcinfo->mc_mtu;
4995 	mcg_attr.mc_mtu_req.r_selector = IBT_EQU;
4996 	if (ibt_query_mcg(state->id_sgid, &mcg_attr, 0, &mcg_info, &numg) !=
4997 	    IBT_SUCCESS) {
4998 		ibd_print_warn(state, "Could not get list of IBA multicast "
4999 		    "groups");
5000 		ret = IBD_OP_ERRORED;
5001 		goto done;
5002 	}
5003 
5004 	/*
5005 	 * Iterate over the returned mcg's and join as NonMember
5006 	 * to the IP mcg's.
5007 	 */
5008 	for (i = 0; i < numg; i++) {
5009 		/*
5010 		 * Do a NonMember JOIN on the MC group.
5011 		 */
5012 		mgid = mcg_info[i].mc_adds_vect.av_dgid;
5013 		if (ibd_join_group(state, mgid, IB_MC_JSTATE_NON) == NULL)
5014 			ibd_print_warn(state, "IBA promiscuous mode missed "
5015 			    "multicast gid %016llx:%016llx",
5016 			    (u_longlong_t)mgid.gid_prefix,
5017 			    (u_longlong_t)mgid.gid_guid);
5018 	}
5019 
5020 	ibt_free_mcg_info(mcg_info, numg);
5021 	DPRINT(4, "ibd_async_setprom : async_set_promisc completes");
5022 done:
5023 	state->id_prom_op = ret;
5024 }
5025 
5026 /*
5027  * GLDv3 entry point for multicast promiscuous enable/disable requests.
5028  * GLDv3 assumes phys state receives more packets than multi state,
5029  * which is not true for IPoIB. Thus, treat the multi and phys
5030  * promiscuous states the same way to work with GLDv3's assumption.
5031  */
5032 static int
5033 ibd_m_promisc(void *arg, boolean_t on)
5034 {
5035 	ibd_state_t *state = (ibd_state_t *)arg;
5036 	ibd_req_t *req;
5037 
5038 	/*
5039 	 * Async thread wouldn't have been started if we haven't
5040 	 * passed ibd_m_start()
5041 	 */
5042 	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
5043 		return (0);
5044 
5045 	req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
5046 	if (req == NULL)
5047 		return (ENOMEM);
5048 	if (on) {
5049 		DPRINT(1, "ibd_m_promisc : set_promisc : %d", on);
5050 		ibd_queue_work_slot(state, req, IBD_ASYNC_PROMON);
5051 	} else {
5052 		DPRINT(1, "ibd_m_promisc : unset_promisc");
5053 		ibd_queue_work_slot(state, req, IBD_ASYNC_PROMOFF);
5054 	}
5055 
5056 	return (0);
5057 }
5058 
5059 /*
5060  * GLDv3 entry point for gathering statistics.
5061  */
5062 static int
5063 ibd_m_stat(void *arg, uint_t stat, uint64_t *val)
5064 {
5065 	ibd_state_t *state = (ibd_state_t *)arg;
5066 
5067 	switch (stat) {
5068 	case MAC_STAT_IFSPEED:
5069 		*val = state->id_link_speed;
5070 		break;
5071 	case MAC_STAT_MULTIRCV:
5072 		*val = state->id_multi_rcv;
5073 		break;
5074 	case MAC_STAT_BRDCSTRCV:
5075 		*val = state->id_brd_rcv;
5076 		break;
5077 	case MAC_STAT_MULTIXMT:
5078 		*val = state->id_multi_xmt;
5079 		break;
5080 	case MAC_STAT_BRDCSTXMT:
5081 		*val = state->id_brd_xmt;
5082 		break;
5083 	case MAC_STAT_RBYTES:
5084 		*val = state->id_rcv_bytes;
5085 		break;
5086 	case MAC_STAT_IPACKETS:
5087 		*val = state->id_rcv_pkt;
5088 		break;
5089 	case MAC_STAT_OBYTES:
5090 		*val = state->id_xmt_bytes;
5091 		break;
5092 	case MAC_STAT_OPACKETS:
5093 		*val = state->id_xmt_pkt;
5094 		break;
5095 	case MAC_STAT_OERRORS:
5096 		*val = state->id_ah_error;	/* failed AH translation */
5097 		break;
5098 	case MAC_STAT_IERRORS:
5099 		*val = 0;
5100 		break;
5101 	case MAC_STAT_NOXMTBUF:
5102 		*val = state->id_tx_short;
5103 		break;
5104 	case MAC_STAT_NORCVBUF:
5105 	default:
5106 		return (ENOTSUP);
5107 	}
5108 
5109 	return (0);
5110 }
5111 
5112 static void
5113 ibd_async_txsched(ibd_state_t *state)
5114 {
5115 	ibd_resume_transmission(state);
5116 }
5117 
5118 static void
5119 ibd_resume_transmission(ibd_state_t *state)
5120 {
5121 	int flag;
5122 	int met_thresh = 0;
5123 	int thresh = 0;
5124 	int ret = -1;
5125 
5126 	mutex_enter(&state->id_sched_lock);
5127 	if (state->id_sched_needed & IBD_RSRC_SWQE) {
5128 		mutex_enter(&state->id_tx_list.dl_mutex);
5129 		mutex_enter(&state->id_tx_rel_list.dl_mutex);
5130 		met_thresh = state->id_tx_list.dl_cnt +
5131 		    state->id_tx_rel_list.dl_cnt;
5132 		mutex_exit(&state->id_tx_rel_list.dl_mutex);
5133 		mutex_exit(&state->id_tx_list.dl_mutex);
5134 		thresh = IBD_FREE_SWQES_THRESH;
5135 		flag = IBD_RSRC_SWQE;
5136 	} else if (state->id_sched_needed & IBD_RSRC_LSOBUF) {
5137 		ASSERT(state->id_lso != NULL);
5138 		mutex_enter(&state->id_lso_lock);
5139 		met_thresh = state->id_lso->bkt_nfree;
5140 		thresh = IBD_FREE_LSOS_THRESH;
5141 		mutex_exit(&state->id_lso_lock);
5142 		flag = IBD_RSRC_LSOBUF;
5143 		if (met_thresh > thresh)
5144 			state->id_sched_lso_cnt++;
5145 	}
5146 	if (met_thresh > thresh) {
5147 		state->id_sched_needed &= ~flag;
5148 		state->id_sched_cnt++;
5149 		ret = 0;
5150 	}
5151 	mutex_exit(&state->id_sched_lock);
5152 
5153 	if (ret == 0)
5154 		mac_tx_update(state->id_mh);
5155 }
5156 
5157 /*
5158  * Release the send wqe back into free list.
5159  */
5160 static void
5161 ibd_release_swqe(ibd_state_t *state, ibd_swqe_t *head, ibd_swqe_t *tail, int n)
5162 {
5163 	/*
5164 	 * Add back on Tx list for reuse.
5165 	 */
5166 	ASSERT(tail->swqe_next == NULL);
5167 	mutex_enter(&state->id_tx_rel_list.dl_mutex);
5168 	state->id_tx_rel_list.dl_pending_sends = B_FALSE;
5169 	tail->swqe_next = state->id_tx_rel_list.dl_head;
5170 	state->id_tx_rel_list.dl_head = SWQE_TO_WQE(head);
5171 	state->id_tx_rel_list.dl_cnt += n;
5172 	mutex_exit(&state->id_tx_rel_list.dl_mutex);
5173 }
5174 
5175 /*
5176  * Acquire a send wqe from free list.
5177  * Returns error number and send wqe pointer.
5178  */
5179 static ibd_swqe_t *
5180 ibd_acquire_swqe(ibd_state_t *state)
5181 {
5182 	ibd_swqe_t *wqe;
5183 
5184 	mutex_enter(&state->id_tx_rel_list.dl_mutex);
5185 	if (state->id_tx_rel_list.dl_head != NULL) {
5186 		/* transfer id_tx_rel_list to id_tx_list */
5187 		state->id_tx_list.dl_head =
5188 		    state->id_tx_rel_list.dl_head;
5189 		state->id_tx_list.dl_cnt =
5190 		    state->id_tx_rel_list.dl_cnt;
5191 		state->id_tx_list.dl_pending_sends = B_FALSE;
5192 
5193 		/* clear id_tx_rel_list */
5194 		state->id_tx_rel_list.dl_head = NULL;
5195 		state->id_tx_rel_list.dl_cnt = 0;
5196 		mutex_exit(&state->id_tx_rel_list.dl_mutex);
5197 
5198 		wqe = WQE_TO_SWQE(state->id_tx_list.dl_head);
5199 		state->id_tx_list.dl_cnt -= 1;
5200 		state->id_tx_list.dl_head = wqe->swqe_next;
5201 	} else {	/* no free swqe */
5202 		mutex_exit(&state->id_tx_rel_list.dl_mutex);
5203 		state->id_tx_list.dl_pending_sends = B_TRUE;
5204 		DPRINT(5, "ibd_acquire_swqe: out of Tx wqe");
5205 		state->id_tx_short++;
5206 		wqe = NULL;
5207 	}
5208 	return (wqe);
5209 }
5210 
5211 static int
5212 ibd_setup_lso(ibd_swqe_t *node, mblk_t *mp, uint32_t mss,
5213     ibt_ud_dest_hdl_t ud_dest)
5214 {
5215 	mblk_t	*nmp;
5216 	int iph_len, tcph_len;
5217 	ibt_wr_lso_t *lso;
5218 	uintptr_t ip_start, tcp_start;
5219 	uint8_t *dst;
5220 	uint_t pending, mblen;
5221 
5222 	/*
5223 	 * The code in ibd_send would've set 'wr.ud.udwr_dest' by default;
5224 	 * we need to adjust it here for lso.
5225 	 */
5226 	lso = &(node->w_swr.wr.ud_lso);
5227 	lso->lso_ud_dest = ud_dest;
5228 	lso->lso_mss = mss;
5229 
5230 	/*
5231 	 * Calculate the LSO header size and set it in the UD LSO structure.
5232 	 * Note that the only assumption we make is that each of the IPoIB,
5233 	 * IP and TCP headers will be contained in a single mblk fragment;
5234 	 * together, the headers may span multiple mblk fragments.
5235 	 */
5236 	nmp = mp;
5237 	ip_start = (uintptr_t)(nmp->b_rptr) + IPOIB_HDRSIZE;
5238 	if (ip_start >= (uintptr_t)(nmp->b_wptr)) {
5239 		ip_start = (uintptr_t)nmp->b_cont->b_rptr
5240 		    + (ip_start - (uintptr_t)(nmp->b_wptr));
5241 		nmp = nmp->b_cont;
5242 
5243 	}
5244 	iph_len = IPH_HDR_LENGTH((ipha_t *)ip_start);
5245 
5246 	tcp_start = ip_start + iph_len;
5247 	if (tcp_start >= (uintptr_t)(nmp->b_wptr)) {
5248 		tcp_start = (uintptr_t)nmp->b_cont->b_rptr
5249 		    + (tcp_start - (uintptr_t)(nmp->b_wptr));
5250 		nmp = nmp->b_cont;
5251 	}
5252 	tcph_len = TCP_HDR_LENGTH((tcph_t *)tcp_start);
5253 	lso->lso_hdr_sz = IPOIB_HDRSIZE + iph_len + tcph_len;
5254 
5255 	/*
5256 	 * If the lso header fits entirely within a single mblk fragment,
5257 	 * we'll avoid an additional copy of the lso header here and just
5258 	 * pass the b_rptr of the mblk directly.
5259 	 *
5260 	 * If this isn't true, we'd have to allocate for it explicitly.
5261 	 */
5262 	if (lso->lso_hdr_sz <= MBLKL(mp)) {
5263 		lso->lso_hdr = mp->b_rptr;
5264 	} else {
5265 		/* On work completion, remember to free this allocated hdr */
5266 		lso->lso_hdr = kmem_zalloc(lso->lso_hdr_sz, KM_NOSLEEP);
5267 		if (lso->lso_hdr == NULL) {
5268 			DPRINT(10, "ibd_setup_lso: couldn't allocate lso hdr, "
5269 			    "sz = %d", lso->lso_hdr_sz);
5270 			lso->lso_hdr_sz = 0;
5271 			lso->lso_mss = 0;
5272 			return (-1);
5273 		}
5274 	}
5275 
5276 	/*
5277 	 * Copy in the lso header only if we need to
5278 	 */
5279 	if (lso->lso_hdr != mp->b_rptr) {
5280 		dst = lso->lso_hdr;
5281 		pending = lso->lso_hdr_sz;
5282 
5283 		for (nmp = mp; nmp && pending; nmp = nmp->b_cont) {
5284 			mblen = MBLKL(nmp);
5285 			if (pending > mblen) {
5286 				bcopy(nmp->b_rptr, dst, mblen);
5287 				dst += mblen;
5288 				pending -= mblen;
5289 			} else {
5290 				bcopy(nmp->b_rptr, dst, pending);
5291 				break;
5292 			}
5293 		}
5294 	}
5295 
5296 	return (0);
5297 }
5298 
5299 static void
5300 ibd_free_lsohdr(ibd_swqe_t *node, mblk_t *mp)
5301 {
5302 	ibt_wr_lso_t *lso;
5303 
5304 	if ((!node) || (!mp))
5305 		return;
5306 
5307 	/*
5308 	 * Free any header space that we might've allocated if we
5309 	 * did an LSO
5310 	 */
5311 	if (node->w_swr.wr_opcode == IBT_WRC_SEND_LSO) {
5312 		lso = &(node->w_swr.wr.ud_lso);
5313 		if ((lso->lso_hdr) && (lso->lso_hdr != mp->b_rptr)) {
5314 			kmem_free(lso->lso_hdr, lso->lso_hdr_sz);
5315 			lso->lso_hdr = NULL;
5316 			lso->lso_hdr_sz = 0;
5317 		}
5318 	}
5319 }
5320 
5321 static void
5322 ibd_post_send(ibd_state_t *state, ibd_swqe_t *node)
5323 {
5324 	uint_t		i;
5325 	uint_t		num_posted;
5326 	uint_t		n_wrs;
5327 	ibt_status_t	ibt_status;
5328 	ibt_send_wr_t	wrs[IBD_MAX_TX_POST_MULTIPLE];
5329 	ibd_swqe_t	*tx_head, *elem;
5330 	ibd_swqe_t	*nodes[IBD_MAX_TX_POST_MULTIPLE];
5331 
5332 	/* post the one request, then check for more */
5333 	ibt_status = ibt_post_send(state->id_chnl_hdl,
5334 	    &node->w_swr, 1, NULL);
5335 	if (ibt_status != IBT_SUCCESS) {
5336 		ibd_print_warn(state, "ibd_post_send: "
5337 		    "posting one wr failed: ret=%d", ibt_status);
5338 		ibd_tx_cleanup(state, node);
5339 	}
5340 
5341 	tx_head = NULL;
5342 	for (;;) {
5343 		if (tx_head == NULL) {
5344 			mutex_enter(&state->id_txpost_lock);
5345 			tx_head = state->id_tx_head;
5346 			if (tx_head == NULL) {
5347 				state->id_tx_busy = 0;
5348 				mutex_exit(&state->id_txpost_lock);
5349 				return;
5350 			}
5351 			state->id_tx_head = NULL;
5352 			mutex_exit(&state->id_txpost_lock);
5353 		}
5354 
5355 		/*
5356 		 * Collect pending requests, IBD_MAX_TX_POST_MULTIPLE wrs
5357 		 * at a time if possible, and keep posting them.
5358 		 */
5359 		for (n_wrs = 0, elem = tx_head;
5360 		    (elem) && (n_wrs < IBD_MAX_TX_POST_MULTIPLE);
5361 		    elem = WQE_TO_SWQE(elem->swqe_next), n_wrs++) {
5362 			nodes[n_wrs] = elem;
5363 			wrs[n_wrs] = elem->w_swr;
5364 		}
5365 		tx_head = elem;
5366 
5367 		ASSERT(n_wrs != 0);
5368 
5369 		/*
5370 		 * If posting fails for some reason, we'll never receive
5371 		 * completion intimation, so we'll need to cleanup. But
5372 		 * we need to make sure we don't clean up nodes whose
5373 		 * wrs have been successfully posted. We assume that the
5374 		 * hca driver returns on the first failure to post and
5375 		 * therefore the first 'num_posted' entries don't need
5376 		 * cleanup here.
5377 		 */
5378 		num_posted = 0;
5379 		ibt_status = ibt_post_send(state->id_chnl_hdl,
5380 		    wrs, n_wrs, &num_posted);
5381 		if (ibt_status != IBT_SUCCESS) {
5382 			ibd_print_warn(state, "ibd_post_send: "
5383 			    "posting multiple wrs failed: "
5384 			    "requested=%d, done=%d, ret=%d",
5385 			    n_wrs, num_posted, ibt_status);
5386 
5387 			for (i = num_posted; i < n_wrs; i++)
5388 				ibd_tx_cleanup(state, nodes[i]);
5389 		}
5390 	}
5391 }
5392 
5393 static int
5394 ibd_prepare_sgl(ibd_state_t *state, mblk_t *mp, ibd_swqe_t *node,
5395     uint_t lsohdr_sz)
5396 {
5397 	ibt_wr_ds_t *sgl;
5398 	ibt_status_t ibt_status;
5399 	mblk_t *nmp;
5400 	mblk_t *data_mp;
5401 	uchar_t *bufp;
5402 	size_t blksize;
5403 	size_t skip;
5404 	size_t avail;
5405 	uint_t pktsize;
5406 	uint_t frag_len;
5407 	uint_t pending_hdr;
5408 	int nmblks;
5409 	int i;
5410 
5411 	/*
5412 	 * Let's skip ahead to the data if this is LSO
5413 	 */
5414 	data_mp = mp;
5415 	pending_hdr = 0;
5416 	if (lsohdr_sz) {
5417 		pending_hdr = lsohdr_sz;
5418 		for (nmp = mp; nmp; nmp = nmp->b_cont) {
5419 			frag_len = nmp->b_wptr - nmp->b_rptr;
5420 			if (frag_len > pending_hdr)
5421 				break;
5422 			pending_hdr -= frag_len;
5423 		}
5424 		data_mp = nmp;	/* start of data past lso header */
5425 		ASSERT(data_mp != NULL);
5426 	}
5427 
5428 	/*
5429 	 * Calculate the size of message data and number of msg blocks
5430 	 */
5431 	pktsize = 0;
5432 	for (nmblks = 0, nmp = data_mp; nmp != NULL;
5433 	    nmp = nmp->b_cont, nmblks++) {
5434 		pktsize += MBLKL(nmp);
5435 	}
5436 	pktsize -= pending_hdr;
5437 
5438 	/*
5439 	 * We only do ibt_map_mem_iov() if the pktsize is above the
5440 	 * "copy-threshold", and if the number of mp fragments is less than
5441 	 * the maximum acceptable.
5442 	 */
5443 	if ((state->id_hca_res_lkey_capab) &&
5444 	    (pktsize > IBD_TX_COPY_THRESH) &&
5445 	    (nmblks < state->id_max_sqseg_hiwm)) {
5446 		ibt_iov_t iov_arr[IBD_MAX_SQSEG];
5447 		ibt_iov_attr_t iov_attr;
5448 
5449 		iov_attr.iov_as = NULL;
5450 		iov_attr.iov = iov_arr;
5451 		iov_attr.iov_buf = NULL;
5452 		iov_attr.iov_list_len = nmblks;
5453 		iov_attr.iov_wr_nds = state->id_max_sqseg;
5454 		iov_attr.iov_lso_hdr_sz = lsohdr_sz;
5455 		iov_attr.iov_flags = IBT_IOV_SLEEP;
5456 
5457 		for (nmp = data_mp, i = 0; i < nmblks; i++, nmp = nmp->b_cont) {
5458 			iov_arr[i].iov_addr = (caddr_t)(void *)nmp->b_rptr;
5459 			iov_arr[i].iov_len = MBLKL(nmp);
5460 			if (i == 0) {
5461 				iov_arr[i].iov_addr += pending_hdr;
5462 				iov_arr[i].iov_len -= pending_hdr;
5463 			}
5464 		}
5465 
5466 		node->w_buftype = IBD_WQE_MAPPED;
5467 		node->w_swr.wr_sgl = node->w_sgl;
5468 
5469 		ibt_status = ibt_map_mem_iov(state->id_hca_hdl, &iov_attr,
5470 		    (ibt_all_wr_t *)&node->w_swr, &node->w_mi_hdl);
5471 		if (ibt_status != IBT_SUCCESS) {
5472 			ibd_print_warn(state, "ibd_send: ibt_map_mem_iov "
5473 			    "failed, nmblks=%d, ret=%d\n", nmblks, ibt_status);
5474 			goto ibd_copy_path;
5475 		}
5476 
5477 		return (0);
5478 	}
5479 
5480 ibd_copy_path:
5481 	if (pktsize <= state->id_tx_buf_sz) {
5482 		node->swqe_copybuf.ic_sgl.ds_len = pktsize;
5483 		node->w_swr.wr_nds = 1;
5484 		node->w_swr.wr_sgl = &node->swqe_copybuf.ic_sgl;
5485 		node->w_buftype = IBD_WQE_TXBUF;
5486 
5487 		/*
5488 		 * Even though this is the copy path for transfers less than
5489 		 * id_tx_buf_sz, it could still be an LSO packet.  If so, it
5490 		 * is possible the first data mblk fragment (data_mp) still
5491 		 * contains part of the LSO header that we need to skip.
5492 		 */
5493 		bufp = (uchar_t *)(uintptr_t)node->w_swr.wr_sgl->ds_va;
5494 		for (nmp = data_mp; nmp != NULL; nmp = nmp->b_cont) {
5495 			blksize = MBLKL(nmp) - pending_hdr;
5496 			bcopy(nmp->b_rptr + pending_hdr, bufp, blksize);
5497 			bufp += blksize;
5498 			pending_hdr = 0;
5499 		}
5500 
5501 		return (0);
5502 	}
5503 
5504 	/*
5505 	 * Copy path for transfers greater than id_tx_buf_sz
5506 	 */
5507 	node->w_swr.wr_sgl = node->w_sgl;
5508 	if (ibd_acquire_lsobufs(state, pktsize,
5509 	    node->w_swr.wr_sgl, &(node->w_swr.wr_nds)) != 0) {
5510 		DPRINT(10, "ibd_prepare_sgl: lso bufs acquire failed");
5511 		return (-1);
5512 	}
5513 	node->w_buftype = IBD_WQE_LSOBUF;
5514 
5515 	/*
5516 	 * Copy the larger-than-id_tx_buf_sz packet into a set of
5517 	 * fixed-sized, pre-mapped LSO buffers. Note that we might
5518 	 * need to skip part of the LSO header in the first fragment
5519 	 * as before.
5520 	 */
5521 	nmp = data_mp;
5522 	skip = pending_hdr;
5523 	for (i = 0; i < node->w_swr.wr_nds; i++) {
5524 		sgl = node->w_swr.wr_sgl + i;
5525 		bufp = (uchar_t *)(uintptr_t)sgl->ds_va;
5526 		avail = IBD_LSO_BUFSZ;
5527 		while (nmp && avail) {
5528 			blksize = MBLKL(nmp) - skip;
5529 			if (blksize > avail) {
5530 				bcopy(nmp->b_rptr + skip, bufp, avail);
5531 				skip += avail;
5532 				avail = 0;
5533 			} else {
5534 				bcopy(nmp->b_rptr + skip, bufp, blksize);
5535 				skip = 0;
5536 				avail -= blksize;
5537 				bufp += blksize;
5538 				nmp = nmp->b_cont;
5539 			}
5540 		}
5541 	}
5542 
5543 	return (0);
5544 }
5545 
5546 /*
5547  * Schedule a completion queue polling to reap the resource we're
5548  * short on.  If we implement the change to reap tx completions
5549  * in a separate thread, we'll need to wake up that thread here.
5550  */
5551 static int
5552 ibd_sched_poll(ibd_state_t *state, int resource_type, int q_flag)
5553 {
5554 	ibd_req_t *req;
5555 
5556 	mutex_enter(&state->id_sched_lock);
5557 	state->id_sched_needed |= resource_type;
5558 	mutex_exit(&state->id_sched_lock);
5559 
5560 	/*
5561 	 * If we are asked to queue a work entry, we need to do it
5562 	 */
5563 	if (q_flag) {
5564 		req = kmem_cache_alloc(state->id_req_kmc, KM_NOSLEEP);
5565 		if (req == NULL)
5566 			return (-1);
5567 
5568 		ibd_queue_work_slot(state, req, IBD_ASYNC_SCHED);
5569 	}
5570 
5571 	return (0);
5572 }
5573 
5574 /*
5575  * The passed in packet has this format:
5576  * IPOIB_ADDRL b dest addr :: 2b sap :: 2b 0's :: data
5577  */
5578 static boolean_t
5579 ibd_send(ibd_state_t *state, mblk_t *mp)
5580 {
5581 	ibd_ace_t *ace;
5582 	ibd_swqe_t *node;
5583 	ipoib_mac_t *dest;
5584 	ib_header_info_t *ipibp;
5585 	ip6_t *ip6h;
5586 	uint_t pktsize;
5587 	uint32_t mss;
5588 	uint32_t hckflags;
5589 	uint32_t lsoflags = 0;
5590 	uint_t lsohdr_sz = 0;
5591 	int ret, len;
5592 	boolean_t dofree = B_FALSE;
5593 	boolean_t rc;
5594 
5595 	/*
5596 	 * If we aren't done with the device initialization and start,
5597 	 * we shouldn't be here.
5598 	 */
5599 	if ((state->id_mac_state & IBD_DRV_STARTED) == 0)
5600 		return (B_FALSE);
5601 
5602 	mutex_enter(&state->id_tx_list.dl_mutex);
5603 	node = WQE_TO_SWQE(state->id_tx_list.dl_head);
5604 	if (node != NULL) {
5605 		state->id_tx_list.dl_cnt -= 1;
5606 		state->id_tx_list.dl_head = node->swqe_next;
5607 	} else {
5608 		node = ibd_acquire_swqe(state);
5609 	}
5610 	mutex_exit(&state->id_tx_list.dl_mutex);
5611 	if (node == NULL) {
5612 		/*
5613 		 * If we don't have an swqe available, schedule a transmit
5614 		 * completion queue cleanup and hold off on sending more
5615 		 * more packets until we have some free swqes
5616 		 */
5617 		if (ibd_sched_poll(state, IBD_RSRC_SWQE, 0) == 0)
5618 			return (B_FALSE);
5619 
5620 		/*
5621 		 * If a poll cannot be scheduled, we have no choice but
5622 		 * to drop this packet
5623 		 */
5624 		ibd_print_warn(state, "ibd_send: no swqe, pkt drop");
5625 		return (B_TRUE);
5626 	}
5627 
5628 	/*
5629 	 * Initialize the commonly used fields in swqe to NULL to protect
5630 	 * against ibd_tx_cleanup accidentally misinterpreting these on a
5631 	 * failure.
5632 	 */
5633 	node->swqe_im_mblk = NULL;
5634 	node->w_swr.wr_nds = 0;
5635 	node->w_swr.wr_sgl = NULL;
5636 	node->w_swr.wr_opcode = IBT_WRC_SEND;
5637 
5638 	/*
5639 	 * Obtain an address handle for the destination.
5640 	 */
5641 	ipibp = (ib_header_info_t *)mp->b_rptr;
5642 	dest = (ipoib_mac_t *)&ipibp->ib_dst;
5643 	if ((ntohl(dest->ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
5644 		IBD_FILL_SCOPE_PKEY(dest, state->id_scope, state->id_pkey);
5645 
5646 	pktsize = msgsize(mp);
5647 
5648 	atomic_add_64(&state->id_xmt_bytes, pktsize);
5649 	atomic_inc_64(&state->id_xmt_pkt);
5650 	if (bcmp(&ipibp->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0)
5651 		atomic_inc_64(&state->id_brd_xmt);
5652 	else if ((ntohl(ipibp->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
5653 		atomic_inc_64(&state->id_multi_xmt);
5654 
5655 	if ((ace = ibd_acache_lookup(state, dest, &ret, 1)) != NULL) {
5656 		node->w_ahandle = ace;
5657 		node->w_swr.wr.ud.udwr_dest = ace->ac_dest;
5658 	} else {
5659 		DPRINT(5,
5660 		    "ibd_send: acache lookup %s for %08X:%08X:%08X:%08X:%08X",
5661 		    ((ret == EFAULT) ? "failed" : "queued"),
5662 		    htonl(dest->ipoib_qpn), htonl(dest->ipoib_gidpref[0]),
5663 		    htonl(dest->ipoib_gidpref[1]),
5664 		    htonl(dest->ipoib_gidsuff[0]),
5665 		    htonl(dest->ipoib_gidsuff[1]));
5666 		node->w_ahandle = NULL;
5667 
5668 		/*
5669 		 * Here if ibd_acache_lookup() returns EFAULT, it means ibd
5670 		 * can not find a path for the specific dest address. We
5671 		 * should get rid of this kind of packet.  We also should get
5672 		 * rid of the packet if we cannot schedule a poll via the
5673 		 * async thread.  For the normal case, ibd will return the
5674 		 * packet to upper layer and wait for AH creating.
5675 		 *
5676 		 * Note that we always queue a work slot entry for the async
5677 		 * thread when we fail AH lookup (even in intr mode); this is
5678 		 * due to the convoluted way the code currently looks for AH.
5679 		 */
5680 		if (ret == EFAULT) {
5681 			dofree = B_TRUE;
5682 			rc = B_TRUE;
5683 		} else if (ibd_sched_poll(state, IBD_RSRC_SWQE, 1) != 0) {
5684 			dofree = B_TRUE;
5685 			rc = B_TRUE;
5686 		} else {
5687 			dofree = B_FALSE;
5688 			rc = B_FALSE;
5689 		}
5690 		goto ibd_send_fail;
5691 	}
5692 
5693 	/*
5694 	 * For ND6 packets, padding is at the front of the source lladdr.
5695 	 * Insert the padding at front.
5696 	 */
5697 	if (ntohs(ipibp->ipib_rhdr.ipoib_type) == ETHERTYPE_IPV6) {
5698 		if (MBLKL(mp) < sizeof (ib_header_info_t) + IPV6_HDR_LEN) {
5699 			if (!pullupmsg(mp, IPV6_HDR_LEN +
5700 			    sizeof (ib_header_info_t))) {
5701 				DPRINT(10, "ibd_send: pullupmsg failure ");
5702 				dofree = B_TRUE;
5703 				rc = B_TRUE;
5704 				goto ibd_send_fail;
5705 			}
5706 			ipibp = (ib_header_info_t *)mp->b_rptr;
5707 		}
5708 		ip6h = (ip6_t *)((uchar_t *)ipibp +
5709 		    sizeof (ib_header_info_t));
5710 		len = ntohs(ip6h->ip6_plen);
5711 		if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
5712 			mblk_t	*pad;
5713 
5714 			pad = allocb(4, 0);
5715 			pad->b_wptr = (uchar_t *)pad->b_rptr + 4;
5716 			linkb(mp, pad);
5717 			if (MBLKL(mp) < sizeof (ib_header_info_t) +
5718 			    IPV6_HDR_LEN + len + 4) {
5719 				if (!pullupmsg(mp, sizeof (ib_header_info_t) +
5720 				    IPV6_HDR_LEN + len + 4)) {
5721 					DPRINT(10, "ibd_send: pullupmsg "
5722 					    "failure ");
5723 					dofree = B_TRUE;
5724 					rc = B_TRUE;
5725 					goto ibd_send_fail;
5726 				}
5727 				ip6h = (ip6_t *)((uchar_t *)mp->b_rptr +
5728 				    sizeof (ib_header_info_t));
5729 			}
5730 
5731 			/* LINTED: E_CONSTANT_CONDITION */
5732 			IBD_PAD_NSNA(ip6h, len, IBD_SEND);
5733 		}
5734 	}
5735 
5736 	mp->b_rptr += sizeof (ib_addrs_t);
5737 
5738 	/*
5739 	 * Do LSO and checksum related work here.  For LSO send, adjust the
5740 	 * ud destination, the opcode and the LSO header information to the
5741 	 * work request.
5742 	 */
5743 	lso_info_get(mp, &mss, &lsoflags);
5744 	if ((lsoflags & HW_LSO) != HW_LSO) {
5745 		node->w_swr.wr_opcode = IBT_WRC_SEND;
5746 		lsohdr_sz = 0;
5747 	} else {
5748 		if (ibd_setup_lso(node, mp, mss, ace->ac_dest) != 0) {
5749 			/*
5750 			 * The routine can only fail if there's no memory; we
5751 			 * can only drop the packet if this happens
5752 			 */
5753 			ibd_print_warn(state,
5754 			    "ibd_send: no memory, lso posting failed");
5755 			dofree = B_TRUE;
5756 			rc = B_TRUE;
5757 			goto ibd_send_fail;
5758 		}
5759 
5760 		node->w_swr.wr_opcode = IBT_WRC_SEND_LSO;
5761 		lsohdr_sz = (node->w_swr.wr.ud_lso).lso_hdr_sz;
5762 	}
5763 
5764 	hcksum_retrieve(mp, NULL, NULL, NULL, NULL, NULL, NULL, &hckflags);
5765 	if ((hckflags & HCK_FULLCKSUM) == HCK_FULLCKSUM)
5766 		node->w_swr.wr_flags |= IBT_WR_SEND_CKSUM;
5767 	else
5768 		node->w_swr.wr_flags &= ~IBT_WR_SEND_CKSUM;
5769 
5770 	/*
5771 	 * Prepare the sgl for posting; the routine can only fail if there's
5772 	 * no lso buf available for posting. If this is the case, we should
5773 	 * probably resched for lso bufs to become available and then try again.
5774 	 */
5775 	if (ibd_prepare_sgl(state, mp, node, lsohdr_sz) != 0) {
5776 		if (ibd_sched_poll(state, IBD_RSRC_LSOBUF, 1) != 0) {
5777 			dofree = B_TRUE;
5778 			rc = B_TRUE;
5779 		} else {
5780 			dofree = B_FALSE;
5781 			rc = B_FALSE;
5782 		}
5783 		goto ibd_send_fail;
5784 	}
5785 	node->swqe_im_mblk = mp;
5786 
5787 	/*
5788 	 * Queue the wqe to hardware; since we can now simply queue a
5789 	 * post instead of doing it serially, we cannot assume anything
5790 	 * about the 'node' after ibd_post_send() returns.
5791 	 */
5792 	node->swqe_next = NULL;
5793 
5794 	mutex_enter(&state->id_txpost_lock);
5795 	if (state->id_tx_busy) {
5796 		if (state->id_tx_head) {
5797 			state->id_tx_tail->swqe_next =
5798 			    SWQE_TO_WQE(node);
5799 		} else {
5800 			state->id_tx_head = node;
5801 		}
5802 		state->id_tx_tail = node;
5803 		mutex_exit(&state->id_txpost_lock);
5804 	} else {
5805 		state->id_tx_busy = 1;
5806 		mutex_exit(&state->id_txpost_lock);
5807 		ibd_post_send(state, node);
5808 	}
5809 
5810 	return (B_TRUE);
5811 
5812 ibd_send_fail:
5813 	if (node && mp)
5814 		ibd_free_lsohdr(node, mp);
5815 
5816 	if (dofree)
5817 		freemsg(mp);
5818 
5819 	if (node != NULL)
5820 		ibd_tx_cleanup(state, node);
5821 
5822 	return (rc);
5823 }
5824 
5825 /*
5826  * GLDv3 entry point for transmitting datagram.
5827  */
5828 static mblk_t *
5829 ibd_m_tx(void *arg, mblk_t *mp)
5830 {
5831 	ibd_state_t *state = (ibd_state_t *)arg;
5832 	mblk_t *next;
5833 
5834 	if (state->id_link_state != LINK_STATE_UP) {
5835 		freemsgchain(mp);
5836 		mp = NULL;
5837 	}
5838 
5839 	while (mp != NULL) {
5840 		next = mp->b_next;
5841 		mp->b_next = NULL;
5842 		if (ibd_send(state, mp) == B_FALSE) {
5843 			/* Send fail */
5844 			mp->b_next = next;
5845 			break;
5846 		}
5847 		mp = next;
5848 	}
5849 
5850 	return (mp);
5851 }
5852 
5853 /*
5854  * this handles Tx and Rx completions. With separate CQs, this handles
5855  * only Rx completions.
5856  */
5857 static uint_t
5858 ibd_intr(caddr_t arg)
5859 {
5860 	ibd_state_t *state = (ibd_state_t *)arg;
5861 
5862 	ibd_poll_rcq(state, state->id_rcq_hdl);
5863 
5864 	return (DDI_INTR_CLAIMED);
5865 }
5866 
5867 /*
5868  * Poll and fully drain the send cq
5869  */
5870 static void
5871 ibd_drain_scq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl)
5872 {
5873 	ibt_wc_t *wcs = state->id_txwcs;
5874 	uint_t numwcs = state->id_txwcs_size;
5875 	ibd_wqe_t *wqe;
5876 	ibd_swqe_t *head, *tail;
5877 	ibt_wc_t *wc;
5878 	uint_t num_polled;
5879 	int i;
5880 
5881 	while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) {
5882 		head = tail = NULL;
5883 		for (i = 0, wc = wcs; i < num_polled; i++, wc++) {
5884 			wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id;
5885 			ASSERT(wqe->w_type == IBD_WQE_SEND);
5886 			if (wc->wc_status != IBT_WC_SUCCESS) {
5887 				/*
5888 				 * Channel being torn down.
5889 				 */
5890 				if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) {
5891 					DPRINT(5, "ibd_drain_scq: flush error");
5892 					/*
5893 					 * Only invoke the Tx handler to
5894 					 * release possibly held resources
5895 					 * like AH refcount etc.
5896 					 */
5897 					DPRINT(10, "ibd_drain_scq: Bad "
5898 					    "status %d", wc->wc_status);
5899 				}
5900 				return;	/* give up.  no need to clean up */
5901 			}
5902 			/*
5903 			 * Add this swqe to the list to be cleaned up.
5904 			 */
5905 			if (head)
5906 				tail->swqe_next = wqe;
5907 			else
5908 				head = WQE_TO_SWQE(wqe);
5909 			tail = WQE_TO_SWQE(wqe);
5910 		}
5911 		tail->swqe_next = NULL;
5912 		ibd_tx_cleanup_list(state, head, tail);
5913 
5914 		/*
5915 		 * Resume any blocked transmissions if possible
5916 		 */
5917 		ibd_resume_transmission(state);
5918 	}
5919 }
5920 
5921 /*
5922  * Poll and fully drain the receive cq
5923  */
5924 static void
5925 ibd_drain_rcq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl)
5926 {
5927 	ibt_wc_t *wcs = state->id_rxwcs;
5928 	uint_t numwcs = state->id_rxwcs_size;
5929 	ibd_wqe_t *wqe;
5930 	ibt_wc_t *wc;
5931 	uint_t num_polled;
5932 	int i;
5933 	mblk_t *head, *tail, *mp;
5934 
5935 	while (ibt_poll_cq(cq_hdl, wcs, numwcs, &num_polled) == IBT_SUCCESS) {
5936 		head = tail = NULL;
5937 		for (i = 0, wc = wcs; i < num_polled; i++, wc++) {
5938 			wqe = (ibd_wqe_t *)(uintptr_t)wc->wc_id;
5939 			ASSERT(wqe->w_type == IBD_WQE_RECV);
5940 			if (wc->wc_status != IBT_WC_SUCCESS) {
5941 				/*
5942 				 * Channel being torn down.
5943 				 */
5944 				if (wc->wc_status == IBT_WC_WR_FLUSHED_ERR) {
5945 					DPRINT(5, "ibd_drain_rcq: flush error");
5946 					/*
5947 					 * Do not invoke Rx handler because
5948 					 * it might add buffers to the Rx pool
5949 					 * when we are trying to deinitialize.
5950 					 */
5951 					continue;
5952 				}
5953 			}
5954 			mp = ibd_process_rx(state, WQE_TO_RWQE(wqe), wc);
5955 			if (mp == NULL)
5956 				continue;
5957 
5958 			/*
5959 			 * Add this mp to the list to send to the nw layer.
5960 			 */
5961 			if (head)
5962 				tail->b_next = mp;
5963 			else
5964 				head = mp;
5965 			tail = mp;
5966 		}
5967 		if (head)
5968 			mac_rx(state->id_mh, state->id_rh, head);
5969 	}
5970 }
5971 
5972 /*
5973  * Common code for interrupt handling as well as for polling
5974  * for all completed wqe's while detaching.
5975  */
5976 static void
5977 ibd_poll_scq(ibd_state_t *state, ibt_cq_hdl_t cq_hdl)
5978 {
5979 	int flag, redo_flag;
5980 	int redo = 1;
5981 
5982 	flag = IBD_CQ_POLLING;
5983 	redo_flag = IBD_REDO_CQ_POLLING;
5984 
5985 	mutex_enter(&state->id_scq_poll_lock);
5986 	if (state->id_scq_poll_busy & flag) {
5987 		ibd_print_warn(state, "ibd_poll_scq: multiple polling threads");
5988 		state->id_scq_poll_busy |= redo_flag;
5989 		mutex_exit(&state->id_scq_poll_lock);
5990 		return;
5991 	}
5992 	state->id_scq_poll_busy |= flag;
5993 	mutex_exit(&state->id_scq_poll_lock);
5994 
5995 	/*
5996 	 * In some cases (eg detaching), this code can be invoked on
5997 	 * any cpu after disabling cq notification (thus no concurrency
5998 	 * exists). Apart from that, the following applies normally:
5999 	 * Transmit completion handling could be from any cpu if
6000 	 * Tx CQ is poll driven, but always on Tx interrupt cpu if Tx CQ
6001 	 * is interrupt driven.
6002 	 */
6003 
6004 	/*
6005 	 * Poll and drain the CQ
6006 	 */
6007 	ibd_drain_scq(state, cq_hdl);
6008 
6009 	/*
6010 	 * Enable CQ notifications and redrain the cq to catch any
6011 	 * completions we might have missed after the ibd_drain_scq()
6012 	 * above and before the ibt_enable_cq_notify() that follows.
6013 	 * Finally, service any new requests to poll the cq that
6014 	 * could've come in after the ibt_enable_cq_notify().
6015 	 */
6016 	do {
6017 		if (ibt_enable_cq_notify(cq_hdl, IBT_NEXT_COMPLETION) !=
6018 		    IBT_SUCCESS) {
6019 			DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed");
6020 		}
6021 
6022 		ibd_drain_scq(state, cq_hdl);
6023 
6024 		mutex_enter(&state->id_scq_poll_lock);
6025 		if (state->id_scq_poll_busy & redo_flag)
6026 			state->id_scq_poll_busy &= ~redo_flag;
6027 		else {
6028 			state->id_scq_poll_busy &= ~flag;
6029 			redo = 0;
6030 		}
6031 		mutex_exit(&state->id_scq_poll_lock);
6032 
6033 	} while (redo);
6034 }
6035 
6036 /*
6037  * Common code for interrupt handling as well as for polling
6038  * for all completed wqe's while detaching.
6039  */
6040 static void
6041 ibd_poll_rcq(ibd_state_t *state, ibt_cq_hdl_t rcq)
6042 {
6043 	int flag, redo_flag;
6044 	int redo = 1;
6045 
6046 	flag = IBD_CQ_POLLING;
6047 	redo_flag = IBD_REDO_CQ_POLLING;
6048 
6049 	mutex_enter(&state->id_rcq_poll_lock);
6050 	if (state->id_rcq_poll_busy & flag) {
6051 		ibd_print_warn(state, "ibd_poll_rcq: multiple polling threads");
6052 		state->id_rcq_poll_busy |= redo_flag;
6053 		mutex_exit(&state->id_rcq_poll_lock);
6054 		return;
6055 	}
6056 	state->id_rcq_poll_busy |= flag;
6057 	mutex_exit(&state->id_rcq_poll_lock);
6058 
6059 	/*
6060 	 * Poll and drain the CQ
6061 	 */
6062 	ibd_drain_rcq(state, rcq);
6063 
6064 	/*
6065 	 * Enable CQ notifications and redrain the cq to catch any
6066 	 * completions we might have missed after the ibd_drain_cq()
6067 	 * above and before the ibt_enable_cq_notify() that follows.
6068 	 * Finally, service any new requests to poll the cq that
6069 	 * could've come in after the ibt_enable_cq_notify().
6070 	 */
6071 	do {
6072 		if (ibt_enable_cq_notify(rcq, IBT_NEXT_COMPLETION) !=
6073 		    IBT_SUCCESS) {
6074 			DPRINT(10, "ibd_intr: ibt_enable_cq_notify() failed");
6075 		}
6076 
6077 		ibd_drain_rcq(state, rcq);
6078 
6079 		mutex_enter(&state->id_rcq_poll_lock);
6080 		if (state->id_rcq_poll_busy & redo_flag)
6081 			state->id_rcq_poll_busy &= ~redo_flag;
6082 		else {
6083 			state->id_rcq_poll_busy &= ~flag;
6084 			redo = 0;
6085 		}
6086 		mutex_exit(&state->id_rcq_poll_lock);
6087 
6088 	} while (redo);
6089 }
6090 
6091 /*
6092  * Unmap the memory area associated with a given swqe.
6093  */
6094 static void
6095 ibd_unmap_mem(ibd_state_t *state, ibd_swqe_t *swqe)
6096 {
6097 	ibt_status_t stat;
6098 
6099 	DPRINT(20, "ibd_unmap_mem: wqe=%p, seg=%d\n", swqe, swqe->w_swr.wr_nds);
6100 
6101 	if (swqe->w_mi_hdl) {
6102 		if ((stat = ibt_unmap_mem_iov(state->id_hca_hdl,
6103 		    swqe->w_mi_hdl)) != IBT_SUCCESS) {
6104 			DPRINT(10,
6105 			    "failed in ibt_unmap_mem_iov, ret=%d\n", stat);
6106 		}
6107 		swqe->w_mi_hdl = NULL;
6108 	}
6109 	swqe->w_swr.wr_nds = 0;
6110 }
6111 
6112 static void
6113 ibd_dec_ref_ace(ibd_state_t *state, ibd_ace_t *ace)
6114 {
6115 	/*
6116 	 * The recycling logic can be eliminated from here
6117 	 * and put into the async thread if we create another
6118 	 * list to hold ACE's for unjoined mcg's.
6119 	 */
6120 	if (DEC_REF_DO_CYCLE(ace)) {
6121 		ibd_mce_t *mce;
6122 
6123 		/*
6124 		 * Check with the lock taken: we decremented
6125 		 * reference count without the lock, and some
6126 		 * transmitter might already have bumped the
6127 		 * reference count (possible in case of multicast
6128 		 * disable when we leave the AH on the active
6129 		 * list). If not still 0, get out, leaving the
6130 		 * recycle bit intact.
6131 		 *
6132 		 * Atomically transition the AH from active
6133 		 * to free list, and queue a work request to
6134 		 * leave the group and destroy the mce. No
6135 		 * transmitter can be looking at the AH or
6136 		 * the MCE in between, since we have the
6137 		 * ac_mutex lock. In the SendOnly reap case,
6138 		 * it is not necessary to hold the ac_mutex
6139 		 * and recheck the ref count (since the AH was
6140 		 * taken off the active list), we just do it
6141 		 * to have uniform processing with the Full
6142 		 * reap case.
6143 		 */
6144 		mutex_enter(&state->id_ac_mutex);
6145 		mce = ace->ac_mce;
6146 		if (GET_REF_CYCLE(ace) == 0) {
6147 			CLEAR_REFCYCLE(ace);
6148 			/*
6149 			 * Identify the case of fullmember reap as
6150 			 * opposed to mcg trap reap. Also, port up
6151 			 * might set ac_mce to NULL to indicate Tx
6152 			 * cleanup should do no more than put the
6153 			 * AH in the free list (see ibd_async_link).
6154 			 */
6155 			if (mce != NULL) {
6156 				ace->ac_mce = NULL;
6157 				IBD_ACACHE_PULLOUT_ACTIVE(state, ace);
6158 				/*
6159 				 * mc_req was initialized at mce
6160 				 * creation time.
6161 				 */
6162 				ibd_queue_work_slot(state,
6163 				    &mce->mc_req, IBD_ASYNC_REAP);
6164 			}
6165 			IBD_ACACHE_INSERT_FREE(state, ace);
6166 		}
6167 		mutex_exit(&state->id_ac_mutex);
6168 	}
6169 }
6170 
6171 /*
6172  * Common code that deals with clean ups after a successful or
6173  * erroneous transmission attempt.
6174  */
6175 static void
6176 ibd_tx_cleanup(ibd_state_t *state, ibd_swqe_t *swqe)
6177 {
6178 	ibd_ace_t *ace = swqe->w_ahandle;
6179 
6180 	DPRINT(20, "ibd_tx_cleanup %p\n", swqe);
6181 
6182 	/*
6183 	 * If this was a dynamic mapping in ibd_send(), we need to
6184 	 * unmap here. If this was an lso buffer we'd used for sending,
6185 	 * we need to release the lso buf to the pool, since the resource
6186 	 * is scarce. However, if this was simply a normal send using
6187 	 * the copybuf (present in each swqe), we don't need to release it.
6188 	 */
6189 	if (swqe->swqe_im_mblk != NULL) {
6190 		if (swqe->w_buftype == IBD_WQE_MAPPED) {
6191 			ibd_unmap_mem(state, swqe);
6192 		} else if (swqe->w_buftype == IBD_WQE_LSOBUF) {
6193 			ibd_release_lsobufs(state,
6194 			    swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds);
6195 		}
6196 		ibd_free_lsohdr(swqe, swqe->swqe_im_mblk);
6197 		freemsg(swqe->swqe_im_mblk);
6198 		swqe->swqe_im_mblk = NULL;
6199 	}
6200 
6201 	/*
6202 	 * Drop the reference count on the AH; it can be reused
6203 	 * now for a different destination if there are no more
6204 	 * posted sends that will use it. This can be eliminated
6205 	 * if we can always associate each Tx buffer with an AH.
6206 	 * The ace can be null if we are cleaning up from the
6207 	 * ibd_send() error path.
6208 	 */
6209 	if (ace != NULL) {
6210 		ibd_dec_ref_ace(state, ace);
6211 	}
6212 
6213 	/*
6214 	 * Release the send wqe for reuse.
6215 	 */
6216 	swqe->swqe_next = NULL;
6217 	ibd_release_swqe(state, swqe, swqe, 1);
6218 }
6219 
6220 static void
6221 ibd_tx_cleanup_list(ibd_state_t *state, ibd_swqe_t *head, ibd_swqe_t *tail)
6222 {
6223 	ibd_ace_t *ace;
6224 	ibd_swqe_t *swqe;
6225 	int n = 0;
6226 
6227 	DPRINT(20, "ibd_tx_cleanup_list %p %p\n", head, tail);
6228 
6229 	for (swqe = head; swqe != NULL; swqe = WQE_TO_SWQE(swqe->swqe_next)) {
6230 
6231 		/*
6232 		 * If this was a dynamic mapping in ibd_send(), we need to
6233 		 * unmap here. If this was an lso buffer we'd used for sending,
6234 		 * we need to release the lso buf to the pool, since the
6235 		 * resource is scarce. However, if this was simply a normal
6236 		 * send using the copybuf (present in each swqe), we don't need
6237 		 * to release it.
6238 		 */
6239 		if (swqe->swqe_im_mblk != NULL) {
6240 			if (swqe->w_buftype == IBD_WQE_MAPPED) {
6241 				ibd_unmap_mem(state, swqe);
6242 			} else if (swqe->w_buftype == IBD_WQE_LSOBUF) {
6243 				ibd_release_lsobufs(state,
6244 				    swqe->w_swr.wr_sgl, swqe->w_swr.wr_nds);
6245 			}
6246 			ibd_free_lsohdr(swqe, swqe->swqe_im_mblk);
6247 			freemsg(swqe->swqe_im_mblk);
6248 			swqe->swqe_im_mblk = NULL;
6249 		}
6250 
6251 		/*
6252 		 * Drop the reference count on the AH; it can be reused
6253 		 * now for a different destination if there are no more
6254 		 * posted sends that will use it. This can be eliminated
6255 		 * if we can always associate each Tx buffer with an AH.
6256 		 * The ace can be null if we are cleaning up from the
6257 		 * ibd_send() error path.
6258 		 */
6259 		ace = swqe->w_ahandle;
6260 		if (ace != NULL) {
6261 			ibd_dec_ref_ace(state, ace);
6262 		}
6263 		n++;
6264 	}
6265 
6266 	/*
6267 	 * Release the send wqes for reuse.
6268 	 */
6269 	ibd_release_swqe(state, head, tail, n);
6270 }
6271 
6272 /*
6273  * Processing to be done after receipt of a packet; hand off to GLD
6274  * in the format expected by GLD.  The received packet has this
6275  * format: 2b sap :: 00 :: data.
6276  */
6277 static mblk_t *
6278 ibd_process_rx(ibd_state_t *state, ibd_rwqe_t *rwqe, ibt_wc_t *wc)
6279 {
6280 	ib_header_info_t *phdr;
6281 	mblk_t *mp;
6282 	ipoib_hdr_t *ipibp;
6283 	ipha_t *iphap;
6284 	ip6_t *ip6h;
6285 	int len;
6286 	ib_msglen_t pkt_len = wc->wc_bytes_xfer;
6287 	uint32_t bufs;
6288 
6289 	atomic_add_32(&state->id_rx_list.dl_cnt, -1);
6290 
6291 	/*
6292 	 * Track number handed to upper layer, and number still
6293 	 * available to receive packets.
6294 	 */
6295 	bufs = atomic_add_32_nv(&state->id_rx_list.dl_bufs_outstanding, 1);
6296 
6297 	/* Never run out of rwqes, use allocb when running low */
6298 	if (bufs >= state->id_rx_bufs_outstanding_limit) {
6299 		atomic_add_32(&state->id_rx_list.dl_bufs_outstanding, -1);
6300 		atomic_inc_32(&state->id_rx_allocb);
6301 		mp = allocb(pkt_len, BPRI_HI);
6302 		if (mp) {
6303 			bcopy(rwqe->rwqe_im_mblk->b_rptr, mp->b_rptr, pkt_len);
6304 			ibd_post_recv(state, rwqe);
6305 		} else {	/* no memory */
6306 			atomic_inc_32(&state->id_rx_allocb_failed);
6307 			ibd_post_recv(state, rwqe);
6308 			return (NULL);
6309 		}
6310 	} else {
6311 		mp = rwqe->rwqe_im_mblk;
6312 	}
6313 
6314 
6315 	/*
6316 	 * Adjust write pointer depending on how much data came in.
6317 	 */
6318 	mp->b_wptr = mp->b_rptr + pkt_len;
6319 
6320 	/*
6321 	 * Make sure this is NULL or we're in trouble.
6322 	 */
6323 	if (mp->b_next != NULL) {
6324 		ibd_print_warn(state,
6325 		    "ibd_process_rx: got duplicate mp from rcq?");
6326 		mp->b_next = NULL;
6327 	}
6328 
6329 	/*
6330 	 * the IB link will deliver one of the IB link layer
6331 	 * headers called, the Global Routing Header (GRH).
6332 	 * ibd driver uses the information in GRH to build the
6333 	 * Header_info structure and pass it with the datagram up
6334 	 * to GLDv3.
6335 	 * If the GRH is not valid, indicate to GLDv3 by setting
6336 	 * the VerTcFlow field to 0.
6337 	 */
6338 	phdr = (ib_header_info_t *)mp->b_rptr;
6339 	if (wc->wc_flags & IBT_WC_GRH_PRESENT) {
6340 		phdr->ib_grh.ipoib_sqpn = htonl(wc->wc_qpn);
6341 
6342 		/* if it is loop back packet, just drop it. */
6343 		if (bcmp(&phdr->ib_grh.ipoib_sqpn, &state->id_macaddr,
6344 		    IPOIB_ADDRL) == 0) {
6345 			freemsg(mp);
6346 			return (NULL);
6347 		}
6348 
6349 		ovbcopy(&phdr->ib_grh.ipoib_sqpn, &phdr->ib_src,
6350 		    sizeof (ipoib_mac_t));
6351 		if (*(uint8_t *)(phdr->ib_grh.ipoib_dgid_pref) == 0xFF) {
6352 			phdr->ib_dst.ipoib_qpn = htonl(IB_MC_QPN);
6353 			IBD_CLEAR_SCOPE_PKEY(&phdr->ib_dst);
6354 		} else {
6355 			phdr->ib_dst.ipoib_qpn = state->id_macaddr.ipoib_qpn;
6356 		}
6357 	} else {
6358 		/*
6359 		 * It can not be a IBA multicast packet. Must have been
6360 		 * unicast for us. Just copy the interface address to dst.
6361 		 */
6362 		phdr->ib_grh.ipoib_vertcflow = 0;
6363 		ovbcopy(&state->id_macaddr, &phdr->ib_dst,
6364 		    sizeof (ipoib_mac_t));
6365 	}
6366 
6367 	/*
6368 	 * For ND6 packets, padding is at the front of the source/target
6369 	 * lladdr. However the inet6 layer is not aware of it, hence remove
6370 	 * the padding from such packets.
6371 	 */
6372 	ipibp = (ipoib_hdr_t *)((uchar_t *)mp->b_rptr + sizeof (ipoib_pgrh_t));
6373 	if (ntohs(ipibp->ipoib_type) == ETHERTYPE_IPV6) {
6374 		ip6h = (ip6_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t));
6375 		len = ntohs(ip6h->ip6_plen);
6376 		if (ip6h->ip6_nxt == IPPROTO_ICMPV6) {
6377 			/* LINTED: E_CONSTANT_CONDITION */
6378 			IBD_PAD_NSNA(ip6h, len, IBD_RECV);
6379 		}
6380 	}
6381 
6382 	/*
6383 	 * Update statistics
6384 	 */
6385 	atomic_add_64(&state->id_rcv_bytes, pkt_len);
6386 	atomic_inc_64(&state->id_rcv_pkt);
6387 	if (bcmp(&phdr->ib_dst, &state->id_bcaddr, IPOIB_ADDRL) == 0)
6388 		atomic_inc_64(&state->id_brd_rcv);
6389 	else if ((ntohl(phdr->ib_dst.ipoib_qpn) & IB_QPN_MASK) == IB_MC_QPN)
6390 		atomic_inc_64(&state->id_multi_rcv);
6391 
6392 	iphap = (ipha_t *)((uchar_t *)ipibp + sizeof (ipoib_hdr_t));
6393 	/*
6394 	 * Set receive checksum status in mp
6395 	 * Hardware checksumming can be considered valid only if:
6396 	 * 1. CQE.IP_OK bit is set
6397 	 * 2. CQE.CKSUM = 0xffff
6398 	 * 3. IPv6 routing header is not present in the packet
6399 	 * 4. If there are no IP_OPTIONS in the IP HEADER
6400 	 */
6401 
6402 	if (((wc->wc_flags & IBT_WC_CKSUM_OK) == IBT_WC_CKSUM_OK) &&
6403 	    (wc->wc_cksum == 0xFFFF) &&
6404 	    (iphap->ipha_version_and_hdr_length == IP_SIMPLE_HDR_VERSION)) {
6405 		(void) hcksum_assoc(mp, NULL, NULL, 0, 0, 0, 0,
6406 		    HCK_FULLCKSUM | HCK_FULLCKSUM_OK, 0);
6407 	}
6408 
6409 	return (mp);
6410 }
6411 
6412 /*
6413  * Callback code invoked from STREAMs when the receive data buffer is
6414  * free for recycling.
6415  */
6416 static void
6417 ibd_freemsg_cb(char *arg)
6418 {
6419 	ibd_rwqe_t *rwqe = (ibd_rwqe_t *)arg;
6420 	ibd_state_t *state = rwqe->w_state;
6421 
6422 	/*
6423 	 * If the wqe is being destructed, do not attempt recycling.
6424 	 */
6425 	if (rwqe->w_freeing_wqe == B_TRUE) {
6426 		DPRINT(6, "ibd_freemsg: wqe being freed");
6427 		return;
6428 	}
6429 
6430 	rwqe->rwqe_im_mblk = desballoc(rwqe->rwqe_copybuf.ic_bufaddr,
6431 	    state->id_mtu + IPOIB_GRH_SIZE, 0, &rwqe->w_freemsg_cb);
6432 	if (rwqe->rwqe_im_mblk == NULL) {
6433 		ibd_free_rwqe(state, rwqe);
6434 		DPRINT(6, "ibd_freemsg: desballoc failed");
6435 		return;
6436 	}
6437 
6438 	ibd_post_recv(state, rwqe);
6439 
6440 	atomic_add_32(&state->id_rx_list.dl_bufs_outstanding, -1);
6441 }
6442 
6443 static uint_t
6444 ibd_tx_recycle(caddr_t arg)
6445 {
6446 	ibd_state_t *state = (ibd_state_t *)arg;
6447 
6448 	/*
6449 	 * Poll for completed entries
6450 	 */
6451 	ibd_poll_scq(state, state->id_scq_hdl);
6452 
6453 	return (DDI_INTR_CLAIMED);
6454 }
6455 
6456 #ifdef IBD_LOGGING
6457 static void
6458 ibd_log_init(void)
6459 {
6460 	ibd_lbuf = kmem_zalloc(IBD_LOG_SZ, KM_SLEEP);
6461 	ibd_lbuf_ndx = 0;
6462 
6463 	mutex_init(&ibd_lbuf_lock, NULL, MUTEX_DRIVER, NULL);
6464 }
6465 
6466 static void
6467 ibd_log_fini(void)
6468 {
6469 	if (ibd_lbuf)
6470 		kmem_free(ibd_lbuf, IBD_LOG_SZ);
6471 	ibd_lbuf_ndx = 0;
6472 	ibd_lbuf = NULL;
6473 
6474 	mutex_destroy(&ibd_lbuf_lock);
6475 }
6476 
6477 static void
6478 ibd_log(const char *fmt, ...)
6479 {
6480 	va_list	ap;
6481 	uint32_t off;
6482 	uint32_t msglen;
6483 	char tmpbuf[IBD_DMAX_LINE];
6484 
6485 	if (ibd_lbuf == NULL)
6486 		return;
6487 
6488 	va_start(ap, fmt);
6489 	msglen = vsnprintf(tmpbuf, IBD_DMAX_LINE, fmt, ap);
6490 	va_end(ap);
6491 
6492 	if (msglen >= IBD_DMAX_LINE)
6493 		msglen = IBD_DMAX_LINE - 1;
6494 
6495 	mutex_enter(&ibd_lbuf_lock);
6496 
6497 	off = ibd_lbuf_ndx;		/* current msg should go here */
6498 	if ((ibd_lbuf_ndx) && (ibd_lbuf[ibd_lbuf_ndx-1] != '\n'))
6499 		ibd_lbuf[ibd_lbuf_ndx-1] = '\n';
6500 
6501 	ibd_lbuf_ndx += msglen;		/* place where next msg should start */
6502 	ibd_lbuf[ibd_lbuf_ndx] = 0;	/* current msg should terminate */
6503 
6504 	if (ibd_lbuf_ndx >= (IBD_LOG_SZ - 2 * IBD_DMAX_LINE))
6505 		ibd_lbuf_ndx = 0;
6506 
6507 	mutex_exit(&ibd_lbuf_lock);
6508 
6509 	bcopy(tmpbuf, ibd_lbuf+off, msglen);	/* no lock needed for this */
6510 }
6511 #endif
6512