1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #include <sys/types.h>
27 #include <sys/ddi.h>
28 #include <sys/types.h>
29 #include <sys/socket.h>
30 #include <netinet/in.h>
31 #include <sys/sunddi.h>
32 #include <sys/sysmacros.h>
33 #include <sys/iscsi_protocol.h>
34 
35 #include <sys/ib/clients/iser/iser.h>
36 #include <sys/ib/clients/iser/iser_idm.h>
37 
38 /*
39  * iser_ib.c
40  * Routines for InfiniBand transport for iSER
41  *
42  * This file contains the routines to interface with the IBT API to attach and
43  * allocate IB resources, handle async events, and post recv work requests.
44  *
45  */
46 
47 static iser_hca_t *iser_ib_gid2hca(ib_gid_t gid);
48 static iser_hca_t *iser_ib_guid2hca(ib_guid_t guid);
49 
50 static iser_hca_t *iser_ib_alloc_hca(ib_guid_t guid);
51 static int iser_ib_free_hca(iser_hca_t *hca);
52 static int iser_ib_update_hcaports(iser_hca_t *hca);
53 static int iser_ib_init_hcas(void);
54 static int iser_ib_fini_hcas(void);
55 
56 static iser_sbind_t *iser_ib_get_bind(
57     iser_svc_t *iser_svc, ib_guid_t hca_guid, ib_gid_t gid);
58 static int iser_ib_activate_port(
59     idm_svc_t *idm_svc, ib_guid_t guid, ib_gid_t gid);
60 static void iser_ib_deactivate_port(ib_guid_t hca_guid, ib_gid_t gid);
61 
62 static void iser_ib_init_qp(iser_chan_t *chan, uint_t sq_size, uint_t rq_size);
63 static void iser_ib_fini_qp(iser_qp_t *qp);
64 
65 static int iser_ib_setup_cq(ibt_hca_hdl_t hca_hdl, uint_t cq_size,
66     ibt_cq_hdl_t *cq_hdl);
67 
68 static void iser_ib_setup_chanargs(uint8_t hca_port, ibt_cq_hdl_t scq_hdl,
69     ibt_cq_hdl_t rcq_hdl, uint_t sq_size, uint_t rq_size,
70     ibt_pd_hdl_t hca_pdhdl, ibt_rc_chan_alloc_args_t *cargs);
71 
72 static void iser_ib_handle_portup_event(ibt_hca_hdl_t hdl,
73     ibt_async_event_t *event);
74 static void iser_ib_handle_portdown_event(ibt_hca_hdl_t hdl,
75     ibt_async_event_t *event);
76 static void iser_ib_handle_hca_detach_event(ibt_hca_hdl_t hdl,
77     ibt_async_event_t *event);
78 
79 static void iser_ib_post_recv_task(void *arg);
80 
81 static struct ibt_clnt_modinfo_s iser_ib_modinfo = {
82 	IBTI_V_CURR,
83 	IBT_STORAGE_DEV,
84 	iser_ib_async_handler,
85 	NULL,
86 	"iSER"
87 };
88 
89 /*
90  * iser_ib_init
91  *
92  * This function registers the HCA drivers with IBTF and registers and binds
93  * iSER as a service with IBTF.
94  */
95 int
96 iser_ib_init(void)
97 {
98 	int		status;
99 
100 	/* Register with IBTF */
101 	status = ibt_attach(&iser_ib_modinfo, iser_state->is_dip, iser_state,
102 	    &iser_state->is_ibhdl);
103 	if (status != DDI_SUCCESS) {
104 		ISER_LOG(CE_NOTE, "iser_ib_init: ibt_attach failed (0x%x)",
105 		    status);
106 		return (DDI_FAILURE);
107 	}
108 
109 	/* Create the global work request kmem_cache */
110 	iser_state->iser_wr_cache = kmem_cache_create("iser_wr_cache",
111 	    sizeof (iser_wr_t), 0, NULL, NULL, NULL,
112 	    iser_state, NULL, KM_SLEEP);
113 
114 	/* Populate our list of HCAs */
115 	status = iser_ib_init_hcas();
116 	if (status != DDI_SUCCESS) {
117 		/* HCAs failed to initialize, tear it down */
118 		kmem_cache_destroy(iser_state->iser_wr_cache);
119 		(void) ibt_detach(iser_state->is_ibhdl);
120 		iser_state->is_ibhdl = NULL;
121 		ISER_LOG(CE_NOTE, "iser_ib_init: failed to initialize HCAs");
122 		return (DDI_FAILURE);
123 	}
124 
125 	/* Target will register iSER as a service with IBTF when required */
126 
127 	/* Target will bind this service when it comes online */
128 
129 	return (DDI_SUCCESS);
130 }
131 
132 /*
133  * iser_ib_fini
134  *
135  * This function unbinds and degisters the iSER service from IBTF
136  */
137 int
138 iser_ib_fini(void)
139 {
140 	/* IDM would have already disabled all the services */
141 
142 	/* Teardown the HCA list and associated resources */
143 	if (iser_ib_fini_hcas() != DDI_SUCCESS)
144 		return (DDI_FAILURE);
145 
146 	/* Teardown the global work request kmem_cache */
147 	kmem_cache_destroy(iser_state->iser_wr_cache);
148 
149 	/* Deregister with IBTF */
150 	if (iser_state->is_ibhdl != NULL) {
151 		(void) ibt_detach(iser_state->is_ibhdl);
152 		iser_state->is_ibhdl = NULL;
153 	}
154 
155 	return (DDI_SUCCESS);
156 }
157 
158 /*
159  * iser_ib_register_service
160  *
161  * This function registers the iSER service using the RDMA-Aware Service ID.
162  */
163 int
164 iser_ib_register_service(idm_svc_t *idm_svc)
165 {
166 	ibt_srv_desc_t	srvdesc;
167 	iser_svc_t	*iser_svc;
168 	int		status;
169 
170 	bzero(&srvdesc, sizeof (ibt_srv_desc_t));
171 
172 	/* Set up IBTI client callback handler from the CM */
173 	srvdesc.sd_handler = iser_ib_cm_handler;
174 
175 	srvdesc.sd_flags = IBT_SRV_NO_FLAGS;
176 
177 	iser_svc = (iser_svc_t *)idm_svc->is_iser_svc;
178 
179 	/* Register the service on the specified port */
180 	status = ibt_register_service(
181 	    iser_state->is_ibhdl, &srvdesc,
182 	    iser_svc->is_svcid, 1, &iser_svc->is_srvhdl, NULL);
183 
184 	return (status);
185 }
186 
187 /*
188  * iser_ib_bind_service
189  *
190  * This function binds a given iSER service on all available HCA ports. The
191  * current specification does not allow user to specify transport bindings
192  * for each iscsi target. The ULP invokes this function to bind the target
193  * to all available iser ports after checking for the presence of an IB HCA.
194  * iSER is "configured" whenever an IB-capable IP address exists. The lack
195  * of active IB ports is a less-fatal condition, and sockets would be used
196  * as the transport even though an Infiniband HCA is configured but unusable.
197  *
198  */
199 int
200 iser_ib_bind_service(idm_svc_t *idm_svc)
201 {
202 	iser_hca_t	*hca;
203 	ib_gid_t	gid;
204 	int		num_ports = 0;
205 	int		num_binds = 0;
206 	int		num_inactive_binds = 0; /* if HCA ports inactive */
207 	int		status;
208 	int		i;
209 
210 	ASSERT(idm_svc != NULL);
211 	ASSERT(idm_svc->is_iser_svc != NULL);
212 
213 	/* Register the iSER service on all available ports */
214 	mutex_enter(&iser_state->is_hcalist_lock);
215 
216 	for (hca = list_head(&iser_state->is_hcalist);
217 	    hca != NULL;
218 	    hca = list_next(&iser_state->is_hcalist, hca)) {
219 
220 		for (i = 0; i < hca->hca_num_ports; i++) {
221 			num_ports++;
222 			if (hca->hca_port_info[i].p_linkstate !=
223 			    IBT_PORT_ACTIVE) {
224 				/*
225 				 * Move on. We will attempt to bind service
226 				 * in our async handler if the port comes up
227 				 * at a later time.
228 				 */
229 				num_inactive_binds++;
230 				continue;
231 			}
232 
233 			gid = hca->hca_port_info[i].p_sgid_tbl[0];
234 
235 			/* If the port is already bound, skip */
236 			if (iser_ib_get_bind(
237 			    idm_svc->is_iser_svc, hca->hca_guid, gid) == NULL) {
238 
239 				status = iser_ib_activate_port(
240 				    idm_svc, hca->hca_guid, gid);
241 				if (status != IBT_SUCCESS) {
242 					ISER_LOG(CE_NOTE,
243 					    "iser_ib_bind_service: "
244 					    "iser_ib_activate_port failure "
245 					    "(0x%x)", status);
246 					continue;
247 				}
248 			}
249 			num_binds++;
250 		}
251 	}
252 	mutex_exit(&iser_state->is_hcalist_lock);
253 
254 	if (num_binds) {
255 		ISER_LOG(CE_NOTE, "iser_ib_bind_service: Service available on "
256 		    "(%d) of (%d) ports", num_binds, num_ports);
257 		return (ISER_STATUS_SUCCESS);
258 	} else if (num_inactive_binds) {
259 		ISER_LOG(CE_NOTE, "iser_ib_bind_service: Could not bind "
260 		    "service, HCA ports are not active.");
261 		/*
262 		 * still considered success, the async handler will bind
263 		 * the service when the port comes up at a later time
264 		 */
265 		return (ISER_STATUS_SUCCESS);
266 	} else {
267 		ISER_LOG(CE_NOTE, "iser_ib_bind_service: Did not bind service");
268 		return (ISER_STATUS_FAIL);
269 	}
270 }
271 
272 /*
273  * iser_ib_unbind_service
274  *
275  * This function unbinds a given service on a all HCA ports
276  */
277 void
278 iser_ib_unbind_service(idm_svc_t *idm_svc)
279 {
280 	iser_svc_t	*iser_svc;
281 	iser_sbind_t	*is_sbind, *next_sb;
282 
283 	if (idm_svc != NULL && idm_svc->is_iser_svc != NULL) {
284 
285 		iser_svc = idm_svc->is_iser_svc;
286 
287 		for (is_sbind = list_head(&iser_svc->is_sbindlist);
288 		    is_sbind != NULL;
289 		    is_sbind = next_sb) {
290 			next_sb = list_next(&iser_svc->is_sbindlist, is_sbind);
291 			ibt_unbind_service(iser_svc->is_srvhdl,
292 			    is_sbind->is_sbindhdl);
293 			list_remove(&iser_svc->is_sbindlist, is_sbind);
294 			kmem_free(is_sbind, sizeof (iser_sbind_t));
295 		}
296 	}
297 }
298 
299 /* ARGSUSED */
300 void
301 iser_ib_deregister_service(idm_svc_t *idm_svc)
302 {
303 	iser_svc_t	*iser_svc;
304 
305 	if (idm_svc != NULL && idm_svc->is_iser_svc != NULL) {
306 
307 		iser_svc = (iser_svc_t *)idm_svc->is_iser_svc;
308 		ibt_deregister_service(iser_state->is_ibhdl,
309 		    iser_svc->is_srvhdl);
310 		ibt_release_ip_sid(iser_svc->is_svcid);
311 	}
312 }
313 
314 /*
315  * iser_ib_get_paths
316  * This function finds the IB path between the local and the remote address.
317  *
318  */
319 int
320 iser_ib_get_paths(ibt_ip_addr_t *local_ip, ibt_ip_addr_t *remote_ip,
321     ibt_path_info_t *path, ibt_path_ip_src_t *path_src_ip)
322 {
323 	ibt_ip_path_attr_t	ipattr;
324 	int			status;
325 
326 	(void) bzero(&ipattr, sizeof (ibt_ip_path_attr_t));
327 	ipattr.ipa_dst_ip	= remote_ip;
328 	ipattr.ipa_src_ip	= *local_ip;
329 	ipattr.ipa_max_paths	= 1;
330 	ipattr.ipa_ndst		= 1;
331 
332 	(void) bzero(path, sizeof (ibt_path_info_t));
333 	status = ibt_get_ip_paths(iser_state->is_ibhdl, IBT_PATH_NO_FLAGS,
334 	    &ipattr, path, NULL, path_src_ip);
335 	if (status != IBT_SUCCESS) {
336 		ISER_LOG(CE_NOTE, "ibt_get_ip_paths: ibt_get_ip_paths "
337 		    "failure: status (%d)", status);
338 		return (status);
339 	}
340 
341 	if (local_ip != NULL) {
342 		ISER_LOG(CE_NOTE, "iser_ib_get_paths success: IP[%x to %x]",
343 		    local_ip->un.ip4addr, remote_ip->un.ip4addr);
344 	} else {
345 		ISER_LOG(CE_NOTE, "iser_ib_get_paths success: "
346 		    "IP[INADDR_ANY to %x]", remote_ip->un.ip4addr);
347 	}
348 
349 	return (ISER_STATUS_SUCCESS);
350 }
351 
352 /*
353  * iser_ib_alloc_rc_channel
354  *
355  * This function allocates a reliable communication channel using the specified
356  * channel attributes.
357  */
358 iser_chan_t *
359 iser_ib_alloc_rc_channel(ibt_ip_addr_t *local_ip, ibt_ip_addr_t *remote_ip)
360 {
361 
362 	iser_chan_t			*chan;
363 	ib_gid_t			lgid;
364 	uint8_t				hca_port; /* from path */
365 	iser_hca_t			*hca;
366 	ibt_path_ip_src_t		path_src_ip;
367 	ibt_rc_chan_alloc_args_t	chanargs;
368 	uint_t				sq_size, rq_size;
369 	int				status;
370 
371 	chan = kmem_zalloc(sizeof (iser_chan_t), KM_SLEEP);
372 
373 	mutex_init(&chan->ic_lock, NULL, MUTEX_DRIVER, NULL);
374 	mutex_init(&chan->ic_sq_post_lock, NULL, MUTEX_DRIVER, NULL);
375 
376 	/* Lookup a path to the given destination */
377 	status = iser_ib_get_paths(local_ip, remote_ip, &chan->ic_ibt_path,
378 	    &path_src_ip);
379 
380 	if (status != ISER_STATUS_SUCCESS) {
381 		ISER_LOG(CE_NOTE, "iser_ib_get_paths failed: status (%d)",
382 		    status);
383 		mutex_destroy(&chan->ic_lock);
384 		mutex_destroy(&chan->ic_sq_post_lock);
385 		kmem_free(chan, sizeof (iser_chan_t));
386 		return (NULL);
387 	}
388 
389 	/* get the local gid from the path info */
390 	lgid = chan->ic_ibt_path.pi_prim_cep_path.cep_adds_vect.av_sgid;
391 
392 	/* get the hca port from the path info */
393 	hca_port = chan->ic_ibt_path.pi_prim_cep_path.cep_hca_port_num;
394 
395 	/* Lookup the hca using the gid in the path info */
396 	hca = iser_ib_gid2hca(lgid);
397 	if (hca == NULL) {
398 		ISER_LOG(CE_NOTE, "iser_ib_alloc_rc_channel: failed "
399 		    "to lookup HCA handle");
400 		mutex_destroy(&chan->ic_lock);
401 		mutex_destroy(&chan->ic_sq_post_lock);
402 		kmem_free(chan, sizeof (iser_chan_t));
403 		return (NULL);
404 	}
405 
406 	/* Set up the iSER channel handle with HCA and IP data */
407 	chan->ic_hca		= hca;
408 	chan->ic_localip	= path_src_ip.ip_primary;
409 	chan->ic_remoteip	= *remote_ip;
410 
411 	/*
412 	 * Determine the queue sizes, based upon the HCA query data.
413 	 * For our Work Queues, we will use either our default value,
414 	 * or the HCA's maximum value, whichever is smaller.
415 	 */
416 	sq_size = min(hca->hca_attr.hca_max_chan_sz, ISER_IB_SENDQ_SIZE);
417 	rq_size = min(hca->hca_attr.hca_max_chan_sz, ISER_IB_RECVQ_SIZE);
418 
419 	/*
420 	 * For our Completion Queues, we again check the device maximum.
421 	 * We want to end up with CQs that are the next size up from the
422 	 * WQs they are servicing so that they have some overhead.
423 	 */
424 	if (hca->hca_attr.hca_max_cq_sz >= (sq_size + 1)) {
425 		chan->ic_sendcq_sz = sq_size + 1;
426 	} else {
427 		chan->ic_sendcq_sz = hca->hca_attr.hca_max_cq_sz;
428 		sq_size = chan->ic_sendcq_sz - 1;
429 	}
430 
431 	if (hca->hca_attr.hca_max_cq_sz >= (rq_size + 1)) {
432 		chan->ic_recvcq_sz = rq_size + 1;
433 	} else {
434 		chan->ic_recvcq_sz = hca->hca_attr.hca_max_cq_sz;
435 		rq_size = chan->ic_recvcq_sz - 1;
436 	}
437 
438 	/* Initialize the iSER channel's QP handle */
439 	iser_ib_init_qp(chan, sq_size, rq_size);
440 
441 	/* Set up the Send Completion Queue */
442 	status = iser_ib_setup_cq(hca->hca_hdl, chan->ic_sendcq_sz,
443 	    &chan->ic_sendcq);
444 	if (status != ISER_STATUS_SUCCESS) {
445 		iser_ib_fini_qp(&chan->ic_qp);
446 		mutex_destroy(&chan->ic_lock);
447 		mutex_destroy(&chan->ic_sq_post_lock);
448 		kmem_free(chan, sizeof (iser_chan_t));
449 		return (NULL);
450 	}
451 	ibt_set_cq_handler(chan->ic_sendcq, iser_ib_sendcq_handler, chan);
452 	ibt_enable_cq_notify(chan->ic_sendcq, IBT_NEXT_COMPLETION);
453 
454 	/* Set up the Receive Completion Queue */
455 	status = iser_ib_setup_cq(hca->hca_hdl, chan->ic_recvcq_sz,
456 	    &chan->ic_recvcq);
457 	if (status != ISER_STATUS_SUCCESS) {
458 		(void) ibt_free_cq(chan->ic_sendcq);
459 		iser_ib_fini_qp(&chan->ic_qp);
460 		mutex_destroy(&chan->ic_lock);
461 		mutex_destroy(&chan->ic_sq_post_lock);
462 		kmem_free(chan, sizeof (iser_chan_t));
463 		return (NULL);
464 	}
465 	ibt_set_cq_handler(chan->ic_recvcq, iser_ib_recvcq_handler, chan);
466 	ibt_enable_cq_notify(chan->ic_recvcq, IBT_NEXT_COMPLETION);
467 
468 	/* Setup the channel arguments */
469 	iser_ib_setup_chanargs(hca_port, chan->ic_sendcq, chan->ic_recvcq,
470 	    sq_size, rq_size, hca->hca_pdhdl, &chanargs);
471 
472 	status = ibt_alloc_rc_channel(hca->hca_hdl,
473 	    IBT_ACHAN_NO_FLAGS, &chanargs, &chan->ic_chanhdl, NULL);
474 	if (status != IBT_SUCCESS) {
475 		ISER_LOG(CE_NOTE, "iser_ib_alloc_rc_channel: failed "
476 		    "ibt_alloc_rc_channel: status (%d)", status);
477 		(void) ibt_free_cq(chan->ic_sendcq);
478 		(void) ibt_free_cq(chan->ic_recvcq);
479 		iser_ib_fini_qp(&chan->ic_qp);
480 		mutex_destroy(&chan->ic_lock);
481 		mutex_destroy(&chan->ic_sq_post_lock);
482 		kmem_free(chan, sizeof (iser_chan_t));
483 		return (NULL);
484 	}
485 
486 	/* Set the 'channel' as the client private data */
487 	(void) ibt_set_chan_private(chan->ic_chanhdl, chan);
488 
489 	ISER_LOG(CE_NOTE, "iser_ib_alloc_rc_channel success: "
490 	    "chanhdl (0x%p), IP:[%llx to %llx], lgid (%llx:%llx), HCA(%llx) %d",
491 	    (void *)chan->ic_chanhdl,
492 	    (longlong_t)local_ip->un.ip4addr,
493 	    (longlong_t)remote_ip->un.ip4addr,
494 	    (longlong_t)lgid.gid_prefix, (longlong_t)lgid.gid_guid,
495 	    (longlong_t)hca->hca_guid, hca_port);
496 
497 	return (chan);
498 }
499 
500 /*
501  * iser_ib_open_rc_channel
502  * This function opens a RC connection on the given allocated RC channel
503  */
504 int
505 iser_ib_open_rc_channel(iser_chan_t *chan)
506 {
507 	ibt_ip_cm_info_t	ipcm_info;
508 	iser_private_data_t	iser_priv_data;
509 	ibt_chan_open_args_t	ocargs;
510 	ibt_rc_returns_t	ocreturns;
511 	int			status;
512 
513 	mutex_enter(&chan->ic_lock);
514 
515 	/*
516 	 * For connection establishment, the initiator sends a CM REQ using the
517 	 * iSER RDMA-Aware Service ID. Included are the source and destination
518 	 * IP addresses, and the src port.
519 	 */
520 	bzero(&ipcm_info, sizeof (ibt_ip_cm_info_t));
521 	ipcm_info.src_addr = chan->ic_localip;
522 	ipcm_info.dst_addr = chan->ic_remoteip;
523 	ipcm_info.src_port = chan->ic_lport;
524 
525 	/*
526 	 * The CM Private Data field defines the iSER connection parameters
527 	 * such as zero based virtual address exception (ZBVAE) and Send with
528 	 * invalidate Exception (SIE).
529 	 *
530 	 * Solaris IBT does not currently support ZBVAE or SIE.
531 	 */
532 	iser_priv_data.rsvd1	= 0;
533 	iser_priv_data.sie	= 1;
534 	iser_priv_data.zbvae	= 1;
535 
536 	status = ibt_format_ip_private_data(&ipcm_info,
537 	    sizeof (iser_private_data_t), &iser_priv_data);
538 	if (status != IBT_SUCCESS) {
539 		ISER_LOG(CE_NOTE, "iser_ib_open_rc_channel failed: %d", status);
540 		mutex_exit(&chan->ic_lock);
541 		return (status);
542 	}
543 
544 	/*
545 	 * Set the SID we are attempting to connect to, based upon the
546 	 * remote port number.
547 	 */
548 	chan->ic_ibt_path.pi_sid = ibt_get_ip_sid(IPPROTO_TCP, chan->ic_rport);
549 
550 	/* Set up the args for the channel open */
551 	bzero(&ocargs, sizeof (ibt_chan_open_args_t));
552 	ocargs.oc_path			= &chan->ic_ibt_path;
553 	ocargs.oc_cm_handler		= iser_ib_cm_handler;
554 	ocargs.oc_cm_clnt_private	= iser_state;
555 	ocargs.oc_rdma_ra_out		= 4;
556 	ocargs.oc_rdma_ra_in		= 4;
557 	ocargs.oc_path_retry_cnt	= 2;
558 	ocargs.oc_path_rnr_retry_cnt	= 2;
559 	ocargs.oc_priv_data_len		= sizeof (iser_private_data_t);
560 	ocargs.oc_priv_data		= &iser_priv_data;
561 
562 	bzero(&ocreturns, sizeof (ibt_rc_returns_t));
563 
564 	status = ibt_open_rc_channel(chan->ic_chanhdl,
565 	    IBT_OCHAN_NO_FLAGS, IBT_BLOCKING, &ocargs, &ocreturns);
566 
567 	if (status != IBT_SUCCESS) {
568 		ISER_LOG(CE_NOTE, "iser_ib_open_rc_channel failed: %d", status);
569 		mutex_exit(&chan->ic_lock);
570 		return (status);
571 	}
572 
573 	mutex_exit(&chan->ic_lock);
574 	return (IDM_STATUS_SUCCESS);
575 }
576 
577 /*
578  * iser_ib_close_rc_channel
579  * This function closes the RC channel related to this iser_chan handle.
580  * We invoke this in a non-blocking, no callbacks context.
581  */
582 void
583 iser_ib_close_rc_channel(iser_chan_t *chan)
584 {
585 	int			status;
586 
587 	mutex_enter(&chan->ic_lock);
588 	status = ibt_close_rc_channel(chan->ic_chanhdl, IBT_BLOCKING, NULL,
589 	    0, NULL, NULL, 0);
590 	if (status != IBT_SUCCESS) {
591 		ISER_LOG(CE_NOTE, "iser_ib_close_rc_channel: "
592 		    "ibt_close_rc_channel failed: status (%d)", status);
593 	}
594 	mutex_exit(&chan->ic_lock);
595 }
596 
597 /*
598  * iser_ib_free_rc_channel
599  *
600  * This function tears down an RC channel's QP initialization and frees it.
601  * Note that we do not need synchronization here; the channel has been
602  * closed already, so we should only have completion polling occuring.  Once
603  * complete, we are free to free the IBTF channel, WQ and CQ resources, and
604  * our own related resources.
605  */
606 void
607 iser_ib_free_rc_channel(iser_chan_t *chan)
608 {
609 	iser_qp_t	*iser_qp;
610 
611 	iser_qp = &chan->ic_qp;
612 
613 	/* Ensure the SQ is empty */
614 	while (chan->ic_sq_post_count != 0) {
615 		mutex_exit(&chan->ic_conn->ic_lock);
616 		delay(drv_usectohz(ISER_DELAY_HALF_SECOND));
617 		mutex_enter(&chan->ic_conn->ic_lock);
618 	}
619 	mutex_destroy(&chan->ic_sq_post_lock);
620 
621 	/* Ensure the RQ is empty */
622 	(void) ibt_flush_channel(chan->ic_chanhdl);
623 	mutex_enter(&iser_qp->qp_lock);
624 	while (iser_qp->rq_level != 0) {
625 		mutex_exit(&iser_qp->qp_lock);
626 		mutex_exit(&chan->ic_conn->ic_lock);
627 		delay(drv_usectohz(ISER_DELAY_HALF_SECOND));
628 		mutex_enter(&chan->ic_conn->ic_lock);
629 		mutex_enter(&iser_qp->qp_lock);
630 	}
631 
632 	/* Free our QP handle */
633 	mutex_exit(&iser_qp->qp_lock);
634 	(void) iser_ib_fini_qp(iser_qp);
635 
636 	/* Free the IBT channel resources */
637 	(void) ibt_free_channel(chan->ic_chanhdl);
638 	chan->ic_chanhdl = NULL;
639 
640 	/* Free the CQs */
641 	ibt_free_cq(chan->ic_sendcq);
642 	ibt_free_cq(chan->ic_recvcq);
643 
644 	/* Free the chan handle */
645 	mutex_destroy(&chan->ic_lock);
646 	kmem_free(chan, sizeof (iser_chan_t));
647 }
648 
649 /*
650  * iser_ib_post_recv
651  *
652  * This function handles keeping the RQ full on a given channel.
653  * This routine will mostly be run on a taskq, and will check the
654  * current fill level of the RQ, and post as many WRs as necessary
655  * to fill it again.
656  */
657 
658 int
659 iser_ib_post_recv_async(ibt_channel_hdl_t chanhdl)
660 {
661 	iser_chan_t	*chan;
662 	int		status;
663 
664 	/* Pull our iSER channel handle from the private data */
665 	chan = (iser_chan_t *)ibt_get_chan_private(chanhdl);
666 
667 	/*
668 	 * Caller must check that chan->ic_conn->ic_stage indicates
669 	 * the connection is active (not closing, not closed) and
670 	 * it must hold the mutex cross the check and the call to this function
671 	 */
672 	ASSERT(mutex_owned(&chan->ic_conn->ic_lock));
673 	ASSERT((chan->ic_conn->ic_stage >= ISER_CONN_STAGE_IC_CONNECTED) &&
674 	    (chan->ic_conn->ic_stage <= ISER_CONN_STAGE_LOGGED_IN));
675 	idm_conn_hold(chan->ic_conn->ic_idmc);
676 	status = ddi_taskq_dispatch(iser_taskq, iser_ib_post_recv_task,
677 	    (void *)chanhdl, DDI_NOSLEEP);
678 	if (status != DDI_SUCCESS) {
679 		idm_conn_rele(chan->ic_conn->ic_idmc);
680 	}
681 
682 	return (status);
683 }
684 
685 static void
686 iser_ib_post_recv_task(void *arg)
687 {
688 	ibt_channel_hdl_t	chanhdl = arg;
689 	iser_chan_t		*chan;
690 
691 	/* Pull our iSER channel handle from the private data */
692 	chan = (iser_chan_t *)ibt_get_chan_private(chanhdl);
693 
694 	iser_ib_post_recv(chanhdl);
695 	idm_conn_rele(chan->ic_conn->ic_idmc);
696 }
697 
698 void
699 iser_ib_post_recv(ibt_channel_hdl_t chanhdl)
700 {
701 	iser_chan_t	*chan;
702 	iser_hca_t	*hca;
703 	iser_msg_t	*msg;
704 	ibt_recv_wr_t	*wrlist, wr[ISER_IB_RQ_POST_MAX];
705 	int		rq_space, msg_ret;
706 	int		total_num, npost;
707 	uint_t		nposted;
708 	int		status, i;
709 	iser_qp_t	*iser_qp;
710 	ib_gid_t	lgid;
711 
712 	/* Pull our iSER channel handle from the private data */
713 	chan = (iser_chan_t *)ibt_get_chan_private(chanhdl);
714 
715 	ASSERT(chan != NULL);
716 
717 	mutex_enter(&chan->ic_conn->ic_lock);
718 
719 	/* Bail out if the connection is closed; no need for more recv WRs */
720 	if ((chan->ic_conn->ic_stage == ISER_CONN_STAGE_CLOSING) ||
721 	    (chan->ic_conn->ic_stage == ISER_CONN_STAGE_CLOSED)) {
722 		mutex_exit(&chan->ic_conn->ic_lock);
723 		return;
724 	}
725 
726 	/* get the QP handle from the iser_chan */
727 	iser_qp = &chan->ic_qp;
728 
729 	/* get the local gid from the path info */
730 	lgid = chan->ic_ibt_path.pi_prim_cep_path.cep_adds_vect.av_sgid;
731 
732 	/* get the hca port from the path info */
733 	hca = iser_ib_gid2hca(lgid);
734 	if (hca == NULL) {
735 		ISER_LOG(CE_NOTE, "iser_ib_post_recv: unable to retrieve "
736 		    "HCA handle");
737 		mutex_exit(&chan->ic_conn->ic_lock);
738 		return;
739 	}
740 
741 	/* check for space to post on the RQ */
742 	mutex_enter(&iser_qp->qp_lock);
743 	rq_space = iser_qp->rq_depth - iser_qp->rq_level;
744 	if (rq_space == 0) {
745 		/* The RQ is full, clear the pending flag and return */
746 		iser_qp->rq_taskqpending = B_FALSE;
747 		mutex_exit(&iser_qp->qp_lock);
748 		mutex_exit(&chan->ic_conn->ic_lock);
749 		return;
750 	}
751 
752 	/* Keep track of the lowest value for rq_min_post_level */
753 	if (iser_qp->rq_level < iser_qp->rq_min_post_level)
754 		iser_qp->rq_min_post_level = iser_qp->rq_level;
755 
756 	mutex_exit(&iser_qp->qp_lock);
757 
758 	/* we've room to post, so pull from the msg cache */
759 	msg = iser_msg_get(hca, rq_space, &msg_ret);
760 	if (msg == NULL) {
761 		ISER_LOG(CE_NOTE, "iser_ib_post_recv: no message handles "
762 		    "available in msg cache currently");
763 		/*
764 		 * There are no messages on the cache. Wait a half-
765 		 * second, then try again.
766 		 */
767 		delay(drv_usectohz(ISER_DELAY_HALF_SECOND));
768 		status = iser_ib_post_recv_async(chanhdl);
769 		if (status != DDI_SUCCESS) {
770 			ISER_LOG(CE_NOTE, "iser_ib_post_recv: failed to "
771 			    "redispatch routine");
772 			/* Failed to dispatch, clear pending flag */
773 			mutex_enter(&iser_qp->qp_lock);
774 			iser_qp->rq_taskqpending = B_FALSE;
775 			mutex_exit(&iser_qp->qp_lock);
776 		}
777 		mutex_exit(&chan->ic_conn->ic_lock);
778 		return;
779 	}
780 
781 	if (msg_ret != rq_space) {
782 		ISER_LOG(CE_NOTE, "iser_ib_post_recv: requested number of "
783 		    "messages not allocated: requested (%d) allocated (%d)",
784 		    rq_space, msg_ret);
785 		/* We got some, but not all, of our requested depth */
786 		rq_space = msg_ret;
787 	}
788 
789 	/*
790 	 * Now, walk through the allocated WRs and post them,
791 	 * ISER_IB_RQ_POST_MAX (or less) at a time.
792 	 */
793 	wrlist = &wr[0];
794 	total_num = rq_space;
795 
796 	while (total_num) {
797 		/* determine the number to post on this iteration */
798 		npost = (total_num > ISER_IB_RQ_POST_MAX) ?
799 		    ISER_IB_RQ_POST_MAX : total_num;
800 
801 		/* build a list of WRs from the msg list */
802 		for (i = 0; i < npost; i++) {
803 			wrlist[i].wr_id		= (ibt_wrid_t)(uintptr_t)msg;
804 			wrlist[i].wr_nds	= ISER_IB_SGLIST_SIZE;
805 			wrlist[i].wr_sgl	= &msg->msg_ds;
806 			msg = msg->nextp;
807 		}
808 
809 		/* post the list to the RQ */
810 		nposted = 0;
811 		status = ibt_post_recv(chanhdl, wrlist, npost, &nposted);
812 		if ((status != IBT_SUCCESS) || (nposted != npost)) {
813 			ISER_LOG(CE_NOTE, "iser_ib_post_recv: ibt_post_recv "
814 			    "failed: requested (%d) posted (%d) status (%d)",
815 			    npost, nposted, status);
816 			total_num -= nposted;
817 			break;
818 		}
819 
820 		/* decrement total number to post by the number posted */
821 		total_num -= nposted;
822 	}
823 
824 	mutex_enter(&iser_qp->qp_lock);
825 	if (total_num != 0) {
826 		ISER_LOG(CE_NOTE, "iser_ib_post_recv: unable to fill RQ, "
827 		    "failed to post (%d) WRs", total_num);
828 		iser_qp->rq_level += rq_space - total_num;
829 	} else {
830 		iser_qp->rq_level += rq_space;
831 	}
832 
833 	/*
834 	 * Now that we've filled the RQ, check that all of the recv WRs
835 	 * haven't just been immediately consumed. If so, taskqpending is
836 	 * still B_TRUE, so we need to fire off a taskq thread to post
837 	 * more WRs.
838 	 */
839 	if (iser_qp->rq_level == 0) {
840 		mutex_exit(&iser_qp->qp_lock);
841 		status = iser_ib_post_recv_async(chanhdl);
842 		if (status != DDI_SUCCESS) {
843 			ISER_LOG(CE_NOTE, "iser_ib_post_recv: failed to "
844 			    "dispatch followup routine");
845 			/* Failed to dispatch, clear pending flag */
846 			mutex_enter(&iser_qp->qp_lock);
847 			iser_qp->rq_taskqpending = B_FALSE;
848 			mutex_exit(&iser_qp->qp_lock);
849 		}
850 	} else {
851 		/*
852 		 * We're done, we've filled the RQ. Clear the taskq
853 		 * flag so that we can run again.
854 		 */
855 		iser_qp->rq_taskqpending = B_FALSE;
856 		mutex_exit(&iser_qp->qp_lock);
857 	}
858 
859 	mutex_exit(&chan->ic_conn->ic_lock);
860 }
861 
862 /*
863  * iser_ib_handle_portup_event()
864  * This handles the IBT_EVENT_PORT_UP unaffiliated asynchronous event.
865  *
866  * To facilitate a seamless bringover of the port and configure the CM service
867  * for inbound iSER service requests on this newly active port, the existing
868  * IDM services will be checked for iSER support.
869  * If an iSER service was already created, then this service will simply be
870  * bound to the gid of the newly active port. If on the other hand, the CM
871  * service did not exist, i.e. only socket communication, then a new CM
872  * service will be first registered with the saved service parameters and
873  * then bound to the newly active port.
874  *
875  */
876 /* ARGSUSED */
877 static void
878 iser_ib_handle_portup_event(ibt_hca_hdl_t hdl, ibt_async_event_t *event)
879 {
880 	iser_hca_t		*hca;
881 	ib_gid_t		gid;
882 	idm_svc_t		*idm_svc;
883 	int			status;
884 
885 	ISER_LOG(CE_NOTE, "iser_ib_handle_portup_event: HCA(0x%llx) port(%d)",
886 	    (longlong_t)event->ev_hca_guid, event->ev_port);
887 
888 	/*
889 	 * Query all ports on the HCA and update the port information
890 	 * maintainted in the iser_hca_t structure
891 	 */
892 	hca = iser_ib_guid2hca(event->ev_hca_guid);
893 	if (hca == NULL) {
894 
895 		/* HCA is just made available, first port on that HCA */
896 		hca = iser_ib_alloc_hca(event->ev_hca_guid);
897 
898 		mutex_enter(&iser_state->is_hcalist_lock);
899 		list_insert_tail(&iser_state->is_hcalist, hca);
900 		iser_state->is_num_hcas++;
901 		mutex_exit(&iser_state->is_hcalist_lock);
902 
903 	} else {
904 
905 		status = iser_ib_update_hcaports(hca);
906 
907 		if (status != IBT_SUCCESS) {
908 			ISER_LOG(CE_NOTE, "iser_ib_handle_portup_event "
909 			    "status(0x%x): iser_ib_update_hcaports failed: "
910 			    "HCA(0x%llx) port(%d)", status,
911 			    (longlong_t)event->ev_hca_guid, event->ev_port);
912 			return;
913 		}
914 	}
915 
916 	gid = hca->hca_port_info[event->ev_port - 1].p_sgid_tbl[0];
917 
918 	/*
919 	 * Iterate through the global list of IDM target services
920 	 * and check for existing iSER CM service.
921 	 */
922 	mutex_enter(&idm.idm_global_mutex);
923 	for (idm_svc = list_head(&idm.idm_tgt_svc_list);
924 	    idm_svc != NULL;
925 	    idm_svc = list_next(&idm.idm_tgt_svc_list, idm_svc)) {
926 
927 
928 		if (idm_svc->is_iser_svc == NULL) {
929 
930 			/* Establish a new CM service for iSER requests */
931 			status = iser_tgt_svc_create(
932 			    &idm_svc->is_svc_req, idm_svc);
933 
934 			if (status != IBT_SUCCESS) {
935 				ISER_LOG(CE_NOTE, "iser_ib_handle_portup_event "
936 				    "status(0x%x): iser_tgt_svc_create failed: "
937 				    "HCA(0x%llx) port(%d)", status,
938 				    (longlong_t)event->ev_hca_guid,
939 				    event->ev_port);
940 
941 				continue;
942 			}
943 		}
944 
945 		status = iser_ib_activate_port(
946 		    idm_svc, event->ev_hca_guid, gid);
947 		if (status != IBT_SUCCESS) {
948 
949 			ISER_LOG(CE_NOTE, "iser_ib_handle_portup_event "
950 			    "status(0x%x): Bind service on port "
951 			    "(%llx:%llx) failed",
952 			    status, (longlong_t)gid.gid_prefix,
953 			    (longlong_t)gid.gid_guid);
954 
955 			continue;
956 		}
957 		ISER_LOG(CE_NOTE, "iser_ib_handle_portup_event: service bound "
958 		    "HCA(0x%llx) port(%d)", (longlong_t)event->ev_hca_guid,
959 		    event->ev_port);
960 	}
961 	mutex_exit(&idm.idm_global_mutex);
962 
963 	ISER_LOG(CE_NOTE, "iser_ib_handle_portup_event success: "
964 	    "HCA(0x%llx) port(%d)", (longlong_t)event->ev_hca_guid,
965 	    event->ev_port);
966 }
967 
968 /*
969  * iser_ib_handle_portdown_event()
970  * This handles the IBT_EVENT_PORT_DOWN unaffiliated asynchronous error.
971  *
972  * Unconfigure the CM service on the deactivated port and teardown the
973  * connections that are using the CM service.
974  */
975 /* ARGSUSED */
976 static void
977 iser_ib_handle_portdown_event(ibt_hca_hdl_t hdl, ibt_async_event_t *event)
978 {
979 	iser_hca_t		*hca;
980 	ib_gid_t		gid;
981 	int			status;
982 
983 	/*
984 	 * Query all ports on the HCA and update the port information
985 	 * maintainted in the iser_hca_t structure
986 	 */
987 	hca = iser_ib_guid2hca(event->ev_hca_guid);
988 	ASSERT(hca != NULL);
989 
990 	status = iser_ib_update_hcaports(hca);
991 	if (status != IBT_SUCCESS) {
992 		ISER_LOG(CE_NOTE, "iser_ib_handle_portdown_event status(0x%x): "
993 		    "ibt_ib_update_hcaports failed: HCA(0x%llx) port(%d)",
994 		    status, (longlong_t)event->ev_hca_guid, event->ev_port);
995 		return;
996 	}
997 
998 	/* get the gid of the new port */
999 	gid = hca->hca_port_info[event->ev_port - 1].p_sgid_tbl[0];
1000 	iser_ib_deactivate_port(event->ev_hca_guid, gid);
1001 
1002 	ISER_LOG(CE_NOTE, "iser_ib_handle_portdown_event success: "
1003 	    "HCA(0x%llx) port(%d)", (longlong_t)event->ev_hca_guid,
1004 	    event->ev_port);
1005 }
1006 
1007 /*
1008  * iser_ib_handle_hca_detach_event()
1009  * Quiesce all activity bound for the port, teardown the connection, unbind
1010  * iSER services on all ports and release the HCA handle.
1011  */
1012 /* ARGSUSED */
1013 static void
1014 iser_ib_handle_hca_detach_event(ibt_hca_hdl_t hdl, ibt_async_event_t *event)
1015 {
1016 	iser_hca_t	*nexthca, *hca;
1017 	int		i, status;
1018 
1019 	ISER_LOG(CE_NOTE, "iser_ib_handle_hca_detach_event: HCA(0x%llx)",
1020 	    (longlong_t)event->ev_hca_guid);
1021 
1022 	hca = iser_ib_guid2hca(event->ev_hca_guid);
1023 	for (i = 0; i < hca->hca_num_ports; i++) {
1024 		iser_ib_deactivate_port(hca->hca_guid,
1025 		    hca->hca_port_info[i].p_sgid_tbl[0]);
1026 	}
1027 
1028 	/*
1029 	 * Update the HCA list maintained in the iser_state. Free the
1030 	 * resources allocated to the HCA, i.e. caches, protection domain
1031 	 */
1032 	mutex_enter(&iser_state->is_hcalist_lock);
1033 
1034 	for (hca = list_head(&iser_state->is_hcalist);
1035 	    hca != NULL;
1036 	    hca = nexthca) {
1037 
1038 		nexthca = list_next(&iser_state->is_hcalist, hca);
1039 
1040 		if (hca->hca_guid == event->ev_hca_guid) {
1041 
1042 			list_remove(&iser_state->is_hcalist, hca);
1043 			iser_state->is_num_hcas--;
1044 
1045 			status = iser_ib_free_hca(hca);
1046 			if (status != DDI_SUCCESS) {
1047 				ISER_LOG(CE_WARN, "iser_ib_handle_hca_detach: "
1048 				    "Failed to free hca(%p)", (void *)hca);
1049 				list_insert_tail(&iser_state->is_hcalist, hca);
1050 				iser_state->is_num_hcas++;
1051 			}
1052 			/* No way to return status to IBT if this fails */
1053 		}
1054 	}
1055 	mutex_exit(&iser_state->is_hcalist_lock);
1056 
1057 }
1058 
1059 /*
1060  * iser_ib_async_handler
1061  * An IBT Asynchronous Event handler is registered it with the framework and
1062  * passed via the ibt_attach() routine. This function handles the following
1063  * asynchronous events.
1064  * IBT_EVENT_PORT_UP
1065  * IBT_ERROR_PORT_DOWN
1066  * IBT_HCA_ATTACH_EVENT
1067  * IBT_HCA_DETACH_EVENT
1068  */
1069 /* ARGSUSED */
1070 void
1071 iser_ib_async_handler(void *clntp, ibt_hca_hdl_t hdl, ibt_async_code_t code,
1072     ibt_async_event_t *event)
1073 {
1074 	switch (code) {
1075 	case IBT_EVENT_PORT_UP:
1076 		iser_ib_handle_portup_event(hdl, event);
1077 		break;
1078 
1079 	case IBT_ERROR_PORT_DOWN:
1080 		iser_ib_handle_portdown_event(hdl, event);
1081 		break;
1082 
1083 	case IBT_HCA_ATTACH_EVENT:
1084 		/*
1085 		 * A new HCA device is available for use, ignore this
1086 		 * event because the corresponding IBT_EVENT_PORT_UP
1087 		 * events will get triggered and handled accordingly.
1088 		 */
1089 		break;
1090 
1091 	case IBT_HCA_DETACH_EVENT:
1092 		iser_ib_handle_hca_detach_event(hdl, event);
1093 		break;
1094 
1095 	default:
1096 		break;
1097 	}
1098 }
1099 
1100 /*
1101  * iser_ib_init_hcas
1102  *
1103  * This function opens all the HCA devices, gathers the HCA state information
1104  * and adds the HCA handle for each HCA found in the iser_soft_state.
1105  */
1106 static int
1107 iser_ib_init_hcas(void)
1108 {
1109 	ib_guid_t	*guid;
1110 	int		num_hcas;
1111 	int		i;
1112 	iser_hca_t	*hca;
1113 
1114 	/* Retrieve the HCA list */
1115 	num_hcas = ibt_get_hca_list(&guid);
1116 	if (num_hcas == 0) {
1117 		/*
1118 		 * This shouldn't happen, but might if we have all HCAs
1119 		 * detach prior to initialization.
1120 		 */
1121 		return (DDI_FAILURE);
1122 	}
1123 
1124 	/* Initialize the hcalist lock */
1125 	mutex_init(&iser_state->is_hcalist_lock, NULL, MUTEX_DRIVER, NULL);
1126 
1127 	/* Create the HCA list */
1128 	list_create(&iser_state->is_hcalist, sizeof (iser_hca_t),
1129 	    offsetof(iser_hca_t, hca_node));
1130 
1131 	for (i = 0; i < num_hcas; i++) {
1132 
1133 		ISER_LOG(CE_NOTE, "iser_ib_init_hcas: initializing HCA "
1134 		    "(0x%llx)", (longlong_t)guid[i]);
1135 
1136 		hca = iser_ib_alloc_hca(guid[i]);
1137 		if (hca == NULL) {
1138 			/* This shouldn't happen, teardown and fail */
1139 			(void) iser_ib_fini_hcas();
1140 			(void) ibt_free_hca_list(guid, num_hcas);
1141 			return (DDI_FAILURE);
1142 		}
1143 
1144 		mutex_enter(&iser_state->is_hcalist_lock);
1145 		list_insert_tail(&iser_state->is_hcalist, hca);
1146 		iser_state->is_num_hcas++;
1147 		mutex_exit(&iser_state->is_hcalist_lock);
1148 
1149 	}
1150 
1151 	/* Free the IBT HCA list */
1152 	(void) ibt_free_hca_list(guid, num_hcas);
1153 
1154 	/* Check that we've initialized at least one HCA */
1155 	mutex_enter(&iser_state->is_hcalist_lock);
1156 	if (list_is_empty(&iser_state->is_hcalist)) {
1157 		ISER_LOG(CE_NOTE, "iser_ib_init_hcas: failed to initialize "
1158 		    "any HCAs");
1159 
1160 		mutex_exit(&iser_state->is_hcalist_lock);
1161 		(void) iser_ib_fini_hcas();
1162 		return (DDI_FAILURE);
1163 	}
1164 	mutex_exit(&iser_state->is_hcalist_lock);
1165 
1166 	return (DDI_SUCCESS);
1167 }
1168 
1169 /*
1170  * iser_ib_fini_hcas
1171  *
1172  * Teardown the iSER HCA list initialized above.
1173  */
1174 static int
1175 iser_ib_fini_hcas(void)
1176 {
1177 	iser_hca_t	*nexthca, *hca;
1178 	int		status;
1179 
1180 	mutex_enter(&iser_state->is_hcalist_lock);
1181 	for (hca = list_head(&iser_state->is_hcalist);
1182 	    hca != NULL;
1183 	    hca = nexthca) {
1184 
1185 		nexthca = list_next(&iser_state->is_hcalist, hca);
1186 
1187 		list_remove(&iser_state->is_hcalist, hca);
1188 
1189 		status = iser_ib_free_hca(hca);
1190 		if (status != IBT_SUCCESS) {
1191 			ISER_LOG(CE_NOTE, "iser_ib_fini_hcas: failed to free "
1192 			    "HCA during fini");
1193 			list_insert_tail(&iser_state->is_hcalist, hca);
1194 			return (DDI_FAILURE);
1195 		}
1196 
1197 		iser_state->is_num_hcas--;
1198 
1199 	}
1200 	mutex_exit(&iser_state->is_hcalist_lock);
1201 	list_destroy(&iser_state->is_hcalist);
1202 	mutex_destroy(&iser_state->is_hcalist_lock);
1203 
1204 	return (DDI_SUCCESS);
1205 }
1206 
1207 /*
1208  * iser_ib_alloc_hca
1209  *
1210  * This function opens the given HCA device, gathers the HCA state information
1211  * and adds the HCA handle
1212  */
1213 static iser_hca_t *
1214 iser_ib_alloc_hca(ib_guid_t guid)
1215 {
1216 	iser_hca_t	*hca;
1217 	int		status;
1218 
1219 	/* Allocate an iser_hca_t HCA handle */
1220 	hca = (iser_hca_t *)kmem_zalloc(sizeof (iser_hca_t), KM_SLEEP);
1221 
1222 	/* Open this HCA */
1223 	status = ibt_open_hca(iser_state->is_ibhdl, guid, &hca->hca_hdl);
1224 	if (status != IBT_SUCCESS) {
1225 		ISER_LOG(CE_NOTE, "iser_ib_alloc_hca: ibt_open_hca failed:"
1226 		    " guid (0x%llx) status (0x%x)", (longlong_t)guid, status);
1227 		kmem_free(hca, sizeof (iser_hca_t));
1228 		return (NULL);
1229 	}
1230 
1231 	hca->hca_guid		= guid;
1232 	hca->hca_clnt_hdl	= iser_state->is_ibhdl;
1233 
1234 	/* Query the HCA */
1235 	status = ibt_query_hca(hca->hca_hdl, &hca->hca_attr);
1236 	if (status != IBT_SUCCESS) {
1237 		ISER_LOG(CE_NOTE, "iser_ib_alloc_hca: ibt_query_hca "
1238 		    "failure: guid (0x%llx) status (0x%x)",
1239 		    (longlong_t)guid, status);
1240 		(void) ibt_close_hca(hca->hca_hdl);
1241 		kmem_free(hca, sizeof (iser_hca_t));
1242 		return (NULL);
1243 	}
1244 
1245 	/* Query all ports on the HCA */
1246 	status = ibt_query_hca_ports(hca->hca_hdl, 0,
1247 	    &hca->hca_port_info, &hca->hca_num_ports,
1248 	    &hca->hca_port_info_sz);
1249 	if (status != IBT_SUCCESS) {
1250 		ISER_LOG(CE_NOTE, "iser_ib_alloc_hca: "
1251 		    "ibt_query_hca_ports failure: guid (0x%llx) "
1252 		    "status (0x%x)", (longlong_t)guid, status);
1253 		(void) ibt_close_hca(hca->hca_hdl);
1254 		kmem_free(hca, sizeof (iser_hca_t));
1255 		return (NULL);
1256 	}
1257 
1258 	/* Allocate a single PD on this HCA */
1259 	status = ibt_alloc_pd(hca->hca_hdl, IBT_PD_NO_FLAGS,
1260 	    &hca->hca_pdhdl);
1261 	if (status != IBT_SUCCESS) {
1262 		ISER_LOG(CE_NOTE, "iser_ib_alloc_hca: ibt_alloc_pd "
1263 		    "failure: guid (0x%llx) status (0x%x)",
1264 		    (longlong_t)guid, status);
1265 		(void) ibt_close_hca(hca->hca_hdl);
1266 		ibt_free_portinfo(hca->hca_port_info, hca->hca_port_info_sz);
1267 		kmem_free(hca, sizeof (iser_hca_t));
1268 		return (NULL);
1269 	}
1270 
1271 	/* Initialize the message and data MR caches for this HCA */
1272 	iser_init_hca_caches(hca);
1273 
1274 	return (hca);
1275 }
1276 
1277 static int
1278 iser_ib_free_hca(iser_hca_t *hca)
1279 {
1280 	int			status;
1281 	ibt_hca_portinfo_t	*hca_port_info;
1282 	uint_t			hca_port_info_sz;
1283 
1284 	ASSERT(hca != NULL);
1285 	if (hca->hca_failed)
1286 		return (DDI_FAILURE);
1287 
1288 	hca_port_info = hca->hca_port_info;
1289 	hca_port_info_sz = hca->hca_port_info_sz;
1290 
1291 	/*
1292 	 * Free the memory regions before freeing
1293 	 * the associated protection domain
1294 	 */
1295 	iser_fini_hca_caches(hca);
1296 
1297 	status = ibt_free_pd(hca->hca_hdl, hca->hca_pdhdl);
1298 	if (status != IBT_SUCCESS) {
1299 		ISER_LOG(CE_NOTE, "iser_ib_free_hca: failed to free PD "
1300 		    "status=0x%x", status);
1301 		goto out_caches;
1302 	}
1303 
1304 	status = ibt_close_hca(hca->hca_hdl);
1305 	if (status != IBT_SUCCESS) {
1306 		ISER_LOG(CE_NOTE, "iser_ib_fini_hcas: failed to close HCA "
1307 		    "status=0x%x", status);
1308 		goto out_pd;
1309 	}
1310 
1311 	ibt_free_portinfo(hca_port_info, hca_port_info_sz);
1312 
1313 	kmem_free(hca, sizeof (iser_hca_t));
1314 	return (DDI_SUCCESS);
1315 
1316 	/*
1317 	 * We only managed to partially tear down the HCA, try to put it back
1318 	 * like it was before returning.
1319 	 */
1320 out_pd:
1321 	status = ibt_alloc_pd(hca->hca_hdl, IBT_PD_NO_FLAGS, &hca->hca_pdhdl);
1322 	if (status != IBT_SUCCESS) {
1323 		hca->hca_failed = B_TRUE;
1324 		/* Report error and exit */
1325 		ISER_LOG(CE_NOTE, "iser_ib_free_hca: could not re-alloc PD "
1326 		    "status=0x%x", status);
1327 		return (DDI_FAILURE);
1328 	}
1329 
1330 out_caches:
1331 	iser_init_hca_caches(hca);
1332 
1333 	return (DDI_FAILURE);
1334 }
1335 
1336 static int
1337 iser_ib_update_hcaports(iser_hca_t *hca)
1338 {
1339 	ibt_hca_portinfo_t	*pinfop, *oldpinfop;
1340 	uint_t			size, oldsize, nport;
1341 	int			status;
1342 
1343 	ASSERT(hca != NULL);
1344 
1345 	status = ibt_query_hca_ports(hca->hca_hdl, 0, &pinfop, &nport, &size);
1346 	if (status != IBT_SUCCESS) {
1347 		ISER_LOG(CE_NOTE, "ibt_query_hca_ports failed: %d", status);
1348 		return (status);
1349 	}
1350 
1351 	oldpinfop = hca->hca_port_info;
1352 	oldsize	= hca->hca_port_info_sz;
1353 	hca->hca_port_info = pinfop;
1354 	hca->hca_port_info_sz = size;
1355 
1356 	(void) ibt_free_portinfo(oldpinfop, oldsize);
1357 
1358 	return (IBT_SUCCESS);
1359 }
1360 
1361 /*
1362  * iser_ib_gid2hca
1363  * Given a gid, find the corresponding hca
1364  */
1365 iser_hca_t *
1366 iser_ib_gid2hca(ib_gid_t gid)
1367 {
1368 
1369 	iser_hca_t	*hca;
1370 	int		i;
1371 
1372 	mutex_enter(&iser_state->is_hcalist_lock);
1373 	for (hca = list_head(&iser_state->is_hcalist);
1374 	    hca != NULL;
1375 	    hca = list_next(&iser_state->is_hcalist, hca)) {
1376 
1377 		for (i = 0; i < hca->hca_num_ports; i++) {
1378 			if ((hca->hca_port_info[i].p_sgid_tbl[0].gid_prefix ==
1379 			    gid.gid_prefix) &&
1380 			    (hca->hca_port_info[i].p_sgid_tbl[0].gid_guid ==
1381 			    gid.gid_guid)) {
1382 
1383 				mutex_exit(&iser_state->is_hcalist_lock);
1384 
1385 				return (hca);
1386 			}
1387 		}
1388 	}
1389 	mutex_exit(&iser_state->is_hcalist_lock);
1390 	return (NULL);
1391 }
1392 
1393 /*
1394  * iser_ib_guid2hca
1395  * Given a HCA guid, find the corresponding HCA
1396  */
1397 iser_hca_t *
1398 iser_ib_guid2hca(ib_guid_t guid)
1399 {
1400 
1401 	iser_hca_t	*hca;
1402 
1403 	mutex_enter(&iser_state->is_hcalist_lock);
1404 	for (hca = list_head(&iser_state->is_hcalist);
1405 	    hca != NULL;
1406 	    hca = list_next(&iser_state->is_hcalist, hca)) {
1407 
1408 		if (hca->hca_guid == guid) {
1409 			mutex_exit(&iser_state->is_hcalist_lock);
1410 			return (hca);
1411 		}
1412 	}
1413 	mutex_exit(&iser_state->is_hcalist_lock);
1414 	return (NULL);
1415 }
1416 
1417 /*
1418  * iser_ib_conv_sockaddr2ibtaddr
1419  * This function converts a socket address into the IBT format
1420  */
1421 void iser_ib_conv_sockaddr2ibtaddr(
1422     idm_sockaddr_t *saddr, ibt_ip_addr_t *ibt_addr)
1423 {
1424 	if (saddr == NULL) {
1425 		ibt_addr->family = AF_UNSPEC;
1426 		ibt_addr->un.ip4addr = 0;
1427 	} else {
1428 		switch (saddr->sin.sa_family) {
1429 		case AF_INET:
1430 
1431 			ibt_addr->family	= saddr->sin4.sin_family;
1432 			ibt_addr->un.ip4addr	= saddr->sin4.sin_addr.s_addr;
1433 			break;
1434 
1435 		case AF_INET6:
1436 
1437 			ibt_addr->family	= saddr->sin6.sin6_family;
1438 			ibt_addr->un.ip6addr	= saddr->sin6.sin6_addr;
1439 			break;
1440 
1441 		default:
1442 			ibt_addr->family = AF_UNSPEC;
1443 		}
1444 
1445 	}
1446 }
1447 
1448 /*
1449  * iser_ib_conv_ibtaddr2sockaddr
1450  * This function converts an IBT ip address handle to a sockaddr
1451  */
1452 void iser_ib_conv_ibtaddr2sockaddr(struct sockaddr_storage *ss,
1453     ibt_ip_addr_t *ibt_addr, in_port_t port)
1454 {
1455 	struct sockaddr_in *sin;
1456 	struct sockaddr_in6 *sin6;
1457 
1458 	switch (ibt_addr->family) {
1459 	case AF_INET:
1460 	case AF_UNSPEC:
1461 
1462 		sin = (struct sockaddr_in *)ibt_addr;
1463 		sin->sin_port = ntohs(port);
1464 		bcopy(sin, ss, sizeof (struct sockaddr_in));
1465 		break;
1466 
1467 	case AF_INET6:
1468 
1469 		sin6 = (struct sockaddr_in6 *)ibt_addr;
1470 		sin6->sin6_port = ntohs(port);
1471 		bcopy(sin6, ss, sizeof (struct sockaddr_in6));
1472 		break;
1473 
1474 	default:
1475 		ISER_LOG(CE_NOTE, "iser_ib_conv_ibtaddr2sockaddr: "
1476 		    "unknown family type: 0x%x", ibt_addr->family);
1477 	}
1478 }
1479 
1480 /*
1481  * iser_ib_setup_cq
1482  * This function sets up the Completion Queue size and allocates the specified
1483  * Completion Queue
1484  */
1485 static int
1486 iser_ib_setup_cq(ibt_hca_hdl_t hca_hdl, uint_t cq_size, ibt_cq_hdl_t *cq_hdl)
1487 {
1488 
1489 	ibt_cq_attr_t		cq_attr;
1490 	int			status;
1491 
1492 	cq_attr.cq_size		= cq_size;
1493 	cq_attr.cq_sched	= 0;
1494 	cq_attr.cq_flags	= IBT_CQ_NO_FLAGS;
1495 
1496 	/* Allocate a Completion Queue */
1497 	status = ibt_alloc_cq(hca_hdl, &cq_attr, cq_hdl, NULL);
1498 	if (status != IBT_SUCCESS) {
1499 		ISER_LOG(CE_NOTE, "iser_ib_setup_cq: ibt_alloc_cq failure (%d)",
1500 		    status);
1501 		return (status);
1502 	}
1503 
1504 	return (ISER_STATUS_SUCCESS);
1505 }
1506 
1507 /*
1508  * iser_ib_setup_chanargs
1509  *
1510  */
1511 static void
1512 iser_ib_setup_chanargs(uint8_t hca_port, ibt_cq_hdl_t scq_hdl,
1513     ibt_cq_hdl_t rcq_hdl, uint_t sq_size, uint_t rq_size,
1514     ibt_pd_hdl_t hca_pdhdl, ibt_rc_chan_alloc_args_t *cargs)
1515 {
1516 
1517 	bzero(cargs, sizeof (ibt_rc_chan_alloc_args_t));
1518 
1519 	/*
1520 	 * Set up the size of the channels send queue, receive queue and the
1521 	 * maximum number of elements in a scatter gather list of work requests
1522 	 * posted to the send and receive queues.
1523 	 */
1524 	cargs->rc_sizes.cs_sq		= sq_size;
1525 	cargs->rc_sizes.cs_rq		= rq_size;
1526 	cargs->rc_sizes.cs_sq_sgl	= ISER_IB_SGLIST_SIZE;
1527 	cargs->rc_sizes.cs_rq_sgl	= ISER_IB_SGLIST_SIZE;
1528 
1529 	/*
1530 	 * All Work requests signaled on a WR basis will receive a send
1531 	 * request completion.
1532 	 */
1533 	cargs->rc_flags			= IBT_ALL_SIGNALED;
1534 
1535 	/* Enable RDMA read and RDMA write on the channel end points */
1536 	cargs->rc_control		= IBT_CEP_RDMA_RD | IBT_CEP_RDMA_WR;
1537 
1538 	/* Set the local hca port on which the channel is allocated */
1539 	cargs->rc_hca_port_num		= hca_port;
1540 
1541 	/* Set the Send and Receive Completion Queue handles */
1542 	cargs->rc_scq			= scq_hdl;
1543 	cargs->rc_rcq			= rcq_hdl;
1544 
1545 	/* Set the protection domain associated with the channel */
1546 	cargs->rc_pd			= hca_pdhdl;
1547 
1548 	/* No SRQ usage */
1549 	cargs->rc_srq			= NULL;
1550 }
1551 
1552 /*
1553  * iser_ib_init_qp
1554  * Initialize the QP handle
1555  */
1556 void
1557 iser_ib_init_qp(iser_chan_t *chan, uint_t sq_size, uint_t rq_size)
1558 {
1559 	/* Initialize the handle lock */
1560 	mutex_init(&chan->ic_qp.qp_lock, NULL, MUTEX_DRIVER, NULL);
1561 
1562 	/* Record queue sizes */
1563 	chan->ic_qp.sq_size = sq_size;
1564 	chan->ic_qp.rq_size = rq_size;
1565 
1566 	/* Initialize the RQ monitoring data */
1567 	chan->ic_qp.rq_depth  = rq_size;
1568 	chan->ic_qp.rq_level  = 0;
1569 	chan->ic_qp.rq_lwm = (chan->ic_recvcq_sz * ISER_IB_RQ_LWM_PCT) / 100;
1570 
1571 	/* Initialize the taskq flag */
1572 	chan->ic_qp.rq_taskqpending = B_FALSE;
1573 }
1574 
1575 /*
1576  * iser_ib_fini_qp
1577  * Teardown the QP handle
1578  */
1579 void
1580 iser_ib_fini_qp(iser_qp_t *qp)
1581 {
1582 	/* Destroy the handle lock */
1583 	mutex_destroy(&qp->qp_lock);
1584 }
1585 
1586 static int
1587 iser_ib_activate_port(idm_svc_t *idm_svc, ib_guid_t guid, ib_gid_t gid)
1588 {
1589 	iser_svc_t	*iser_svc;
1590 	iser_sbind_t	*is_sbind;
1591 	int		status;
1592 
1593 	iser_svc = idm_svc->is_iser_svc;
1594 
1595 	/*
1596 	 * Save the address of the service bind handle in the
1597 	 * iser_svc_t to undo the service binding at a later time
1598 	 */
1599 	is_sbind = kmem_zalloc(sizeof (iser_sbind_t), KM_SLEEP);
1600 	is_sbind->is_gid	= gid;
1601 	is_sbind->is_guid	= guid;
1602 
1603 	status  = ibt_bind_service(iser_svc->is_srvhdl, gid, NULL,
1604 	    idm_svc, &is_sbind->is_sbindhdl);
1605 
1606 	if (status != IBT_SUCCESS) {
1607 		ISER_LOG(CE_NOTE, "iser_ib_activate_port: status(0x%x): "
1608 		    "Bind service(%llx) on port(%llx:%llx) failed",
1609 		    status, (longlong_t)iser_svc->is_svcid,
1610 		    (longlong_t)gid.gid_prefix, (longlong_t)gid.gid_guid);
1611 
1612 		kmem_free(is_sbind, sizeof (iser_sbind_t));
1613 
1614 		return (status);
1615 	}
1616 
1617 	list_insert_tail(&iser_svc->is_sbindlist, is_sbind);
1618 
1619 	return (IBT_SUCCESS);
1620 }
1621 
1622 static void
1623 iser_ib_deactivate_port(ib_guid_t hca_guid, ib_gid_t gid)
1624 {
1625 	iser_svc_t	*iser_svc;
1626 	iser_conn_t	*iser_conn;
1627 	iser_sbind_t	*is_sbind;
1628 	idm_conn_t	*idm_conn;
1629 
1630 	/*
1631 	 * Iterate through the global list of IDM target connections.
1632 	 * Issue a TRANSPORT_FAIL for any connections on this port, and
1633 	 * if there is a bound service running on the port, tear it down.
1634 	 */
1635 	mutex_enter(&idm.idm_global_mutex);
1636 	for (idm_conn = list_head(&idm.idm_tgt_conn_list);
1637 	    idm_conn != NULL;
1638 	    idm_conn = list_next(&idm.idm_tgt_conn_list, idm_conn)) {
1639 
1640 		if (idm_conn->ic_transport_type != IDM_TRANSPORT_TYPE_ISER) {
1641 			/* this is not an iSER connection, skip it */
1642 			continue;
1643 		}
1644 
1645 		iser_conn = idm_conn->ic_transport_private;
1646 		if (iser_conn->ic_chan->ic_ibt_path.pi_hca_guid != hca_guid) {
1647 			/* this iSER connection is on a different port */
1648 			continue;
1649 		}
1650 
1651 		/* Fail the transport for this connection */
1652 		idm_conn_event(idm_conn, CE_TRANSPORT_FAIL, IDM_STATUS_FAIL);
1653 
1654 		if (idm_conn->ic_conn_type == CONN_TYPE_INI) {
1655 			/* initiator connection, nothing else to do */
1656 			continue;
1657 		}
1658 
1659 		/* Check for a service binding */
1660 		iser_svc = idm_conn->ic_svc_binding->is_iser_svc;
1661 		is_sbind = iser_ib_get_bind(iser_svc, hca_guid, gid);
1662 		if (is_sbind != NULL) {
1663 			/* This service is still bound, tear it down */
1664 			ibt_unbind_service(iser_svc->is_srvhdl,
1665 			    is_sbind->is_sbindhdl);
1666 			list_remove(&iser_svc->is_sbindlist, is_sbind);
1667 			kmem_free(is_sbind, sizeof (iser_sbind_t));
1668 		}
1669 	}
1670 	mutex_exit(&idm.idm_global_mutex);
1671 }
1672 
1673 static iser_sbind_t *
1674 iser_ib_get_bind(iser_svc_t *iser_svc, ib_guid_t hca_guid, ib_gid_t gid)
1675 {
1676 	iser_sbind_t	*is_sbind;
1677 
1678 	for (is_sbind = list_head(&iser_svc->is_sbindlist);
1679 	    is_sbind != NULL;
1680 	    is_sbind = list_next(&iser_svc->is_sbindlist, is_sbind)) {
1681 
1682 		if ((is_sbind->is_guid == hca_guid) &&
1683 		    (is_sbind->is_gid.gid_prefix == gid.gid_prefix) &&
1684 		    (is_sbind->is_gid.gid_guid == gid.gid_guid)) {
1685 			return (is_sbind);
1686 		}
1687 	}
1688 	return (NULL);
1689 }
1690