1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * I/O Controller functions for the Solaris COMSTAR SCSI RDMA Protocol
29  * Target (SRPT) port provider.
30  */
31 
32 #include <sys/types.h>
33 #include <sys/ddi.h>
34 #include <sys/types.h>
35 #include <sys/sunddi.h>
36 #include <sys/atomic.h>
37 #include <sys/sysmacros.h>
38 #include <sys/ib/ibtl/ibti.h>
39 #include <sys/sdt.h>
40 
41 #include "srp.h"
42 #include "srpt_impl.h"
43 #include "srpt_ioc.h"
44 #include "srpt_stp.h"
45 #include "srpt_ch.h"
46 
47 /*
48  * srpt_ioc_srq_size - Tunable parameter that specifies the number
49  * of receive WQ entries that can be posted to the IOC shared
50  * receive queue.
51  */
52 uint32_t	srpt_ioc_srq_size = SRPT_DEFAULT_IOC_SRQ_SIZE;
53 extern uint16_t srpt_send_msg_depth;
54 
55 /* IOC profile capabilities mask must be big-endian */
56 typedef struct srpt_ioc_opcap_bits_s {
57 #if	defined(_BIT_FIELDS_LTOH)
58 	uint8_t		af:1,
59 			at:1,
60 			wf:1,
61 			wt:1,
62 			rf:1,
63 			rt:1,
64 			sf:1,
65 			st:1;
66 #elif	defined(_BIT_FIELDS_HTOL)
67 	uint8_t		st:1,
68 			sf:1,
69 			rt:1,
70 			rf:1,
71 			wt:1,
72 			wf:1,
73 			at:1,
74 			af:1;
75 #else
76 #error	One of _BIT_FIELDS_LTOH or _BIT_FIELDS_HTOL must be defined
77 #endif
78 } srpt_ioc_opcap_bits_t;
79 
80 typedef union {
81 	srpt_ioc_opcap_bits_t	bits;
82 	uint8_t			mask;
83 } srpt_ioc_opcap_mask_t;
84 
85 /*
86  * vmem arena variables - values derived from iSER
87  */
88 #define	SRPT_MR_QUANTSIZE	0x400			/* 1K */
89 #define	SRPT_MIN_CHUNKSIZE	0x100000		/* 1MB */
90 
91 /* use less memory on 32-bit kernels as it's much more constrained */
92 #ifdef _LP64
93 #define	SRPT_BUF_MR_CHUNKSIZE	0x1000000		/* 16MB */
94 #define	SRPT_BUF_POOL_MAX	0x40000000		/* 1GB */
95 #else
96 #define	SRPT_BUF_MR_CHUNKSIZE	0x400000		/* 4MB */
97 #define	SRPT_BUF_POOL_MAX	0x4000000		/* 64MB */
98 #endif
99 
100 static ibt_mr_flags_t	srpt_dbuf_mr_flags =
101     IBT_MR_ENABLE_LOCAL_WRITE | IBT_MR_ENABLE_REMOTE_WRITE |
102     IBT_MR_ENABLE_REMOTE_READ;
103 
104 void srpt_ioc_ib_async_hdlr(void *clnt, ibt_hca_hdl_t hdl,
105 	ibt_async_code_t code, ibt_async_event_t *event);
106 
107 static struct ibt_clnt_modinfo_s srpt_ibt_modinfo = {
108 	IBTI_V_CURR,
109 	IBT_STORAGE_DEV,
110 	srpt_ioc_ib_async_hdlr,
111 	NULL,
112 	"srpt"
113 };
114 
115 static srpt_ioc_t *srpt_ioc_init(ib_guid_t guid);
116 static void srpt_ioc_fini(srpt_ioc_t *ioc);
117 
118 static srpt_vmem_pool_t *srpt_vmem_create(const char *name, srpt_ioc_t *ioc,
119     ib_memlen_t chunksize, uint64_t maxsize, ibt_mr_flags_t flags);
120 static void *srpt_vmem_alloc(srpt_vmem_pool_t *vm_pool, size_t size);
121 static int srpt_vmem_mr_compare(const void *a, const void *b);
122 static srpt_mr_t *srpt_vmem_chunk_alloc(srpt_vmem_pool_t *ioc,
123     ib_memlen_t chunksize);
124 static void srpt_vmem_destroy(srpt_vmem_pool_t *vm_pool);
125 static void srpt_vmem_free(srpt_vmem_pool_t *vm_pool, void *vaddr, size_t size);
126 static srpt_mr_t *srpt_reg_mem(srpt_vmem_pool_t *vm_pool, ib_vaddr_t vaddr,
127     ib_memlen_t len);
128 static void srpt_vmem_chunk_free(srpt_vmem_pool_t *vm_pool, srpt_mr_t *mr);
129 static void srpt_dereg_mem(srpt_ioc_t *ioc, srpt_mr_t *mr);
130 static int srpt_vmem_mr(srpt_vmem_pool_t *vm_pool, void *vaddr, size_t size,
131     srpt_mr_t *mr);
132 
133 /*
134  * srpt_ioc_attach() - I/O Controller attach
135  *
136  * Attach to IBTF and initialize I/O controllers. The srpt_ctxt->sc_rwlock
137  * should be held outside of this call.
138  */
139 int
140 srpt_ioc_attach()
141 {
142 	int		status;
143 	int		hca_cnt;
144 	int		hca_ndx;
145 	ib_guid_t	*guid;
146 	srpt_ioc_t	*ioc;
147 
148 	ASSERT(srpt_ctxt != NULL);
149 
150 	/*
151 	 * Attach to IBTF and initialize a list of IB devices.  Each
152 	 * HCA will be represented by an I/O Controller.
153 	 */
154 	status = ibt_attach(&srpt_ibt_modinfo, srpt_ctxt->sc_dip,
155 	    srpt_ctxt,  &srpt_ctxt->sc_ibt_hdl);
156 	if (status != DDI_SUCCESS) {
157 		SRPT_DPRINTF_L1("ioc_attach, ibt_attach failed (0x%x)",
158 		    status);
159 		return (DDI_FAILURE);
160 	}
161 
162 	hca_cnt = ibt_get_hca_list(&guid);
163 	if (hca_cnt < 1) {
164 		/*
165 		 * not a fatal error.  Service will be up and
166 		 * waiting for ATTACH events.
167 		 */
168 		SRPT_DPRINTF_L2("ioc_attach, no HCA found");
169 		return (DDI_SUCCESS);
170 	}
171 
172 	for (hca_ndx = 0; hca_ndx < hca_cnt; hca_ndx++) {
173 		SRPT_DPRINTF_L2("ioc_attach, adding I/O"
174 		    " Controller (%016llx)", (u_longlong_t)guid[hca_ndx]);
175 
176 		ioc = srpt_ioc_init(guid[hca_ndx]);
177 		if (ioc == NULL) {
178 			SRPT_DPRINTF_L1("ioc_attach, ioc_init GUID(%016llx)"
179 			    " failed", (u_longlong_t)guid[hca_ndx]);
180 			continue;
181 		}
182 		list_insert_tail(&srpt_ctxt->sc_ioc_list, ioc);
183 		SRPT_DPRINTF_L2("ioc_attach, I/O Controller ibt HCA hdl (%p)",
184 		    (void *)ioc->ioc_ibt_hdl);
185 		srpt_ctxt->sc_num_iocs++;
186 	}
187 
188 	ibt_free_hca_list(guid, hca_cnt);
189 	SRPT_DPRINTF_L3("ioc_attach, added %d I/O Controller(s)",
190 	    srpt_ctxt->sc_num_iocs);
191 	return (DDI_SUCCESS);
192 }
193 
194 /*
195  * srpt_ioc_detach() - I/O Controller detach
196  *
197  * srpt_ctxt->sc_rwlock should be held outside of this call.
198  */
199 void
200 srpt_ioc_detach()
201 {
202 	srpt_ioc_t	*ioc;
203 
204 	ASSERT(srpt_ctxt != NULL);
205 
206 	while ((ioc = list_head(&srpt_ctxt->sc_ioc_list)) != NULL) {
207 		list_remove(&srpt_ctxt->sc_ioc_list, ioc);
208 		SRPT_DPRINTF_L2("ioc_detach, removing I/O Controller(%p)"
209 		    " (%016llx), ibt_hdl(%p)",
210 		    (void *)ioc,
211 		    ioc ? (u_longlong_t)ioc->ioc_guid : 0x0ll,
212 		    (void *)ioc->ioc_ibt_hdl);
213 		srpt_ioc_fini(ioc);
214 	}
215 
216 	(void) ibt_detach(srpt_ctxt->sc_ibt_hdl);
217 	srpt_ctxt->sc_ibt_hdl = NULL;
218 }
219 
220 /*
221  * srpt_ioc_init() - I/O Controller initialization
222  *
223  * Requires srpt_ctxt->rw_lock be held outside of call.
224  */
225 static srpt_ioc_t *
226 srpt_ioc_init(ib_guid_t guid)
227 {
228 	ibt_status_t		status;
229 	srpt_ioc_t		*ioc;
230 	ibt_hca_attr_t		hca_attr;
231 	uint_t			iu_ndx;
232 	uint_t			err_ndx;
233 	ibt_mr_attr_t		mr_attr;
234 	ibt_mr_desc_t		mr_desc;
235 	srpt_iu_t		*iu;
236 	ibt_srq_sizes_t		srq_attr;
237 	char			namebuf[32];
238 	size_t			iu_offset;
239 
240 	status = ibt_query_hca_byguid(guid, &hca_attr);
241 	if (status != IBT_SUCCESS) {
242 		SRPT_DPRINTF_L1("ioc_init, HCA query error (%d)",
243 		    status);
244 		return (NULL);
245 	}
246 
247 	ioc = srpt_ioc_get_locked(guid);
248 	if (ioc != NULL) {
249 		SRPT_DPRINTF_L1("ioc_init, HCA already exists");
250 		return (NULL);
251 	}
252 
253 	ioc = kmem_zalloc(sizeof (srpt_ioc_t), KM_SLEEP);
254 
255 	rw_init(&ioc->ioc_rwlock, NULL, RW_DRIVER, NULL);
256 	rw_enter(&ioc->ioc_rwlock, RW_WRITER);
257 
258 	bcopy(&hca_attr, &ioc->ioc_attr, sizeof (ibt_hca_attr_t));
259 
260 	SRPT_DPRINTF_L2("ioc_init, HCA max mr=%d, mrlen=%lld",
261 	    hca_attr.hca_max_memr, (u_longlong_t)hca_attr.hca_max_memr_len);
262 	ioc->ioc_guid   = guid;
263 
264 	status = ibt_open_hca(srpt_ctxt->sc_ibt_hdl, guid, &ioc->ioc_ibt_hdl);
265 	if (status != IBT_SUCCESS) {
266 		SRPT_DPRINTF_L1("ioc_init, IBT open failed (%d)", status);
267 		goto hca_open_err;
268 	}
269 
270 	status = ibt_alloc_pd(ioc->ioc_ibt_hdl, IBT_PD_NO_FLAGS,
271 	    &ioc->ioc_pd_hdl);
272 	if (status != IBT_SUCCESS) {
273 		SRPT_DPRINTF_L1("ioc_init, IBT create PD failed (%d)", status);
274 		goto pd_alloc_err;
275 	}
276 
277 	/*
278 	 * We require hardware support for SRQs.  We use a common SRQ to
279 	 * reduce channel memory consumption.
280 	 */
281 	if ((ioc->ioc_attr.hca_flags & IBT_HCA_SRQ) == 0) {
282 		SRPT_DPRINTF_L0("ioc_init, no SRQ capability, not supported");
283 		goto srq_alloc_err;
284 	}
285 
286 	SRPT_DPRINTF_L3("ioc_init, Using shared receive queues, max srq work"
287 	    " queue size(%d), def size = %d", ioc->ioc_attr.hca_max_srqs_sz,
288 	    srpt_ioc_srq_size);
289 	srq_attr.srq_wr_sz = min(srpt_ioc_srq_size,
290 	    ioc->ioc_attr.hca_max_srqs_sz);
291 	srq_attr.srq_sgl_sz = 1;
292 
293 	status = ibt_alloc_srq(ioc->ioc_ibt_hdl, IBT_SRQ_NO_FLAGS,
294 	    ioc->ioc_pd_hdl, &srq_attr, &ioc->ioc_srq_hdl,
295 	    &ioc->ioc_srq_attr);
296 	if (status != IBT_SUCCESS) {
297 		SRPT_DPRINTF_L1("ioc_init, IBT create SRQ failed(%d)", status);
298 		goto srq_alloc_err;
299 	}
300 
301 	SRPT_DPRINTF_L2("ioc_init, SRQ WR size(%d), SG size(%d)",
302 	    ioc->ioc_srq_attr.srq_wr_sz, ioc->ioc_srq_attr.srq_sgl_sz);
303 
304 	ibt_set_srq_private(ioc->ioc_srq_hdl, ioc);
305 
306 	/*
307 	 * Allocate a pool of SRP IU message buffers and post them to
308 	 * the I/O Controller SRQ.  We let the SRQ manage the free IU
309 	 * messages.
310 	 */
311 	ioc->ioc_num_iu_entries =
312 	    min(srq_attr.srq_wr_sz, srpt_ioc_srq_size) - 1;
313 
314 	ioc->ioc_iu_pool = kmem_zalloc(sizeof (srpt_iu_t) *
315 	    ioc->ioc_num_iu_entries, KM_SLEEP);
316 
317 	ioc->ioc_iu_bufs = kmem_alloc(SRPT_DEFAULT_SEND_MSG_SIZE *
318 	    ioc->ioc_num_iu_entries, KM_SLEEP);
319 
320 	if ((ioc->ioc_iu_pool == NULL) || (ioc->ioc_iu_bufs == NULL)) {
321 		SRPT_DPRINTF_L1("ioc_init, failed to allocate SRQ IUs");
322 		goto srq_iu_alloc_err;
323 	}
324 
325 	mr_attr.mr_vaddr = (ib_vaddr_t)(uintptr_t)ioc->ioc_iu_bufs;
326 	mr_attr.mr_len   = SRPT_DEFAULT_SEND_MSG_SIZE * ioc->ioc_num_iu_entries;
327 	mr_attr.mr_as    = NULL;
328 	mr_attr.mr_flags = IBT_MR_ENABLE_LOCAL_WRITE;
329 
330 	status = ibt_register_mr(ioc->ioc_ibt_hdl, ioc->ioc_pd_hdl,
331 	    &mr_attr, &ioc->ioc_iu_mr_hdl, &mr_desc);
332 	if (status != IBT_SUCCESS) {
333 		SRPT_DPRINTF_L1("ioc_init, IU buffer pool MR err(%d)",
334 		    status);
335 		goto srq_iu_alloc_err;
336 	}
337 
338 	for (iu_ndx = 0, iu = ioc->ioc_iu_pool; iu_ndx <
339 	    ioc->ioc_num_iu_entries; iu_ndx++, iu++) {
340 
341 		iu_offset = (iu_ndx * SRPT_DEFAULT_SEND_MSG_SIZE);
342 		iu->iu_buf = (void *)((uintptr_t)ioc->ioc_iu_bufs + iu_offset);
343 
344 		mutex_init(&iu->iu_lock, NULL, MUTEX_DRIVER, NULL);
345 
346 		iu->iu_sge.ds_va  = mr_desc.md_vaddr + iu_offset;
347 		iu->iu_sge.ds_key = mr_desc.md_lkey;
348 		iu->iu_sge.ds_len = SRPT_DEFAULT_SEND_MSG_SIZE;
349 		iu->iu_ioc	  = ioc;
350 		iu->iu_pool_ndx   = iu_ndx;
351 
352 		status = srpt_ioc_post_recv_iu(ioc, &ioc->ioc_iu_pool[iu_ndx]);
353 		if (status != IBT_SUCCESS) {
354 			SRPT_DPRINTF_L1("ioc_init, SRQ IU post err(%d)",
355 			    status);
356 			goto srq_iu_post_err;
357 		}
358 	}
359 
360 	/*
361 	 * Initialize the dbuf vmem arena
362 	 */
363 	(void) snprintf(namebuf, sizeof (namebuf),
364 	    "srpt_buf_pool_%16llX", (u_longlong_t)guid);
365 	ioc->ioc_dbuf_pool = srpt_vmem_create(namebuf, ioc,
366 	    SRPT_BUF_MR_CHUNKSIZE, SRPT_BUF_POOL_MAX, srpt_dbuf_mr_flags);
367 
368 	if (ioc->ioc_dbuf_pool == NULL) {
369 		goto stmf_db_alloc_err;
370 	}
371 
372 	/*
373 	 * Allocate the I/O Controller STMF data buffer allocator.  The
374 	 * data store will span all targets associated with this IOC.
375 	 */
376 	ioc->ioc_stmf_ds = stmf_alloc(STMF_STRUCT_DBUF_STORE, 0, 0);
377 	if (ioc->ioc_stmf_ds == NULL) {
378 		SRPT_DPRINTF_L1("ioc_attach, STMF DBUF alloc failure for IOC");
379 		goto stmf_db_alloc_err;
380 	}
381 	ioc->ioc_stmf_ds->ds_alloc_data_buf = &srpt_ioc_ds_alloc_dbuf;
382 	ioc->ioc_stmf_ds->ds_free_data_buf  = &srpt_ioc_ds_free_dbuf;
383 	ioc->ioc_stmf_ds->ds_port_private   = ioc;
384 
385 	rw_exit(&ioc->ioc_rwlock);
386 	return (ioc);
387 
388 stmf_db_alloc_err:
389 	if (ioc->ioc_dbuf_pool != NULL) {
390 		srpt_vmem_destroy(ioc->ioc_dbuf_pool);
391 	}
392 
393 srq_iu_post_err:
394 	if (ioc->ioc_iu_mr_hdl != NULL) {
395 		status = ibt_deregister_mr(ioc->ioc_ibt_hdl,
396 		    ioc->ioc_iu_mr_hdl);
397 		if (status != IBT_SUCCESS) {
398 			SRPT_DPRINTF_L1("ioc_init, error deregistering"
399 			    " memory region (%d)", status);
400 		}
401 	}
402 	for (err_ndx = 0, iu = ioc->ioc_iu_pool; err_ndx < iu_ndx;
403 	    err_ndx++, iu++) {
404 		mutex_destroy(&iu->iu_lock);
405 	}
406 
407 srq_iu_alloc_err:
408 	if (ioc->ioc_iu_bufs != NULL) {
409 		kmem_free(ioc->ioc_iu_bufs, SRPT_DEFAULT_SEND_MSG_SIZE *
410 		    ioc->ioc_num_iu_entries);
411 	}
412 	if (ioc->ioc_iu_pool != NULL) {
413 		kmem_free(ioc->ioc_iu_pool,
414 		    sizeof (srpt_iu_t) * ioc->ioc_num_iu_entries);
415 	}
416 	if (ioc->ioc_srq_hdl != NULL) {
417 		status = ibt_free_srq(ioc->ioc_srq_hdl);
418 		if (status != IBT_SUCCESS) {
419 			SRPT_DPRINTF_L1("ioc_init, error freeing SRQ (%d)",
420 			    status);
421 		}
422 
423 	}
424 
425 srq_alloc_err:
426 	status = ibt_free_pd(ioc->ioc_ibt_hdl, ioc->ioc_pd_hdl);
427 	if (status != IBT_SUCCESS) {
428 		SRPT_DPRINTF_L1("ioc_init, free PD error (%d)", status);
429 	}
430 
431 pd_alloc_err:
432 	status = ibt_close_hca(ioc->ioc_ibt_hdl);
433 	if (status != IBT_SUCCESS) {
434 		SRPT_DPRINTF_L1("ioc_init, close ioc error (%d)", status);
435 	}
436 
437 hca_open_err:
438 	rw_exit(&ioc->ioc_rwlock);
439 	rw_destroy(&ioc->ioc_rwlock);
440 	kmem_free(ioc, sizeof (*ioc));
441 	return (NULL);
442 }
443 
444 /*
445  * srpt_ioc_fini() - I/O Controller Cleanup
446  *
447  * Requires srpt_ctxt->sc_rwlock be held outside of call.
448  */
449 static void
450 srpt_ioc_fini(srpt_ioc_t *ioc)
451 {
452 	int		status;
453 	int		ndx;
454 
455 	/*
456 	 * Note driver flows will have already taken all SRP
457 	 * services running on the I/O Controller off-line.
458 	 */
459 	rw_enter(&ioc->ioc_rwlock, RW_WRITER);
460 	if (ioc->ioc_ibt_hdl != NULL) {
461 		if (ioc->ioc_stmf_ds != NULL) {
462 			stmf_free(ioc->ioc_stmf_ds);
463 		}
464 
465 		if (ioc->ioc_srq_hdl != NULL) {
466 			SRPT_DPRINTF_L4("ioc_fini, freeing SRQ");
467 			status = ibt_free_srq(ioc->ioc_srq_hdl);
468 			if (status != IBT_SUCCESS) {
469 				SRPT_DPRINTF_L1("ioc_fini, free SRQ"
470 				    " error (%d)", status);
471 			}
472 		}
473 
474 		if (ioc->ioc_iu_mr_hdl != NULL) {
475 			status = ibt_deregister_mr(
476 			    ioc->ioc_ibt_hdl, ioc->ioc_iu_mr_hdl);
477 			if (status != IBT_SUCCESS) {
478 				SRPT_DPRINTF_L1("ioc_fini, error deregistering"
479 				    " memory region (%d)", status);
480 			}
481 		}
482 
483 		if (ioc->ioc_iu_bufs != NULL) {
484 			kmem_free(ioc->ioc_iu_bufs, SRPT_DEFAULT_SEND_MSG_SIZE *
485 			    ioc->ioc_num_iu_entries);
486 		}
487 
488 		if (ioc->ioc_iu_pool != NULL) {
489 			SRPT_DPRINTF_L4("ioc_fini, freeing IU entries");
490 			for (ndx = 0; ndx < ioc->ioc_num_iu_entries; ndx++) {
491 				mutex_destroy(&ioc->ioc_iu_pool[ndx].iu_lock);
492 			}
493 
494 			SRPT_DPRINTF_L4("ioc_fini, free IU pool struct");
495 			kmem_free(ioc->ioc_iu_pool,
496 			    sizeof (srpt_iu_t) * (ioc->ioc_num_iu_entries));
497 			ioc->ioc_iu_pool = NULL;
498 			ioc->ioc_num_iu_entries = 0;
499 		}
500 
501 		if (ioc->ioc_dbuf_pool != NULL) {
502 			srpt_vmem_destroy(ioc->ioc_dbuf_pool);
503 		}
504 
505 		if (ioc->ioc_pd_hdl != NULL) {
506 			status = ibt_free_pd(ioc->ioc_ibt_hdl,
507 			    ioc->ioc_pd_hdl);
508 			if (status != IBT_SUCCESS) {
509 				SRPT_DPRINTF_L1("ioc_fini, free PD"
510 				    " error (%d)", status);
511 			}
512 		}
513 
514 		status = ibt_close_hca(ioc->ioc_ibt_hdl);
515 		if (status != IBT_SUCCESS) {
516 			SRPT_DPRINTF_L1(
517 			    "ioc_fini, close ioc error (%d)", status);
518 		}
519 	}
520 	rw_exit(&ioc->ioc_rwlock);
521 	rw_destroy(&ioc->ioc_rwlock);
522 	kmem_free(ioc, sizeof (srpt_ioc_t));
523 }
524 
525 /*
526  * srpt_ioc_port_active() - I/O Controller port active
527  */
528 static void
529 srpt_ioc_port_active(ibt_async_event_t *event)
530 {
531 	ibt_status_t		status;
532 	srpt_ioc_t		*ioc;
533 
534 	ASSERT(event != NULL);
535 
536 	SRPT_DPRINTF_L3("ioc_port_active event handler, invoked");
537 
538 	/*
539 	 * Find the HCA in question and if the HCA has completed
540 	 * initialization, and the SRP Target service for the
541 	 * the I/O Controller exists, then bind this port.
542 	 */
543 	ioc = srpt_ioc_get(event->ev_hca_guid);
544 
545 	if (ioc == NULL) {
546 		SRPT_DPRINTF_L2("ioc_port_active, I/O Controller not"
547 		    " active");
548 		return;
549 	}
550 
551 	if (ioc->ioc_tgt_port == NULL) {
552 		SRPT_DPRINTF_L2("ioc_port_active, no I/O Controller target"
553 		    " undefined");
554 		return;
555 	}
556 
557 
558 	/*
559 	 * We take the target lock here to serialize this operation
560 	 * with any STMF initiated target state transitions.  If
561 	 * SRP is off-line then the service handle is NULL.
562 	 */
563 	mutex_enter(&ioc->ioc_tgt_port->tp_lock);
564 
565 	if (ioc->ioc_tgt_port->tp_ibt_svc_hdl != NULL) {
566 		status = srpt_ioc_svc_bind(ioc->ioc_tgt_port, event->ev_port);
567 		if (status != IBT_SUCCESS &&
568 		    status != IBT_HCA_PORT_NOT_ACTIVE) {
569 			SRPT_DPRINTF_L1("ioc_port_active, bind failed (%d)",
570 			    status);
571 		}
572 	}
573 	mutex_exit(&ioc->ioc_tgt_port->tp_lock);
574 }
575 
576 /*
577  * srpt_ioc_port_down()
578  */
579 static void
580 srpt_ioc_port_down(ibt_async_event_t *event)
581 {
582 	srpt_ioc_t		*ioc;
583 	srpt_target_port_t	*tgt;
584 	srpt_channel_t		*ch;
585 	srpt_channel_t		*next_ch;
586 
587 	SRPT_DPRINTF_L3("ioc_port_down event handler, invoked");
588 
589 	/*
590 	 * Find the HCA in question and if the HCA has completed
591 	 * initialization, and the SRP Target service for the
592 	 * the I/O Controller exists, then logout initiators
593 	 * through this port.
594 	 */
595 	ioc = srpt_ioc_get(event->ev_hca_guid);
596 
597 	if (ioc == NULL) {
598 		SRPT_DPRINTF_L2("ioc_port_down, I/O Controller not"
599 		    " active");
600 		return;
601 	}
602 
603 	/*
604 	 * We only have one target now, but we could go through all
605 	 * SCSI target ports if more are added.
606 	 */
607 	tgt = ioc->ioc_tgt_port;
608 	if (tgt == NULL) {
609 		SRPT_DPRINTF_L2("ioc_port_down, no I/O Controller target"
610 		    " undefined");
611 		return;
612 	}
613 	mutex_enter(&tgt->tp_lock);
614 
615 	/*
616 	 * For all channel's logged in through this port, initiate a
617 	 * disconnect.
618 	 */
619 	mutex_enter(&tgt->tp_ch_list_lock);
620 	ch = list_head(&tgt->tp_ch_list);
621 	while (ch != NULL) {
622 		next_ch = list_next(&tgt->tp_ch_list, ch);
623 		if (ch->ch_session && (ch->ch_session->ss_hw_port ==
624 		    event->ev_port)) {
625 			srpt_ch_disconnect(ch);
626 		}
627 		ch = next_ch;
628 	}
629 	mutex_exit(&tgt->tp_ch_list_lock);
630 
631 	mutex_exit(&tgt->tp_lock);
632 }
633 
634 /*
635  * srpt_ioc_ib_async_hdlr - I/O Controller IB asynchronous events
636  */
637 /* ARGSUSED */
638 void
639 srpt_ioc_ib_async_hdlr(void *clnt, ibt_hca_hdl_t hdl,
640 	ibt_async_code_t code, ibt_async_event_t *event)
641 {
642 	srpt_ioc_t		*ioc;
643 	srpt_channel_t		*ch;
644 
645 	switch (code) {
646 	case IBT_EVENT_PORT_UP:
647 		srpt_ioc_port_active(event);
648 		break;
649 
650 	case IBT_ERROR_PORT_DOWN:
651 		srpt_ioc_port_down(event);
652 		break;
653 
654 	case IBT_HCA_ATTACH_EVENT:
655 		rw_enter(&srpt_ctxt->sc_rwlock, RW_WRITER);
656 		ioc = srpt_ioc_init(event->ev_hca_guid);
657 
658 		if (ioc == NULL) {
659 			rw_exit(&srpt_ctxt->sc_rwlock);
660 			SRPT_DPRINTF_L1("ib_async_hdlr, HCA_ATTACH"
661 			    " event failed to initialize HCA (0x%016llx)",
662 			    (u_longlong_t)event->ev_hca_guid);
663 			return;
664 		}
665 		SRPT_DPRINTF_L2("HCA_ATTACH_EVENT: I/O Controller"
666 		    " ibt hdl (%p)",
667 		    (void *)ioc->ioc_ibt_hdl);
668 
669 		rw_enter(&ioc->ioc_rwlock, RW_WRITER);
670 		ioc->ioc_tgt_port = srpt_stp_alloc_port(ioc, ioc->ioc_guid);
671 		if (ioc->ioc_tgt_port == NULL) {
672 			SRPT_DPRINTF_L1("ioc_ib_async_hdlr, alloc SCSI "
673 			    "target port error for HCA (0x%016llx)",
674 			    (u_longlong_t)event->ev_hca_guid);
675 			rw_exit(&ioc->ioc_rwlock);
676 			srpt_ioc_fini(ioc);
677 			rw_exit(&srpt_ctxt->sc_rwlock);
678 			return;
679 		}
680 
681 		/*
682 		 * New HCA added with default SCSI Target Port, SRP service
683 		 * will be started when SCSI Target Port is brought
684 		 * on-line by STMF.
685 		 */
686 		srpt_ctxt->sc_num_iocs++;
687 		list_insert_tail(&srpt_ctxt->sc_ioc_list, ioc);
688 
689 		rw_exit(&ioc->ioc_rwlock);
690 		rw_exit(&srpt_ctxt->sc_rwlock);
691 		break;
692 
693 	case IBT_HCA_DETACH_EVENT:
694 		SRPT_DPRINTF_L1(
695 		    "ioc_iob_async_hdlr, HCA_DETACH_EVENT received.");
696 		break;
697 
698 	case IBT_EVENT_EMPTY_CHAN:
699 		/* Channel in ERROR state is now empty */
700 		ch = (srpt_channel_t *)ibt_get_chan_private(event->ev_chan_hdl);
701 		SRPT_DPRINTF_L3(
702 		    "ioc_iob_async_hdlr, received empty channel error on %p",
703 		    (void *)ch);
704 		break;
705 
706 	default:
707 		SRPT_DPRINTF_L2("ioc_ib_async_hdlr, event not "
708 		    "handled (%d)", code);
709 		break;
710 	}
711 }
712 
713 /*
714  * srpt_ioc_svc_bind()
715  */
716 ibt_status_t
717 srpt_ioc_svc_bind(srpt_target_port_t *tgt, uint_t portnum)
718 {
719 	ibt_status_t		status;
720 	srpt_hw_port_t		*port;
721 	ibt_hca_portinfo_t	*portinfo;
722 	uint_t			qportinfo_sz;
723 	uint_t			qportnum;
724 	ib_gid_t		new_gid;
725 	srpt_ioc_t		*ioc;
726 	srpt_session_t		sess;
727 
728 	ASSERT(tgt != NULL);
729 	ASSERT(tgt->tp_ioc != NULL);
730 	ioc = tgt->tp_ioc;
731 
732 	if (tgt->tp_ibt_svc_hdl == NULL) {
733 		SRPT_DPRINTF_L2("ioc_svc_bind, NULL SCSI target port"
734 		    " service");
735 		return (IBT_INVALID_PARAM);
736 	}
737 
738 	if (portnum == 0 || portnum > tgt->tp_nports) {
739 		SRPT_DPRINTF_L2("ioc_svc_bind, bad port (%d)", portnum);
740 		return (IBT_INVALID_PARAM);
741 	}
742 	status = ibt_query_hca_ports(ioc->ioc_ibt_hdl, portnum,
743 	    &portinfo, &qportnum, &qportinfo_sz);
744 	if (status != IBT_SUCCESS) {
745 		SRPT_DPRINTF_L1("ioc_svc_bind, query port error (%d)",
746 		    portnum);
747 		return (IBT_INVALID_PARAM);
748 	}
749 
750 	ASSERT(portinfo != NULL);
751 
752 	/*
753 	 * If port is not active do nothing, caller should attempt to bind
754 	 * after the port goes active.
755 	 */
756 	if (portinfo->p_linkstate != IBT_PORT_ACTIVE) {
757 		SRPT_DPRINTF_L2("ioc_svc_bind, port %d not in active state",
758 		    portnum);
759 		ibt_free_portinfo(portinfo, qportinfo_sz);
760 		return (IBT_HCA_PORT_NOT_ACTIVE);
761 	}
762 
763 	port    = &tgt->tp_hw_port[portnum-1];
764 	new_gid = portinfo->p_sgid_tbl[0];
765 	ibt_free_portinfo(portinfo, qportinfo_sz);
766 
767 	/*
768 	 * If previously bound and the port GID has changed,
769 	 * rebind to the new GID.
770 	 */
771 	if (port->hwp_bind_hdl != NULL) {
772 		if (new_gid.gid_guid != port->hwp_gid.gid_guid ||
773 		    new_gid.gid_prefix != port->hwp_gid.gid_prefix) {
774 			SRPT_DPRINTF_L2("ioc_svc_bind, unregister current"
775 			    " bind");
776 			(void) ibt_unbind_service(tgt->tp_ibt_svc_hdl,
777 			    port->hwp_bind_hdl);
778 			port->hwp_bind_hdl = NULL;
779 		}
780 	}
781 	SRPT_DPRINTF_L2("ioc_svc_bind, bind service, %016llx:%016llx",
782 	    (u_longlong_t)new_gid.gid_prefix,
783 	    (u_longlong_t)new_gid.gid_guid);
784 
785 	/*
786 	 * Pass SCSI Target Port as CM private data, the target will always
787 	 * exist while this service is bound.
788 	 */
789 	status = ibt_bind_service(tgt->tp_ibt_svc_hdl, new_gid, NULL, tgt,
790 	    &port->hwp_bind_hdl);
791 	if (status != IBT_SUCCESS && status != IBT_CM_SERVICE_EXISTS) {
792 		SRPT_DPRINTF_L1("ioc_svc_bind, bind error (%d)", status);
793 		return (status);
794 	}
795 	port->hwp_gid.gid_prefix = new_gid.gid_prefix;
796 	port->hwp_gid.gid_guid = new_gid.gid_guid;
797 
798 	/* setting up a transient structure for the dtrace probe. */
799 	bzero(&sess, sizeof (srpt_session_t));
800 	ALIAS_STR(sess.ss_t_gid, new_gid.gid_prefix, new_gid.gid_guid);
801 	EUI_STR(sess.ss_t_name, tgt->tp_ibt_svc_id);
802 
803 	DTRACE_SRP_1(service__up, srpt_session_t, &sess);
804 
805 	return (IBT_SUCCESS);
806 }
807 
808 /*
809  * srpt_ioc_svc_unbind()
810  */
811 void
812 srpt_ioc_svc_unbind(srpt_target_port_t *tgt, uint_t portnum)
813 {
814 	srpt_hw_port_t		*port;
815 	srpt_session_t		sess;
816 
817 	if (tgt == NULL) {
818 		SRPT_DPRINTF_L2("ioc_svc_unbind, SCSI target does not exist");
819 		return;
820 	}
821 
822 	if (portnum == 0 || portnum > tgt->tp_nports) {
823 		SRPT_DPRINTF_L2("ioc_svc_unbind, bad port (%d)", portnum);
824 		return;
825 	}
826 	port = &tgt->tp_hw_port[portnum-1];
827 
828 	/* setting up a transient structure for the dtrace probe. */
829 	bzero(&sess, sizeof (srpt_session_t));
830 	ALIAS_STR(sess.ss_t_gid, port->hwp_gid.gid_prefix,
831 	    port->hwp_gid.gid_guid);
832 	EUI_STR(sess.ss_t_name, tgt->tp_ibt_svc_id);
833 
834 	DTRACE_SRP_1(service__down, srpt_session_t, &sess);
835 
836 	if (tgt->tp_ibt_svc_hdl != NULL && port->hwp_bind_hdl != NULL) {
837 		SRPT_DPRINTF_L2("ioc_svc_unbind, unregister current bind");
838 		(void) ibt_unbind_service(tgt->tp_ibt_svc_hdl,
839 		    port->hwp_bind_hdl);
840 	}
841 	port->hwp_bind_hdl = NULL;
842 	port->hwp_gid.gid_prefix = 0;
843 	port->hwp_gid.gid_guid = 0;
844 }
845 
846 /*
847  * srpt_ioc_svc_unbind_all()
848  */
849 void
850 srpt_ioc_svc_unbind_all(srpt_target_port_t *tgt)
851 {
852 	uint_t		portnum;
853 
854 	if (tgt == NULL) {
855 		SRPT_DPRINTF_L2("ioc_svc_unbind_all, NULL SCSI target port"
856 		    " specified");
857 		return;
858 	}
859 	for (portnum = 1; portnum <= tgt->tp_nports; portnum++) {
860 		srpt_ioc_svc_unbind(tgt, portnum);
861 	}
862 }
863 
864 /*
865  * srpt_ioc_get_locked()
866  *
867  * Requires srpt_ctxt->rw_lock be held outside of call.
868  */
869 srpt_ioc_t *
870 srpt_ioc_get_locked(ib_guid_t guid)
871 {
872 	srpt_ioc_t	*ioc;
873 
874 	ioc = list_head(&srpt_ctxt->sc_ioc_list);
875 	while (ioc != NULL) {
876 		if (ioc->ioc_guid == guid) {
877 			break;
878 		}
879 		ioc = list_next(&srpt_ctxt->sc_ioc_list, ioc);
880 	}
881 	return (ioc);
882 }
883 
884 /*
885  * srpt_ioc_get()
886  */
887 srpt_ioc_t *
888 srpt_ioc_get(ib_guid_t guid)
889 {
890 	srpt_ioc_t	*ioc;
891 
892 	rw_enter(&srpt_ctxt->sc_rwlock, RW_READER);
893 	ioc = srpt_ioc_get_locked(guid);
894 	rw_exit(&srpt_ctxt->sc_rwlock);
895 	return (ioc);
896 }
897 
898 /*
899  * srpt_ioc_post_recv_iu()
900  */
901 ibt_status_t
902 srpt_ioc_post_recv_iu(srpt_ioc_t *ioc, srpt_iu_t *iu)
903 {
904 	ibt_status_t		status;
905 	ibt_recv_wr_t		wr;
906 	uint_t			posted;
907 
908 	ASSERT(ioc != NULL);
909 	ASSERT(iu != NULL);
910 
911 	wr.wr_id  = (ibt_wrid_t)(uintptr_t)iu;
912 	wr.wr_nds = 1;
913 	wr.wr_sgl = &iu->iu_sge;
914 	posted    = 0;
915 
916 	status = ibt_post_srq(ioc->ioc_srq_hdl, &wr, 1, &posted);
917 	if (status != IBT_SUCCESS) {
918 		SRPT_DPRINTF_L2("ioc_post_recv_iu, post error (%d)",
919 		    status);
920 	}
921 	return (status);
922 }
923 
924 /*
925  * srpt_ioc_repost_recv_iu()
926  */
927 void
928 srpt_ioc_repost_recv_iu(srpt_ioc_t *ioc, srpt_iu_t *iu)
929 {
930 	srpt_channel_t		*ch;
931 	ibt_status_t		status;
932 
933 	ASSERT(iu != NULL);
934 	ASSERT(mutex_owned(&iu->iu_lock));
935 
936 	/*
937 	 * Some additional sanity checks while in debug state, all STMF
938 	 * related task activities should be complete prior to returning
939 	 * this IU to the available pool.
940 	 */
941 	ASSERT(iu->iu_stmf_task == NULL);
942 	ASSERT(iu->iu_sq_posted_cnt == 0);
943 
944 	ch = iu->iu_ch;
945 	iu->iu_ch = NULL;
946 	iu->iu_num_rdescs = 0;
947 	iu->iu_rdescs = NULL;
948 	iu->iu_tot_xfer_len = 0;
949 	iu->iu_tag = 0;
950 	iu->iu_flags = 0;
951 	iu->iu_sq_posted_cnt = 0;
952 
953 	status = srpt_ioc_post_recv_iu(ioc, iu);
954 
955 	if (status != IBT_SUCCESS) {
956 		/*
957 		 * Very bad, we should initiate a shutdown of the I/O
958 		 * Controller here, off-lining any targets associated
959 		 * with this I/O Controller (and therefore disconnecting
960 		 * any logins that remain).
961 		 *
962 		 * In practice this should never happen so we put
963 		 * the code near the bottom of the implementation list.
964 		 */
965 		SRPT_DPRINTF_L0("ioc_repost_recv_iu, error RX IU (%d)",
966 		    status);
967 		ASSERT(0);
968 	} else if (ch != NULL) {
969 		atomic_inc_32(&ch->ch_req_lim_delta);
970 	}
971 }
972 
973 /*
974  * srpt_ioc_init_profile()
975  *
976  * SRP I/O Controller serialization lock must be held when this
977  * routine is invoked.
978  */
979 void
980 srpt_ioc_init_profile(srpt_ioc_t *ioc)
981 {
982 	srpt_ioc_opcap_mask_t		capmask = {0};
983 
984 	ASSERT(ioc != NULL);
985 
986 	ioc->ioc_profile.ioc_guid = h2b64(ioc->ioc_guid);
987 	(void) memcpy(ioc->ioc_profile.ioc_id_string,
988 	    "Solaris SRP Target 0.9a", 23);
989 
990 	/*
991 	 * Note vendor ID and subsystem ID are 24 bit values.  Low order
992 	 * 8 bits in vendor ID field is slot and is initialized to zero.
993 	 * Low order 8 bits of subsystem ID is a reserved field and
994 	 * initialized to zero.
995 	 */
996 	ioc->ioc_profile.ioc_vendorid =
997 	    h2b32((uint32_t)(ioc->ioc_attr.hca_vendor_id << 8));
998 	ioc->ioc_profile.ioc_deviceid =
999 	    h2b32((uint32_t)ioc->ioc_attr.hca_device_id);
1000 	ioc->ioc_profile.ioc_device_ver =
1001 	    h2b16((uint16_t)ioc->ioc_attr.hca_version_id);
1002 	ioc->ioc_profile.ioc_subsys_vendorid =
1003 	    h2b32((uint32_t)(ioc->ioc_attr.hca_vendor_id << 8));
1004 	ioc->ioc_profile.ioc_subsys_id = h2b32(0);
1005 	ioc->ioc_profile.ioc_io_class = h2b16(SRP_REV_16A_IO_CLASS);
1006 	ioc->ioc_profile.ioc_io_subclass = h2b16(SRP_IO_SUBCLASS);
1007 	ioc->ioc_profile.ioc_protocol = h2b16(SRP_PROTOCOL);
1008 	ioc->ioc_profile.ioc_protocol_ver = h2b16(SRP_PROTOCOL_VERSION);
1009 	ioc->ioc_profile.ioc_send_msg_qdepth = h2b16(srpt_send_msg_depth);
1010 	ioc->ioc_profile.ioc_rdma_read_qdepth =
1011 	    ioc->ioc_attr.hca_max_rdma_out_chan;
1012 	ioc->ioc_profile.ioc_send_msg_sz = h2b32(SRPT_DEFAULT_SEND_MSG_SIZE);
1013 	ioc->ioc_profile.ioc_rdma_xfer_sz = h2b32(SRPT_DEFAULT_MAX_RDMA_SIZE);
1014 
1015 	capmask.bits.st = 1;	/* Messages can be sent to IOC */
1016 	capmask.bits.sf = 1;	/* Messages can be sent from IOC */
1017 	capmask.bits.rf = 1;	/* RDMA Reads can be sent from IOC */
1018 	capmask.bits.wf = 1;	/* RDMA Writes can be sent from IOC */
1019 	ioc->ioc_profile.ioc_ctrl_opcap_mask = capmask.mask;
1020 
1021 	/*
1022 	 * We currently only have one target, but if we had a list we would
1023 	 * go through that list and only count those that are ONLINE when
1024 	 * setting the services count and entries.
1025 	 */
1026 	if (ioc->ioc_tgt_port->tp_srp_enabled) {
1027 		ioc->ioc_profile.ioc_service_entries = 1;
1028 		ioc->ioc_svc.srv_id = h2b64(ioc->ioc_guid);
1029 		(void) snprintf((char *)ioc->ioc_svc.srv_name,
1030 		    IB_DM_MAX_SVC_NAME_LEN, "SRP.T10:%016llx",
1031 		    (u_longlong_t)ioc->ioc_guid);
1032 	} else {
1033 		ioc->ioc_profile.ioc_service_entries = 0;
1034 		ioc->ioc_svc.srv_id = 0;
1035 	}
1036 }
1037 
1038 /*
1039  * srpt_ioc_ds_alloc_dbuf()
1040  */
1041 /* ARGSUSED */
1042 stmf_data_buf_t *
1043 srpt_ioc_ds_alloc_dbuf(struct scsi_task *task, uint32_t size,
1044 	uint32_t *pminsize, uint32_t flags)
1045 {
1046 	srpt_iu_t		*iu;
1047 	srpt_ioc_t		*ioc;
1048 	srpt_ds_dbuf_t		*dbuf;
1049 	stmf_data_buf_t		*stmf_dbuf;
1050 	void			*buf;
1051 	srpt_mr_t		mr;
1052 
1053 	ASSERT(task != NULL);
1054 	iu  = task->task_port_private;
1055 	ioc = iu->iu_ioc;
1056 
1057 	SRPT_DPRINTF_L4("ioc_ds_alloc_dbuf, invoked ioc(%p)"
1058 	    " size(%d), flags(%x)",
1059 	    (void *)ioc, size, flags);
1060 
1061 	buf = srpt_vmem_alloc(ioc->ioc_dbuf_pool, size);
1062 	if (buf == NULL) {
1063 		return (NULL);
1064 	}
1065 
1066 	if (srpt_vmem_mr(ioc->ioc_dbuf_pool, buf, size, &mr) != 0) {
1067 		goto stmf_alloc_err;
1068 	}
1069 
1070 	stmf_dbuf = stmf_alloc(STMF_STRUCT_DATA_BUF, sizeof (srpt_ds_dbuf_t),
1071 	    0);
1072 	if (stmf_dbuf == NULL) {
1073 		SRPT_DPRINTF_L2("ioc_ds_alloc_dbuf, stmf_alloc failed");
1074 		goto stmf_alloc_err;
1075 	}
1076 
1077 	dbuf = stmf_dbuf->db_port_private;
1078 	dbuf->db_stmf_buf = stmf_dbuf;
1079 	dbuf->db_mr_hdl = mr.mr_hdl;
1080 	dbuf->db_ioc = ioc;
1081 	dbuf->db_sge.ds_va = mr.mr_va;
1082 	dbuf->db_sge.ds_key = mr.mr_lkey;
1083 	dbuf->db_sge.ds_len = size;
1084 
1085 	stmf_dbuf->db_buf_size = size;
1086 	stmf_dbuf->db_data_size = size;
1087 	stmf_dbuf->db_relative_offset = 0;
1088 	stmf_dbuf->db_flags = 0;
1089 	stmf_dbuf->db_xfer_status = 0;
1090 	stmf_dbuf->db_sglist_length = 1;
1091 	stmf_dbuf->db_sglist[0].seg_addr = buf;
1092 	stmf_dbuf->db_sglist[0].seg_length = size;
1093 
1094 	return (stmf_dbuf);
1095 
1096 buf_mr_err:
1097 	stmf_free(stmf_dbuf);
1098 
1099 stmf_alloc_err:
1100 	srpt_vmem_free(ioc->ioc_dbuf_pool, buf, size);
1101 
1102 	return (NULL);
1103 }
1104 
1105 void
1106 srpt_ioc_ds_free_dbuf(struct stmf_dbuf_store *ds,
1107 	stmf_data_buf_t *dbuf)
1108 {
1109 	srpt_ioc_t	*ioc;
1110 
1111 	SRPT_DPRINTF_L4("ioc_ds_free_dbuf, invoked buf (%p)",
1112 	    (void *)dbuf);
1113 	ioc = ds->ds_port_private;
1114 
1115 	srpt_vmem_free(ioc->ioc_dbuf_pool, dbuf->db_sglist[0].seg_addr,
1116 	    dbuf->db_buf_size);
1117 	stmf_free(dbuf);
1118 }
1119 
1120 /* Memory arena routines */
1121 
1122 static srpt_vmem_pool_t *
1123 srpt_vmem_create(const char *name, srpt_ioc_t *ioc, ib_memlen_t chunksize,
1124     uint64_t maxsize, ibt_mr_flags_t flags)
1125 {
1126 	srpt_mr_t		*chunk;
1127 	srpt_vmem_pool_t	*result;
1128 
1129 	ASSERT(chunksize <= maxsize);
1130 
1131 	result = kmem_zalloc(sizeof (srpt_vmem_pool_t), KM_SLEEP);
1132 
1133 	result->svp_ioc = ioc;
1134 	result->svp_chunksize = chunksize;
1135 	result->svp_max_size = maxsize;
1136 	result->svp_flags = flags;
1137 
1138 	rw_init(&result->svp_lock, NULL, RW_DRIVER, NULL);
1139 	avl_create(&result->svp_mr_list, srpt_vmem_mr_compare,
1140 	    sizeof (srpt_mr_t), offsetof(srpt_mr_t, mr_avl));
1141 
1142 	chunk = srpt_vmem_chunk_alloc(result, chunksize);
1143 
1144 	avl_add(&result->svp_mr_list, chunk);
1145 	result->svp_total_size = chunksize;
1146 
1147 	result->svp_vmem = vmem_create(name,
1148 	    (void*)(uintptr_t)chunk->mr_va,
1149 	    (size_t)chunk->mr_len, SRPT_MR_QUANTSIZE,
1150 	    NULL, NULL, NULL, 0, VM_SLEEP);
1151 
1152 	return (result);
1153 }
1154 
1155 static void
1156 srpt_vmem_destroy(srpt_vmem_pool_t *vm_pool)
1157 {
1158 	srpt_mr_t		*chunk;
1159 	srpt_mr_t		*next;
1160 
1161 	rw_enter(&vm_pool->svp_lock, RW_WRITER);
1162 	vmem_destroy(vm_pool->svp_vmem);
1163 
1164 	chunk = avl_first(&vm_pool->svp_mr_list);
1165 
1166 	while (chunk != NULL) {
1167 		next = AVL_NEXT(&vm_pool->svp_mr_list, chunk);
1168 		avl_remove(&vm_pool->svp_mr_list, chunk);
1169 		srpt_vmem_chunk_free(vm_pool, chunk);
1170 		chunk = next;
1171 	}
1172 
1173 	avl_destroy(&vm_pool->svp_mr_list);
1174 
1175 	rw_exit(&vm_pool->svp_lock);
1176 	rw_destroy(&vm_pool->svp_lock);
1177 
1178 	kmem_free(vm_pool, sizeof (srpt_vmem_pool_t));
1179 }
1180 
1181 static void *
1182 srpt_vmem_alloc(srpt_vmem_pool_t *vm_pool, size_t size)
1183 {
1184 	void		*result;
1185 	srpt_mr_t	*next;
1186 	ib_memlen_t	chunklen;
1187 
1188 	ASSERT(vm_pool != NULL);
1189 
1190 	result = vmem_alloc(vm_pool->svp_vmem, size,
1191 	    VM_NOSLEEP | VM_FIRSTFIT);
1192 
1193 	if (result != NULL) {
1194 		/* memory successfully allocated */
1195 		return (result);
1196 	}
1197 
1198 	/* need more vmem */
1199 	rw_enter(&vm_pool->svp_lock, RW_WRITER);
1200 	chunklen = vm_pool->svp_chunksize;
1201 
1202 	if (vm_pool->svp_total_size >= vm_pool->svp_max_size) {
1203 		/* no more room to alloc */
1204 		rw_exit(&vm_pool->svp_lock);
1205 		return (NULL);
1206 	}
1207 
1208 	if ((vm_pool->svp_total_size + chunklen) > vm_pool->svp_max_size) {
1209 		chunklen = vm_pool->svp_max_size - vm_pool->svp_total_size;
1210 	}
1211 
1212 	next = srpt_vmem_chunk_alloc(vm_pool, chunklen);
1213 	if (next != NULL) {
1214 		/*
1215 		 * Note that the size of the chunk we got
1216 		 * may not be the size we requested.  Use the
1217 		 * length returned in the chunk itself.
1218 		 */
1219 		if (vmem_add(vm_pool->svp_vmem, (void*)(uintptr_t)next->mr_va,
1220 		    next->mr_len, VM_NOSLEEP) == NULL) {
1221 			srpt_vmem_chunk_free(vm_pool, next);
1222 			SRPT_DPRINTF_L2("vmem_add failed");
1223 		} else {
1224 			vm_pool->svp_total_size += next->mr_len;
1225 			avl_add(&vm_pool->svp_mr_list, next);
1226 		}
1227 	}
1228 
1229 	rw_exit(&vm_pool->svp_lock);
1230 
1231 	result = vmem_alloc(vm_pool->svp_vmem, size, VM_NOSLEEP | VM_FIRSTFIT);
1232 
1233 	return (result);
1234 }
1235 
1236 static void
1237 srpt_vmem_free(srpt_vmem_pool_t *vm_pool, void *vaddr, size_t size)
1238 {
1239 	vmem_free(vm_pool->svp_vmem, vaddr, size);
1240 }
1241 
1242 static int
1243 srpt_vmem_mr(srpt_vmem_pool_t *vm_pool, void *vaddr, size_t size,
1244     srpt_mr_t *mr)
1245 {
1246 	avl_index_t		where;
1247 	ib_vaddr_t		mrva = (ib_vaddr_t)(uintptr_t)vaddr;
1248 	srpt_mr_t		chunk;
1249 	srpt_mr_t		*nearest;
1250 	ib_vaddr_t		chunk_end;
1251 	int			status = DDI_FAILURE;
1252 
1253 	rw_enter(&vm_pool->svp_lock, RW_READER);
1254 
1255 	chunk.mr_va = mrva;
1256 	nearest = avl_find(&vm_pool->svp_mr_list, &chunk, &where);
1257 
1258 	if (nearest == NULL) {
1259 		nearest = avl_nearest(&vm_pool->svp_mr_list, where,
1260 		    AVL_BEFORE);
1261 	}
1262 
1263 	if (nearest != NULL) {
1264 		/* Verify this chunk contains the specified address range */
1265 		ASSERT(nearest->mr_va <= mrva);
1266 
1267 		chunk_end = nearest->mr_va + nearest->mr_len;
1268 		if (chunk_end >= mrva + size) {
1269 			mr->mr_hdl = nearest->mr_hdl;
1270 			mr->mr_va = mrva;
1271 			mr->mr_len = size;
1272 			mr->mr_lkey = nearest->mr_lkey;
1273 			mr->mr_rkey = nearest->mr_rkey;
1274 			status = DDI_SUCCESS;
1275 		}
1276 	}
1277 
1278 	rw_exit(&vm_pool->svp_lock);
1279 	return (status);
1280 }
1281 
1282 static srpt_mr_t *
1283 srpt_vmem_chunk_alloc(srpt_vmem_pool_t *vm_pool, ib_memlen_t chunksize)
1284 {
1285 	void			*chunk = NULL;
1286 	srpt_mr_t		*result = NULL;
1287 
1288 	while ((chunk == NULL) && (chunksize >= SRPT_MIN_CHUNKSIZE)) {
1289 		chunk = kmem_alloc(chunksize, KM_NOSLEEP);
1290 		if (chunk == NULL) {
1291 			SRPT_DPRINTF_L2("srpt_vmem_chunk_alloc: "
1292 			    "failed to alloc chunk of %d, trying %d",
1293 			    (int)chunksize, (int)chunksize/2);
1294 			chunksize /= 2;
1295 		}
1296 	}
1297 
1298 	if (chunk != NULL) {
1299 		result = srpt_reg_mem(vm_pool, (ib_vaddr_t)(uintptr_t)chunk,
1300 		    chunksize);
1301 		if (result == NULL) {
1302 			SRPT_DPRINTF_L2("srpt_vmem_chunk_alloc: "
1303 			    "chunk registration failed");
1304 			kmem_free(chunk, chunksize);
1305 		}
1306 	}
1307 
1308 	return (result);
1309 }
1310 
1311 static void
1312 srpt_vmem_chunk_free(srpt_vmem_pool_t *vm_pool, srpt_mr_t *mr)
1313 {
1314 	void			*chunk = (void *)(uintptr_t)mr->mr_va;
1315 	ib_memlen_t		chunksize = mr->mr_len;
1316 
1317 	srpt_dereg_mem(vm_pool->svp_ioc, mr);
1318 	kmem_free(chunk, chunksize);
1319 }
1320 
1321 static srpt_mr_t *
1322 srpt_reg_mem(srpt_vmem_pool_t *vm_pool, ib_vaddr_t vaddr, ib_memlen_t len)
1323 {
1324 	srpt_mr_t		*result = NULL;
1325 	ibt_mr_attr_t		mr_attr;
1326 	ibt_mr_desc_t		mr_desc;
1327 	ibt_status_t		status;
1328 	srpt_ioc_t		*ioc = vm_pool->svp_ioc;
1329 
1330 	result = kmem_zalloc(sizeof (srpt_mr_t), KM_NOSLEEP);
1331 	if (result == NULL) {
1332 		SRPT_DPRINTF_L2("srpt_reg_mem: failed to allocate");
1333 		return (NULL);
1334 	}
1335 
1336 	bzero(&mr_attr, sizeof (ibt_mr_attr_t));
1337 	bzero(&mr_desc, sizeof (ibt_mr_desc_t));
1338 
1339 	mr_attr.mr_vaddr = vaddr;
1340 	mr_attr.mr_len = len;
1341 	mr_attr.mr_as = NULL;
1342 	mr_attr.mr_flags = vm_pool->svp_flags;
1343 
1344 	status = ibt_register_mr(ioc->ioc_ibt_hdl, ioc->ioc_pd_hdl,
1345 	    &mr_attr, &result->mr_hdl, &mr_desc);
1346 	if (status != IBT_SUCCESS) {
1347 		SRPT_DPRINTF_L2("srpt_reg_mem: ibt_register_mr "
1348 		    "failed %d", status);
1349 		kmem_free(result, sizeof (srpt_mr_t));
1350 		return (NULL);
1351 	}
1352 
1353 	result->mr_va = mr_attr.mr_vaddr;
1354 	result->mr_len = mr_attr.mr_len;
1355 	result->mr_lkey = mr_desc.md_lkey;
1356 	result->mr_rkey = mr_desc.md_rkey;
1357 
1358 	return (result);
1359 }
1360 
1361 static void
1362 srpt_dereg_mem(srpt_ioc_t *ioc, srpt_mr_t *mr)
1363 {
1364 	ibt_status_t		status;
1365 
1366 	status = ibt_deregister_mr(ioc->ioc_ibt_hdl, mr->mr_hdl);
1367 	if (status != IBT_SUCCESS) {
1368 		SRPT_DPRINTF_L1("ioc_fini, error deregistering MR (%d)",
1369 		    status);
1370 	}
1371 	kmem_free(mr, sizeof (srpt_mr_t));
1372 }
1373 
1374 static int
1375 srpt_vmem_mr_compare(const void *a, const void *b)
1376 {
1377 	srpt_mr_t		*mr1 = (srpt_mr_t *)a;
1378 	srpt_mr_t		*mr2 = (srpt_mr_t *)b;
1379 
1380 	/* sort and match by virtual address */
1381 	if (mr1->mr_va < mr2->mr_va) {
1382 		return (-1);
1383 	} else if (mr1->mr_va > mr2->mr_va) {
1384 		return (1);
1385 	}
1386 
1387 	return (0);
1388 }
1389