1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * hermon_wr.c
29  *    Hermon Work Request Processing Routines
30  *
31  *    Implements all the routines necessary to provide the PostSend(),
32  *    PostRecv() and PostSRQ() verbs.  Also contains all the code
33  *    necessary to implement the Hermon WRID tracking mechanism.
34  */
35 
36 #include <sys/types.h>
37 #include <sys/conf.h>
38 #include <sys/ddi.h>
39 #include <sys/sunddi.h>
40 #include <sys/modctl.h>
41 #include <sys/avl.h>
42 
43 #include <sys/ib/adapters/hermon/hermon.h>
44 
45 static uint32_t hermon_wr_get_immediate(ibt_send_wr_t *wr);
46 static int hermon_wr_bind_check(hermon_state_t *state, ibt_send_wr_t *wr);
47 static int hermon_wqe_send_build(hermon_state_t *state, hermon_qphdl_t qp,
48     ibt_send_wr_t *wr, uint64_t *desc, uint_t *size);
49 static int hermon_wqe_mlx_build(hermon_state_t *state, hermon_qphdl_t qp,
50     ibt_send_wr_t *wr, uint64_t *desc, uint_t *size);
51 static void hermon_wqe_headroom(uint_t from, hermon_qphdl_t qp);
52 static int hermon_wqe_recv_build(hermon_state_t *state, hermon_qphdl_t qp,
53     ibt_recv_wr_t *wr, uint64_t *desc);
54 static int hermon_wqe_srq_build(hermon_state_t *state, hermon_srqhdl_t srq,
55     ibt_recv_wr_t *wr, uint64_t *desc);
56 static void hermon_wqe_sync(void *hdl, uint_t sync_from,
57     uint_t sync_to, uint_t sync_type, uint_t flag);
58 static hermon_workq_avl_t *hermon_wrid_wqavl_find(hermon_cqhdl_t cq, uint_t qpn,
59     uint_t send_or_recv);
60 static void hermon_cq_workq_add(hermon_cqhdl_t cq, hermon_workq_avl_t *wqavl);
61 static void hermon_cq_workq_remove(hermon_cqhdl_t cq,
62     hermon_workq_avl_t *wqavl);
63 
64 static	ibt_wr_ds_t	null_sgl = { 0, 0x00000100, 0 };
65 
66 static int
67 hermon_post_send_ud(hermon_state_t *state, hermon_qphdl_t qp,
68     ibt_send_wr_t *wr, uint_t num_wr, uint_t *num_posted)
69 {
70 	hermon_hw_snd_wqe_ud_t		*ud;
71 	hermon_workq_hdr_t		*wq;
72 	hermon_ahhdl_t			ah;
73 	ibt_ud_dest_t			*dest;
74 	uint64_t			*desc;
75 	uint32_t			desc_sz;
76 	uint32_t			signaled_dbd, solicited;
77 	uint32_t			head, tail, next_tail, qsize_msk;
78 	uint32_t			hdrmwqes;
79 	uint32_t			nopcode, fence, immed_data = 0;
80 	hermon_hw_wqe_sgl_t		*ds;
81 	ibt_wr_ds_t			*sgl;
82 	uint32_t			nds, dnds;
83 	int				i, j, last_ds, num_ds, status;
84 	uint32_t			*wqe_start;
85 	int				sectperwqe;
86 	uint_t				posted_cnt = 0;
87 
88 	/* initialize the FMA retry loop */
89 	hermon_pio_init(fm_loop_cnt, fm_status, fm_test_num);
90 
91 	ASSERT(MUTEX_HELD(&qp->qp_sq_lock));
92 	_NOTE(LOCK_RELEASED_AS_SIDE_EFFECT(&qp->qp_sq_lock))
93 
94 	/* Grab the lock for the WRID list */
95 	membar_consumer();
96 
97 	/* Save away some initial QP state */
98 	wq = qp->qp_sq_wqhdr;
99 	qsize_msk = wq->wq_mask;
100 	hdrmwqes  = qp->qp_sq_hdrmwqes;		/* in WQEs  */
101 	sectperwqe = 1 << (qp->qp_sq_log_wqesz - 2);
102 
103 	tail	  = wq->wq_tail;
104 	head	  = wq->wq_head;
105 	status	  = DDI_SUCCESS;
106 
107 post_next:
108 	/*
109 	 * Check for "queue full" condition.  If the queue
110 	 * is already full, then no more WQEs can be posted.
111 	 * So break out, ring a doorbell (if necessary) and
112 	 * return an error
113 	 */
114 	if (wq->wq_full != 0) {
115 		status = IBT_QP_FULL;
116 		goto done;
117 	}
118 
119 	next_tail = (tail + 1) & qsize_msk;
120 	if (((tail + hdrmwqes) & qsize_msk) == head) {
121 		wq->wq_full = 1;
122 	}
123 
124 	desc = HERMON_QP_SQ_ENTRY(qp, tail);
125 
126 	ud = (hermon_hw_snd_wqe_ud_t *)((uintptr_t)desc +
127 	    sizeof (hermon_hw_snd_wqe_ctrl_t));
128 	ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)ud +
129 	    sizeof (hermon_hw_snd_wqe_ud_t));
130 	nds = wr->wr_nds;
131 	sgl = wr->wr_sgl;
132 	num_ds = 0;
133 
134 	/* need to know the count of destination nds for backward loop */
135 	for (dnds = 0, i = 0; i < nds; i++) {
136 		if (sgl[i].ds_len != 0)
137 			dnds++;
138 	}
139 
140 	/*
141 	 * Build a Send or Send_LSO WQE
142 	 */
143 	if (wr->wr_opcode == IBT_WRC_SEND_LSO) {
144 		int total_len;
145 		hermon_hw_wqe_sgl_t *old_ds;
146 
147 		nopcode = HERMON_WQE_SEND_NOPCODE_LSO;
148 		dest = wr->wr.ud_lso.lso_ud_dest;
149 		ah = (hermon_ahhdl_t)dest->ud_ah;
150 		if (ah == NULL) {
151 			status = IBT_AH_HDL_INVALID;
152 			goto done;
153 		}
154 		HERMON_WQE_BUILD_UD(qp, ud, ah, dest);
155 
156 		total_len = (4 + 0xf + wr->wr.ud_lso.lso_hdr_sz) & ~0xf;
157 		if ((uintptr_t)ds + total_len + (nds * 16) >
158 		    (uintptr_t)desc + (1 << qp->qp_sq_log_wqesz)) {
159 			status = IBT_QP_SGL_LEN_INVALID;
160 			goto done;
161 		}
162 		bcopy(wr->wr.ud_lso.lso_hdr, (uint32_t *)ds + 1,
163 		    wr->wr.ud_lso.lso_hdr_sz);
164 		old_ds = ds;
165 		ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)ds + total_len);
166 		for (i = 0; i < nds; i++) {
167 			if (sgl[i].ds_len == 0)
168 				continue;
169 			HERMON_WQE_BUILD_DATA_SEG_SEND(&ds[num_ds], &sgl[i]);
170 			num_ds++;
171 			i++;
172 			break;
173 		}
174 		membar_producer();
175 		HERMON_WQE_BUILD_LSO(qp, old_ds, wr->wr.ud_lso.lso_mss,
176 		    wr->wr.ud_lso.lso_hdr_sz);
177 	} else if (wr->wr_opcode == IBT_WRC_SEND) {
178 		if (wr->wr_flags & IBT_WR_SEND_IMMED) {
179 			nopcode = HERMON_WQE_SEND_NOPCODE_SENDI;
180 			immed_data = wr->wr.ud.udwr_immed;
181 		} else {
182 			nopcode = HERMON_WQE_SEND_NOPCODE_SEND;
183 		}
184 		dest = wr->wr.ud.udwr_dest;
185 		ah = (hermon_ahhdl_t)dest->ud_ah;
186 		if (ah == NULL) {
187 			status = IBT_AH_HDL_INVALID;
188 			goto done;
189 		}
190 		HERMON_WQE_BUILD_UD(qp, ud, ah, dest);
191 		i = 0;
192 	} else {
193 		status = IBT_QP_OP_TYPE_INVALID;
194 		goto done;
195 	}
196 
197 	if (nds > qp->qp_sq_sgl) {
198 		status = IBT_QP_SGL_LEN_INVALID;
199 		goto done;
200 	}
201 	for (last_ds = num_ds, j = i; j < nds; j++) {
202 		if (sgl[j].ds_len != 0)
203 			last_ds++;	/* real last ds of wqe to fill */
204 	}
205 	desc_sz = ((uintptr_t)&ds[last_ds] - (uintptr_t)desc) >> 0x4;
206 	for (j = nds; --j >= i; ) {
207 		if (sgl[j].ds_len == 0) {
208 			continue;
209 		}
210 
211 		/*
212 		 * Fill in the Data Segment(s) for the current WQE, using the
213 		 * information contained in the scatter-gather list of the
214 		 * work request.
215 		 */
216 		last_ds--;
217 		HERMON_WQE_BUILD_DATA_SEG_SEND(&ds[last_ds], &sgl[j]);
218 	}
219 
220 	fence = (wr->wr_flags & IBT_WR_SEND_FENCE) ? 1 : 0;
221 
222 	signaled_dbd = ((qp->qp_sq_sigtype == HERMON_QP_SQ_ALL_SIGNALED) ||
223 	    (wr->wr_flags & IBT_WR_SEND_SIGNAL)) ? 1 : 0;
224 
225 	solicited = (wr->wr_flags & IBT_WR_SEND_SOLICIT) ? 1 : 0;
226 
227 	HERMON_WQE_SET_CTRL_SEGMENT(desc, desc_sz, fence, immed_data,
228 	    solicited, signaled_dbd, wr->wr_flags & IBT_WR_SEND_CKSUM, qp);
229 
230 	wq->wq_wrid[tail] = wr->wr_id;
231 
232 	tail = next_tail;
233 
234 	/* Update some of the state in the QP */
235 	wq->wq_tail = tail;
236 
237 	membar_producer();
238 
239 	/* Now set the ownership bit and opcode (first dword). */
240 	HERMON_SET_SEND_WQE_OWNER(qp, (uint32_t *)desc, nopcode);
241 
242 	posted_cnt++;
243 	if (--num_wr > 0) {
244 		/* do the invalidate of the headroom */
245 		wqe_start = (uint32_t *)HERMON_QP_SQ_ENTRY(qp,
246 		    (tail + hdrmwqes) & qsize_msk);
247 		for (i = 16; i < sectperwqe; i += 16) {
248 			wqe_start[i] = 0xFFFFFFFF;
249 		}
250 
251 		wr++;
252 		goto post_next;
253 	}
254 done:
255 	if (posted_cnt != 0) {
256 		ddi_acc_handle_t uarhdl = hermon_get_uarhdl(state);
257 
258 		membar_producer();
259 
260 		/* the FMA retry loop starts for Hermon doorbell register. */
261 		hermon_pio_start(state, uarhdl, pio_error, fm_loop_cnt,
262 		    fm_status, fm_test_num);
263 
264 		HERMON_UAR_DOORBELL(state, uarhdl,
265 		    (uint64_t *)(void *)&state->hs_uar->send,
266 		    (uint64_t)qp->qp_ring);
267 
268 		/* the FMA retry loop ends. */
269 		hermon_pio_end(state, uarhdl, pio_error, fm_loop_cnt,
270 		    fm_status, fm_test_num);
271 
272 		/* do the invalidate of the headroom */
273 		wqe_start = (uint32_t *)HERMON_QP_SQ_ENTRY(qp,
274 		    (tail + hdrmwqes) & qsize_msk);
275 		for (i = 16; i < sectperwqe; i += 16) {
276 			wqe_start[i] = 0xFFFFFFFF;
277 		}
278 	}
279 	if (num_posted != NULL)
280 		*num_posted = posted_cnt;
281 
282 	mutex_exit(&qp->qp_sq_lock);
283 
284 	return (status);
285 
286 pio_error:
287 	mutex_exit(&qp->qp_sq_lock);
288 	hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
289 	return (ibc_get_ci_failure(0));
290 }
291 
292 static int
293 hermon_post_send_rc(hermon_state_t *state, hermon_qphdl_t qp,
294     ibt_send_wr_t *wr, uint_t num_wr, uint_t *num_posted)
295 {
296 	uint64_t			*desc;
297 	hermon_workq_hdr_t		*wq;
298 	uint32_t			desc_sz;
299 	uint32_t			signaled_dbd, solicited;
300 	uint32_t			head, tail, next_tail, qsize_msk;
301 	uint32_t			hdrmwqes;
302 	int				status;
303 	uint32_t			nopcode, fence, immed_data = 0;
304 	hermon_hw_snd_wqe_remaddr_t	*rc;
305 	hermon_hw_snd_wqe_atomic_t	*at;
306 	hermon_hw_snd_wqe_bind_t	*bn;
307 	hermon_hw_wqe_sgl_t		*ds;
308 	ibt_wr_ds_t			*sgl;
309 	uint32_t			nds;
310 	int				i, last_ds, num_ds;
311 	uint32_t			*wqe_start;
312 	int				sectperwqe;
313 	uint_t				posted_cnt = 0;
314 
315 	/* initialize the FMA retry loop */
316 	hermon_pio_init(fm_loop_cnt, fm_status, fm_test_num);
317 
318 	ASSERT(MUTEX_HELD(&qp->qp_sq_lock));
319 	_NOTE(LOCK_RELEASED_AS_SIDE_EFFECT(&qp->qp_sq_lock))
320 
321 	/* make sure we see any update of wq_head */
322 	membar_consumer();
323 
324 	/* Save away some initial QP state */
325 	wq = qp->qp_sq_wqhdr;
326 	qsize_msk = wq->wq_mask;
327 	hdrmwqes  = qp->qp_sq_hdrmwqes;		/* in WQEs  */
328 	sectperwqe = 1 << (qp->qp_sq_log_wqesz - 2);
329 
330 	tail	  = wq->wq_tail;
331 	head	  = wq->wq_head;
332 	status	  = DDI_SUCCESS;
333 
334 post_next:
335 	/*
336 	 * Check for "queue full" condition.  If the queue
337 	 * is already full, then no more WQEs can be posted.
338 	 * So break out, ring a doorbell (if necessary) and
339 	 * return an error
340 	 */
341 	if (wq->wq_full != 0) {
342 		status = IBT_QP_FULL;
343 		goto done;
344 	}
345 	next_tail = (tail + 1) & qsize_msk;
346 	if (((tail + hdrmwqes) & qsize_msk) == head) {
347 		wq->wq_full = 1;
348 	}
349 
350 	desc = HERMON_QP_SQ_ENTRY(qp, tail);
351 
352 	ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)desc +
353 	    sizeof (hermon_hw_snd_wqe_ctrl_t));
354 	nds = wr->wr_nds;
355 	sgl = wr->wr_sgl;
356 	num_ds = 0;
357 
358 	/*
359 	 * Validate the operation type.  For RC requests, we allow
360 	 * "Send", "RDMA Read", "RDMA Write", various "Atomic"
361 	 * operations, and memory window "Bind"
362 	 */
363 	switch (wr->wr_opcode) {
364 	default:
365 		status = IBT_QP_OP_TYPE_INVALID;
366 		goto done;
367 
368 	case IBT_WRC_SEND:
369 		if (wr->wr_flags & IBT_WR_SEND_IMMED) {
370 			nopcode = HERMON_WQE_SEND_NOPCODE_SENDI;
371 			immed_data = wr->wr.rc.rcwr.send_immed;
372 		} else {
373 			nopcode = HERMON_WQE_SEND_NOPCODE_SEND;
374 		}
375 		break;
376 
377 	/*
378 	 * If this is an RDMA Read or RDMA Write request, then fill
379 	 * in the "Remote Address" header fields.
380 	 */
381 	case IBT_WRC_RDMAW:
382 		if (wr->wr_flags & IBT_WR_SEND_IMMED) {
383 			nopcode = HERMON_WQE_SEND_NOPCODE_RDMAWI;
384 			immed_data = wr->wr.rc.rcwr.rdma.rdma_immed;
385 		} else {
386 			nopcode = HERMON_WQE_SEND_NOPCODE_RDMAW;
387 		}
388 		/* FALLTHROUGH */
389 	case IBT_WRC_RDMAR:
390 		if (wr->wr_opcode == IBT_WRC_RDMAR)
391 			nopcode = HERMON_WQE_SEND_NOPCODE_RDMAR;
392 		rc = (hermon_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
393 		    sizeof (hermon_hw_snd_wqe_ctrl_t));
394 
395 		/*
396 		 * Build the Remote Address Segment for the WQE, using
397 		 * the information from the RC work request.
398 		 */
399 		HERMON_WQE_BUILD_REMADDR(qp, rc, &wr->wr.rc.rcwr.rdma);
400 
401 		/* Update "ds" for filling in Data Segments (below) */
402 		ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)rc +
403 		    sizeof (hermon_hw_snd_wqe_remaddr_t));
404 		break;
405 
406 	/*
407 	 * If this is one of the Atomic type operations (i.e
408 	 * Compare-Swap or Fetch-Add), then fill in both the "Remote
409 	 * Address" header fields and the "Atomic" header fields.
410 	 */
411 	case IBT_WRC_CSWAP:
412 		nopcode = HERMON_WQE_SEND_NOPCODE_ATMCS;
413 		/* FALLTHROUGH */
414 	case IBT_WRC_FADD:
415 		if (wr->wr_opcode == IBT_WRC_FADD)
416 			nopcode = HERMON_WQE_SEND_NOPCODE_ATMFA;
417 		rc = (hermon_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
418 		    sizeof (hermon_hw_snd_wqe_ctrl_t));
419 		at = (hermon_hw_snd_wqe_atomic_t *)((uintptr_t)rc +
420 		    sizeof (hermon_hw_snd_wqe_remaddr_t));
421 
422 		/*
423 		 * Build the Remote Address and Atomic Segments for
424 		 * the WQE, using the information from the RC Atomic
425 		 * work request.
426 		 */
427 		HERMON_WQE_BUILD_RC_ATOMIC_REMADDR(qp, rc, wr);
428 		HERMON_WQE_BUILD_ATOMIC(qp, at, wr->wr.rc.rcwr.atomic);
429 
430 		/* Update "ds" for filling in Data Segments (below) */
431 		ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)at +
432 		    sizeof (hermon_hw_snd_wqe_atomic_t));
433 
434 		/*
435 		 * Update "nds" and "sgl" because Atomic requests have
436 		 * only a single Data Segment.
437 		 */
438 		nds = 1;
439 		sgl = wr->wr_sgl;
440 		break;
441 
442 	/*
443 	 * If this is memory window Bind operation, then we call the
444 	 * hermon_wr_bind_check() routine to validate the request and
445 	 * to generate the updated RKey.  If this is successful, then
446 	 * we fill in the WQE's "Bind" header fields.
447 	 */
448 	case IBT_WRC_BIND:
449 		nopcode = HERMON_WQE_SEND_NOPCODE_BIND;
450 		status = hermon_wr_bind_check(state, wr);
451 		if (status != DDI_SUCCESS)
452 			goto done;
453 
454 		bn = (hermon_hw_snd_wqe_bind_t *)((uintptr_t)desc +
455 		    sizeof (hermon_hw_snd_wqe_ctrl_t));
456 
457 		/*
458 		 * Build the Bind Memory Window Segments for the WQE,
459 		 * using the information from the RC Bind memory
460 		 * window work request.
461 		 */
462 		HERMON_WQE_BUILD_BIND(qp, bn, wr->wr.rc.rcwr.bind);
463 
464 		/*
465 		 * Update the "ds" pointer.  Even though the "bind"
466 		 * operation requires no SGLs, this is necessary to
467 		 * facilitate the correct descriptor size calculations
468 		 * (below).
469 		 */
470 		ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)bn +
471 		    sizeof (hermon_hw_snd_wqe_bind_t));
472 		nds = 0;
473 	}
474 
475 	/*
476 	 * Now fill in the Data Segments (SGL) for the Send WQE based
477 	 * on the values setup above (i.e. "sgl", "nds", and the "ds"
478 	 * pointer. Start by checking for a valid number of SGL entries
479 	 */
480 	if (nds > qp->qp_sq_sgl) {
481 		status = IBT_QP_SGL_LEN_INVALID;
482 		goto done;
483 	}
484 
485 	for (last_ds = num_ds, i = 0; i < nds; i++) {
486 		if (sgl[i].ds_len != 0)
487 			last_ds++;	/* real last ds of wqe to fill */
488 	}
489 	desc_sz = ((uintptr_t)&ds[last_ds] - (uintptr_t)desc) >> 0x4;
490 	for (i = nds; --i >= 0; ) {
491 		if (sgl[i].ds_len == 0) {
492 			continue;
493 		}
494 
495 		/*
496 		 * Fill in the Data Segment(s) for the current WQE, using the
497 		 * information contained in the scatter-gather list of the
498 		 * work request.
499 		 */
500 		last_ds--;
501 		HERMON_WQE_BUILD_DATA_SEG_SEND(&ds[last_ds], &sgl[i]);
502 	}
503 
504 	fence = (wr->wr_flags & IBT_WR_SEND_FENCE) ? 1 : 0;
505 
506 	signaled_dbd = ((qp->qp_sq_sigtype == HERMON_QP_SQ_ALL_SIGNALED) ||
507 	    (wr->wr_flags & IBT_WR_SEND_SIGNAL)) ? 1 : 0;
508 
509 	solicited = (wr->wr_flags & IBT_WR_SEND_SOLICIT) ? 1 : 0;
510 
511 	HERMON_WQE_SET_CTRL_SEGMENT(desc, desc_sz, fence, immed_data, solicited,
512 	    signaled_dbd, wr->wr_flags & IBT_WR_SEND_CKSUM, qp);
513 
514 	wq->wq_wrid[tail] = wr->wr_id;
515 
516 	tail = next_tail;
517 
518 	/* Update some of the state in the QP */
519 	wq->wq_tail = tail;
520 
521 	membar_producer();
522 
523 	/* Now set the ownership bit of the first one in the chain. */
524 	HERMON_SET_SEND_WQE_OWNER(qp, (uint32_t *)desc, nopcode);
525 
526 	posted_cnt++;
527 	if (--num_wr > 0) {
528 		/* do the invalidate of the headroom */
529 		wqe_start = (uint32_t *)HERMON_QP_SQ_ENTRY(qp,
530 		    (tail + hdrmwqes) & qsize_msk);
531 		for (i = 16; i < sectperwqe; i += 16) {
532 			wqe_start[i] = 0xFFFFFFFF;
533 		}
534 
535 		wr++;
536 		goto post_next;
537 	}
538 done:
539 
540 	if (posted_cnt != 0) {
541 		ddi_acc_handle_t uarhdl = hermon_get_uarhdl(state);
542 
543 		membar_producer();
544 
545 		/* the FMA retry loop starts for Hermon doorbell register. */
546 		hermon_pio_start(state, uarhdl, pio_error, fm_loop_cnt,
547 		    fm_status, fm_test_num);
548 
549 		/* Ring the doorbell */
550 		HERMON_UAR_DOORBELL(state, uarhdl,
551 		    (uint64_t *)(void *)&state->hs_uar->send,
552 		    (uint64_t)qp->qp_ring);
553 
554 		/* the FMA retry loop ends. */
555 		hermon_pio_end(state, uarhdl, pio_error, fm_loop_cnt,
556 		    fm_status, fm_test_num);
557 
558 		/* do the invalidate of the headroom */
559 		wqe_start = (uint32_t *)HERMON_QP_SQ_ENTRY(qp,
560 		    (tail + hdrmwqes) & qsize_msk);
561 		for (i = 16; i < sectperwqe; i += 16) {
562 			wqe_start[i] = 0xFFFFFFFF;
563 		}
564 	}
565 	/*
566 	 * Update the "num_posted" return value (if necessary).
567 	 * Then drop the locks and return success.
568 	 */
569 	if (num_posted != NULL) {
570 		*num_posted = posted_cnt;
571 	}
572 
573 	mutex_exit(&qp->qp_sq_lock);
574 	return (status);
575 
576 pio_error:
577 	mutex_exit(&qp->qp_sq_lock);
578 	hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
579 	return (ibc_get_ci_failure(0));
580 }
581 
582 /*
583  * hermon_post_send()
584  *    Context: Can be called from interrupt or base context.
585  */
586 int
587 hermon_post_send(hermon_state_t *state, hermon_qphdl_t qp,
588     ibt_send_wr_t *wr, uint_t num_wr, uint_t *num_posted)
589 {
590 	ibt_send_wr_t 			*curr_wr;
591 	hermon_workq_hdr_t		*wq;
592 	hermon_ahhdl_t			ah;
593 	uint64_t			*desc, *prev;
594 	uint32_t			desc_sz;
595 	uint32_t			signaled_dbd, solicited;
596 	uint32_t			head, tail, next_tail, qsize_msk;
597 	uint32_t			sync_from, sync_to;
598 	uint32_t			hdrmwqes;
599 	uint_t				currindx, wrindx, numremain;
600 	uint_t				chainlen;
601 	uint_t				posted_cnt, maxstat;
602 	uint_t				total_posted;
603 	int				status;
604 	uint32_t			nopcode, fence, immed_data = 0;
605 	uint32_t			prev_nopcode;
606 
607 	/* initialize the FMA retry loop */
608 	hermon_pio_init(fm_loop_cnt, fm_status, fm_test);
609 
610 	/*
611 	 * Check for user-mappable QP memory.  Note:  We do not allow kernel
612 	 * clients to post to QP memory that is accessible directly by the
613 	 * user.  If the QP memory is user accessible, then return an error.
614 	 */
615 	if (qp->qp_is_umap) {
616 		return (IBT_QP_HDL_INVALID);
617 	}
618 
619 	mutex_enter(&qp->qp_lock);
620 
621 	/*
622 	 * Check QP state.  Can not post Send requests from the "Reset",
623 	 * "Init", or "RTR" states
624 	 */
625 	if ((qp->qp_state == HERMON_QP_RESET) ||
626 	    (qp->qp_state == HERMON_QP_INIT) ||
627 	    (qp->qp_state == HERMON_QP_RTR)) {
628 		mutex_exit(&qp->qp_lock);
629 		return (IBT_QP_STATE_INVALID);
630 	}
631 	mutex_exit(&qp->qp_lock);
632 	mutex_enter(&qp->qp_sq_lock);
633 
634 	if (qp->qp_is_special)
635 		goto post_many;
636 
637 	/* Use these optimized functions most of the time */
638 	if (qp->qp_serv_type == HERMON_QP_UD)
639 		return (hermon_post_send_ud(state, qp, wr, num_wr, num_posted));
640 
641 	if (qp->qp_serv_type == HERMON_QP_RC)
642 		return (hermon_post_send_rc(state, qp, wr, num_wr, num_posted));
643 
644 	if (qp->qp_serv_type == HERMON_QP_UC)
645 		goto post_many;
646 
647 	mutex_exit(&qp->qp_sq_lock);
648 	return (IBT_QP_SRV_TYPE_INVALID);
649 
650 post_many:
651 	/* general loop for non-optimized posting */
652 
653 	/* Grab the lock for the WRID list */
654 	membar_consumer();
655 
656 	/* Save away some initial QP state */
657 	wq = qp->qp_sq_wqhdr;
658 	qsize_msk = wq->wq_mask;
659 	tail	  = wq->wq_tail;
660 	head	  = wq->wq_head;
661 	hdrmwqes  = qp->qp_sq_hdrmwqes;		/* in WQEs  */
662 
663 	/* Initialize posted_cnt */
664 	posted_cnt = 0;
665 	total_posted = 0;
666 
667 	/*
668 	 * For each ibt_send_wr_t in the wr[] list passed in, parse the
669 	 * request and build a Send WQE.  NOTE:  Because we are potentially
670 	 * building a chain of WQEs to post, we want to build them all first,
671 	 * and set the valid (HW Ownership) bit on all but the first.
672 	 * However, we do not want to validate the first one until the
673 	 * entire chain of WQEs has been built.  Then in the final
674 	 * we set the valid bit in the first, flush if needed, and as a last
675 	 * step ring the appropriate doorbell.  NOTE: the doorbell ring may
676 	 * NOT be needed if the HCA is already processing, but the doorbell
677 	 * ring will be done regardless. NOTE ALSO:  It is possible for
678 	 * more Work Requests to be posted than the HW will support at one
679 	 * shot.  If this happens, we need to be able to post and ring
680 	 * several chains here until the the entire request is complete.
681 	 * NOTE ALSO:  the term "chain" is used to differentiate it from
682 	 * Work Request List passed in; and because that's the terminology
683 	 * from the previous generations of HCA - but the WQEs are not, in fact
684 	 * chained together for Hermon
685 	 */
686 
687 	wrindx = 0;
688 	numremain = num_wr;
689 	status	  = DDI_SUCCESS;
690 	while ((wrindx < num_wr) && (status == DDI_SUCCESS)) {
691 		/*
692 		 * For the first WQE on a new chain we need "prev" to point
693 		 * to the current descriptor.
694 		 */
695 		prev = HERMON_QP_SQ_ENTRY(qp, tail);
696 
697 	/*
698 	 * unlike Tavor & Arbel, tail will maintain the number of the
699 	 * next (this) WQE to be posted.  Since there is no backward linking
700 	 * in Hermon, we can always just look ahead
701 	 */
702 		/*
703 		 * Before we begin, save the current "tail index" for later
704 		 * DMA sync
705 		 */
706 		/* NOTE: don't need to go back one like arbel/tavor */
707 		sync_from = tail;
708 
709 		/*
710 		 * Break the request up into lists that are less than or
711 		 * equal to the maximum number of WQEs that can be posted
712 		 * per doorbell ring - 256 currently
713 		 */
714 		chainlen = (numremain > HERMON_QP_MAXDESC_PER_DB) ?
715 		    HERMON_QP_MAXDESC_PER_DB : numremain;
716 		numremain -= chainlen;
717 
718 		for (currindx = 0; currindx < chainlen; currindx++, wrindx++) {
719 			/*
720 			 * Check for "queue full" condition.  If the queue
721 			 * is already full, then no more WQEs can be posted.
722 			 * So break out, ring a doorbell (if necessary) and
723 			 * return an error
724 			 */
725 			if (wq->wq_full != 0) {
726 				status = IBT_QP_FULL;
727 				break;
728 			}
729 
730 			/*
731 			 * Increment the "tail index". Check for "queue
732 			 * full" condition incl. headroom.  If we detect that
733 			 * the current work request is going to fill the work
734 			 * queue, then we mark this condition and continue.
735 			 * Don't need >=, because going one-by-one we have to
736 			 * hit it exactly sooner or later
737 			 */
738 
739 			next_tail = (tail + 1) & qsize_msk;
740 			if (((tail + hdrmwqes) & qsize_msk) == head) {
741 				wq->wq_full = 1;
742 			}
743 
744 			/*
745 			 * Get the address of the location where the next
746 			 * Send WQE should be built
747 			 */
748 			desc = HERMON_QP_SQ_ENTRY(qp, tail);
749 			/*
750 			 * Call hermon_wqe_send_build() to build the WQE
751 			 * at the given address.  This routine uses the
752 			 * information in the ibt_send_wr_t list (wr[]) and
753 			 * returns the size of the WQE when it returns.
754 			 */
755 			status = hermon_wqe_send_build(state, qp,
756 			    &wr[wrindx], desc, &desc_sz);
757 			if (status != DDI_SUCCESS) {
758 				break;
759 			}
760 
761 			/*
762 			 * Now, build the Ctrl Segment based on
763 			 * what was just done
764 			 */
765 			curr_wr = &wr[wrindx];
766 
767 			switch (curr_wr->wr_opcode) {
768 			case IBT_WRC_RDMAW:
769 				if (curr_wr->wr_flags & IBT_WR_SEND_IMMED) {
770 					nopcode =
771 					    HERMON_WQE_SEND_NOPCODE_RDMAWI;
772 					immed_data =
773 					    hermon_wr_get_immediate(curr_wr);
774 				} else {
775 					nopcode = HERMON_WQE_SEND_NOPCODE_RDMAW;
776 				}
777 				break;
778 
779 			case IBT_WRC_SEND:
780 				if (curr_wr->wr_flags & IBT_WR_SEND_IMMED) {
781 					nopcode = HERMON_WQE_SEND_NOPCODE_SENDI;
782 					immed_data =
783 					    hermon_wr_get_immediate(curr_wr);
784 				} else {
785 					nopcode = HERMON_WQE_SEND_NOPCODE_SEND;
786 				}
787 				break;
788 
789 			case IBT_WRC_SEND_LSO:
790 				nopcode = HERMON_WQE_SEND_NOPCODE_LSO;
791 				break;
792 
793 			case IBT_WRC_RDMAR:
794 				nopcode = HERMON_WQE_SEND_NOPCODE_RDMAR;
795 				break;
796 
797 			case IBT_WRC_CSWAP:
798 				nopcode = HERMON_WQE_SEND_NOPCODE_ATMCS;
799 				break;
800 
801 			case IBT_WRC_FADD:
802 				nopcode = HERMON_WQE_SEND_NOPCODE_ATMFA;
803 				break;
804 
805 			case IBT_WRC_BIND:
806 				nopcode = HERMON_WQE_SEND_NOPCODE_BIND;
807 				break;
808 			}
809 
810 			fence = (curr_wr->wr_flags & IBT_WR_SEND_FENCE) ? 1 : 0;
811 
812 			/*
813 			 * now, build up the control segment, leaving the
814 			 * owner bit as it is
815 			 */
816 
817 			if ((qp->qp_sq_sigtype == HERMON_QP_SQ_ALL_SIGNALED) ||
818 			    (curr_wr->wr_flags & IBT_WR_SEND_SIGNAL)) {
819 				signaled_dbd = 1;
820 			} else {
821 				signaled_dbd = 0;
822 			}
823 			if (curr_wr->wr_flags & IBT_WR_SEND_SOLICIT)
824 				solicited = 1;
825 			else
826 				solicited = 0;
827 
828 			if (qp->qp_is_special) {
829 				ah = (hermon_ahhdl_t)
830 				    curr_wr->wr.ud.udwr_dest->ud_ah;
831 				mutex_enter(&ah->ah_lock);
832 				maxstat = ah->ah_udav->max_stat_rate;
833 				HERMON_WQE_SET_MLX_CTRL_SEGMENT(desc, desc_sz,
834 				    signaled_dbd, maxstat, ah->ah_udav->rlid,
835 				    qp, ah->ah_udav->sl);
836 				mutex_exit(&ah->ah_lock);
837 			} else {
838 				HERMON_WQE_SET_CTRL_SEGMENT(desc, desc_sz,
839 				    fence, immed_data, solicited,
840 				    signaled_dbd, curr_wr->wr_flags &
841 				    IBT_WR_SEND_CKSUM, qp);
842 			}
843 			wq->wq_wrid[tail] = curr_wr->wr_id;
844 
845 			/*
846 			 * If this is not the first descriptor on the current
847 			 * chain, then set the ownership bit.
848 			 */
849 			if (currindx != 0) {		/* not the first */
850 				membar_producer();
851 				HERMON_SET_SEND_WQE_OWNER(qp,
852 				    (uint32_t *)desc, nopcode);
853 			} else
854 				prev_nopcode = nopcode;
855 
856 			/*
857 			 * Update the current "tail index" and increment
858 			 * "posted_cnt"
859 			 */
860 			tail = next_tail;
861 			posted_cnt++;
862 		}
863 
864 		/*
865 		 * If we reach here and there are one or more WQEs which have
866 		 * been successfully built as a chain, we have to finish up
867 		 * and prepare them for writing to the HW
868 		 * The steps are:
869 		 * 	1. do the headroom fixup
870 		 *	2. add in the size of the headroom for the sync
871 		 *	3. write the owner bit for the first WQE
872 		 *	4. sync them
873 		 *	5. fix up the structures
874 		 *	6. hit the doorbell in UAR
875 		 */
876 		if (posted_cnt != 0) {
877 			ddi_acc_handle_t uarhdl = hermon_get_uarhdl(state);
878 
879 			/*
880 			 * Save away updated "tail index" for the DMA sync
881 			 * including the headroom that will be needed
882 			 */
883 			sync_to = (tail + hdrmwqes) & qsize_msk;
884 
885 			/* do the invalidate of the headroom */
886 
887 			hermon_wqe_headroom(tail, qp);
888 
889 			/* Do a DMA sync for current send WQE(s) */
890 			hermon_wqe_sync(qp, sync_from, sync_to, HERMON_WR_SEND,
891 			    DDI_DMA_SYNC_FORDEV);
892 
893 			/* Update some of the state in the QP */
894 			wq->wq_tail = tail;
895 			total_posted += posted_cnt;
896 			posted_cnt = 0;
897 
898 			membar_producer();
899 
900 			/*
901 			 * Now set the ownership bit of the first
902 			 * one in the chain
903 			 */
904 			HERMON_SET_SEND_WQE_OWNER(qp, (uint32_t *)prev,
905 			    prev_nopcode);
906 
907 			/* the FMA retry loop starts for Hermon doorbell. */
908 			hermon_pio_start(state, uarhdl, pio_error, fm_loop_cnt,
909 			    fm_status, fm_test);
910 
911 			HERMON_UAR_DOORBELL(state, uarhdl,
912 			    (uint64_t *)(void *)&state->hs_uar->send,
913 			    (uint64_t)qp->qp_ring);
914 
915 			/* the FMA retry loop ends. */
916 			hermon_pio_end(state, uarhdl, pio_error, fm_loop_cnt,
917 			    fm_status, fm_test);
918 		}
919 	}
920 
921 	/*
922 	 * Update the "num_posted" return value (if necessary).
923 	 * Then drop the locks and return success.
924 	 */
925 	if (num_posted != NULL) {
926 		*num_posted = total_posted;
927 	}
928 	mutex_exit(&qp->qp_sq_lock);
929 	return (status);
930 
931 pio_error:
932 	mutex_exit(&qp->qp_sq_lock);
933 	hermon_fm_ereport(state, HCA_SYS_ERR, HCA_ERR_SRV_LOST);
934 	return (ibc_get_ci_failure(0));
935 }
936 
937 
938 /*
939  * hermon_post_recv()
940  *    Context: Can be called from interrupt or base context.
941  */
942 int
943 hermon_post_recv(hermon_state_t *state, hermon_qphdl_t qp,
944     ibt_recv_wr_t *wr, uint_t num_wr, uint_t *num_posted)
945 {
946 	uint64_t			*desc;
947 	hermon_workq_hdr_t		*wq;
948 	uint32_t			head, tail, next_tail, qsize_msk;
949 	uint32_t			sync_from, sync_to;
950 	uint_t				wrindx;
951 	uint_t				posted_cnt;
952 	int				status;
953 
954 	/*
955 	 * Check for user-mappable QP memory.  Note:  We do not allow kernel
956 	 * clients to post to QP memory that is accessible directly by the
957 	 * user.  If the QP memory is user accessible, then return an error.
958 	 */
959 	if (qp->qp_is_umap) {
960 		return (IBT_QP_HDL_INVALID);
961 	}
962 
963 	/* Initialize posted_cnt */
964 	posted_cnt = 0;
965 
966 	mutex_enter(&qp->qp_lock);
967 
968 	/*
969 	 * Check if QP is associated with an SRQ
970 	 */
971 	if (qp->qp_srq_en == HERMON_QP_SRQ_ENABLED) {
972 		mutex_exit(&qp->qp_lock);
973 		return (IBT_SRQ_IN_USE);
974 	}
975 
976 	/*
977 	 * Check QP state.  Can not post Recv requests from the "Reset" state
978 	 */
979 	if (qp->qp_state == HERMON_QP_RESET) {
980 		mutex_exit(&qp->qp_lock);
981 		return (IBT_QP_STATE_INVALID);
982 	}
983 
984 	/* Check that work request transport type is valid */
985 	if ((qp->qp_serv_type != HERMON_QP_UD) &&
986 	    (qp->qp_serv_type != HERMON_QP_RC) &&
987 	    (qp->qp_serv_type != HERMON_QP_UC)) {
988 		mutex_exit(&qp->qp_lock);
989 		return (IBT_QP_SRV_TYPE_INVALID);
990 	}
991 
992 	mutex_exit(&qp->qp_lock);
993 	mutex_enter(&qp->qp_rq_lock);
994 
995 	/*
996 	 * Grab the lock for the WRID list, i.e., membar_consumer().
997 	 * This is not needed because the mutex_enter() above has
998 	 * the same effect.
999 	 */
1000 
1001 	/* Save away some initial QP state */
1002 	wq = qp->qp_rq_wqhdr;
1003 	qsize_msk = wq->wq_mask;
1004 	tail	  = wq->wq_tail;
1005 	head	  = wq->wq_head;
1006 
1007 	wrindx = 0;
1008 	status	  = DDI_SUCCESS;
1009 	/*
1010 	 * Before we begin, save the current "tail index" for later
1011 	 * DMA sync
1012 	 */
1013 	sync_from = tail;
1014 
1015 	for (wrindx = 0; wrindx < num_wr; wrindx++) {
1016 		if (wq->wq_full != 0) {
1017 			status = IBT_QP_FULL;
1018 			break;
1019 		}
1020 		next_tail = (tail + 1) & qsize_msk;
1021 		if (next_tail == head) {
1022 			wq->wq_full = 1;
1023 		}
1024 		desc = HERMON_QP_RQ_ENTRY(qp, tail);
1025 		status = hermon_wqe_recv_build(state, qp, &wr[wrindx], desc);
1026 		if (status != DDI_SUCCESS) {
1027 			break;
1028 		}
1029 
1030 		wq->wq_wrid[tail] = wr[wrindx].wr_id;
1031 		qp->qp_rq_wqecntr++;
1032 
1033 		tail = next_tail;
1034 		posted_cnt++;
1035 	}
1036 
1037 	if (posted_cnt != 0) {
1038 		/* Save away updated "tail index" for the DMA sync */
1039 		sync_to = tail;
1040 
1041 		hermon_wqe_sync(qp, sync_from, sync_to, HERMON_WR_RECV,
1042 		    DDI_DMA_SYNC_FORDEV);
1043 
1044 		wq->wq_tail = tail;
1045 
1046 		membar_producer();	/* ensure wrids are visible */
1047 
1048 		/* Update the doorbell record w/ wqecntr */
1049 		HERMON_UAR_DB_RECORD_WRITE(qp->qp_rq_vdbr,
1050 		    qp->qp_rq_wqecntr & 0xFFFF);
1051 	}
1052 
1053 	if (num_posted != NULL) {
1054 		*num_posted = posted_cnt;
1055 	}
1056 
1057 
1058 	mutex_exit(&qp->qp_rq_lock);
1059 	return (status);
1060 }
1061 
1062 /*
1063  * hermon_post_srq()
1064  *    Context: Can be called from interrupt or base context.
1065  */
1066 int
1067 hermon_post_srq(hermon_state_t *state, hermon_srqhdl_t srq,
1068     ibt_recv_wr_t *wr, uint_t num_wr, uint_t *num_posted)
1069 {
1070 	uint64_t			*desc;
1071 	hermon_workq_hdr_t		*wq;
1072 	uint_t				indx, wrindx;
1073 	uint_t				posted_cnt;
1074 	int				status;
1075 
1076 	mutex_enter(&srq->srq_lock);
1077 
1078 	/*
1079 	 * Check for user-mappable QP memory.  Note:  We do not allow kernel
1080 	 * clients to post to QP memory that is accessible directly by the
1081 	 * user.  If the QP memory is user accessible, then return an error.
1082 	 */
1083 	if (srq->srq_is_umap) {
1084 		mutex_exit(&srq->srq_lock);
1085 		return (IBT_SRQ_HDL_INVALID);
1086 	}
1087 
1088 	/*
1089 	 * Check SRQ state.  Can not post Recv requests when SRQ is in error
1090 	 */
1091 	if (srq->srq_state == HERMON_SRQ_STATE_ERROR) {
1092 		mutex_exit(&srq->srq_lock);
1093 		return (IBT_QP_STATE_INVALID);
1094 	}
1095 
1096 	status = DDI_SUCCESS;
1097 	posted_cnt = 0;
1098 	wq = srq->srq_wq_wqhdr;
1099 	indx = wq->wq_head;
1100 
1101 	for (wrindx = 0; wrindx < num_wr; wrindx++) {
1102 
1103 		if (indx == wq->wq_tail) {
1104 			status = IBT_QP_FULL;
1105 			break;
1106 		}
1107 		desc = HERMON_SRQ_WQE_ADDR(srq, indx);
1108 
1109 		wq->wq_wrid[indx] = wr[wrindx].wr_id;
1110 
1111 		status = hermon_wqe_srq_build(state, srq, &wr[wrindx], desc);
1112 		if (status != DDI_SUCCESS) {
1113 			break;
1114 		}
1115 
1116 		hermon_wqe_sync(srq, indx, indx + 1,
1117 		    HERMON_WR_SRQ, DDI_DMA_SYNC_FORDEV);
1118 		posted_cnt++;
1119 		indx = htons(((uint16_t *)desc)[1]);
1120 		wq->wq_head = indx;
1121 	}
1122 
1123 	if (posted_cnt != 0) {
1124 
1125 		srq->srq_wq_wqecntr += posted_cnt;
1126 
1127 		membar_producer();	/* ensure wrids are visible */
1128 
1129 		/* Ring the doorbell w/ wqecntr */
1130 		HERMON_UAR_DB_RECORD_WRITE(srq->srq_wq_vdbr,
1131 		    srq->srq_wq_wqecntr & 0xFFFF);
1132 	}
1133 
1134 	if (num_posted != NULL) {
1135 		*num_posted = posted_cnt;
1136 	}
1137 
1138 	mutex_exit(&srq->srq_lock);
1139 	return (status);
1140 }
1141 
1142 
1143 /*
1144  * hermon_wqe_send_build()
1145  *    Context: Can be called from interrupt or base context.
1146  */
1147 static int
1148 hermon_wqe_send_build(hermon_state_t *state, hermon_qphdl_t qp,
1149     ibt_send_wr_t *wr, uint64_t *desc, uint_t *size)
1150 {
1151 	hermon_hw_snd_wqe_ud_t		*ud;
1152 	hermon_hw_snd_wqe_remaddr_t	*rc;
1153 	hermon_hw_snd_wqe_atomic_t	*at;
1154 	hermon_hw_snd_wqe_remaddr_t	*uc;
1155 	hermon_hw_snd_wqe_bind_t	*bn;
1156 	hermon_hw_wqe_sgl_t		*ds, *old_ds;
1157 	ibt_ud_dest_t			*dest;
1158 	ibt_wr_ds_t			*sgl;
1159 	hermon_ahhdl_t			ah;
1160 	uint32_t			nds;
1161 	int				i, j, last_ds, num_ds, status;
1162 	int				tmpsize;
1163 
1164 	ASSERT(MUTEX_HELD(&qp->qp_sq_lock));
1165 
1166 	/* Initialize the information for the Data Segments */
1167 	ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)desc +
1168 	    sizeof (hermon_hw_snd_wqe_ctrl_t));
1169 	nds = wr->wr_nds;
1170 	sgl = wr->wr_sgl;
1171 	num_ds = 0;
1172 	i = 0;
1173 
1174 	/*
1175 	 * Build a Send WQE depends first and foremost on the transport
1176 	 * type of Work Request (i.e. UD, RC, or UC)
1177 	 */
1178 	switch (wr->wr_trans) {
1179 	case IBT_UD_SRV:
1180 		/* Ensure that work request transport type matches QP type */
1181 		if (qp->qp_serv_type != HERMON_QP_UD) {
1182 			return (IBT_QP_SRV_TYPE_INVALID);
1183 		}
1184 
1185 		/*
1186 		 * Validate the operation type.  For UD requests, only the
1187 		 * "Send" and "Send LSO" operations are valid.
1188 		 */
1189 		if (wr->wr_opcode != IBT_WRC_SEND &&
1190 		    wr->wr_opcode != IBT_WRC_SEND_LSO) {
1191 			return (IBT_QP_OP_TYPE_INVALID);
1192 		}
1193 
1194 		/*
1195 		 * If this is a Special QP (QP0 or QP1), then we need to
1196 		 * build MLX WQEs instead.  So jump to hermon_wqe_mlx_build()
1197 		 * and return whatever status it returns
1198 		 */
1199 		if (qp->qp_is_special) {
1200 			if (wr->wr_opcode == IBT_WRC_SEND_LSO) {
1201 				return (IBT_QP_OP_TYPE_INVALID);
1202 			}
1203 			status = hermon_wqe_mlx_build(state, qp,
1204 			    wr, desc, size);
1205 			return (status);
1206 		}
1207 
1208 		/*
1209 		 * Otherwise, if this is a normal UD Send request, then fill
1210 		 * all the fields in the Hermon UD header for the WQE.  Note:
1211 		 * to do this we'll need to extract some information from the
1212 		 * Address Handle passed with the work request.
1213 		 */
1214 		ud = (hermon_hw_snd_wqe_ud_t *)((uintptr_t)desc +
1215 		    sizeof (hermon_hw_snd_wqe_ctrl_t));
1216 		if (wr->wr_opcode == IBT_WRC_SEND) {
1217 			dest = wr->wr.ud.udwr_dest;
1218 		} else {
1219 			dest = wr->wr.ud_lso.lso_ud_dest;
1220 		}
1221 		ah = (hermon_ahhdl_t)dest->ud_ah;
1222 		if (ah == NULL) {
1223 			return (IBT_AH_HDL_INVALID);
1224 		}
1225 
1226 		/*
1227 		 * Build the Unreliable Datagram Segment for the WQE, using
1228 		 * the information from the address handle and the work
1229 		 * request.
1230 		 */
1231 		/* mutex_enter(&ah->ah_lock); */
1232 		if (wr->wr_opcode == IBT_WRC_SEND) {
1233 			HERMON_WQE_BUILD_UD(qp, ud, ah, wr->wr.ud.udwr_dest);
1234 		} else {	/* IBT_WRC_SEND_LSO */
1235 			HERMON_WQE_BUILD_UD(qp, ud, ah,
1236 			    wr->wr.ud_lso.lso_ud_dest);
1237 		}
1238 		/* mutex_exit(&ah->ah_lock); */
1239 
1240 		/* Update "ds" for filling in Data Segments (below) */
1241 		ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)ud +
1242 		    sizeof (hermon_hw_snd_wqe_ud_t));
1243 
1244 		if (wr->wr_opcode == IBT_WRC_SEND_LSO) {
1245 			int total_len;
1246 
1247 			total_len = (4 + 0xf + wr->wr.ud_lso.lso_hdr_sz) & ~0xf;
1248 			if ((uintptr_t)ds + total_len + (nds * 16) >
1249 			    (uintptr_t)desc + (1 << qp->qp_sq_log_wqesz))
1250 				return (IBT_QP_SGL_LEN_INVALID);
1251 
1252 			bcopy(wr->wr.ud_lso.lso_hdr, (uint32_t *)ds + 1,
1253 			    wr->wr.ud_lso.lso_hdr_sz);
1254 			old_ds = ds;
1255 			ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)ds + total_len);
1256 			for (; i < nds; i++) {
1257 				if (sgl[i].ds_len == 0)
1258 					continue;
1259 				HERMON_WQE_BUILD_DATA_SEG_SEND(&ds[num_ds],
1260 				    &sgl[i]);
1261 				num_ds++;
1262 				i++;
1263 				break;
1264 			}
1265 			membar_producer();
1266 			HERMON_WQE_BUILD_LSO(qp, old_ds, wr->wr.ud_lso.lso_mss,
1267 			    wr->wr.ud_lso.lso_hdr_sz);
1268 		}
1269 
1270 		break;
1271 
1272 	case IBT_RC_SRV:
1273 		/* Ensure that work request transport type matches QP type */
1274 		if (qp->qp_serv_type != HERMON_QP_RC) {
1275 			return (IBT_QP_SRV_TYPE_INVALID);
1276 		}
1277 
1278 		/*
1279 		 * Validate the operation type.  For RC requests, we allow
1280 		 * "Send", "RDMA Read", "RDMA Write", various "Atomic"
1281 		 * operations, and memory window "Bind"
1282 		 */
1283 		if ((wr->wr_opcode != IBT_WRC_SEND) &&
1284 		    (wr->wr_opcode != IBT_WRC_RDMAR) &&
1285 		    (wr->wr_opcode != IBT_WRC_RDMAW) &&
1286 		    (wr->wr_opcode != IBT_WRC_CSWAP) &&
1287 		    (wr->wr_opcode != IBT_WRC_FADD) &&
1288 		    (wr->wr_opcode != IBT_WRC_BIND)) {
1289 			return (IBT_QP_OP_TYPE_INVALID);
1290 		}
1291 
1292 		/*
1293 		 * If this is a Send request, then all we need to do is break
1294 		 * out and here and begin the Data Segment processing below
1295 		 */
1296 		if (wr->wr_opcode == IBT_WRC_SEND) {
1297 			break;
1298 		}
1299 
1300 		/*
1301 		 * If this is an RDMA Read or RDMA Write request, then fill
1302 		 * in the "Remote Address" header fields.
1303 		 */
1304 		if ((wr->wr_opcode == IBT_WRC_RDMAR) ||
1305 		    (wr->wr_opcode == IBT_WRC_RDMAW)) {
1306 			rc = (hermon_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
1307 			    sizeof (hermon_hw_snd_wqe_ctrl_t));
1308 
1309 			/*
1310 			 * Build the Remote Address Segment for the WQE, using
1311 			 * the information from the RC work request.
1312 			 */
1313 			HERMON_WQE_BUILD_REMADDR(qp, rc, &wr->wr.rc.rcwr.rdma);
1314 
1315 			/* Update "ds" for filling in Data Segments (below) */
1316 			ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)rc +
1317 			    sizeof (hermon_hw_snd_wqe_remaddr_t));
1318 			break;
1319 		}
1320 
1321 		/*
1322 		 * If this is one of the Atomic type operations (i.e
1323 		 * Compare-Swap or Fetch-Add), then fill in both the "Remote
1324 		 * Address" header fields and the "Atomic" header fields.
1325 		 */
1326 		if ((wr->wr_opcode == IBT_WRC_CSWAP) ||
1327 		    (wr->wr_opcode == IBT_WRC_FADD)) {
1328 			rc = (hermon_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
1329 			    sizeof (hermon_hw_snd_wqe_ctrl_t));
1330 			at = (hermon_hw_snd_wqe_atomic_t *)((uintptr_t)rc +
1331 			    sizeof (hermon_hw_snd_wqe_remaddr_t));
1332 
1333 			/*
1334 			 * Build the Remote Address and Atomic Segments for
1335 			 * the WQE, using the information from the RC Atomic
1336 			 * work request.
1337 			 */
1338 			HERMON_WQE_BUILD_RC_ATOMIC_REMADDR(qp, rc, wr);
1339 			HERMON_WQE_BUILD_ATOMIC(qp, at, wr->wr.rc.rcwr.atomic);
1340 
1341 			/* Update "ds" for filling in Data Segments (below) */
1342 			ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)at +
1343 			    sizeof (hermon_hw_snd_wqe_atomic_t));
1344 
1345 			/*
1346 			 * Update "nds" and "sgl" because Atomic requests have
1347 			 * only a single Data Segment (and they are encoded
1348 			 * somewhat differently in the work request.
1349 			 */
1350 			nds = 1;
1351 			sgl = wr->wr_sgl;
1352 			break;
1353 		}
1354 
1355 		/*
1356 		 * If this is memory window Bind operation, then we call the
1357 		 * hermon_wr_bind_check() routine to validate the request and
1358 		 * to generate the updated RKey.  If this is successful, then
1359 		 * we fill in the WQE's "Bind" header fields.
1360 		 */
1361 		if (wr->wr_opcode == IBT_WRC_BIND) {
1362 			status = hermon_wr_bind_check(state, wr);
1363 			if (status != DDI_SUCCESS) {
1364 				return (status);
1365 			}
1366 
1367 			bn = (hermon_hw_snd_wqe_bind_t *)((uintptr_t)desc +
1368 			    sizeof (hermon_hw_snd_wqe_ctrl_t));
1369 
1370 			/*
1371 			 * Build the Bind Memory Window Segments for the WQE,
1372 			 * using the information from the RC Bind memory
1373 			 * window work request.
1374 			 */
1375 			HERMON_WQE_BUILD_BIND(qp, bn, wr->wr.rc.rcwr.bind);
1376 
1377 			/*
1378 			 * Update the "ds" pointer.  Even though the "bind"
1379 			 * operation requires no SGLs, this is necessary to
1380 			 * facilitate the correct descriptor size calculations
1381 			 * (below).
1382 			 */
1383 			ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)bn +
1384 			    sizeof (hermon_hw_snd_wqe_bind_t));
1385 			nds = 0;
1386 		}
1387 		break;
1388 
1389 	case IBT_UC_SRV:
1390 		/* Ensure that work request transport type matches QP type */
1391 		if (qp->qp_serv_type != HERMON_QP_UC) {
1392 			return (IBT_QP_SRV_TYPE_INVALID);
1393 		}
1394 
1395 		/*
1396 		 * Validate the operation type.  For UC requests, we only
1397 		 * allow "Send", "RDMA Write", and memory window "Bind".
1398 		 * Note: Unlike RC, UC does not allow "RDMA Read" or "Atomic"
1399 		 * operations
1400 		 */
1401 		if ((wr->wr_opcode != IBT_WRC_SEND) &&
1402 		    (wr->wr_opcode != IBT_WRC_RDMAW) &&
1403 		    (wr->wr_opcode != IBT_WRC_BIND)) {
1404 			return (IBT_QP_OP_TYPE_INVALID);
1405 		}
1406 
1407 		/*
1408 		 * If this is a Send request, then all we need to do is break
1409 		 * out and here and begin the Data Segment processing below
1410 		 */
1411 		if (wr->wr_opcode == IBT_WRC_SEND) {
1412 			break;
1413 		}
1414 
1415 		/*
1416 		 * If this is an RDMA Write request, then fill in the "Remote
1417 		 * Address" header fields.
1418 		 */
1419 		if (wr->wr_opcode == IBT_WRC_RDMAW) {
1420 			uc = (hermon_hw_snd_wqe_remaddr_t *)((uintptr_t)desc +
1421 			    sizeof (hermon_hw_snd_wqe_ctrl_t));
1422 
1423 			/*
1424 			 * Build the Remote Address Segment for the WQE, using
1425 			 * the information from the UC work request.
1426 			 */
1427 			HERMON_WQE_BUILD_REMADDR(qp, uc, &wr->wr.uc.ucwr.rdma);
1428 
1429 			/* Update "ds" for filling in Data Segments (below) */
1430 			ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)uc +
1431 			    sizeof (hermon_hw_snd_wqe_remaddr_t));
1432 			break;
1433 		}
1434 
1435 		/*
1436 		 * If this is memory window Bind operation, then we call the
1437 		 * hermon_wr_bind_check() routine to validate the request and
1438 		 * to generate the updated RKey.  If this is successful, then
1439 		 * we fill in the WQE's "Bind" header fields.
1440 		 */
1441 		if (wr->wr_opcode == IBT_WRC_BIND) {
1442 			status = hermon_wr_bind_check(state, wr);
1443 			if (status != DDI_SUCCESS) {
1444 				return (status);
1445 			}
1446 
1447 			bn = (hermon_hw_snd_wqe_bind_t *)((uintptr_t)desc +
1448 			    sizeof (hermon_hw_snd_wqe_ctrl_t));
1449 
1450 			/*
1451 			 * Build the Bind Memory Window Segments for the WQE,
1452 			 * using the information from the UC Bind memory
1453 			 * window work request.
1454 			 */
1455 			HERMON_WQE_BUILD_BIND(qp, bn, wr->wr.uc.ucwr.bind);
1456 
1457 			/*
1458 			 * Update the "ds" pointer.  Even though the "bind"
1459 			 * operation requires no SGLs, this is necessary to
1460 			 * facilitate the correct descriptor size calculations
1461 			 * (below).
1462 			 */
1463 			ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)bn +
1464 			    sizeof (hermon_hw_snd_wqe_bind_t));
1465 			nds = 0;
1466 		}
1467 		break;
1468 
1469 	default:
1470 		return (IBT_QP_SRV_TYPE_INVALID);
1471 	}
1472 
1473 	/*
1474 	 * Now fill in the Data Segments (SGL) for the Send WQE based on
1475 	 * the values setup above (i.e. "sgl", "nds", and the "ds" pointer
1476 	 * Start by checking for a valid number of SGL entries
1477 	 */
1478 	if (nds > qp->qp_sq_sgl) {
1479 		return (IBT_QP_SGL_LEN_INVALID);
1480 	}
1481 
1482 	/*
1483 	 * For each SGL in the Send Work Request, fill in the Send WQE's data
1484 	 * segments.  Note: We skip any SGL with zero size because Hermon
1485 	 * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
1486 	 * the encoding for zero means a 2GB transfer.
1487 	 */
1488 	for (last_ds = num_ds, j = i; j < nds; j++) {
1489 		if (sgl[j].ds_len != 0)
1490 			last_ds++;	/* real last ds of wqe to fill */
1491 	}
1492 
1493 	/*
1494 	 * Return the size of descriptor (in 16-byte chunks)
1495 	 * For Hermon, we want them (for now) to be on stride size
1496 	 * boundaries, which was implicit in Tavor/Arbel
1497 	 *
1498 	 */
1499 	tmpsize = ((uintptr_t)&ds[last_ds] - (uintptr_t)desc);
1500 
1501 	*size = tmpsize >> 0x4;
1502 
1503 	for (j = nds; --j >= i; ) {
1504 		if (sgl[j].ds_len == 0) {
1505 			continue;
1506 		}
1507 
1508 		/*
1509 		 * Fill in the Data Segment(s) for the current WQE, using the
1510 		 * information contained in the scatter-gather list of the
1511 		 * work request.
1512 		 */
1513 		last_ds--;
1514 		HERMON_WQE_BUILD_DATA_SEG_SEND(&ds[last_ds], &sgl[j]);
1515 	}
1516 
1517 	return (DDI_SUCCESS);
1518 }
1519 
1520 
1521 
1522 /*
1523  * hermon_wqe_mlx_build()
1524  *    Context: Can be called from interrupt or base context.
1525  */
1526 static int
1527 hermon_wqe_mlx_build(hermon_state_t *state, hermon_qphdl_t qp,
1528     ibt_send_wr_t *wr, uint64_t *desc, uint_t *size)
1529 {
1530 	hermon_ahhdl_t		ah;
1531 	hermon_hw_udav_t	*udav;
1532 	ib_lrh_hdr_t		*lrh;
1533 	ib_grh_t		*grh;
1534 	ib_bth_hdr_t		*bth;
1535 	ib_deth_hdr_t		*deth;
1536 	hermon_hw_wqe_sgl_t	*ds;
1537 	ibt_wr_ds_t		*sgl;
1538 	uint8_t			*mgmtclass, *hpoint, *hcount;
1539 	uint32_t		nds, offset, pktlen;
1540 	uint32_t		desc_sz;
1541 	int			i, num_ds;
1542 	int			tmpsize;
1543 
1544 	ASSERT(MUTEX_HELD(&qp->qp_sq_lock));
1545 
1546 	/* Initialize the information for the Data Segments */
1547 	ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)desc +
1548 	    sizeof (hermon_hw_mlx_wqe_nextctrl_t));
1549 
1550 	/*
1551 	 * Pull the address handle from the work request. The UDAV will
1552 	 * be used to answer some questions about the request.
1553 	 */
1554 	ah = (hermon_ahhdl_t)wr->wr.ud.udwr_dest->ud_ah;
1555 	if (ah == NULL) {
1556 		return (IBT_AH_HDL_INVALID);
1557 	}
1558 	mutex_enter(&ah->ah_lock);
1559 	udav = ah->ah_udav;
1560 
1561 	/*
1562 	 * If the request is for QP1 and the destination LID is equal to
1563 	 * the Permissive LID, then return an error.  This combination is
1564 	 * not allowed
1565 	 */
1566 	if ((udav->rlid == IB_LID_PERMISSIVE) &&
1567 	    (qp->qp_is_special == HERMON_QP_GSI)) {
1568 		mutex_exit(&ah->ah_lock);
1569 		return (IBT_AH_HDL_INVALID);
1570 	}
1571 
1572 	/*
1573 	 * Calculate the size of the packet headers, including the GRH
1574 	 * (if necessary)
1575 	 */
1576 	desc_sz = sizeof (ib_lrh_hdr_t) + sizeof (ib_bth_hdr_t) +
1577 	    sizeof (ib_deth_hdr_t);
1578 	if (udav->grh) {
1579 		desc_sz += sizeof (ib_grh_t);
1580 	}
1581 
1582 	/*
1583 	 * Begin to build the first "inline" data segment for the packet
1584 	 * headers.  Note:  By specifying "inline" we can build the contents
1585 	 * of the MAD packet headers directly into the work queue (as part
1586 	 * descriptor).  This has the advantage of both speeding things up
1587 	 * and of not requiring the driver to allocate/register any additional
1588 	 * memory for the packet headers.
1589 	 */
1590 	HERMON_WQE_BUILD_INLINE(qp, &ds[0], desc_sz);
1591 	desc_sz += 4;
1592 
1593 	/*
1594 	 * Build Local Route Header (LRH)
1595 	 *    We start here by building the LRH into a temporary location.
1596 	 *    When we have finished we copy the LRH data into the descriptor.
1597 	 *
1598 	 *    Notice that the VL values are hardcoded.  This is not a problem
1599 	 *    because VL15 is decided later based on the value in the MLX
1600 	 *    transport "next/ctrl" header (see the "vl15" bit below), and it
1601 	 *    is otherwise (meaning for QP1) chosen from the SL-to-VL table
1602 	 *    values.  This rule does not hold for loopback packets however
1603 	 *    (all of which bypass the SL-to-VL tables) and it is the reason
1604 	 *    that non-QP0 MADs are setup with VL hardcoded to zero below.
1605 	 *
1606 	 *    Notice also that Source LID is hardcoded to the Permissive LID
1607 	 *    (0xFFFF).  This is also not a problem because if the Destination
1608 	 *    LID is not the Permissive LID, then the "slr" value in the MLX
1609 	 *    transport "next/ctrl" header will be set to zero and the hardware
1610 	 *    will pull the LID from value in the port.
1611 	 */
1612 	lrh = (ib_lrh_hdr_t *)((uintptr_t)&ds[0] + 4);
1613 	pktlen = (desc_sz + 0x100) >> 2;
1614 	HERMON_WQE_BUILD_MLX_LRH(lrh, qp, udav, pktlen);
1615 
1616 	/*
1617 	 * Build Global Route Header (GRH)
1618 	 *    This is only built if necessary as defined by the "grh" bit in
1619 	 *    the address vector.  Note:  We also calculate the offset to the
1620 	 *    next header (BTH) based on whether or not the "grh" bit is set.
1621 	 */
1622 	if (udav->grh) {
1623 		/*
1624 		 * If the request is for QP0, then return an error.  The
1625 		 * combination of global routine (GRH) and QP0 is not allowed.
1626 		 */
1627 		if (qp->qp_is_special == HERMON_QP_SMI) {
1628 			mutex_exit(&ah->ah_lock);
1629 			return (IBT_AH_HDL_INVALID);
1630 		}
1631 		grh = (ib_grh_t *)((uintptr_t)lrh + sizeof (ib_lrh_hdr_t));
1632 		HERMON_WQE_BUILD_MLX_GRH(state, grh, qp, udav, pktlen);
1633 
1634 		bth = (ib_bth_hdr_t *)((uintptr_t)grh + sizeof (ib_grh_t));
1635 	} else {
1636 		bth = (ib_bth_hdr_t *)((uintptr_t)lrh + sizeof (ib_lrh_hdr_t));
1637 	}
1638 	mutex_exit(&ah->ah_lock);
1639 
1640 
1641 	/*
1642 	 * Build Base Transport Header (BTH)
1643 	 *    Notice that the M, PadCnt, and TVer fields are all set
1644 	 *    to zero implicitly.  This is true for all Management Datagrams
1645 	 *    MADs whether GSI are SMI.
1646 	 */
1647 	HERMON_WQE_BUILD_MLX_BTH(state, bth, qp, wr);
1648 
1649 	/*
1650 	 * Build Datagram Extended Transport Header (DETH)
1651 	 */
1652 	deth = (ib_deth_hdr_t *)((uintptr_t)bth + sizeof (ib_bth_hdr_t));
1653 	HERMON_WQE_BUILD_MLX_DETH(deth, qp);
1654 
1655 	/* Ensure that the Data Segment is aligned on a 16-byte boundary */
1656 	ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)deth + sizeof (ib_deth_hdr_t));
1657 	ds = (hermon_hw_wqe_sgl_t *)(((uintptr_t)ds + 0xF) & ~0xF);
1658 	nds = wr->wr_nds;
1659 	sgl = wr->wr_sgl;
1660 	num_ds = 0;
1661 
1662 	/*
1663 	 * Now fill in the Data Segments (SGL) for the MLX WQE based on the
1664 	 * values set up above (i.e. "sgl", "nds", and the "ds" pointer
1665 	 * Start by checking for a valid number of SGL entries
1666 	 */
1667 	if (nds > qp->qp_sq_sgl) {
1668 		return (IBT_QP_SGL_LEN_INVALID);
1669 	}
1670 
1671 	/*
1672 	 * For each SGL in the Send Work Request, fill in the MLX WQE's data
1673 	 * segments.  Note: We skip any SGL with zero size because Hermon
1674 	 * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
1675 	 * the encoding for zero means a 2GB transfer.  Because of this special
1676 	 * encoding in the hardware, we mask the requested length with
1677 	 * HERMON_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
1678 	 * zero.)
1679 	 */
1680 	mgmtclass = hpoint = hcount = NULL;
1681 	offset = 0;
1682 	for (i = 0; i < nds; i++) {
1683 		if (sgl[i].ds_len == 0) {
1684 			continue;
1685 		}
1686 
1687 		/*
1688 		 * Fill in the Data Segment(s) for the MLX send WQE, using
1689 		 * the information contained in the scatter-gather list of
1690 		 * the work request.
1691 		 */
1692 		HERMON_WQE_BUILD_DATA_SEG_SEND(&ds[num_ds], &sgl[i]);
1693 
1694 		/*
1695 		 * Search through the contents of all MADs posted to QP0 to
1696 		 * initialize pointers to the places where Directed Route "hop
1697 		 * pointer", "hop count", and "mgmtclass" would be.  Hermon
1698 		 * needs these updated (i.e. incremented or decremented, as
1699 		 * necessary) by software.
1700 		 */
1701 		if (qp->qp_is_special == HERMON_QP_SMI) {
1702 
1703 			HERMON_SPECIAL_QP_DRMAD_GET_MGMTCLASS(mgmtclass,
1704 			    offset, sgl[i].ds_va, sgl[i].ds_len);
1705 
1706 			HERMON_SPECIAL_QP_DRMAD_GET_HOPPOINTER(hpoint,
1707 			    offset, sgl[i].ds_va, sgl[i].ds_len);
1708 
1709 			HERMON_SPECIAL_QP_DRMAD_GET_HOPCOUNT(hcount,
1710 			    offset, sgl[i].ds_va, sgl[i].ds_len);
1711 
1712 			offset += sgl[i].ds_len;
1713 		}
1714 		num_ds++;
1715 	}
1716 
1717 	/*
1718 	 * Hermon's Directed Route MADs need to have the "hop pointer"
1719 	 * incremented/decremented (as necessary) depending on whether it is
1720 	 * currently less than or greater than the "hop count" (i.e. whether
1721 	 * the MAD is a request or a response.)
1722 	 */
1723 	if (qp->qp_is_special == HERMON_QP_SMI) {
1724 		HERMON_SPECIAL_QP_DRMAD_DO_HOPPOINTER_MODIFY(*mgmtclass,
1725 		    *hpoint, *hcount);
1726 	}
1727 
1728 	/*
1729 	 * Now fill in the ICRC Data Segment.  This data segment is inlined
1730 	 * just like the packets headers above, but it is only four bytes and
1731 	 * set to zero (to indicate that we wish the hardware to generate ICRC.
1732 	 */
1733 	HERMON_WQE_BUILD_INLINE_ICRC(qp, &ds[num_ds], 4, 0);
1734 	num_ds++;
1735 
1736 	/*
1737 	 * Return the size of descriptor (in 16-byte chunks)
1738 	 * For Hermon, we want them (for now) to be on stride size
1739 	 * boundaries, which was implicit in Tavor/Arbel
1740 	 */
1741 	tmpsize = ((uintptr_t)&ds[num_ds] - (uintptr_t)desc);
1742 
1743 	*size = tmpsize >> 0x04;
1744 
1745 	return (DDI_SUCCESS);
1746 }
1747 
1748 
1749 
1750 /*
1751  * hermon_wqe_recv_build()
1752  *    Context: Can be called from interrupt or base context.
1753  */
1754 /* ARGSUSED */
1755 static int
1756 hermon_wqe_recv_build(hermon_state_t *state, hermon_qphdl_t qp,
1757     ibt_recv_wr_t *wr, uint64_t *desc)
1758 {
1759 	hermon_hw_wqe_sgl_t	*ds;
1760 	int			i, num_ds;
1761 
1762 	ASSERT(MUTEX_HELD(&qp->qp_rq_lock));
1763 
1764 	/*
1765 	 * Fill in the Data Segments (SGL) for the Recv WQE  - don't
1766 	 * need to have a reserved for the ctrl, there is none on the
1767 	 * recv queue for hermon, but will need to put an invalid
1768 	 * (null) scatter pointer per PRM
1769 	 */
1770 	ds = (hermon_hw_wqe_sgl_t *)(uintptr_t)desc;
1771 	num_ds = 0;
1772 
1773 	/* Check for valid number of SGL entries */
1774 	if (wr->wr_nds > qp->qp_rq_sgl) {
1775 		return (IBT_QP_SGL_LEN_INVALID);
1776 	}
1777 
1778 	/*
1779 	 * For each SGL in the Recv Work Request, fill in the Recv WQE's data
1780 	 * segments.  Note: We skip any SGL with zero size because Hermon
1781 	 * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
1782 	 * the encoding for zero means a 2GB transfer.  Because of this special
1783 	 * encoding in the hardware, we mask the requested length with
1784 	 * HERMON_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
1785 	 * zero.)
1786 	 */
1787 	for (i = 0; i < wr->wr_nds; i++) {
1788 		if (wr->wr_sgl[i].ds_len == 0) {
1789 			continue;
1790 		}
1791 
1792 		/*
1793 		 * Fill in the Data Segment(s) for the receive WQE, using the
1794 		 * information contained in the scatter-gather list of the
1795 		 * work request.
1796 		 */
1797 		HERMON_WQE_BUILD_DATA_SEG_RECV(&ds[num_ds], &wr->wr_sgl[i]);
1798 		num_ds++;
1799 	}
1800 
1801 	/* put the null sgl pointer as well if needed */
1802 	if (num_ds < qp->qp_rq_sgl) {
1803 		HERMON_WQE_BUILD_DATA_SEG_RECV(&ds[num_ds], &null_sgl);
1804 	}
1805 
1806 	return (DDI_SUCCESS);
1807 }
1808 
1809 
1810 
1811 /*
1812  * hermon_wqe_srq_build()
1813  *    Context: Can be called from interrupt or base context.
1814  */
1815 /* ARGSUSED */
1816 static int
1817 hermon_wqe_srq_build(hermon_state_t *state, hermon_srqhdl_t srq,
1818     ibt_recv_wr_t *wr, uint64_t *desc)
1819 {
1820 	hermon_hw_wqe_sgl_t	*ds;
1821 	int			i, num_ds;
1822 
1823 	ASSERT(MUTEX_HELD(&srq->srq_lock));
1824 
1825 	/* Fill in the Data Segments (SGL) for the Recv WQE */
1826 	ds = (hermon_hw_wqe_sgl_t *)((uintptr_t)desc +
1827 	    sizeof (hermon_hw_srq_wqe_next_t));
1828 	num_ds = 0;
1829 
1830 	/* Check for valid number of SGL entries */
1831 	if (wr->wr_nds > srq->srq_wq_sgl) {
1832 		return (IBT_QP_SGL_LEN_INVALID);
1833 	}
1834 
1835 	/*
1836 	 * For each SGL in the Recv Work Request, fill in the Recv WQE's data
1837 	 * segments.  Note: We skip any SGL with zero size because Hermon
1838 	 * hardware cannot handle a zero for "byte_cnt" in the WQE.  Actually
1839 	 * the encoding for zero means a 2GB transfer.  Because of this special
1840 	 * encoding in the hardware, we mask the requested length with
1841 	 * HERMON_WQE_SGL_BYTE_CNT_MASK (so that 2GB will end up encoded as
1842 	 * zero.)
1843 	 */
1844 	for (i = 0; i < wr->wr_nds; i++) {
1845 		if (wr->wr_sgl[i].ds_len == 0) {
1846 			continue;
1847 		}
1848 
1849 		/*
1850 		 * Fill in the Data Segment(s) for the receive WQE, using the
1851 		 * information contained in the scatter-gather list of the
1852 		 * work request.
1853 		 */
1854 		HERMON_WQE_BUILD_DATA_SEG_RECV(&ds[num_ds], &wr->wr_sgl[i]);
1855 		num_ds++;
1856 	}
1857 
1858 	/*
1859 	 * put in the null sgl pointer as well, if needed
1860 	 */
1861 	if (num_ds < srq->srq_wq_sgl) {
1862 		HERMON_WQE_BUILD_DATA_SEG_RECV(&ds[num_ds], &null_sgl);
1863 	}
1864 
1865 	return (DDI_SUCCESS);
1866 }
1867 
1868 
1869 /*
1870  * hermon_wr_get_immediate()
1871  *    Context: Can be called from interrupt or base context.
1872  */
1873 static uint32_t
1874 hermon_wr_get_immediate(ibt_send_wr_t *wr)
1875 {
1876 	/*
1877 	 * This routine extracts the "immediate data" from the appropriate
1878 	 * location in the IBTF work request.  Because of the way the
1879 	 * work request structure is defined, the location for this data
1880 	 * depends on the actual work request operation type.
1881 	 */
1882 
1883 	/* For RDMA Write, test if RC or UC */
1884 	if (wr->wr_opcode == IBT_WRC_RDMAW) {
1885 		if (wr->wr_trans == IBT_RC_SRV) {
1886 			return (wr->wr.rc.rcwr.rdma.rdma_immed);
1887 		} else {  /* IBT_UC_SRV */
1888 			return (wr->wr.uc.ucwr.rdma.rdma_immed);
1889 		}
1890 	}
1891 
1892 	/* For Send, test if RC, UD, or UC */
1893 	if (wr->wr_opcode == IBT_WRC_SEND) {
1894 		if (wr->wr_trans == IBT_RC_SRV) {
1895 			return (wr->wr.rc.rcwr.send_immed);
1896 		} else if (wr->wr_trans == IBT_UD_SRV) {
1897 			return (wr->wr.ud.udwr_immed);
1898 		} else {  /* IBT_UC_SRV */
1899 			return (wr->wr.uc.ucwr.send_immed);
1900 		}
1901 	}
1902 
1903 	/*
1904 	 * If any other type of request, then immediate is undefined
1905 	 */
1906 	return (0);
1907 }
1908 
1909 /*
1910  * hermon_wqe_headroom()
1911  *	Context: can be called from interrupt or base, currently only from
1912  *	base context.
1913  * Routine that fills in the headroom for the Send Queue
1914  */
1915 
1916 static void
1917 hermon_wqe_headroom(uint_t from, hermon_qphdl_t qp)
1918 {
1919 	uint32_t	*wqe_start, *wqe_top, *wqe_base, qsize;
1920 	int		hdrmwqes, wqesizebytes, sectperwqe;
1921 	uint32_t	invalue;
1922 	int		i, j;
1923 
1924 	qsize	 = qp->qp_sq_bufsz;
1925 	wqesizebytes = 1 << qp->qp_sq_log_wqesz;
1926 	sectperwqe = wqesizebytes >> 6; 	/* 64 bytes/section */
1927 	hdrmwqes = qp->qp_sq_hdrmwqes;
1928 	wqe_base  = (uint32_t *)HERMON_QP_SQ_ENTRY(qp, 0);
1929 	wqe_top	  = (uint32_t *)HERMON_QP_SQ_ENTRY(qp, qsize);
1930 	wqe_start = (uint32_t *)HERMON_QP_SQ_ENTRY(qp, from);
1931 
1932 	for (i = 0; i < hdrmwqes; i++)	{
1933 		for (j = 0; j < sectperwqe; j++) {
1934 			if (j == 0) {		/* 1st section of wqe */
1935 				/* perserve ownership bit */
1936 				invalue = ddi_get32(qp->qp_wqinfo.qa_acchdl,
1937 				    wqe_start) | 0x7FFFFFFF;
1938 			} else {
1939 				/* or just invalidate it */
1940 				invalue = 0xFFFFFFFF;
1941 			}
1942 			ddi_put32(qp->qp_wqinfo.qa_acchdl, wqe_start, invalue);
1943 			wqe_start += 16;	/* move 64 bytes */
1944 		}
1945 		if (wqe_start == wqe_top)	/* hit the end of the queue */
1946 			wqe_start = wqe_base;	/* wrap to start */
1947 	}
1948 }
1949 
1950 /*
1951  * hermon_wqe_sync()
1952  *    Context: Can be called from interrupt or base context.
1953  */
1954 static void
1955 hermon_wqe_sync(void *hdl, uint_t sync_from, uint_t sync_to,
1956     uint_t sync_type, uint_t flag)
1957 {
1958 	hermon_qphdl_t		qp;
1959 	hermon_srqhdl_t		srq;
1960 	uint64_t		*wqe_from, *wqe_to;
1961 	uint64_t		*wq_base, *wq_top, *qp_base;
1962 	ddi_dma_handle_t	dmahdl;
1963 	off_t			offset;
1964 	size_t			length;
1965 	uint32_t		qsize;
1966 	int			status;
1967 
1968 	if (sync_type == HERMON_WR_SRQ) {
1969 		srq = (hermon_srqhdl_t)hdl;
1970 		/* Get the DMA handle from SRQ context */
1971 		dmahdl = srq->srq_mrhdl->mr_bindinfo.bi_dmahdl;
1972 		/* get base addr of the buffer */
1973 		qp_base = (uint64_t *)(void *)srq->srq_wq_buf;
1974 	} else {
1975 		qp = (hermon_qphdl_t)hdl;
1976 		/* Get the DMA handle from QP context */
1977 		dmahdl = qp->qp_mrhdl->mr_bindinfo.bi_dmahdl;
1978 		/* Determine the base address of the QP buffer */
1979 		if (qp->qp_sq_baseaddr == 0) {
1980 			qp_base = (uint64_t *)(void *)(qp->qp_sq_buf);
1981 		} else {
1982 			qp_base = (uint64_t *)(void *)(qp->qp_rq_buf);
1983 		}
1984 	}
1985 
1986 	/*
1987 	 * Depending on the type of the work queue, we grab information
1988 	 * about the address ranges we need to DMA sync.
1989 	 */
1990 
1991 	if (sync_type == HERMON_WR_SEND) {
1992 		wqe_from = HERMON_QP_SQ_ENTRY(qp, sync_from);
1993 		wqe_to   = HERMON_QP_SQ_ENTRY(qp, sync_to);
1994 		qsize	 = qp->qp_sq_bufsz;
1995 
1996 		wq_base = HERMON_QP_SQ_ENTRY(qp, 0);
1997 		wq_top	 = HERMON_QP_SQ_ENTRY(qp, qsize);
1998 	} else if (sync_type == HERMON_WR_RECV) {
1999 		wqe_from = HERMON_QP_RQ_ENTRY(qp, sync_from);
2000 		wqe_to   = HERMON_QP_RQ_ENTRY(qp, sync_to);
2001 		qsize	 = qp->qp_rq_bufsz;
2002 
2003 		wq_base = HERMON_QP_RQ_ENTRY(qp, 0);
2004 		wq_top	 = HERMON_QP_RQ_ENTRY(qp, qsize);
2005 	} else {
2006 		wqe_from = HERMON_SRQ_WQ_ENTRY(srq, sync_from);
2007 		wqe_to   = HERMON_SRQ_WQ_ENTRY(srq, sync_to);
2008 		qsize	 = srq->srq_wq_bufsz;
2009 
2010 		wq_base = HERMON_SRQ_WQ_ENTRY(srq, 0);
2011 		wq_top	 = HERMON_SRQ_WQ_ENTRY(srq, qsize);
2012 	}
2013 
2014 	/*
2015 	 * There are two possible cases for the beginning and end of the WQE
2016 	 * chain we are trying to sync.  Either this is the simple case, where
2017 	 * the end of the chain is below the beginning of the chain, or it is
2018 	 * the "wrap-around" case, where the end of the chain has wrapped over
2019 	 * the end of the queue.  In the former case, we simply need to
2020 	 * calculate the span from beginning to end and sync it.  In the latter
2021 	 * case, however, we need to calculate the span from the top of the
2022 	 * work queue to the end of the chain and sync that, and then we need
2023 	 * to find the other portion (from beginning of chain to end of queue)
2024 	 * and sync that as well.  Note: if the "top to end" span is actually
2025 	 * zero length, then we don't do a DMA sync because a zero length DMA
2026 	 * sync unnecessarily syncs the entire work queue.
2027 	 */
2028 	if (wqe_to > wqe_from) {
2029 		/* "From Beginning to End" */
2030 
2031 		offset = (off_t)((uintptr_t)wqe_from - (uintptr_t)qp_base);
2032 		length = (size_t)((uintptr_t)wqe_to - (uintptr_t)wqe_from);
2033 
2034 		status = ddi_dma_sync(dmahdl, offset, length, flag);
2035 		if (status != DDI_SUCCESS) {
2036 			return;
2037 		}
2038 	} else {
2039 		/* "From Top to End" */
2040 
2041 		offset = (off_t)((uintptr_t)wq_base - (uintptr_t)qp_base);
2042 		length = (size_t)((uintptr_t)wqe_to - (uintptr_t)wq_base);
2043 		if (length) {
2044 			status = ddi_dma_sync(dmahdl, offset, length, flag);
2045 			if (status != DDI_SUCCESS) {
2046 				return;
2047 			}
2048 		}
2049 
2050 		/* "From Beginning to Bottom" */
2051 
2052 		offset = (off_t)((uintptr_t)wqe_from - (uintptr_t)qp_base);
2053 		length = (size_t)((uintptr_t)wq_top - (uintptr_t)wqe_from);
2054 		status = ddi_dma_sync(dmahdl, offset, length, flag);
2055 		if (status != DDI_SUCCESS) {
2056 			return;
2057 		}
2058 	}
2059 }
2060 
2061 
2062 /*
2063  * hermon_wr_bind_check()
2064  *    Context: Can be called from interrupt or base context.
2065  */
2066 /* ARGSUSED */
2067 static int
2068 hermon_wr_bind_check(hermon_state_t *state, ibt_send_wr_t *wr)
2069 {
2070 	ibt_bind_flags_t	bind_flags;
2071 	uint64_t		vaddr, len;
2072 	uint64_t		reg_start_addr, reg_end_addr;
2073 	hermon_mwhdl_t		mw;
2074 	hermon_mrhdl_t		mr;
2075 	hermon_rsrc_t		*mpt;
2076 	uint32_t		new_rkey;
2077 
2078 	/* Check for a valid Memory Window handle in the WR */
2079 	mw = (hermon_mwhdl_t)wr->wr.rc.rcwr.bind->bind_ibt_mw_hdl;
2080 	if (mw == NULL) {
2081 		return (IBT_MW_HDL_INVALID);
2082 	}
2083 
2084 	/* Check for a valid Memory Region handle in the WR */
2085 	mr = (hermon_mrhdl_t)wr->wr.rc.rcwr.bind->bind_ibt_mr_hdl;
2086 	if (mr == NULL) {
2087 		return (IBT_MR_HDL_INVALID);
2088 	}
2089 
2090 	mutex_enter(&mr->mr_lock);
2091 	mutex_enter(&mw->mr_lock);
2092 
2093 	/*
2094 	 * Check here to see if the memory region has already been partially
2095 	 * deregistered as a result of a hermon_umap_umemlock_cb() callback.
2096 	 * If so, this is an error, return failure.
2097 	 */
2098 	if ((mr->mr_is_umem) && (mr->mr_umemcookie == NULL)) {
2099 		mutex_exit(&mr->mr_lock);
2100 		mutex_exit(&mw->mr_lock);
2101 		return (IBT_MR_HDL_INVALID);
2102 	}
2103 
2104 	/* Check for a valid Memory Window RKey (i.e. a matching RKey) */
2105 	if (mw->mr_rkey != wr->wr.rc.rcwr.bind->bind_rkey) {
2106 		mutex_exit(&mr->mr_lock);
2107 		mutex_exit(&mw->mr_lock);
2108 		return (IBT_MR_RKEY_INVALID);
2109 	}
2110 
2111 	/* Check for a valid Memory Region LKey (i.e. a matching LKey) */
2112 	if (mr->mr_lkey != wr->wr.rc.rcwr.bind->bind_lkey) {
2113 		mutex_exit(&mr->mr_lock);
2114 		mutex_exit(&mw->mr_lock);
2115 		return (IBT_MR_LKEY_INVALID);
2116 	}
2117 
2118 	/*
2119 	 * Now check for valid "vaddr" and "len".  Note:  We don't check the
2120 	 * "vaddr" range when "len == 0" (i.e. on unbind operations)
2121 	 */
2122 	len = wr->wr.rc.rcwr.bind->bind_len;
2123 	if (len != 0) {
2124 		vaddr = wr->wr.rc.rcwr.bind->bind_va;
2125 		reg_start_addr = mr->mr_bindinfo.bi_addr;
2126 		reg_end_addr   = mr->mr_bindinfo.bi_addr +
2127 		    (mr->mr_bindinfo.bi_len - 1);
2128 		if ((vaddr < reg_start_addr) || (vaddr > reg_end_addr)) {
2129 			mutex_exit(&mr->mr_lock);
2130 			mutex_exit(&mw->mr_lock);
2131 			return (IBT_MR_VA_INVALID);
2132 		}
2133 		vaddr = (vaddr + len) - 1;
2134 		if (vaddr > reg_end_addr) {
2135 			mutex_exit(&mr->mr_lock);
2136 			mutex_exit(&mw->mr_lock);
2137 			return (IBT_MR_LEN_INVALID);
2138 		}
2139 	}
2140 
2141 	/*
2142 	 * Validate the bind access flags.  Remote Write and Atomic access for
2143 	 * the Memory Window require that Local Write access be set in the
2144 	 * corresponding Memory Region.
2145 	 */
2146 	bind_flags = wr->wr.rc.rcwr.bind->bind_flags;
2147 	if (((bind_flags & IBT_WR_BIND_WRITE) ||
2148 	    (bind_flags & IBT_WR_BIND_ATOMIC)) &&
2149 	    !(mr->mr_accflag & IBT_MR_LOCAL_WRITE)) {
2150 		mutex_exit(&mr->mr_lock);
2151 		mutex_exit(&mw->mr_lock);
2152 		return (IBT_MR_ACCESS_REQ_INVALID);
2153 	}
2154 
2155 	/* Calculate the new RKey for the Memory Window */
2156 	mpt = mw->mr_mptrsrcp;
2157 	new_rkey = hermon_mr_keycalc(mpt->hr_indx);
2158 	new_rkey = hermon_mr_key_swap(new_rkey);
2159 
2160 	wr->wr.rc.rcwr.bind->bind_rkey_out = new_rkey;
2161 	mw->mr_rkey = new_rkey;
2162 
2163 	mutex_exit(&mr->mr_lock);
2164 	mutex_exit(&mw->mr_lock);
2165 	return (DDI_SUCCESS);
2166 }
2167 
2168 
2169 /*
2170  * hermon_wrid_from_reset_handling()
2171  *    Context: Can be called from interrupt or base context.
2172  */
2173 /* ARGSUSED */
2174 int
2175 hermon_wrid_from_reset_handling(hermon_state_t *state, hermon_qphdl_t qp)
2176 {
2177 	hermon_workq_hdr_t	*swq, *rwq;
2178 	uint_t			qp_srq_en;
2179 
2180 	if (qp->qp_is_umap)
2181 		return (DDI_SUCCESS);
2182 
2183 	/* grab the cq lock(s) to modify the wqavl tree */
2184 	mutex_enter(&qp->qp_rq_cqhdl->cq_lock);
2185 #ifdef __lock_lint
2186 	mutex_enter(&qp->qp_sq_cqhdl->cq_lock);
2187 #else
2188 	if (qp->qp_rq_cqhdl != qp->qp_sq_cqhdl)
2189 		mutex_enter(&qp->qp_sq_cqhdl->cq_lock);
2190 #endif
2191 
2192 	/* Chain the newly allocated work queue header to the CQ's list */
2193 	hermon_cq_workq_add(qp->qp_sq_cqhdl, &qp->qp_sq_wqavl);
2194 
2195 	swq = qp->qp_sq_wqhdr;
2196 	swq->wq_head = 0;
2197 	swq->wq_tail = 0;
2198 	swq->wq_full = 0;
2199 
2200 	/*
2201 	 * Now we repeat all the above operations for the receive work queue,
2202 	 * or shared receive work queue.
2203 	 *
2204 	 * Note: We still use the 'qp_rq_cqhdl' even in the SRQ case.
2205 	 */
2206 	qp_srq_en = qp->qp_srq_en;
2207 
2208 #ifdef __lock_lint
2209 	mutex_enter(&qp->qp_srqhdl->srq_lock);
2210 #else
2211 	if (qp_srq_en == HERMON_QP_SRQ_ENABLED) {
2212 		mutex_enter(&qp->qp_srqhdl->srq_lock);
2213 	} else {
2214 		rwq = qp->qp_rq_wqhdr;
2215 		rwq->wq_head = 0;
2216 		rwq->wq_tail = 0;
2217 		rwq->wq_full = 0;
2218 		qp->qp_rq_wqecntr = 0;
2219 	}
2220 #endif
2221 	hermon_cq_workq_add(qp->qp_rq_cqhdl, &qp->qp_rq_wqavl);
2222 
2223 #ifdef __lock_lint
2224 	mutex_exit(&qp->qp_srqhdl->srq_lock);
2225 #else
2226 	if (qp_srq_en == HERMON_QP_SRQ_ENABLED) {
2227 		mutex_exit(&qp->qp_srqhdl->srq_lock);
2228 	}
2229 #endif
2230 
2231 #ifdef __lock_lint
2232 	mutex_exit(&qp->qp_sq_cqhdl->cq_lock);
2233 #else
2234 	if (qp->qp_rq_cqhdl != qp->qp_sq_cqhdl)
2235 		mutex_exit(&qp->qp_sq_cqhdl->cq_lock);
2236 #endif
2237 	mutex_exit(&qp->qp_rq_cqhdl->cq_lock);
2238 	return (DDI_SUCCESS);
2239 }
2240 
2241 
2242 /*
2243  * hermon_wrid_to_reset_handling()
2244  *    Context: Can be called from interrupt or base context.
2245  */
2246 int
2247 hermon_wrid_to_reset_handling(hermon_state_t *state, hermon_qphdl_t qp)
2248 {
2249 	uint_t			qp_srq_en;
2250 
2251 	if (qp->qp_is_umap)
2252 		return (DDI_SUCCESS);
2253 
2254 	/*
2255 	 * If there are unpolled entries in these CQs, they are
2256 	 * polled/flushed.
2257 	 * Grab the CQ lock(s) before manipulating the lists.
2258 	 */
2259 	mutex_enter(&qp->qp_rq_cqhdl->cq_lock);
2260 #ifdef __lock_lint
2261 	mutex_enter(&qp->qp_sq_cqhdl->cq_lock);
2262 #else
2263 	if (qp->qp_rq_cqhdl != qp->qp_sq_cqhdl)
2264 		mutex_enter(&qp->qp_sq_cqhdl->cq_lock);
2265 #endif
2266 
2267 	qp_srq_en = qp->qp_srq_en;
2268 #ifdef __lock_lint
2269 	mutex_enter(&qp->qp_srqhdl->srq_lock);
2270 #else
2271 	if (qp_srq_en == HERMON_QP_SRQ_ENABLED) {
2272 		mutex_enter(&qp->qp_srqhdl->srq_lock);
2273 	}
2274 #endif
2275 	/*
2276 	 * Flush the entries on the CQ for this QP's QPN.
2277 	 */
2278 	hermon_cq_entries_flush(state, qp);
2279 
2280 #ifdef __lock_lint
2281 	mutex_exit(&qp->qp_srqhdl->srq_lock);
2282 #else
2283 	if (qp_srq_en == HERMON_QP_SRQ_ENABLED) {
2284 		mutex_exit(&qp->qp_srqhdl->srq_lock);
2285 	}
2286 #endif
2287 
2288 	hermon_cq_workq_remove(qp->qp_rq_cqhdl, &qp->qp_rq_wqavl);
2289 	hermon_cq_workq_remove(qp->qp_sq_cqhdl, &qp->qp_sq_wqavl);
2290 
2291 #ifdef __lock_lint
2292 	mutex_exit(&qp->qp_sq_cqhdl->cq_lock);
2293 #else
2294 	if (qp->qp_rq_cqhdl != qp->qp_sq_cqhdl)
2295 		mutex_exit(&qp->qp_sq_cqhdl->cq_lock);
2296 #endif
2297 	mutex_exit(&qp->qp_rq_cqhdl->cq_lock);
2298 
2299 	return (IBT_SUCCESS);
2300 }
2301 
2302 
2303 /*
2304  * hermon_wrid_get_entry()
2305  *    Context: Can be called from interrupt or base context.
2306  */
2307 uint64_t
2308 hermon_wrid_get_entry(hermon_cqhdl_t cq, hermon_hw_cqe_t *cqe)
2309 {
2310 	hermon_workq_avl_t	*wqa;
2311 	hermon_workq_hdr_t	*wq;
2312 	uint64_t		wrid;
2313 	uint_t			send_or_recv, qpnum;
2314 	uint32_t		indx;
2315 
2316 	/*
2317 	 * Determine whether this CQE is a send or receive completion.
2318 	 */
2319 	send_or_recv = HERMON_CQE_SENDRECV_GET(cq, cqe);
2320 
2321 	/* Find the work queue for this QP number (send or receive side) */
2322 	qpnum = HERMON_CQE_QPNUM_GET(cq, cqe);
2323 	wqa = hermon_wrid_wqavl_find(cq, qpnum, send_or_recv);
2324 	wq = wqa->wqa_wq;
2325 
2326 	/*
2327 	 * Regardless of whether the completion is the result of a "success"
2328 	 * or a "failure", we lock the list of "containers" and attempt to
2329 	 * search for the the first matching completion (i.e. the first WR
2330 	 * with a matching WQE addr and size).  Once we find it, we pull out
2331 	 * the "wrid" field and return it (see below).  XXX Note: One possible
2332 	 * future enhancement would be to enable this routine to skip over
2333 	 * any "unsignaled" completions to go directly to the next "signaled"
2334 	 * entry on success.
2335 	 */
2336 	indx = HERMON_CQE_WQEADDRSZ_GET(cq, cqe) & wq->wq_mask;
2337 	wrid = wq->wq_wrid[indx];
2338 	if (wqa->wqa_srq_en) {
2339 		struct hermon_sw_srq_s	*srq;
2340 		uint64_t		*desc;
2341 
2342 		/* put wqe back on the srq free list */
2343 		srq = wqa->wqa_srq;
2344 		mutex_enter(&srq->srq_lock);
2345 		desc = HERMON_SRQ_WQE_ADDR(srq, wq->wq_tail);
2346 		((uint16_t *)desc)[1] = htons(indx);
2347 		wq->wq_tail = indx;
2348 		mutex_exit(&srq->srq_lock);
2349 	} else {
2350 		wq->wq_head = (indx + 1) & wq->wq_mask;
2351 		wq->wq_full = 0;
2352 	}
2353 
2354 	return (wrid);
2355 }
2356 
2357 
2358 int
2359 hermon_wrid_workq_compare(const void *p1, const void *p2)
2360 {
2361 	hermon_workq_compare_t	*cmpp;
2362 	hermon_workq_avl_t	*curr;
2363 
2364 	cmpp = (hermon_workq_compare_t *)p1;
2365 	curr = (hermon_workq_avl_t *)p2;
2366 
2367 	if (cmpp->cmp_qpn < curr->wqa_qpn)
2368 		return (-1);
2369 	else if (cmpp->cmp_qpn > curr->wqa_qpn)
2370 		return (+1);
2371 	else if (cmpp->cmp_type < curr->wqa_type)
2372 		return (-1);
2373 	else if (cmpp->cmp_type > curr->wqa_type)
2374 		return (+1);
2375 	else
2376 		return (0);
2377 }
2378 
2379 
2380 /*
2381  * hermon_wrid_workq_find()
2382  *    Context: Can be called from interrupt or base context.
2383  */
2384 static hermon_workq_avl_t *
2385 hermon_wrid_wqavl_find(hermon_cqhdl_t cq, uint_t qpn, uint_t wq_type)
2386 {
2387 	hermon_workq_avl_t	*curr;
2388 	hermon_workq_compare_t	cmp;
2389 
2390 	/*
2391 	 * Walk the CQ's work queue list, trying to find a send or recv queue
2392 	 * with the same QP number.  We do this even if we are going to later
2393 	 * create a new entry because it helps us easily find the end of the
2394 	 * list.
2395 	 */
2396 	cmp.cmp_qpn = qpn;
2397 	cmp.cmp_type = wq_type;
2398 #ifdef __lock_lint
2399 	hermon_wrid_workq_compare(NULL, NULL);
2400 #endif
2401 	curr = avl_find(&cq->cq_wrid_wqhdr_avl_tree, &cmp, NULL);
2402 
2403 	return (curr);
2404 }
2405 
2406 
2407 /*
2408  * hermon_wrid_wqhdr_create()
2409  *    Context: Can be called from base context.
2410  */
2411 /* ARGSUSED */
2412 hermon_workq_hdr_t *
2413 hermon_wrid_wqhdr_create(int bufsz)
2414 {
2415 	hermon_workq_hdr_t	*wqhdr;
2416 
2417 	/*
2418 	 * Allocate space for the wqhdr, and an array to record all the wrids.
2419 	 */
2420 	wqhdr = (hermon_workq_hdr_t *)kmem_zalloc(sizeof (*wqhdr), KM_NOSLEEP);
2421 	if (wqhdr == NULL) {
2422 		return (NULL);
2423 	}
2424 	_NOTE(NOW_INVISIBLE_TO_OTHER_THREADS(*wqhdr))
2425 	wqhdr->wq_wrid = kmem_zalloc(bufsz * sizeof (uint64_t), KM_NOSLEEP);
2426 	if (wqhdr->wq_wrid == NULL) {
2427 		kmem_free(wqhdr, sizeof (*wqhdr));
2428 		return (NULL);
2429 	}
2430 	wqhdr->wq_size = bufsz;
2431 	wqhdr->wq_mask = bufsz - 1;
2432 
2433 	return (wqhdr);
2434 }
2435 
2436 void
2437 hermon_wrid_wqhdr_destroy(hermon_workq_hdr_t *wqhdr)
2438 {
2439 	kmem_free(wqhdr->wq_wrid, wqhdr->wq_size * sizeof (uint64_t));
2440 	kmem_free(wqhdr, sizeof (*wqhdr));
2441 }
2442 
2443 
2444 /*
2445  * hermon_cq_workq_add()
2446  *    Context: Can be called from interrupt or base context.
2447  */
2448 static void
2449 hermon_cq_workq_add(hermon_cqhdl_t cq, hermon_workq_avl_t *wqavl)
2450 {
2451 	hermon_workq_compare_t	cmp;
2452 	avl_index_t		where;
2453 
2454 	cmp.cmp_qpn = wqavl->wqa_qpn;
2455 	cmp.cmp_type = wqavl->wqa_type;
2456 #ifdef __lock_lint
2457 	hermon_wrid_workq_compare(NULL, NULL);
2458 #endif
2459 	(void) avl_find(&cq->cq_wrid_wqhdr_avl_tree, &cmp, &where);
2460 	avl_insert(&cq->cq_wrid_wqhdr_avl_tree, wqavl, where);
2461 }
2462 
2463 
2464 /*
2465  * hermon_cq_workq_remove()
2466  *    Context: Can be called from interrupt or base context.
2467  */
2468 static void
2469 hermon_cq_workq_remove(hermon_cqhdl_t cq, hermon_workq_avl_t *wqavl)
2470 {
2471 #ifdef __lock_lint
2472 	hermon_wrid_workq_compare(NULL, NULL);
2473 #endif
2474 	avl_remove(&cq->cq_wrid_wqhdr_avl_tree, wqavl);
2475 }
2476