xref: /freebsd/contrib/ofed/libmlx5/cq.c (revision 0957b409)
1 /*
2  * Copyright (c) 2012 Mellanox Technologies, Inc.  All rights reserved.
3  *
4  * This software is available to you under a choice of one of two
5  * licenses.  You may choose to be licensed under the terms of the GNU
6  * General Public License (GPL) Version 2, available from the file
7  * COPYING in the main directory of this source tree, or the
8  * OpenIB.org BSD license below:
9  *
10  *     Redistribution and use in source and binary forms, with or
11  *     without modification, are permitted provided that the following
12  *     conditions are met:
13  *
14  *      - Redistributions of source code must retain the above
15  *        copyright notice, this list of conditions and the following
16  *        disclaimer.
17  *
18  *      - Redistributions in binary form must reproduce the above
19  *        copyright notice, this list of conditions and the following
20  *        disclaimer in the documentation and/or other materials
21  *        provided with the distribution.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30  * SOFTWARE.
31  */
32 
33 #include <config.h>
34 
35 #include <stdio.h>
36 #include <stdlib.h>
37 #include <pthread.h>
38 #include <string.h>
39 #include <errno.h>
40 #include <unistd.h>
41 
42 #include <infiniband/opcode.h>
43 
44 #include "mlx5.h"
45 #include "wqe.h"
46 #include "doorbell.h"
47 
48 enum {
49 	CQ_OK					=  0,
50 	CQ_EMPTY				= -1,
51 	CQ_POLL_ERR				= -2
52 };
53 
54 enum {
55 	MLX5_CQ_MODIFY_RESEIZE = 0,
56 	MLX5_CQ_MODIFY_MODER = 1,
57 	MLX5_CQ_MODIFY_MAPPING = 2,
58 };
59 
60 int mlx5_stall_num_loop = 60;
61 int mlx5_stall_cq_poll_min = 60;
62 int mlx5_stall_cq_poll_max = 100000;
63 int mlx5_stall_cq_inc_step = 100;
64 int mlx5_stall_cq_dec_step = 10;
65 
66 static inline uint8_t get_cqe_l3_hdr_type(struct mlx5_cqe64 *cqe)
67 {
68 	return (cqe->l4_hdr_type_etc >> 2) & 0x3;
69 }
70 
71 static void *get_buf_cqe(struct mlx5_buf *buf, int n, int cqe_sz)
72 {
73 	return buf->buf + n * cqe_sz;
74 }
75 
76 static void *get_cqe(struct mlx5_cq *cq, int n)
77 {
78 	return cq->active_buf->buf + n * cq->cqe_sz;
79 }
80 
81 static void *get_sw_cqe(struct mlx5_cq *cq, int n)
82 {
83 	void *cqe = get_cqe(cq, n & cq->ibv_cq.cqe);
84 	struct mlx5_cqe64 *cqe64;
85 
86 	cqe64 = (cq->cqe_sz == 64) ? cqe : cqe + 64;
87 
88 	if (likely(mlx5dv_get_cqe_opcode(cqe64) != MLX5_CQE_INVALID) &&
89 	    !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & (cq->ibv_cq.cqe + 1)))) {
90 		return cqe;
91 	} else {
92 		return NULL;
93 	}
94 }
95 
96 static void *next_cqe_sw(struct mlx5_cq *cq)
97 {
98 	return get_sw_cqe(cq, cq->cons_index);
99 }
100 
101 static void update_cons_index(struct mlx5_cq *cq)
102 {
103 	cq->dbrec[MLX5_CQ_SET_CI] = htobe32(cq->cons_index & 0xffffff);
104 }
105 
106 static inline void handle_good_req(struct ibv_wc *wc, struct mlx5_cqe64 *cqe, struct mlx5_wq *wq, int idx)
107 {
108 	switch (be32toh(cqe->sop_drop_qpn) >> 24) {
109 	case MLX5_OPCODE_RDMA_WRITE_IMM:
110 		wc->wc_flags |= IBV_WC_WITH_IMM;
111 		SWITCH_FALLTHROUGH;
112 	case MLX5_OPCODE_RDMA_WRITE:
113 		wc->opcode    = IBV_WC_RDMA_WRITE;
114 		break;
115 	case MLX5_OPCODE_SEND_IMM:
116 		wc->wc_flags |= IBV_WC_WITH_IMM;
117 		SWITCH_FALLTHROUGH;
118 	case MLX5_OPCODE_SEND:
119 	case MLX5_OPCODE_SEND_INVAL:
120 		wc->opcode    = IBV_WC_SEND;
121 		break;
122 	case MLX5_OPCODE_RDMA_READ:
123 		wc->opcode    = IBV_WC_RDMA_READ;
124 		wc->byte_len  = be32toh(cqe->byte_cnt);
125 		break;
126 	case MLX5_OPCODE_ATOMIC_CS:
127 		wc->opcode    = IBV_WC_COMP_SWAP;
128 		wc->byte_len  = 8;
129 		break;
130 	case MLX5_OPCODE_ATOMIC_FA:
131 		wc->opcode    = IBV_WC_FETCH_ADD;
132 		wc->byte_len  = 8;
133 		break;
134 	case MLX5_OPCODE_UMR:
135 		wc->opcode = wq->wr_data[idx];
136 		break;
137 	case MLX5_OPCODE_TSO:
138 		wc->opcode    = IBV_WC_TSO;
139 		break;
140 	}
141 }
142 
143 static inline int handle_responder_lazy(struct mlx5_cq *cq, struct mlx5_cqe64 *cqe,
144 					struct mlx5_resource *cur_rsc, struct mlx5_srq *srq)
145 {
146 	uint16_t	wqe_ctr;
147 	struct mlx5_wq *wq;
148 	struct mlx5_qp *qp = rsc_to_mqp(cur_rsc);
149 	int err = IBV_WC_SUCCESS;
150 
151 	if (srq) {
152 		wqe_ctr = be16toh(cqe->wqe_counter);
153 		cq->ibv_cq.wr_id = srq->wrid[wqe_ctr];
154 		mlx5_free_srq_wqe(srq, wqe_ctr);
155 		if (cqe->op_own & MLX5_INLINE_SCATTER_32)
156 			err = mlx5_copy_to_recv_srq(srq, wqe_ctr, cqe,
157 						    be32toh(cqe->byte_cnt));
158 		else if (cqe->op_own & MLX5_INLINE_SCATTER_64)
159 			err = mlx5_copy_to_recv_srq(srq, wqe_ctr, cqe - 1,
160 						    be32toh(cqe->byte_cnt));
161 	} else {
162 		if (likely(cur_rsc->type == MLX5_RSC_TYPE_QP)) {
163 			wq = &qp->rq;
164 			if (qp->qp_cap_cache & MLX5_RX_CSUM_VALID)
165 				cq->flags |= MLX5_CQ_FLAGS_RX_CSUM_VALID;
166 		} else {
167 			wq = &(rsc_to_mrwq(cur_rsc)->rq);
168 		}
169 
170 		wqe_ctr = wq->tail & (wq->wqe_cnt - 1);
171 		cq->ibv_cq.wr_id = wq->wrid[wqe_ctr];
172 		++wq->tail;
173 		if (cqe->op_own & MLX5_INLINE_SCATTER_32)
174 			err = mlx5_copy_to_recv_wqe(qp, wqe_ctr, cqe,
175 						    be32toh(cqe->byte_cnt));
176 		else if (cqe->op_own & MLX5_INLINE_SCATTER_64)
177 			err = mlx5_copy_to_recv_wqe(qp, wqe_ctr, cqe - 1,
178 						    be32toh(cqe->byte_cnt));
179 	}
180 
181 	return err;
182 }
183 
184 static inline int handle_responder(struct ibv_wc *wc, struct mlx5_cqe64 *cqe,
185 				   struct mlx5_resource *cur_rsc, struct mlx5_srq *srq)
186 {
187 	uint16_t	wqe_ctr;
188 	struct mlx5_wq *wq;
189 	struct mlx5_qp *qp = rsc_to_mqp(cur_rsc);
190 	uint8_t g;
191 	int err = 0;
192 
193 	wc->byte_len = be32toh(cqe->byte_cnt);
194 	if (srq) {
195 		wqe_ctr = be16toh(cqe->wqe_counter);
196 		wc->wr_id = srq->wrid[wqe_ctr];
197 		mlx5_free_srq_wqe(srq, wqe_ctr);
198 		if (cqe->op_own & MLX5_INLINE_SCATTER_32)
199 			err = mlx5_copy_to_recv_srq(srq, wqe_ctr, cqe,
200 						    wc->byte_len);
201 		else if (cqe->op_own & MLX5_INLINE_SCATTER_64)
202 			err = mlx5_copy_to_recv_srq(srq, wqe_ctr, cqe - 1,
203 						    wc->byte_len);
204 	} else {
205 		if (likely(cur_rsc->type == MLX5_RSC_TYPE_QP)) {
206 			wq = &qp->rq;
207 			if (qp->qp_cap_cache & MLX5_RX_CSUM_VALID)
208 				wc->wc_flags |= (!!(cqe->hds_ip_ext & MLX5_CQE_L4_OK) &
209 						 !!(cqe->hds_ip_ext & MLX5_CQE_L3_OK) &
210 						(get_cqe_l3_hdr_type(cqe) ==
211 						MLX5_CQE_L3_HDR_TYPE_IPV4)) <<
212 						IBV_WC_IP_CSUM_OK_SHIFT;
213 		} else {
214 			wq = &(rsc_to_mrwq(cur_rsc)->rq);
215 		}
216 
217 		wqe_ctr = wq->tail & (wq->wqe_cnt - 1);
218 		wc->wr_id = wq->wrid[wqe_ctr];
219 		++wq->tail;
220 		if (cqe->op_own & MLX5_INLINE_SCATTER_32)
221 			err = mlx5_copy_to_recv_wqe(qp, wqe_ctr, cqe,
222 						    wc->byte_len);
223 		else if (cqe->op_own & MLX5_INLINE_SCATTER_64)
224 			err = mlx5_copy_to_recv_wqe(qp, wqe_ctr, cqe - 1,
225 						    wc->byte_len);
226 	}
227 	if (err)
228 		return err;
229 
230 	switch (cqe->op_own >> 4) {
231 	case MLX5_CQE_RESP_WR_IMM:
232 		wc->opcode	= IBV_WC_RECV_RDMA_WITH_IMM;
233 		wc->wc_flags	|= IBV_WC_WITH_IMM;
234 		wc->imm_data = cqe->imm_inval_pkey;
235 		break;
236 	case MLX5_CQE_RESP_SEND:
237 		wc->opcode   = IBV_WC_RECV;
238 		break;
239 	case MLX5_CQE_RESP_SEND_IMM:
240 		wc->opcode	= IBV_WC_RECV;
241 		wc->wc_flags	|= IBV_WC_WITH_IMM;
242 		wc->imm_data = cqe->imm_inval_pkey;
243 		break;
244 	case MLX5_CQE_RESP_SEND_INV:
245 		wc->opcode = IBV_WC_RECV;
246 		wc->wc_flags |= IBV_WC_WITH_INV;
247 		wc->imm_data = be32toh(cqe->imm_inval_pkey);
248 		break;
249 	}
250 	wc->slid	   = be16toh(cqe->slid);
251 	wc->sl		   = (be32toh(cqe->flags_rqpn) >> 24) & 0xf;
252 	wc->src_qp	   = be32toh(cqe->flags_rqpn) & 0xffffff;
253 	wc->dlid_path_bits = cqe->ml_path & 0x7f;
254 	g = (be32toh(cqe->flags_rqpn) >> 28) & 3;
255 	wc->wc_flags |= g ? IBV_WC_GRH : 0;
256 	wc->pkey_index     = be32toh(cqe->imm_inval_pkey) & 0xffff;
257 
258 	return IBV_WC_SUCCESS;
259 }
260 
261 static void dump_cqe(FILE *fp, void *buf)
262 {
263 	uint32_t *p = buf;
264 	int i;
265 
266 	for (i = 0; i < 16; i += 4)
267 		fprintf(fp, "%08x %08x %08x %08x\n", be32toh(p[i]), be32toh(p[i + 1]),
268 			be32toh(p[i + 2]), be32toh(p[i + 3]));
269 }
270 
271 static enum ibv_wc_status mlx5_handle_error_cqe(struct mlx5_err_cqe *cqe)
272 {
273 	switch (cqe->syndrome) {
274 	case MLX5_CQE_SYNDROME_LOCAL_LENGTH_ERR:
275 		return IBV_WC_LOC_LEN_ERR;
276 	case MLX5_CQE_SYNDROME_LOCAL_QP_OP_ERR:
277 		return IBV_WC_LOC_QP_OP_ERR;
278 	case MLX5_CQE_SYNDROME_LOCAL_PROT_ERR:
279 		return IBV_WC_LOC_PROT_ERR;
280 	case MLX5_CQE_SYNDROME_WR_FLUSH_ERR:
281 		return IBV_WC_WR_FLUSH_ERR;
282 	case MLX5_CQE_SYNDROME_MW_BIND_ERR:
283 		return IBV_WC_MW_BIND_ERR;
284 	case MLX5_CQE_SYNDROME_BAD_RESP_ERR:
285 		return IBV_WC_BAD_RESP_ERR;
286 	case MLX5_CQE_SYNDROME_LOCAL_ACCESS_ERR:
287 		return IBV_WC_LOC_ACCESS_ERR;
288 	case MLX5_CQE_SYNDROME_REMOTE_INVAL_REQ_ERR:
289 		return IBV_WC_REM_INV_REQ_ERR;
290 	case MLX5_CQE_SYNDROME_REMOTE_ACCESS_ERR:
291 		return IBV_WC_REM_ACCESS_ERR;
292 	case MLX5_CQE_SYNDROME_REMOTE_OP_ERR:
293 		return IBV_WC_REM_OP_ERR;
294 	case MLX5_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR:
295 		return IBV_WC_RETRY_EXC_ERR;
296 	case MLX5_CQE_SYNDROME_RNR_RETRY_EXC_ERR:
297 		return IBV_WC_RNR_RETRY_EXC_ERR;
298 	case MLX5_CQE_SYNDROME_REMOTE_ABORTED_ERR:
299 		return IBV_WC_REM_ABORT_ERR;
300 	default:
301 		return IBV_WC_GENERAL_ERR;
302 	}
303 }
304 
305 #if defined(__x86_64__) || defined (__i386__)
306 static inline unsigned long get_cycles(void)
307 {
308 	uint32_t low, high;
309 	uint64_t val;
310 	asm volatile ("rdtsc" : "=a" (low), "=d" (high));
311 	val = high;
312 	val = (val << 32) | low;
313 	return val;
314 }
315 
316 static void mlx5_stall_poll_cq(void)
317 {
318 	int i;
319 
320 	for (i = 0; i < mlx5_stall_num_loop; i++)
321 		(void)get_cycles();
322 }
323 static void mlx5_stall_cycles_poll_cq(uint64_t cycles)
324 {
325 	while (get_cycles()  <  cycles)
326 		; /* Nothing */
327 }
328 static void mlx5_get_cycles(uint64_t *cycles)
329 {
330 	*cycles = get_cycles();
331 }
332 #else
333 static void mlx5_stall_poll_cq(void)
334 {
335 }
336 static void mlx5_stall_cycles_poll_cq(uint64_t cycles)
337 {
338 }
339 static void mlx5_get_cycles(uint64_t *cycles)
340 {
341 }
342 #endif
343 
344 static inline struct mlx5_qp *get_req_context(struct mlx5_context *mctx,
345 					      struct mlx5_resource **cur_rsc,
346 					      uint32_t rsn, int cqe_ver)
347 					      ALWAYS_INLINE;
348 static inline struct mlx5_qp *get_req_context(struct mlx5_context *mctx,
349 					      struct mlx5_resource **cur_rsc,
350 					      uint32_t rsn, int cqe_ver)
351 {
352 	if (!*cur_rsc || (rsn != (*cur_rsc)->rsn))
353 		*cur_rsc = cqe_ver ? mlx5_find_uidx(mctx, rsn) :
354 				      (struct mlx5_resource *)mlx5_find_qp(mctx, rsn);
355 
356 	return rsc_to_mqp(*cur_rsc);
357 }
358 
359 static inline int get_resp_ctx_v1(struct mlx5_context *mctx,
360 				  struct mlx5_resource **cur_rsc,
361 				  struct mlx5_srq **cur_srq,
362 				  uint32_t uidx, uint8_t *is_srq)
363 				  ALWAYS_INLINE;
364 static inline int get_resp_ctx_v1(struct mlx5_context *mctx,
365 				  struct mlx5_resource **cur_rsc,
366 				  struct mlx5_srq **cur_srq,
367 				  uint32_t uidx, uint8_t *is_srq)
368 {
369 	struct mlx5_qp *mqp;
370 
371 	if (!*cur_rsc || (uidx != (*cur_rsc)->rsn)) {
372 		*cur_rsc = mlx5_find_uidx(mctx, uidx);
373 		if (unlikely(!*cur_rsc))
374 			return CQ_POLL_ERR;
375 	}
376 
377 	switch ((*cur_rsc)->type) {
378 	case MLX5_RSC_TYPE_QP:
379 		mqp = rsc_to_mqp(*cur_rsc);
380 		if (mqp->verbs_qp.qp.srq) {
381 			*cur_srq = to_msrq(mqp->verbs_qp.qp.srq);
382 			*is_srq = 1;
383 		}
384 		break;
385 	case MLX5_RSC_TYPE_XSRQ:
386 		*cur_srq = rsc_to_msrq(*cur_rsc);
387 		*is_srq = 1;
388 		break;
389 	case MLX5_RSC_TYPE_RWQ:
390 		break;
391 	default:
392 		return CQ_POLL_ERR;
393 	}
394 
395 	return CQ_OK;
396 }
397 
398 static inline int get_qp_ctx(struct mlx5_context *mctx,
399 			     struct mlx5_resource **cur_rsc,
400 			     uint32_t qpn)
401 			     ALWAYS_INLINE;
402 static inline int get_qp_ctx(struct mlx5_context *mctx,
403 			     struct mlx5_resource **cur_rsc,
404 			     uint32_t qpn)
405 {
406 	if (!*cur_rsc || (qpn != (*cur_rsc)->rsn)) {
407 		/*
408 		 * We do not have to take the QP table lock here,
409 		 * because CQs will be locked while QPs are removed
410 		 * from the table.
411 		 */
412 		*cur_rsc = (struct mlx5_resource *)mlx5_find_qp(mctx, qpn);
413 		if (unlikely(!*cur_rsc))
414 			return CQ_POLL_ERR;
415 	}
416 
417 	return CQ_OK;
418 }
419 
420 static inline int get_srq_ctx(struct mlx5_context *mctx,
421 			      struct mlx5_srq **cur_srq,
422 			      uint32_t srqn_uidx)
423 			      ALWAYS_INLINE;
424 static inline int get_srq_ctx(struct mlx5_context *mctx,
425 			      struct mlx5_srq **cur_srq,
426 			      uint32_t srqn)
427 {
428 	if (!*cur_srq || (srqn != (*cur_srq)->srqn)) {
429 		*cur_srq = mlx5_find_srq(mctx, srqn);
430 		if (unlikely(!*cur_srq))
431 			return CQ_POLL_ERR;
432 	}
433 
434 	return CQ_OK;
435 }
436 
437 static inline int get_cur_rsc(struct mlx5_context *mctx,
438 			      int cqe_ver,
439 			      uint32_t qpn,
440 			      uint32_t srqn_uidx,
441 			      struct mlx5_resource **cur_rsc,
442 			      struct mlx5_srq **cur_srq,
443 			      uint8_t *is_srq)
444 {
445 	int err;
446 
447 	if (cqe_ver) {
448 		err = get_resp_ctx_v1(mctx, cur_rsc, cur_srq, srqn_uidx,
449 				      is_srq);
450 	} else {
451 		if (srqn_uidx) {
452 			*is_srq = 1;
453 			err = get_srq_ctx(mctx, cur_srq, srqn_uidx);
454 		} else {
455 			err = get_qp_ctx(mctx, cur_rsc, qpn);
456 		}
457 	}
458 
459 	return err;
460 
461 }
462 
463 static inline int mlx5_get_next_cqe(struct mlx5_cq *cq,
464 				    struct mlx5_cqe64 **pcqe64,
465 				    void **pcqe)
466 				    ALWAYS_INLINE;
467 static inline int mlx5_get_next_cqe(struct mlx5_cq *cq,
468 				    struct mlx5_cqe64 **pcqe64,
469 				    void **pcqe)
470 {
471 	void *cqe;
472 	struct mlx5_cqe64 *cqe64;
473 
474 	cqe = next_cqe_sw(cq);
475 	if (!cqe)
476 		return CQ_EMPTY;
477 
478 	cqe64 = (cq->cqe_sz == 64) ? cqe : cqe + 64;
479 
480 	++cq->cons_index;
481 
482 	VALGRIND_MAKE_MEM_DEFINED(cqe64, sizeof *cqe64);
483 
484 	/*
485 	 * Make sure we read CQ entry contents after we've checked the
486 	 * ownership bit.
487 	 */
488 	udma_from_device_barrier();
489 
490 #ifdef MLX5_DEBUG
491 	{
492 		struct mlx5_context *mctx = to_mctx(cq->ibv_cq.context);
493 
494 		if (mlx5_debug_mask & MLX5_DBG_CQ_CQE) {
495 			FILE *fp = mctx->dbg_fp;
496 
497 			mlx5_dbg(fp, MLX5_DBG_CQ_CQE, "dump cqe for cqn 0x%x:\n", cq->cqn);
498 			dump_cqe(fp, cqe64);
499 		}
500 	}
501 #endif
502 	*pcqe64 = cqe64;
503 	*pcqe = cqe;
504 
505 	return CQ_OK;
506 }
507 
508 static inline int mlx5_parse_cqe(struct mlx5_cq *cq,
509 				 struct mlx5_cqe64 *cqe64,
510 				 void *cqe,
511 				 struct mlx5_resource **cur_rsc,
512 				 struct mlx5_srq **cur_srq,
513 				 struct ibv_wc *wc,
514 				 int cqe_ver, int lazy)
515 				 ALWAYS_INLINE;
516 static inline int mlx5_parse_cqe(struct mlx5_cq *cq,
517 				 struct mlx5_cqe64 *cqe64,
518 				 void *cqe,
519 				 struct mlx5_resource **cur_rsc,
520 				 struct mlx5_srq **cur_srq,
521 				 struct ibv_wc *wc,
522 				 int cqe_ver, int lazy)
523 {
524 	struct mlx5_wq *wq;
525 	uint16_t wqe_ctr;
526 	uint32_t qpn;
527 	uint32_t srqn_uidx;
528 	int idx;
529 	uint8_t opcode;
530 	struct mlx5_err_cqe *ecqe;
531 	int err = 0;
532 	struct mlx5_qp *mqp;
533 	struct mlx5_context *mctx;
534 	uint8_t is_srq = 0;
535 
536 	mctx = to_mctx(ibv_cq_ex_to_cq(&cq->ibv_cq)->context);
537 	qpn = be32toh(cqe64->sop_drop_qpn) & 0xffffff;
538 	if (lazy) {
539 		cq->cqe64 = cqe64;
540 		cq->flags &= (~MLX5_CQ_FLAGS_RX_CSUM_VALID);
541 	} else {
542 		wc->wc_flags = 0;
543 		wc->qp_num = qpn;
544 	}
545 
546 	opcode = mlx5dv_get_cqe_opcode(cqe64);
547 	switch (opcode) {
548 	case MLX5_CQE_REQ:
549 	{
550 		mqp = get_req_context(mctx, cur_rsc,
551 				      (cqe_ver ? (be32toh(cqe64->srqn_uidx) & 0xffffff) : qpn),
552 				      cqe_ver);
553 		if (unlikely(!mqp))
554 			return CQ_POLL_ERR;
555 		wq = &mqp->sq;
556 		wqe_ctr = be16toh(cqe64->wqe_counter);
557 		idx = wqe_ctr & (wq->wqe_cnt - 1);
558 		if (lazy) {
559 			uint32_t wc_byte_len;
560 
561 			switch (be32toh(cqe64->sop_drop_qpn) >> 24) {
562 			case MLX5_OPCODE_UMR:
563 				cq->umr_opcode = wq->wr_data[idx];
564 				break;
565 
566 			case MLX5_OPCODE_RDMA_READ:
567 				wc_byte_len = be32toh(cqe64->byte_cnt);
568 				goto scatter_out;
569 			case MLX5_OPCODE_ATOMIC_CS:
570 			case MLX5_OPCODE_ATOMIC_FA:
571 				wc_byte_len = 8;
572 
573 			scatter_out:
574 				if (cqe64->op_own & MLX5_INLINE_SCATTER_32)
575 					err = mlx5_copy_to_send_wqe(
576 					    mqp, wqe_ctr, cqe, wc_byte_len);
577 				else if (cqe64->op_own & MLX5_INLINE_SCATTER_64)
578 					err = mlx5_copy_to_send_wqe(
579 					    mqp, wqe_ctr, cqe - 1, wc_byte_len);
580 				break;
581 			}
582 
583 			cq->ibv_cq.wr_id = wq->wrid[idx];
584 			cq->ibv_cq.status = err;
585 		} else {
586 			handle_good_req(wc, cqe64, wq, idx);
587 
588 			if (cqe64->op_own & MLX5_INLINE_SCATTER_32)
589 				err = mlx5_copy_to_send_wqe(mqp, wqe_ctr, cqe,
590 							    wc->byte_len);
591 			else if (cqe64->op_own & MLX5_INLINE_SCATTER_64)
592 				err = mlx5_copy_to_send_wqe(
593 				    mqp, wqe_ctr, cqe - 1, wc->byte_len);
594 
595 			wc->wr_id = wq->wrid[idx];
596 			wc->status = err;
597 		}
598 
599 		wq->tail = wq->wqe_head[idx] + 1;
600 		break;
601 	}
602 	case MLX5_CQE_RESP_WR_IMM:
603 	case MLX5_CQE_RESP_SEND:
604 	case MLX5_CQE_RESP_SEND_IMM:
605 	case MLX5_CQE_RESP_SEND_INV:
606 		srqn_uidx = be32toh(cqe64->srqn_uidx) & 0xffffff;
607 		err = get_cur_rsc(mctx, cqe_ver, qpn, srqn_uidx, cur_rsc,
608 				  cur_srq, &is_srq);
609 		if (unlikely(err))
610 			return CQ_POLL_ERR;
611 
612 		if (lazy)
613 			cq->ibv_cq.status = handle_responder_lazy(cq, cqe64,
614 							      *cur_rsc,
615 							      is_srq ? *cur_srq : NULL);
616 		else
617 			wc->status = handle_responder(wc, cqe64, *cur_rsc,
618 					      is_srq ? *cur_srq : NULL);
619 		break;
620 	case MLX5_CQE_RESIZE_CQ:
621 		break;
622 	case MLX5_CQE_REQ_ERR:
623 	case MLX5_CQE_RESP_ERR:
624 		srqn_uidx = be32toh(cqe64->srqn_uidx) & 0xffffff;
625 		ecqe = (struct mlx5_err_cqe *)cqe64;
626 		{
627 			enum ibv_wc_status *pstatus = lazy ? &cq->ibv_cq.status : &wc->status;
628 
629 			*pstatus = mlx5_handle_error_cqe(ecqe);
630 		}
631 
632 		if (!lazy)
633 			wc->vendor_err = ecqe->vendor_err_synd;
634 
635 		if (unlikely(ecqe->syndrome != MLX5_CQE_SYNDROME_WR_FLUSH_ERR &&
636 			     ecqe->syndrome != MLX5_CQE_SYNDROME_TRANSPORT_RETRY_EXC_ERR)) {
637 			FILE *fp = mctx->dbg_fp;
638 			fprintf(fp, PFX "%s: got completion with error:\n",
639 				mctx->hostname);
640 			dump_cqe(fp, ecqe);
641 			if (mlx5_freeze_on_error_cqe) {
642 				fprintf(fp, PFX "freezing at poll cq...");
643 				while (1)
644 					sleep(10);
645 			}
646 		}
647 
648 		if (opcode == MLX5_CQE_REQ_ERR) {
649 			mqp = get_req_context(mctx, cur_rsc,
650 					      (cqe_ver ? srqn_uidx : qpn), cqe_ver);
651 			if (unlikely(!mqp))
652 				return CQ_POLL_ERR;
653 			wq = &mqp->sq;
654 			wqe_ctr = be16toh(cqe64->wqe_counter);
655 			idx = wqe_ctr & (wq->wqe_cnt - 1);
656 			if (lazy)
657 				cq->ibv_cq.wr_id = wq->wrid[idx];
658 			else
659 				wc->wr_id = wq->wrid[idx];
660 			wq->tail = wq->wqe_head[idx] + 1;
661 		} else {
662 			err = get_cur_rsc(mctx, cqe_ver, qpn, srqn_uidx,
663 					  cur_rsc, cur_srq, &is_srq);
664 			if (unlikely(err))
665 				return CQ_POLL_ERR;
666 
667 			if (is_srq) {
668 				wqe_ctr = be16toh(cqe64->wqe_counter);
669 				if (lazy)
670 					cq->ibv_cq.wr_id = (*cur_srq)->wrid[wqe_ctr];
671 				else
672 					wc->wr_id = (*cur_srq)->wrid[wqe_ctr];
673 				mlx5_free_srq_wqe(*cur_srq, wqe_ctr);
674 			} else {
675 				switch ((*cur_rsc)->type) {
676 				case MLX5_RSC_TYPE_RWQ:
677 					wq = &(rsc_to_mrwq(*cur_rsc)->rq);
678 					break;
679 				default:
680 					wq = &(rsc_to_mqp(*cur_rsc)->rq);
681 					break;
682 				}
683 
684 				if (lazy)
685 					cq->ibv_cq.wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
686 				else
687 					wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)];
688 				++wq->tail;
689 			}
690 		}
691 		break;
692 	}
693 
694 	return CQ_OK;
695 }
696 
697 static inline int mlx5_parse_lazy_cqe(struct mlx5_cq *cq,
698 				      struct mlx5_cqe64 *cqe64,
699 				      void *cqe, int cqe_ver)
700 				      ALWAYS_INLINE;
701 static inline int mlx5_parse_lazy_cqe(struct mlx5_cq *cq,
702 				      struct mlx5_cqe64 *cqe64,
703 				      void *cqe, int cqe_ver)
704 {
705 	return mlx5_parse_cqe(cq, cqe64, cqe, &cq->cur_rsc, &cq->cur_srq, NULL, cqe_ver, 1);
706 }
707 
708 static inline int mlx5_poll_one(struct mlx5_cq *cq,
709 				struct mlx5_resource **cur_rsc,
710 				struct mlx5_srq **cur_srq,
711 				struct ibv_wc *wc, int cqe_ver)
712 				ALWAYS_INLINE;
713 static inline int mlx5_poll_one(struct mlx5_cq *cq,
714 				struct mlx5_resource **cur_rsc,
715 				struct mlx5_srq **cur_srq,
716 				struct ibv_wc *wc, int cqe_ver)
717 {
718 	struct mlx5_cqe64 *cqe64;
719 	void *cqe;
720 	int err;
721 
722 	err = mlx5_get_next_cqe(cq, &cqe64, &cqe);
723 	if (err == CQ_EMPTY)
724 		return err;
725 
726 	return mlx5_parse_cqe(cq, cqe64, cqe, cur_rsc, cur_srq, wc, cqe_ver, 0);
727 }
728 
729 static inline int poll_cq(struct ibv_cq *ibcq, int ne,
730 		      struct ibv_wc *wc, int cqe_ver)
731 		      ALWAYS_INLINE;
732 static inline int poll_cq(struct ibv_cq *ibcq, int ne,
733 		      struct ibv_wc *wc, int cqe_ver)
734 {
735 	struct mlx5_cq *cq = to_mcq(ibcq);
736 	struct mlx5_resource *rsc = NULL;
737 	struct mlx5_srq *srq = NULL;
738 	int npolled;
739 	int err = CQ_OK;
740 
741 	if (cq->stall_enable) {
742 		if (cq->stall_adaptive_enable) {
743 			if (cq->stall_last_count)
744 				mlx5_stall_cycles_poll_cq(cq->stall_last_count + cq->stall_cycles);
745 		} else if (cq->stall_next_poll) {
746 			cq->stall_next_poll = 0;
747 			mlx5_stall_poll_cq();
748 		}
749 	}
750 
751 	mlx5_spin_lock(&cq->lock);
752 
753 	for (npolled = 0; npolled < ne; ++npolled) {
754 		err = mlx5_poll_one(cq, &rsc, &srq, wc + npolled, cqe_ver);
755 		if (err != CQ_OK)
756 			break;
757 	}
758 
759 	update_cons_index(cq);
760 
761 	mlx5_spin_unlock(&cq->lock);
762 
763 	if (cq->stall_enable) {
764 		if (cq->stall_adaptive_enable) {
765 			if (npolled == 0) {
766 				cq->stall_cycles = max(cq->stall_cycles-mlx5_stall_cq_dec_step,
767 						       mlx5_stall_cq_poll_min);
768 				mlx5_get_cycles(&cq->stall_last_count);
769 			} else if (npolled < ne) {
770 				cq->stall_cycles = min(cq->stall_cycles+mlx5_stall_cq_inc_step,
771 						       mlx5_stall_cq_poll_max);
772 				mlx5_get_cycles(&cq->stall_last_count);
773 			} else {
774 				cq->stall_cycles = max(cq->stall_cycles-mlx5_stall_cq_dec_step,
775 						       mlx5_stall_cq_poll_min);
776 				cq->stall_last_count = 0;
777 			}
778 		} else if (err == CQ_EMPTY) {
779 			cq->stall_next_poll = 1;
780 		}
781 	}
782 
783 	return err == CQ_POLL_ERR ? err : npolled;
784 }
785 
786 enum  polling_mode {
787 	POLLING_MODE_NO_STALL,
788 	POLLING_MODE_STALL,
789 	POLLING_MODE_STALL_ADAPTIVE
790 };
791 
792 static inline void _mlx5_end_poll(struct ibv_cq_ex *ibcq,
793 				  int lock, enum polling_mode stall)
794 				  ALWAYS_INLINE;
795 static inline void _mlx5_end_poll(struct ibv_cq_ex *ibcq,
796 				  int lock, enum polling_mode stall)
797 {
798 	struct mlx5_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq));
799 
800 	update_cons_index(cq);
801 
802 	if (lock)
803 		mlx5_spin_unlock(&cq->lock);
804 
805 	if (stall) {
806 		if (stall == POLLING_MODE_STALL_ADAPTIVE) {
807 			if (!(cq->flags & MLX5_CQ_FLAGS_FOUND_CQES)) {
808 				cq->stall_cycles = max(cq->stall_cycles - mlx5_stall_cq_dec_step,
809 						       mlx5_stall_cq_poll_min);
810 				mlx5_get_cycles(&cq->stall_last_count);
811 			} else if (cq->flags & MLX5_CQ_FLAGS_EMPTY_DURING_POLL) {
812 				cq->stall_cycles = min(cq->stall_cycles + mlx5_stall_cq_inc_step,
813 						       mlx5_stall_cq_poll_max);
814 				mlx5_get_cycles(&cq->stall_last_count);
815 			} else {
816 				cq->stall_cycles = max(cq->stall_cycles - mlx5_stall_cq_dec_step,
817 						       mlx5_stall_cq_poll_min);
818 				cq->stall_last_count = 0;
819 			}
820 		} else if (!(cq->flags & MLX5_CQ_FLAGS_FOUND_CQES)) {
821 			cq->stall_next_poll = 1;
822 		}
823 
824 		cq->flags &= ~(MLX5_CQ_FLAGS_FOUND_CQES | MLX5_CQ_FLAGS_EMPTY_DURING_POLL);
825 	}
826 }
827 
828 static inline int mlx5_start_poll(struct ibv_cq_ex *ibcq, struct ibv_poll_cq_attr *attr,
829 				  int lock, enum polling_mode stall, int cqe_version)
830 				  ALWAYS_INLINE;
831 static inline int mlx5_start_poll(struct ibv_cq_ex *ibcq, struct ibv_poll_cq_attr *attr,
832 				  int lock, enum polling_mode stall, int cqe_version)
833 {
834 	struct mlx5_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq));
835 	struct mlx5_cqe64 *cqe64;
836 	void *cqe;
837 	int err;
838 
839 	if (unlikely(attr->comp_mask))
840 		return EINVAL;
841 
842 	if (stall) {
843 		if (stall == POLLING_MODE_STALL_ADAPTIVE) {
844 			if (cq->stall_last_count)
845 				mlx5_stall_cycles_poll_cq(cq->stall_last_count + cq->stall_cycles);
846 		} else if (cq->stall_next_poll) {
847 			cq->stall_next_poll = 0;
848 			mlx5_stall_poll_cq();
849 		}
850 	}
851 
852 	if (lock)
853 		mlx5_spin_lock(&cq->lock);
854 
855 	cq->cur_rsc = NULL;
856 	cq->cur_srq = NULL;
857 
858 	err = mlx5_get_next_cqe(cq, &cqe64, &cqe);
859 	if (err == CQ_EMPTY) {
860 		if (lock)
861 			mlx5_spin_unlock(&cq->lock);
862 
863 		if (stall) {
864 			if (stall == POLLING_MODE_STALL_ADAPTIVE) {
865 				cq->stall_cycles = max(cq->stall_cycles - mlx5_stall_cq_dec_step,
866 						mlx5_stall_cq_poll_min);
867 				mlx5_get_cycles(&cq->stall_last_count);
868 			} else {
869 				cq->stall_next_poll = 1;
870 			}
871 		}
872 
873 		return ENOENT;
874 	}
875 
876 	if (stall)
877 		cq->flags |= MLX5_CQ_FLAGS_FOUND_CQES;
878 
879 	err = mlx5_parse_lazy_cqe(cq, cqe64, cqe, cqe_version);
880 	if (lock && err)
881 		mlx5_spin_unlock(&cq->lock);
882 
883 	if (stall && err) {
884 		if (stall == POLLING_MODE_STALL_ADAPTIVE) {
885 			cq->stall_cycles = max(cq->stall_cycles - mlx5_stall_cq_dec_step,
886 						mlx5_stall_cq_poll_min);
887 			cq->stall_last_count = 0;
888 		}
889 
890 		cq->flags &= ~(MLX5_CQ_FLAGS_FOUND_CQES);
891 	}
892 
893 	return err;
894 }
895 
896 static inline int mlx5_next_poll(struct ibv_cq_ex *ibcq,
897 				 enum polling_mode stall, int cqe_version)
898 				 ALWAYS_INLINE;
899 static inline int mlx5_next_poll(struct ibv_cq_ex *ibcq,
900 				 enum polling_mode stall,
901 				 int cqe_version)
902 {
903 	struct mlx5_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq));
904 	struct mlx5_cqe64 *cqe64;
905 	void *cqe;
906 	int err;
907 
908 	err = mlx5_get_next_cqe(cq, &cqe64, &cqe);
909 	if (err == CQ_EMPTY) {
910 		if (stall == POLLING_MODE_STALL_ADAPTIVE)
911 			cq->flags |= MLX5_CQ_FLAGS_EMPTY_DURING_POLL;
912 
913 		return ENOENT;
914 	}
915 
916 	return mlx5_parse_lazy_cqe(cq, cqe64, cqe, cqe_version);
917 }
918 
919 static inline int mlx5_next_poll_adaptive_v0(struct ibv_cq_ex *ibcq)
920 {
921 	return mlx5_next_poll(ibcq, POLLING_MODE_STALL_ADAPTIVE, 0);
922 }
923 
924 static inline int mlx5_next_poll_adaptive_v1(struct ibv_cq_ex *ibcq)
925 {
926 	return mlx5_next_poll(ibcq, POLLING_MODE_STALL_ADAPTIVE, 1);
927 }
928 
929 static inline int mlx5_next_poll_v0(struct ibv_cq_ex *ibcq)
930 {
931 	return mlx5_next_poll(ibcq, 0, 0);
932 }
933 
934 static inline int mlx5_next_poll_v1(struct ibv_cq_ex *ibcq)
935 {
936 	return mlx5_next_poll(ibcq, 0, 1);
937 }
938 
939 static inline int mlx5_start_poll_v0(struct ibv_cq_ex *ibcq,
940 				     struct ibv_poll_cq_attr *attr)
941 {
942 	return mlx5_start_poll(ibcq, attr, 0, 0, 0);
943 }
944 
945 static inline int mlx5_start_poll_v1(struct ibv_cq_ex *ibcq,
946 				     struct ibv_poll_cq_attr *attr)
947 {
948 	return mlx5_start_poll(ibcq, attr, 0, 0, 1);
949 }
950 
951 static inline int mlx5_start_poll_v0_lock(struct ibv_cq_ex *ibcq,
952 					  struct ibv_poll_cq_attr *attr)
953 {
954 	return mlx5_start_poll(ibcq, attr, 1, 0, 0);
955 }
956 
957 static inline int mlx5_start_poll_v1_lock(struct ibv_cq_ex *ibcq,
958 					  struct ibv_poll_cq_attr *attr)
959 {
960 	return mlx5_start_poll(ibcq, attr, 1, 0, 1);
961 }
962 
963 static inline int mlx5_start_poll_adaptive_stall_v0_lock(struct ibv_cq_ex *ibcq,
964 							 struct ibv_poll_cq_attr *attr)
965 {
966 	return mlx5_start_poll(ibcq, attr, 1, POLLING_MODE_STALL_ADAPTIVE, 0);
967 }
968 
969 static inline int mlx5_start_poll_stall_v0_lock(struct ibv_cq_ex *ibcq,
970 						struct ibv_poll_cq_attr *attr)
971 {
972 	return mlx5_start_poll(ibcq, attr, 1, POLLING_MODE_STALL, 0);
973 }
974 
975 static inline int mlx5_start_poll_adaptive_stall_v1_lock(struct ibv_cq_ex *ibcq,
976 							 struct ibv_poll_cq_attr *attr)
977 {
978 	return mlx5_start_poll(ibcq, attr, 1, POLLING_MODE_STALL_ADAPTIVE, 1);
979 }
980 
981 static inline int mlx5_start_poll_stall_v1_lock(struct ibv_cq_ex *ibcq,
982 						struct ibv_poll_cq_attr *attr)
983 {
984 	return mlx5_start_poll(ibcq, attr, 1, POLLING_MODE_STALL, 1);
985 }
986 
987 static inline int mlx5_start_poll_stall_v0(struct ibv_cq_ex *ibcq,
988 					   struct ibv_poll_cq_attr *attr)
989 {
990 	return mlx5_start_poll(ibcq, attr, 0, POLLING_MODE_STALL, 0);
991 }
992 
993 static inline int mlx5_start_poll_adaptive_stall_v0(struct ibv_cq_ex *ibcq,
994 						    struct ibv_poll_cq_attr *attr)
995 {
996 	return mlx5_start_poll(ibcq, attr, 0, POLLING_MODE_STALL_ADAPTIVE, 0);
997 }
998 
999 static inline int mlx5_start_poll_adaptive_stall_v1(struct ibv_cq_ex *ibcq,
1000 						    struct ibv_poll_cq_attr *attr)
1001 {
1002 	return mlx5_start_poll(ibcq, attr, 0, POLLING_MODE_STALL_ADAPTIVE, 1);
1003 }
1004 
1005 static inline int mlx5_start_poll_stall_v1(struct ibv_cq_ex *ibcq,
1006 					   struct ibv_poll_cq_attr *attr)
1007 {
1008 	return mlx5_start_poll(ibcq, attr, 0, POLLING_MODE_STALL, 1);
1009 }
1010 
1011 static inline void mlx5_end_poll_adaptive_stall_lock(struct ibv_cq_ex *ibcq)
1012 {
1013 	_mlx5_end_poll(ibcq, 1, POLLING_MODE_STALL_ADAPTIVE);
1014 }
1015 
1016 static inline void mlx5_end_poll_stall_lock(struct ibv_cq_ex *ibcq)
1017 {
1018 	_mlx5_end_poll(ibcq, 1, POLLING_MODE_STALL);
1019 }
1020 
1021 static inline void mlx5_end_poll_adaptive_stall(struct ibv_cq_ex *ibcq)
1022 {
1023 	_mlx5_end_poll(ibcq, 0, POLLING_MODE_STALL_ADAPTIVE);
1024 }
1025 
1026 static inline void mlx5_end_poll_stall(struct ibv_cq_ex *ibcq)
1027 {
1028 	_mlx5_end_poll(ibcq, 0, POLLING_MODE_STALL);
1029 }
1030 
1031 static inline void mlx5_end_poll(struct ibv_cq_ex *ibcq)
1032 {
1033 	_mlx5_end_poll(ibcq, 0, 0);
1034 }
1035 
1036 static inline void mlx5_end_poll_lock(struct ibv_cq_ex *ibcq)
1037 {
1038 	_mlx5_end_poll(ibcq, 1, 0);
1039 }
1040 
1041 int mlx5_poll_cq(struct ibv_cq *ibcq, int ne, struct ibv_wc *wc)
1042 {
1043 	return poll_cq(ibcq, ne, wc, 0);
1044 }
1045 
1046 int mlx5_poll_cq_v1(struct ibv_cq *ibcq, int ne, struct ibv_wc *wc)
1047 {
1048 	return poll_cq(ibcq, ne, wc, 1);
1049 }
1050 
1051 static inline enum ibv_wc_opcode mlx5_cq_read_wc_opcode(struct ibv_cq_ex *ibcq)
1052 {
1053 	struct mlx5_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq));
1054 
1055 	switch (mlx5dv_get_cqe_opcode(cq->cqe64)) {
1056 	case MLX5_CQE_RESP_WR_IMM:
1057 		return IBV_WC_RECV_RDMA_WITH_IMM;
1058 	case MLX5_CQE_RESP_SEND:
1059 	case MLX5_CQE_RESP_SEND_IMM:
1060 	case MLX5_CQE_RESP_SEND_INV:
1061 		return IBV_WC_RECV;
1062 	case MLX5_CQE_REQ:
1063 		switch (be32toh(cq->cqe64->sop_drop_qpn) >> 24) {
1064 		case MLX5_OPCODE_RDMA_WRITE_IMM:
1065 		case MLX5_OPCODE_RDMA_WRITE:
1066 			return IBV_WC_RDMA_WRITE;
1067 		case MLX5_OPCODE_SEND_IMM:
1068 		case MLX5_OPCODE_SEND:
1069 		case MLX5_OPCODE_SEND_INVAL:
1070 			return IBV_WC_SEND;
1071 		case MLX5_OPCODE_RDMA_READ:
1072 			return IBV_WC_RDMA_READ;
1073 		case MLX5_OPCODE_ATOMIC_CS:
1074 			return IBV_WC_COMP_SWAP;
1075 		case MLX5_OPCODE_ATOMIC_FA:
1076 			return IBV_WC_FETCH_ADD;
1077 		case MLX5_OPCODE_UMR:
1078 			return cq->umr_opcode;
1079 		case MLX5_OPCODE_TSO:
1080 			return IBV_WC_TSO;
1081 		}
1082 	}
1083 
1084 #ifdef MLX5_DEBUG
1085 {
1086 	struct mlx5_context *ctx = to_mctx(ibcq->context);
1087 
1088 	mlx5_dbg(ctx->dbg_fp, MLX5_DBG_CQ_CQE, "un-expected opcode in cqe\n");
1089 }
1090 #endif
1091 	return 0;
1092 }
1093 
1094 static inline uint32_t mlx5_cq_read_wc_qp_num(struct ibv_cq_ex *ibcq)
1095 {
1096 	struct mlx5_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq));
1097 
1098 	return be32toh(cq->cqe64->sop_drop_qpn) & 0xffffff;
1099 }
1100 
1101 static inline int mlx5_cq_read_wc_flags(struct ibv_cq_ex *ibcq)
1102 {
1103 	struct mlx5_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq));
1104 	int wc_flags = 0;
1105 
1106 	if (cq->flags & MLX5_CQ_FLAGS_RX_CSUM_VALID)
1107 		wc_flags = (!!(cq->cqe64->hds_ip_ext & MLX5_CQE_L4_OK) &
1108 				 !!(cq->cqe64->hds_ip_ext & MLX5_CQE_L3_OK) &
1109 				 (get_cqe_l3_hdr_type(cq->cqe64) ==
1110 				  MLX5_CQE_L3_HDR_TYPE_IPV4)) <<
1111 				IBV_WC_IP_CSUM_OK_SHIFT;
1112 
1113 	switch (mlx5dv_get_cqe_opcode(cq->cqe64)) {
1114 	case MLX5_CQE_RESP_WR_IMM:
1115 	case MLX5_CQE_RESP_SEND_IMM:
1116 		wc_flags	|= IBV_WC_WITH_IMM;
1117 		break;
1118 	case MLX5_CQE_RESP_SEND_INV:
1119 		wc_flags |= IBV_WC_WITH_INV;
1120 		break;
1121 	}
1122 
1123 	wc_flags |= ((be32toh(cq->cqe64->flags_rqpn) >> 28) & 3) ? IBV_WC_GRH : 0;
1124 	return wc_flags;
1125 }
1126 
1127 static inline uint32_t mlx5_cq_read_wc_byte_len(struct ibv_cq_ex *ibcq)
1128 {
1129 	struct mlx5_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq));
1130 
1131 	return be32toh(cq->cqe64->byte_cnt);
1132 }
1133 
1134 static inline uint32_t mlx5_cq_read_wc_vendor_err(struct ibv_cq_ex *ibcq)
1135 {
1136 	struct mlx5_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq));
1137 	struct mlx5_err_cqe *ecqe = (struct mlx5_err_cqe *)cq->cqe64;
1138 
1139 	return ecqe->vendor_err_synd;
1140 }
1141 
1142 static inline uint32_t mlx5_cq_read_wc_imm_data(struct ibv_cq_ex *ibcq)
1143 {
1144 	struct mlx5_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq));
1145 
1146 	switch (mlx5dv_get_cqe_opcode(cq->cqe64)) {
1147 	case MLX5_CQE_RESP_SEND_INV:
1148 		return be32toh(cq->cqe64->imm_inval_pkey);
1149 	default:
1150 		return cq->cqe64->imm_inval_pkey;
1151 	}
1152 }
1153 
1154 static inline uint32_t mlx5_cq_read_wc_slid(struct ibv_cq_ex *ibcq)
1155 {
1156 	struct mlx5_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq));
1157 
1158 	return (uint32_t)be16toh(cq->cqe64->slid);
1159 }
1160 
1161 static inline uint8_t mlx5_cq_read_wc_sl(struct ibv_cq_ex *ibcq)
1162 {
1163 	struct mlx5_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq));
1164 
1165 	return (be32toh(cq->cqe64->flags_rqpn) >> 24) & 0xf;
1166 }
1167 
1168 static inline uint32_t mlx5_cq_read_wc_src_qp(struct ibv_cq_ex *ibcq)
1169 {
1170 	struct mlx5_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq));
1171 
1172 	return be32toh(cq->cqe64->flags_rqpn) & 0xffffff;
1173 }
1174 
1175 static inline uint8_t mlx5_cq_read_wc_dlid_path_bits(struct ibv_cq_ex *ibcq)
1176 {
1177 	struct mlx5_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq));
1178 
1179 	return cq->cqe64->ml_path & 0x7f;
1180 }
1181 
1182 static inline uint64_t mlx5_cq_read_wc_completion_ts(struct ibv_cq_ex *ibcq)
1183 {
1184 	struct mlx5_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq));
1185 
1186 	return be64toh(cq->cqe64->timestamp);
1187 }
1188 
1189 static inline uint16_t mlx5_cq_read_wc_cvlan(struct ibv_cq_ex *ibcq)
1190 {
1191 	struct mlx5_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq));
1192 
1193 	return be16toh(cq->cqe64->vlan_info);
1194 }
1195 
1196 static inline uint32_t mlx5_cq_read_flow_tag(struct ibv_cq_ex *ibcq)
1197 {
1198 	struct mlx5_cq *cq = to_mcq(ibv_cq_ex_to_cq(ibcq));
1199 
1200 	return be32toh(cq->cqe64->sop_drop_qpn) & MLX5_FLOW_TAG_MASK;
1201 }
1202 
1203 #define BIT(i) (1UL << (i))
1204 
1205 #define SINGLE_THREADED BIT(0)
1206 #define STALL BIT(1)
1207 #define V1 BIT(2)
1208 #define ADAPTIVE BIT(3)
1209 
1210 #define mlx5_start_poll_name(cqe_ver, lock, stall, adaptive) \
1211 	mlx5_start_poll##adaptive##stall##cqe_ver##lock
1212 #define mlx5_next_poll_name(cqe_ver, adaptive) \
1213 	mlx5_next_poll##adaptive##cqe_ver
1214 #define mlx5_end_poll_name(lock, stall, adaptive) \
1215 	mlx5_end_poll##adaptive##stall##lock
1216 
1217 #define POLL_FN_ENTRY(cqe_ver, lock, stall, adaptive) { \
1218 		.start_poll = &mlx5_start_poll_name(cqe_ver, lock, stall, adaptive), \
1219 		.next_poll = &mlx5_next_poll_name(cqe_ver, adaptive), \
1220 		.end_poll = &mlx5_end_poll_name(lock, stall, adaptive), \
1221 	}
1222 
1223 static const struct op
1224 {
1225 	int (*start_poll)(struct ibv_cq_ex *ibcq, struct ibv_poll_cq_attr *attr);
1226 	int (*next_poll)(struct ibv_cq_ex *ibcq);
1227 	void (*end_poll)(struct ibv_cq_ex *ibcq);
1228 } ops[ADAPTIVE + V1 + STALL + SINGLE_THREADED + 1] = {
1229 	[V1] =  POLL_FN_ENTRY(_v1, _lock, , ),
1230 	[0] =  POLL_FN_ENTRY(_v0, _lock, , ),
1231 	[V1 | SINGLE_THREADED] =  POLL_FN_ENTRY(_v1, , , ),
1232 	[SINGLE_THREADED] =  POLL_FN_ENTRY(_v0, , , ),
1233 	[V1 | STALL] =  POLL_FN_ENTRY(_v1, _lock, _stall, ),
1234 	[STALL] =  POLL_FN_ENTRY(_v0, _lock, _stall, ),
1235 	[V1 | SINGLE_THREADED | STALL] =  POLL_FN_ENTRY(_v1, , _stall, ),
1236 	[SINGLE_THREADED | STALL] =  POLL_FN_ENTRY(_v0, , _stall, ),
1237 	[V1 | STALL | ADAPTIVE] =  POLL_FN_ENTRY(_v1, _lock, _stall, _adaptive),
1238 	[STALL | ADAPTIVE] =  POLL_FN_ENTRY(_v0, _lock, _stall, _adaptive),
1239 	[V1 | SINGLE_THREADED | STALL | ADAPTIVE] =  POLL_FN_ENTRY(_v1, , _stall, _adaptive),
1240 	[SINGLE_THREADED | STALL | ADAPTIVE] =  POLL_FN_ENTRY(_v0, , _stall, _adaptive),
1241 };
1242 
1243 void mlx5_cq_fill_pfns(struct mlx5_cq *cq, const struct ibv_cq_init_attr_ex *cq_attr)
1244 {
1245 	struct mlx5_context *mctx = to_mctx(ibv_cq_ex_to_cq(&cq->ibv_cq)->context);
1246 	const struct op *poll_ops = &ops[((cq->stall_enable && cq->stall_adaptive_enable) ? ADAPTIVE : 0) |
1247 					 (mctx->cqe_version ? V1 : 0) |
1248 					 (cq->flags & MLX5_CQ_FLAGS_SINGLE_THREADED ?
1249 						      SINGLE_THREADED : 0) |
1250 					 (cq->stall_enable ? STALL : 0)];
1251 
1252 	cq->ibv_cq.start_poll = poll_ops->start_poll;
1253 	cq->ibv_cq.next_poll = poll_ops->next_poll;
1254 	cq->ibv_cq.end_poll = poll_ops->end_poll;
1255 
1256 	cq->ibv_cq.read_opcode = mlx5_cq_read_wc_opcode;
1257 	cq->ibv_cq.read_vendor_err = mlx5_cq_read_wc_vendor_err;
1258 	cq->ibv_cq.read_wc_flags = mlx5_cq_read_wc_flags;
1259 	if (cq_attr->wc_flags & IBV_WC_EX_WITH_BYTE_LEN)
1260 		cq->ibv_cq.read_byte_len = mlx5_cq_read_wc_byte_len;
1261 	if (cq_attr->wc_flags & IBV_WC_EX_WITH_IMM)
1262 		cq->ibv_cq.read_imm_data = mlx5_cq_read_wc_imm_data;
1263 	if (cq_attr->wc_flags & IBV_WC_EX_WITH_QP_NUM)
1264 		cq->ibv_cq.read_qp_num = mlx5_cq_read_wc_qp_num;
1265 	if (cq_attr->wc_flags & IBV_WC_EX_WITH_SRC_QP)
1266 		cq->ibv_cq.read_src_qp = mlx5_cq_read_wc_src_qp;
1267 	if (cq_attr->wc_flags & IBV_WC_EX_WITH_SLID)
1268 		cq->ibv_cq.read_slid = mlx5_cq_read_wc_slid;
1269 	if (cq_attr->wc_flags & IBV_WC_EX_WITH_SL)
1270 		cq->ibv_cq.read_sl = mlx5_cq_read_wc_sl;
1271 	if (cq_attr->wc_flags & IBV_WC_EX_WITH_DLID_PATH_BITS)
1272 		cq->ibv_cq.read_dlid_path_bits = mlx5_cq_read_wc_dlid_path_bits;
1273 	if (cq_attr->wc_flags & IBV_WC_EX_WITH_COMPLETION_TIMESTAMP)
1274 		cq->ibv_cq.read_completion_ts = mlx5_cq_read_wc_completion_ts;
1275 	if (cq_attr->wc_flags & IBV_WC_EX_WITH_CVLAN)
1276 		cq->ibv_cq.read_cvlan = mlx5_cq_read_wc_cvlan;
1277 	if (cq_attr->wc_flags & IBV_WC_EX_WITH_FLOW_TAG)
1278 		cq->ibv_cq.read_flow_tag = mlx5_cq_read_flow_tag;
1279 }
1280 
1281 int mlx5_arm_cq(struct ibv_cq *ibvcq, int solicited)
1282 {
1283 	struct mlx5_cq *cq = to_mcq(ibvcq);
1284 	struct mlx5_context *ctx = to_mctx(ibvcq->context);
1285 	uint32_t doorbell[2];
1286 	uint32_t sn;
1287 	uint32_t ci;
1288 	uint32_t cmd;
1289 
1290 	sn  = cq->arm_sn & 3;
1291 	ci  = cq->cons_index & 0xffffff;
1292 	cmd = solicited ? MLX5_CQ_DB_REQ_NOT_SOL : MLX5_CQ_DB_REQ_NOT;
1293 
1294 	cq->dbrec[MLX5_CQ_ARM_DB] = htobe32(sn << 28 | cmd | ci);
1295 
1296 	/*
1297 	 * Make sure that the doorbell record in host memory is
1298 	 * written before ringing the doorbell via PCI WC MMIO.
1299 	 */
1300 	mmio_wc_start();
1301 
1302 	doorbell[0] = htobe32(sn << 28 | cmd | ci);
1303 	doorbell[1] = htobe32(cq->cqn);
1304 
1305 	mlx5_write64(doorbell, ctx->uar[0] + MLX5_CQ_DOORBELL, &ctx->lock32);
1306 
1307 	mmio_flush_writes();
1308 
1309 	return 0;
1310 }
1311 
1312 void mlx5_cq_event(struct ibv_cq *cq)
1313 {
1314 	to_mcq(cq)->arm_sn++;
1315 }
1316 
1317 static int is_equal_rsn(struct mlx5_cqe64 *cqe64, uint32_t rsn)
1318 {
1319 	return rsn == (be32toh(cqe64->sop_drop_qpn) & 0xffffff);
1320 }
1321 
1322 static inline int is_equal_uidx(struct mlx5_cqe64 *cqe64, uint32_t uidx)
1323 {
1324 	return uidx == (be32toh(cqe64->srqn_uidx) & 0xffffff);
1325 }
1326 
1327 static inline int is_responder(uint8_t opcode)
1328 {
1329 	switch (opcode) {
1330 	case MLX5_CQE_RESP_WR_IMM:
1331 	case MLX5_CQE_RESP_SEND:
1332 	case MLX5_CQE_RESP_SEND_IMM:
1333 	case MLX5_CQE_RESP_SEND_INV:
1334 	case MLX5_CQE_RESP_ERR:
1335 		return 1;
1336 	}
1337 
1338 	return 0;
1339 }
1340 
1341 static inline int free_res_cqe(struct mlx5_cqe64 *cqe64, uint32_t rsn,
1342 			       struct mlx5_srq *srq, int cqe_version)
1343 {
1344 	if (cqe_version) {
1345 		if (is_equal_uidx(cqe64, rsn)) {
1346 			if (srq && is_responder(mlx5dv_get_cqe_opcode(cqe64)))
1347 				mlx5_free_srq_wqe(srq,
1348 						  be16toh(cqe64->wqe_counter));
1349 			return 1;
1350 		}
1351 	} else {
1352 		if (is_equal_rsn(cqe64, rsn)) {
1353 			if (srq && (be32toh(cqe64->srqn_uidx) & 0xffffff))
1354 				mlx5_free_srq_wqe(srq,
1355 						  be16toh(cqe64->wqe_counter));
1356 			return 1;
1357 		}
1358 	}
1359 
1360 	return 0;
1361 }
1362 
1363 void __mlx5_cq_clean(struct mlx5_cq *cq, uint32_t rsn, struct mlx5_srq *srq)
1364 {
1365 	uint32_t prod_index;
1366 	int nfreed = 0;
1367 	struct mlx5_cqe64 *cqe64, *dest64;
1368 	void *cqe, *dest;
1369 	uint8_t owner_bit;
1370 	int cqe_version;
1371 
1372 	if (!cq || cq->flags & MLX5_CQ_FLAGS_DV_OWNED)
1373 		return;
1374 
1375 	/*
1376 	 * First we need to find the current producer index, so we
1377 	 * know where to start cleaning from.  It doesn't matter if HW
1378 	 * adds new entries after this loop -- the QP we're worried
1379 	 * about is already in RESET, so the new entries won't come
1380 	 * from our QP and therefore don't need to be checked.
1381 	 */
1382 	for (prod_index = cq->cons_index; get_sw_cqe(cq, prod_index); ++prod_index)
1383 		if (prod_index == cq->cons_index + cq->ibv_cq.cqe)
1384 			break;
1385 
1386 	/*
1387 	 * Now sweep backwards through the CQ, removing CQ entries
1388 	 * that match our QP by copying older entries on top of them.
1389 	 */
1390 	cqe_version = (to_mctx(cq->ibv_cq.context))->cqe_version;
1391 	while ((int) --prod_index - (int) cq->cons_index >= 0) {
1392 		cqe = get_cqe(cq, prod_index & cq->ibv_cq.cqe);
1393 		cqe64 = (cq->cqe_sz == 64) ? cqe : cqe + 64;
1394 		if (free_res_cqe(cqe64, rsn, srq, cqe_version)) {
1395 			++nfreed;
1396 		} else if (nfreed) {
1397 			dest = get_cqe(cq, (prod_index + nfreed) & cq->ibv_cq.cqe);
1398 			dest64 = (cq->cqe_sz == 64) ? dest : dest + 64;
1399 			owner_bit = dest64->op_own & MLX5_CQE_OWNER_MASK;
1400 			memcpy(dest, cqe, cq->cqe_sz);
1401 			dest64->op_own = owner_bit |
1402 				(dest64->op_own & ~MLX5_CQE_OWNER_MASK);
1403 		}
1404 	}
1405 
1406 	if (nfreed) {
1407 		cq->cons_index += nfreed;
1408 		/*
1409 		 * Make sure update of buffer contents is done before
1410 		 * updating consumer index.
1411 		 */
1412 		udma_to_device_barrier();
1413 		update_cons_index(cq);
1414 	}
1415 }
1416 
1417 void mlx5_cq_clean(struct mlx5_cq *cq, uint32_t qpn, struct mlx5_srq *srq)
1418 {
1419 	mlx5_spin_lock(&cq->lock);
1420 	__mlx5_cq_clean(cq, qpn, srq);
1421 	mlx5_spin_unlock(&cq->lock);
1422 }
1423 
1424 static uint8_t sw_ownership_bit(int n, int nent)
1425 {
1426 	return (n & nent) ? 1 : 0;
1427 }
1428 
1429 static int is_hw(uint8_t own, int n, int mask)
1430 {
1431 	return (own & MLX5_CQE_OWNER_MASK) ^ !!(n & (mask + 1));
1432 }
1433 
1434 void mlx5_cq_resize_copy_cqes(struct mlx5_cq *cq)
1435 {
1436 	struct mlx5_cqe64 *scqe64;
1437 	struct mlx5_cqe64 *dcqe64;
1438 	void *start_cqe;
1439 	void *scqe;
1440 	void *dcqe;
1441 	int ssize;
1442 	int dsize;
1443 	int i;
1444 	uint8_t sw_own;
1445 
1446 	ssize = cq->cqe_sz;
1447 	dsize = cq->resize_cqe_sz;
1448 
1449 	i = cq->cons_index;
1450 	scqe = get_buf_cqe(cq->active_buf, i & cq->active_cqes, ssize);
1451 	scqe64 = ssize == 64 ? scqe : scqe + 64;
1452 	start_cqe = scqe;
1453 	if (is_hw(scqe64->op_own, i, cq->active_cqes)) {
1454 		fprintf(stderr, "expected cqe in sw ownership\n");
1455 		return;
1456 	}
1457 
1458 	while ((scqe64->op_own >> 4) != MLX5_CQE_RESIZE_CQ) {
1459 		dcqe = get_buf_cqe(cq->resize_buf, (i + 1) & (cq->resize_cqes - 1), dsize);
1460 		dcqe64 = dsize == 64 ? dcqe : dcqe + 64;
1461 		sw_own = sw_ownership_bit(i + 1, cq->resize_cqes);
1462 		memcpy(dcqe, scqe, ssize);
1463 		dcqe64->op_own = (dcqe64->op_own & ~MLX5_CQE_OWNER_MASK) | sw_own;
1464 
1465 		++i;
1466 		scqe = get_buf_cqe(cq->active_buf, i & cq->active_cqes, ssize);
1467 		scqe64 = ssize == 64 ? scqe : scqe + 64;
1468 		if (is_hw(scqe64->op_own, i, cq->active_cqes)) {
1469 			fprintf(stderr, "expected cqe in sw ownership\n");
1470 			return;
1471 		}
1472 
1473 		if (scqe == start_cqe) {
1474 			fprintf(stderr, "resize CQ failed to get resize CQE\n");
1475 			return;
1476 		}
1477 	}
1478 	++cq->cons_index;
1479 }
1480 
1481 int mlx5_alloc_cq_buf(struct mlx5_context *mctx, struct mlx5_cq *cq,
1482 		      struct mlx5_buf *buf, int nent, int cqe_sz)
1483 {
1484 	struct mlx5_cqe64 *cqe;
1485 	int i;
1486 	struct mlx5_device *dev = to_mdev(mctx->ibv_ctx.device);
1487 	int ret;
1488 	enum mlx5_alloc_type type;
1489 	enum mlx5_alloc_type default_type = MLX5_ALLOC_TYPE_ANON;
1490 
1491 	if (mlx5_use_huge("HUGE_CQ"))
1492 		default_type = MLX5_ALLOC_TYPE_HUGE;
1493 
1494 	mlx5_get_alloc_type(MLX5_CQ_PREFIX, &type, default_type);
1495 
1496 	ret = mlx5_alloc_prefered_buf(mctx, buf,
1497 				      align(nent * cqe_sz, dev->page_size),
1498 				      dev->page_size,
1499 				      type,
1500 				      MLX5_CQ_PREFIX);
1501 
1502 	if (ret)
1503 		return -1;
1504 
1505 	memset(buf->buf, 0, nent * cqe_sz);
1506 
1507 	for (i = 0; i < nent; ++i) {
1508 		cqe = buf->buf + i * cqe_sz;
1509 		cqe += cqe_sz == 128 ? 1 : 0;
1510 		cqe->op_own = MLX5_CQE_INVALID << 4;
1511 	}
1512 
1513 	return 0;
1514 }
1515 
1516 int mlx5_free_cq_buf(struct mlx5_context *ctx, struct mlx5_buf *buf)
1517 {
1518 	return mlx5_free_actual_buf(ctx, buf);
1519 }
1520