xref: /freebsd/sys/dev/mlx5/mlx5_ib/mlx5_ib_qp.c (revision 8a0a413e)
1 /*-
2  * Copyright (c) 2013-2015, Mellanox Technologies, Ltd.  All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  *
25  * $FreeBSD$
26  */
27 
28 #include <linux/module.h>
29 #include <rdma/ib_cache.h>
30 #include <rdma/ib_umem.h>
31 #include "mlx5_ib.h"
32 #include "user.h"
33 #include <dev/mlx5/mlx5_core/transobj.h>
34 #include <sys/priv.h>
35 
36 #define	IPV6_DEFAULT_HOPLIMIT 64
37 
38 
39 static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
40 			       const struct ib_qp_attr *attr, int attr_mask,
41 			       enum ib_qp_state cur_state, enum ib_qp_state new_state);
42 
43 /* not supported currently */
44 static int workqueue_signature;
45 
46 enum {
47 	MLX5_IB_ACK_REQ_FREQ	= 8,
48 };
49 
50 enum {
51 	MLX5_IB_DEFAULT_SCHED_QUEUE	= 0x83,
52 	MLX5_IB_DEFAULT_QP0_SCHED_QUEUE	= 0x3f,
53 	MLX5_IB_LINK_TYPE_IB		= 0,
54 	MLX5_IB_LINK_TYPE_ETH		= 1
55 };
56 
57 enum {
58 	MLX5_IB_SQ_STRIDE	= 6,
59 	MLX5_IB_CACHE_LINE_SIZE	= 64,
60 };
61 
62 static const u32 mlx5_ib_opcode[] = {
63 	[IB_WR_SEND]				= MLX5_OPCODE_SEND,
64 	[IB_WR_SEND_WITH_IMM]			= MLX5_OPCODE_SEND_IMM,
65 	[IB_WR_RDMA_WRITE]			= MLX5_OPCODE_RDMA_WRITE,
66 	[IB_WR_RDMA_WRITE_WITH_IMM]		= MLX5_OPCODE_RDMA_WRITE_IMM,
67 	[IB_WR_RDMA_READ]			= MLX5_OPCODE_RDMA_READ,
68 	[IB_WR_ATOMIC_CMP_AND_SWP]		= MLX5_OPCODE_ATOMIC_CS,
69 	[IB_WR_ATOMIC_FETCH_AND_ADD]		= MLX5_OPCODE_ATOMIC_FA,
70 	[IB_WR_SEND_WITH_INV]			= MLX5_OPCODE_SEND_INVAL,
71 	[IB_WR_LOCAL_INV]			= MLX5_OPCODE_UMR,
72 	[IB_WR_FAST_REG_MR]			= MLX5_OPCODE_UMR,
73 	[IB_WR_MASKED_ATOMIC_CMP_AND_SWP]	= MLX5_OPCODE_ATOMIC_MASKED_CS,
74 	[IB_WR_MASKED_ATOMIC_FETCH_AND_ADD]	= MLX5_OPCODE_ATOMIC_MASKED_FA,
75 };
76 
77 struct umr_wr {
78 	u64				virt_addr;
79 	struct ib_pd		       *pd;
80 	unsigned int			page_shift;
81 	unsigned int			npages;
82 	u32				length;
83 	int				access_flags;
84 	u32				mkey;
85 };
86 
87 static int is_qp0(enum ib_qp_type qp_type)
88 {
89 	return qp_type == IB_QPT_SMI;
90 }
91 
92 static int is_qp1(enum ib_qp_type qp_type)
93 {
94 	return qp_type == IB_QPT_GSI;
95 }
96 
97 static int is_sqp(enum ib_qp_type qp_type)
98 {
99 	return is_qp0(qp_type) || is_qp1(qp_type);
100 }
101 
102 static void *get_wqe(struct mlx5_ib_qp *qp, int offset)
103 {
104 	return mlx5_buf_offset(&qp->buf, offset);
105 }
106 
107 static void *get_recv_wqe(struct mlx5_ib_qp *qp, int n)
108 {
109 	return get_wqe(qp, qp->rq.offset + (n << qp->rq.wqe_shift));
110 }
111 
112 void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n)
113 {
114 	return get_wqe(qp, qp->sq.offset + (n << MLX5_IB_SQ_STRIDE));
115 }
116 
117 
118 static int
119 query_wqe_idx(struct mlx5_ib_qp *qp)
120 {
121 	struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.device);
122 	struct mlx5_query_qp_mbox_out *outb;
123 	struct mlx5_qp_context *context;
124 	int ret;
125 
126 	outb = kzalloc(sizeof(*outb), GFP_KERNEL);
127 	if (!outb)
128 		return -ENOMEM;
129 
130 	context = &outb->ctx;
131 
132 	mutex_lock(&qp->mutex);
133 	ret = mlx5_core_qp_query(dev->mdev, &qp->mqp, outb, sizeof(*outb));
134 	if (ret)
135 		goto out_free;
136 
137 	ret = be16_to_cpu(context->hw_sq_wqe_counter) & (qp->sq.wqe_cnt - 1);
138 
139 out_free:
140 	mutex_unlock(&qp->mutex);
141 	kfree(outb);
142 
143 	return ret;
144 }
145 
146 static int mlx5_handle_sig_pipelining(struct mlx5_ib_qp *qp)
147 {
148 	int wqe_idx;
149 
150 	wqe_idx = query_wqe_idx(qp);
151 	if (wqe_idx < 0) {
152 		printf("mlx5_ib: ERR: ""Failed to query QP 0x%x wqe index\n", qp->mqp.qpn);
153 		return wqe_idx;
154 	}
155 
156 	if (qp->sq.swr_ctx[wqe_idx].sig_piped) {
157 		struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.device);
158 		struct mlx5_wqe_ctrl_seg *cwqe;
159 
160 		cwqe = mlx5_get_send_wqe(qp, wqe_idx);
161 		cwqe->opmod_idx_opcode = cpu_to_be32(be32_to_cpu(cwqe->opmod_idx_opcode) & 0xffffff00);
162 		qp->sq.swr_ctx[wqe_idx].w_list.opcode |= MLX5_OPCODE_SIGNATURE_CANCELED;
163 		mlx5_ib_dbg(dev, "Cancel QP 0x%x wqe_index 0x%x\n",
164 			    qp->mqp.qpn, wqe_idx);
165 	}
166 
167 	return 0;
168 }
169 
170 static void mlx5_ib_sqd_work(struct work_struct *work)
171 {
172 	struct mlx5_ib_sqd *sqd;
173 	struct mlx5_ib_qp *qp;
174 	struct ib_qp_attr qp_attr;
175 
176 	sqd = container_of(work, struct mlx5_ib_sqd, work);
177 	qp = sqd->qp;
178 
179 	if (mlx5_handle_sig_pipelining(qp))
180 		goto out;
181 
182 	mutex_lock(&qp->mutex);
183 	if (__mlx5_ib_modify_qp(&qp->ibqp, &qp_attr, 0, IB_QPS_SQD, IB_QPS_RTS))
184 		printf("mlx5_ib: ERR: ""Failed to resume QP 0x%x\n", qp->mqp.qpn);
185 	mutex_unlock(&qp->mutex);
186 out:
187 	kfree(sqd);
188 }
189 
190 static void mlx5_ib_sigerr_sqd_event(struct mlx5_ib_qp *qp)
191 {
192 	struct mlx5_ib_sqd *sqd;
193 
194 	sqd = kzalloc(sizeof(*sqd), GFP_ATOMIC);
195 	if (!sqd)
196 		return;
197 
198 	sqd->qp = qp;
199 	INIT_WORK(&sqd->work, mlx5_ib_sqd_work);
200 	queue_work(mlx5_ib_wq, &sqd->work);
201 }
202 
203 static void mlx5_ib_qp_event(struct mlx5_core_qp *qp, int type)
204 {
205 	struct ib_qp *ibqp = &to_mibqp(qp)->ibqp;
206 	struct ib_event event;
207 
208 	if (type == MLX5_EVENT_TYPE_SQ_DRAINED &&
209 	    to_mibqp(qp)->state != IB_QPS_SQD) {
210 		mlx5_ib_sigerr_sqd_event(to_mibqp(qp));
211 		return;
212 	}
213 
214 	if (type == MLX5_EVENT_TYPE_PATH_MIG)
215 		to_mibqp(qp)->port = to_mibqp(qp)->alt_port;
216 
217 	if (ibqp->event_handler) {
218 		event.device     = ibqp->device;
219 		event.element.qp = ibqp;
220 		switch (type) {
221 		case MLX5_EVENT_TYPE_PATH_MIG:
222 			event.event = IB_EVENT_PATH_MIG;
223 			break;
224 		case MLX5_EVENT_TYPE_COMM_EST:
225 			event.event = IB_EVENT_COMM_EST;
226 			break;
227 		case MLX5_EVENT_TYPE_SQ_DRAINED:
228 			event.event = IB_EVENT_SQ_DRAINED;
229 			break;
230 		case MLX5_EVENT_TYPE_SRQ_LAST_WQE:
231 			event.event = IB_EVENT_QP_LAST_WQE_REACHED;
232 			break;
233 		case MLX5_EVENT_TYPE_WQ_CATAS_ERROR:
234 			event.event = IB_EVENT_QP_FATAL;
235 			break;
236 		case MLX5_EVENT_TYPE_PATH_MIG_FAILED:
237 			event.event = IB_EVENT_PATH_MIG_ERR;
238 			break;
239 		case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
240 			event.event = IB_EVENT_QP_REQ_ERR;
241 			break;
242 		case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR:
243 			event.event = IB_EVENT_QP_ACCESS_ERR;
244 			break;
245 		default:
246 			printf("mlx5_ib: WARN: ""mlx5_ib: Unexpected event type %d on QP %06x\n", type, qp->qpn);
247 			return;
248 		}
249 
250 		ibqp->event_handler(&event, ibqp->qp_context);
251 	}
252 }
253 
254 static int set_rq_size(struct mlx5_ib_dev *dev, struct ib_qp_cap *cap,
255 		       int has_rq, struct mlx5_ib_qp *qp, struct mlx5_ib_create_qp *ucmd)
256 {
257 	int wqe_size;
258 	int wq_size;
259 
260 	/* Sanity check RQ size before proceeding */
261 	if (cap->max_recv_wr > (1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz)))
262 		return -EINVAL;
263 
264 	if (!has_rq) {
265 		qp->rq.max_gs = 0;
266 		qp->rq.wqe_cnt = 0;
267 		qp->rq.wqe_shift = 0;
268 		cap->max_recv_wr = 0;
269 		cap->max_recv_sge = 0;
270 	} else {
271 		if (ucmd) {
272 			qp->rq.wqe_cnt = ucmd->rq_wqe_count;
273 			qp->rq.wqe_shift = ucmd->rq_wqe_shift;
274 			qp->rq.max_gs = (1 << qp->rq.wqe_shift) / sizeof(struct mlx5_wqe_data_seg) - qp->wq_sig;
275 			qp->rq.max_post = qp->rq.wqe_cnt;
276 		} else {
277 			wqe_size = qp->wq_sig ? sizeof(struct mlx5_wqe_signature_seg) : 0;
278 			wqe_size += cap->max_recv_sge * sizeof(struct mlx5_wqe_data_seg);
279 			wqe_size = roundup_pow_of_two(wqe_size);
280 			wq_size = roundup_pow_of_two(cap->max_recv_wr) * wqe_size;
281 			wq_size = max_t(int, wq_size, MLX5_SEND_WQE_BB);
282 			qp->rq.wqe_cnt = wq_size / wqe_size;
283 			if (wqe_size > MLX5_CAP_GEN(dev->mdev, max_wqe_sz_rq)) {
284 				mlx5_ib_dbg(dev, "wqe_size %d, max %d\n",
285 					    wqe_size,
286 					    MLX5_CAP_GEN(dev->mdev,
287 							 max_wqe_sz_rq));
288 				return -EINVAL;
289 			}
290 			qp->rq.wqe_shift = ilog2(wqe_size);
291 			qp->rq.max_gs = (1 << qp->rq.wqe_shift) / sizeof(struct mlx5_wqe_data_seg) - qp->wq_sig;
292 			qp->rq.max_post = qp->rq.wqe_cnt;
293 		}
294 	}
295 
296 	return 0;
297 }
298 
299 static int sq_overhead(enum ib_qp_type qp_type)
300 {
301 	int size = 0;
302 
303 	switch (qp_type) {
304 	case IB_QPT_XRC_INI:
305 		size += sizeof(struct mlx5_wqe_xrc_seg);
306 		/* fall through */
307 	case IB_QPT_RC:
308 		size += sizeof(struct mlx5_wqe_ctrl_seg) +
309 			sizeof(struct mlx5_wqe_atomic_seg) +
310 			sizeof(struct mlx5_wqe_raddr_seg) +
311 			sizeof(struct mlx5_wqe_umr_ctrl_seg) +
312 			sizeof(struct mlx5_mkey_seg);
313 		break;
314 
315 	case IB_QPT_XRC_TGT:
316 		return 0;
317 
318 	case IB_QPT_UC:
319 		size += sizeof(struct mlx5_wqe_ctrl_seg) +
320 			sizeof(struct mlx5_wqe_raddr_seg) +
321 			sizeof(struct mlx5_wqe_umr_ctrl_seg) +
322 			sizeof(struct mlx5_mkey_seg);
323 		break;
324 
325 	case IB_QPT_UD:
326 	case IB_QPT_SMI:
327 	case IB_QPT_GSI:
328 		size += sizeof(struct mlx5_wqe_ctrl_seg) +
329 			sizeof(struct mlx5_wqe_datagram_seg);
330 		break;
331 
332 	default:
333 		return -EINVAL;
334 	}
335 
336 	return size;
337 }
338 
339 static int calc_send_wqe(struct ib_qp_init_attr *attr)
340 {
341 	int inl_size = 0;
342 	int size;
343 
344 	size = sq_overhead(attr->qp_type);
345 	if (size < 0)
346 		return size;
347 
348 	if (attr->cap.max_inline_data) {
349 		inl_size = size + sizeof(struct mlx5_wqe_inline_seg) +
350 			attr->cap.max_inline_data;
351 	}
352 
353 	size += attr->cap.max_send_sge * sizeof(struct mlx5_wqe_data_seg);
354 	return ALIGN(max_t(int, inl_size, size), MLX5_SEND_WQE_BB);
355 }
356 
357 static int get_send_sge(struct ib_qp_init_attr *attr, int wqe_size)
358 {
359 	int max_sge;
360 
361 	if (attr->qp_type == IB_QPT_RC)
362 		max_sge = (min_t(int, wqe_size, 512) -
363 			   sizeof(struct mlx5_wqe_ctrl_seg) -
364 			   sizeof(struct mlx5_wqe_raddr_seg)) /
365 			sizeof(struct mlx5_wqe_data_seg);
366 	else if (attr->qp_type == IB_QPT_XRC_INI)
367 		max_sge = (min_t(int, wqe_size, 512) -
368 			   sizeof(struct mlx5_wqe_ctrl_seg) -
369 			   sizeof(struct mlx5_wqe_xrc_seg) -
370 			   sizeof(struct mlx5_wqe_raddr_seg)) /
371 			sizeof(struct mlx5_wqe_data_seg);
372 	else
373 		max_sge = (wqe_size - sq_overhead(attr->qp_type)) /
374 			sizeof(struct mlx5_wqe_data_seg);
375 
376 	return min_t(int, max_sge, wqe_size - sq_overhead(attr->qp_type) /
377 		     sizeof(struct mlx5_wqe_data_seg));
378 }
379 
380 static int calc_sq_size(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *attr,
381 			struct mlx5_ib_qp *qp)
382 {
383 	int wqe_size;
384 	int wq_size;
385 
386 	if (!attr->cap.max_send_wr)
387 		return 0;
388 
389 	wqe_size = calc_send_wqe(attr);
390 	mlx5_ib_dbg(dev, "wqe_size %d\n", wqe_size);
391 	if (wqe_size < 0)
392 		return wqe_size;
393 
394 	if (wqe_size > MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq)) {
395 		mlx5_ib_warn(dev, "wqe_size(%d) > max_sq_desc_sz(%d)\n",
396 			     wqe_size, MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq));
397 		return -EINVAL;
398 	}
399 
400 	qp->max_inline_data = wqe_size - sq_overhead(attr->qp_type) -
401 		sizeof(struct mlx5_wqe_inline_seg);
402 	attr->cap.max_inline_data = qp->max_inline_data;
403 
404 	wq_size = roundup_pow_of_two(attr->cap.max_send_wr * (u64)wqe_size);
405 	qp->sq.wqe_cnt = wq_size / MLX5_SEND_WQE_BB;
406 	if (qp->sq.wqe_cnt > (1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz))) {
407 		mlx5_ib_warn(dev, "wqe count(%d) exceeds limits(%d)\n",
408 			     qp->sq.wqe_cnt,
409 			     1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz));
410 		return -ENOMEM;
411 	}
412 	qp->sq.wqe_shift = ilog2(MLX5_SEND_WQE_BB);
413 	qp->sq.max_gs = get_send_sge(attr, wqe_size);
414 	if (qp->sq.max_gs < attr->cap.max_send_sge) {
415 		mlx5_ib_warn(dev, "max sge(%d) exceeds limits(%d)\n",
416 			     qp->sq.max_gs, attr->cap.max_send_sge);
417 		return -ENOMEM;
418 	}
419 
420 	attr->cap.max_send_sge = qp->sq.max_gs;
421 	qp->sq.max_post = wq_size / wqe_size;
422 	attr->cap.max_send_wr = qp->sq.max_post;
423 
424 	return wq_size;
425 }
426 
427 static int set_user_buf_size(struct mlx5_ib_dev *dev,
428 			    struct mlx5_ib_qp *qp,
429 			    struct mlx5_ib_create_qp *ucmd,
430 			    struct ib_qp_init_attr *attr)
431 {
432 	int desc_sz = 1 << qp->sq.wqe_shift;
433 
434 	if (desc_sz > MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq)) {
435 		mlx5_ib_warn(dev, "desc_sz %d, max_sq_desc_sz %d\n",
436 			     desc_sz, MLX5_CAP_GEN(dev->mdev, max_wqe_sz_sq));
437 		return -EINVAL;
438 	}
439 
440 	if (ucmd->sq_wqe_count && ((1 << ilog2(ucmd->sq_wqe_count)) != ucmd->sq_wqe_count)) {
441 		mlx5_ib_warn(dev, "sq_wqe_count %d, sq_wqe_count %d\n",
442 			     ucmd->sq_wqe_count, ucmd->sq_wqe_count);
443 		return -EINVAL;
444 	}
445 
446 	qp->sq.wqe_cnt = ucmd->sq_wqe_count;
447 
448 	if (qp->sq.wqe_cnt > (1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz))) {
449 		mlx5_ib_warn(dev, "wqe_cnt %d, max_wqes %d\n",
450 			     qp->sq.wqe_cnt,
451 			     1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz));
452 		return -EINVAL;
453 	}
454 
455 
456 	if (attr->qp_type == IB_QPT_RAW_PACKET) {
457 		qp->buf_size = qp->rq.wqe_cnt << qp->rq.wqe_shift;
458 		qp->sq_buf_size = qp->sq.wqe_cnt << 6;
459 	} else {
460 		qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) +
461 			(qp->sq.wqe_cnt << 6);
462 		qp->sq_buf_size = 0;
463 	}
464 
465 	return 0;
466 }
467 
468 static int qp_has_rq(struct ib_qp_init_attr *attr)
469 {
470 	if (attr->qp_type == IB_QPT_XRC_INI ||
471 	    attr->qp_type == IB_QPT_XRC_TGT || attr->srq ||
472 	    !attr->cap.max_recv_wr)
473 		return 0;
474 
475 	return 1;
476 }
477 
478 static int first_med_uuar(void)
479 {
480 	return 1;
481 }
482 
483 static int next_uuar(int n)
484 {
485 	n++;
486 
487 	while (((n % 4) & 2))
488 		n++;
489 
490 	return n;
491 }
492 
493 static int num_med_uuar(struct mlx5_uuar_info *uuari)
494 {
495 	int n;
496 
497 	n = uuari->num_uars * MLX5_NON_FP_BF_REGS_PER_PAGE -
498 		uuari->num_low_latency_uuars - 1;
499 
500 	return n >= 0 ? n : 0;
501 }
502 
503 static int max_uuari(struct mlx5_uuar_info *uuari)
504 {
505 	return uuari->num_uars * 4;
506 }
507 
508 static int first_hi_uuar(struct mlx5_uuar_info *uuari)
509 {
510 	int med;
511 	int i;
512 	int t;
513 
514 	med = num_med_uuar(uuari);
515 	for (t = 0, i = first_med_uuar();; i = next_uuar(i)) {
516 		t++;
517 		if (t == med)
518 			return next_uuar(i);
519 	}
520 
521 	return 0;
522 }
523 
524 static int alloc_high_class_uuar(struct mlx5_uuar_info *uuari)
525 {
526 	int i;
527 
528 	for (i = first_hi_uuar(uuari); i < max_uuari(uuari); i = next_uuar(i)) {
529 		if (!test_bit(i, uuari->bitmap)) {
530 			set_bit(i, uuari->bitmap);
531 			uuari->count[i]++;
532 			return i;
533 		}
534 	}
535 
536 	return -ENOMEM;
537 }
538 
539 static int alloc_med_class_uuar(struct mlx5_uuar_info *uuari)
540 {
541 	int minidx = first_med_uuar();
542 	int i;
543 
544 	for (i = first_med_uuar(); i < first_hi_uuar(uuari); i = next_uuar(i)) {
545 		if (uuari->count[i] < uuari->count[minidx])
546 			minidx = i;
547 	}
548 
549 	uuari->count[minidx]++;
550 
551 	return minidx;
552 }
553 
554 static int alloc_uuar(struct mlx5_uuar_info *uuari,
555 		      enum mlx5_ib_latency_class lat)
556 {
557 	int uuarn = -EINVAL;
558 
559 	mutex_lock(&uuari->lock);
560 	switch (lat) {
561 	case MLX5_IB_LATENCY_CLASS_LOW:
562 		uuarn = 0;
563 		uuari->count[uuarn]++;
564 		break;
565 
566 	case MLX5_IB_LATENCY_CLASS_MEDIUM:
567 		if (uuari->ver < 2)
568 			uuarn = -ENOMEM;
569 		else
570 			uuarn = alloc_med_class_uuar(uuari);
571 		break;
572 
573 	case MLX5_IB_LATENCY_CLASS_HIGH:
574 		if (uuari->ver < 2)
575 			uuarn = -ENOMEM;
576 		else
577 			uuarn = alloc_high_class_uuar(uuari);
578 		break;
579 
580 	case MLX5_IB_LATENCY_CLASS_FAST_PATH:
581 		uuarn = 2;
582 		break;
583 	}
584 	mutex_unlock(&uuari->lock);
585 
586 	return uuarn;
587 }
588 
589 static void free_med_class_uuar(struct mlx5_uuar_info *uuari, int uuarn)
590 {
591 	clear_bit(uuarn, uuari->bitmap);
592 	--uuari->count[uuarn];
593 }
594 
595 static void free_high_class_uuar(struct mlx5_uuar_info *uuari, int uuarn)
596 {
597 	clear_bit(uuarn, uuari->bitmap);
598 	--uuari->count[uuarn];
599 }
600 
601 static void free_uuar(struct mlx5_uuar_info *uuari, int uuarn)
602 {
603 	int nuuars = uuari->num_uars * MLX5_BF_REGS_PER_PAGE;
604 	int high_uuar = nuuars - uuari->num_low_latency_uuars;
605 
606 	mutex_lock(&uuari->lock);
607 	if (uuarn == 0) {
608 		--uuari->count[uuarn];
609 		goto out;
610 	}
611 
612 	if (uuarn < high_uuar) {
613 		free_med_class_uuar(uuari, uuarn);
614 		goto out;
615 	}
616 
617 	free_high_class_uuar(uuari, uuarn);
618 
619 out:
620 	mutex_unlock(&uuari->lock);
621 }
622 
623 static enum mlx5_qp_state to_mlx5_state(enum ib_qp_state state)
624 {
625 	switch (state) {
626 	case IB_QPS_RESET:	return MLX5_QP_STATE_RST;
627 	case IB_QPS_INIT:	return MLX5_QP_STATE_INIT;
628 	case IB_QPS_RTR:	return MLX5_QP_STATE_RTR;
629 	case IB_QPS_RTS:	return MLX5_QP_STATE_RTS;
630 	case IB_QPS_SQD:	return MLX5_QP_STATE_SQD;
631 	case IB_QPS_SQE:	return MLX5_QP_STATE_SQER;
632 	case IB_QPS_ERR:	return MLX5_QP_STATE_ERR;
633 	default:		return -1;
634 	}
635 }
636 
637 static int to_mlx5_st(enum ib_qp_type type)
638 {
639 	switch (type) {
640 	case IB_QPT_RC:			return MLX5_QP_ST_RC;
641 	case IB_QPT_UC:			return MLX5_QP_ST_UC;
642 	case IB_QPT_UD:			return MLX5_QP_ST_UD;
643 	case IB_QPT_XRC_INI:
644 	case IB_QPT_XRC_TGT:		return MLX5_QP_ST_XRC;
645 	case IB_QPT_SMI:		return MLX5_QP_ST_QP0;
646 	case IB_QPT_GSI:		return MLX5_QP_ST_QP1;
647 	case IB_QPT_RAW_IPV6:		return MLX5_QP_ST_RAW_IPV6;
648 	case IB_QPT_RAW_PACKET:
649 	case IB_QPT_RAW_ETHERTYPE:	return MLX5_QP_ST_RAW_ETHERTYPE;
650 	case IB_QPT_MAX:
651 	default:		return -EINVAL;
652 	}
653 }
654 
655 static void mlx5_ib_lock_cqs(struct mlx5_ib_cq *send_cq,
656 			     struct mlx5_ib_cq *recv_cq);
657 static void mlx5_ib_unlock_cqs(struct mlx5_ib_cq *send_cq,
658 			       struct mlx5_ib_cq *recv_cq);
659 
660 static int uuarn_to_uar_index(struct mlx5_uuar_info *uuari, int uuarn)
661 {
662 	return uuari->uars[uuarn / MLX5_BF_REGS_PER_PAGE].index;
663 }
664 
665 static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd,
666 			  struct mlx5_ib_qp *qp, struct ib_udata *udata,
667 			  struct ib_qp_init_attr *attr,
668 			  struct mlx5_create_qp_mbox_in **in,
669 			  int *inlen,
670 			  struct mlx5_exp_ib_create_qp *ucmd)
671 {
672 	struct mlx5_exp_ib_create_qp_resp resp;
673 	struct mlx5_ib_ucontext *context;
674 	int page_shift = 0;
675 	int uar_index;
676 	int npages;
677 	u32 offset = 0;
678 	int uuarn;
679 	int ncont = 0;
680 	int err;
681 
682 	context = to_mucontext(pd->uobject->context);
683 	memset(&resp, 0, sizeof(resp));
684 	resp.size_of_prefix = offsetof(struct mlx5_exp_ib_create_qp_resp, prefix_reserved);
685 	/*
686 	 * TBD: should come from the verbs when we have the API
687 	 */
688 	if (ucmd->exp.comp_mask & MLX5_EXP_CREATE_QP_MASK_WC_UAR_IDX) {
689 		if (ucmd->exp.wc_uar_index == MLX5_EXP_CREATE_QP_DB_ONLY_UUAR) {
690 			/* Assign LATENCY_CLASS_LOW (DB only UUAR) to this QP */
691 			uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_LOW);
692 			if (uuarn < 0) {
693 				mlx5_ib_warn(dev, "DB only uuar allocation failed\n");
694 				return uuarn;
695 			}
696 			uar_index = uuarn_to_uar_index(&context->uuari, uuarn);
697 		} else if (ucmd->exp.wc_uar_index >= MLX5_IB_MAX_CTX_DYNAMIC_UARS ||
698 			   context->dynamic_wc_uar_index[ucmd->exp.wc_uar_index] ==
699 			   MLX5_IB_INVALID_UAR_INDEX) {
700 			mlx5_ib_warn(dev, "dynamic uuar allocation failed\n");
701 			return -EINVAL;
702 		} else {
703 			uar_index = context->dynamic_wc_uar_index[ucmd->exp.wc_uar_index];
704 			uuarn = MLX5_EXP_INVALID_UUAR;
705 		}
706 	} else {
707 		uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_HIGH);
708 		if (uuarn < 0) {
709 			mlx5_ib_dbg(dev, "failed to allocate low latency UUAR\n");
710 			mlx5_ib_dbg(dev, "reverting to medium latency\n");
711 			uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_MEDIUM);
712 			if (uuarn < 0) {
713 				mlx5_ib_dbg(dev, "failed to allocate medium latency UUAR\n");
714 				mlx5_ib_dbg(dev, "reverting to high latency\n");
715 				uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_LOW);
716 				if (uuarn < 0) {
717 					mlx5_ib_warn(dev, "uuar allocation failed\n");
718 					return uuarn;
719 				}
720 			}
721 		}
722 		uar_index = uuarn_to_uar_index(&context->uuari, uuarn);
723 	}
724 	mlx5_ib_dbg(dev, "uuarn 0x%x, uar_index 0x%x\n", uuarn, uar_index);
725 
726 	qp->rq.offset = 0;
727 	qp->sq.wqe_shift = ilog2(MLX5_SEND_WQE_BB);
728 	qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift;
729 
730 	err = set_user_buf_size(dev, qp, (struct mlx5_ib_create_qp *)ucmd, attr);
731 	if (err)
732 		goto err_uuar;
733 
734 	if (ucmd->buf_addr && qp->buf_size) {
735 		qp->umem = ib_umem_get(pd->uobject->context, ucmd->buf_addr,
736 				       qp->buf_size, 0, 0);
737 		if (IS_ERR(qp->umem)) {
738 			mlx5_ib_warn(dev, "umem_get failed\n");
739 			err = PTR_ERR(qp->umem);
740 			goto err_uuar;
741 		}
742 	} else {
743 		qp->umem = NULL;
744 	}
745 
746 	if (qp->umem) {
747 		mlx5_ib_cont_pages(qp->umem, ucmd->buf_addr, &npages, &page_shift,
748 				   &ncont, NULL);
749 		err = mlx5_ib_get_buf_offset(ucmd->buf_addr, page_shift, &offset);
750 		if (err) {
751 			mlx5_ib_warn(dev, "bad offset\n");
752 			goto err_umem;
753 		}
754 		mlx5_ib_dbg(dev, "addr 0x%llx, size %d, npages %d, page_shift %d, ncont %d, offset %d\n",
755 			    (unsigned long long)ucmd->buf_addr, qp->buf_size,
756 			    npages, page_shift, ncont, offset);
757 	}
758 
759 	*inlen = sizeof(**in) + sizeof(*(*in)->pas) * ncont;
760 	*in = mlx5_vzalloc(*inlen);
761 	if (!*in) {
762 		err = -ENOMEM;
763 		goto err_umem;
764 	}
765 	if (qp->umem)
766 		mlx5_ib_populate_pas(dev, qp->umem, page_shift, (*in)->pas, 0);
767 	(*in)->ctx.log_pg_sz_remote_qpn =
768 		cpu_to_be32((page_shift - MLX5_ADAPTER_PAGE_SHIFT) << 24);
769 	(*in)->ctx.params2 = cpu_to_be32(offset << 6);
770 
771 	(*in)->ctx.qp_counter_set_usr_page = cpu_to_be32(uar_index);
772 	resp.uuar_index = uuarn;
773 	qp->uuarn = uuarn;
774 
775 	err = mlx5_ib_db_map_user(context, ucmd->db_addr, &qp->db);
776 	if (err) {
777 		mlx5_ib_warn(dev, "map failed\n");
778 		goto err_free;
779 	}
780 
781 	err = ib_copy_to_udata(udata, &resp, sizeof(struct mlx5_ib_create_qp_resp));
782 	if (err) {
783 		mlx5_ib_err(dev, "copy failed\n");
784 		goto err_unmap;
785 	}
786 	qp->create_type = MLX5_QP_USER;
787 
788 	return 0;
789 
790 err_unmap:
791 	mlx5_ib_db_unmap_user(context, &qp->db);
792 
793 err_free:
794 	kvfree(*in);
795 
796 err_umem:
797 	if (qp->umem)
798 		ib_umem_release(qp->umem);
799 
800 err_uuar:
801 	free_uuar(&context->uuari, uuarn);
802 	return err;
803 }
804 
805 static void destroy_qp_user(struct ib_pd *pd, struct mlx5_ib_qp *qp)
806 {
807 	struct mlx5_ib_ucontext *context;
808 
809 	context = to_mucontext(pd->uobject->context);
810 	mlx5_ib_db_unmap_user(context, &qp->db);
811 	if (qp->umem)
812 		ib_umem_release(qp->umem);
813 	if (qp->sq_umem)
814 		ib_umem_release(qp->sq_umem);
815 	/*
816 	 * Free only the UUARs handled by the kernel.
817 	 * UUARs of UARs allocated dynamically are handled by user.
818 	 */
819 	if (qp->uuarn != MLX5_EXP_INVALID_UUAR)
820 		free_uuar(&context->uuari, qp->uuarn);
821 }
822 
823 static int create_kernel_qp(struct mlx5_ib_dev *dev,
824 			    struct ib_qp_init_attr *init_attr,
825 			    struct mlx5_ib_qp *qp,
826 			    struct mlx5_create_qp_mbox_in **in, int *inlen)
827 {
828 	enum mlx5_ib_latency_class lc = MLX5_IB_LATENCY_CLASS_LOW;
829 	struct mlx5_uuar_info *uuari;
830 	int uar_index;
831 	int uuarn;
832 	int err;
833 
834 	uuari = &dev->mdev->priv.uuari;
835 	if (init_attr->create_flags & ~(IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK))
836 		return -EINVAL;
837 
838 	uuarn = alloc_uuar(uuari, lc);
839 	if (uuarn < 0) {
840 		mlx5_ib_warn(dev, "\n");
841 		return -ENOMEM;
842 	}
843 
844 	qp->bf = &uuari->bfs[uuarn];
845 	uar_index = qp->bf->uar->index;
846 
847 	err = calc_sq_size(dev, init_attr, qp);
848 	if (err < 0) {
849 		mlx5_ib_warn(dev, "err %d\n", err);
850 		goto err_uuar;
851 	}
852 
853 	qp->rq.offset = 0;
854 	qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift;
855 	qp->buf_size = err + (qp->rq.wqe_cnt << qp->rq.wqe_shift);
856 
857 	err = mlx5_buf_alloc(dev->mdev, qp->buf_size, PAGE_SIZE * 2, &qp->buf);
858 	if (err) {
859 		mlx5_ib_warn(dev, "err %d\n", err);
860 		goto err_uuar;
861 	}
862 
863 	qp->sq.qend = mlx5_get_send_wqe(qp, qp->sq.wqe_cnt);
864 	*inlen = sizeof(**in) + sizeof(*(*in)->pas) * qp->buf.npages;
865 	*in = mlx5_vzalloc(*inlen);
866 	if (!*in) {
867 		err = -ENOMEM;
868 		goto err_buf;
869 	}
870 	(*in)->ctx.qp_counter_set_usr_page = cpu_to_be32(uar_index);
871 	(*in)->ctx.log_pg_sz_remote_qpn =
872 		cpu_to_be32((qp->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT) << 24);
873 	/* Set "fast registration enabled" for all kernel QPs */
874 	(*in)->ctx.params1 |= cpu_to_be32(1 << 11);
875 	(*in)->ctx.sq_crq_size |= cpu_to_be16(1 << 4);
876 
877 	mlx5_fill_page_array(&qp->buf, (*in)->pas);
878 
879 	err = mlx5_db_alloc(dev->mdev, &qp->db);
880 	if (err) {
881 		mlx5_ib_warn(dev, "err %d\n", err);
882 		goto err_free;
883 	}
884 
885 	qp->sq.swr_ctx = kcalloc(qp->sq.wqe_cnt, sizeof(*qp->sq.swr_ctx),
886 				 GFP_KERNEL);
887 	qp->rq.rwr_ctx = kcalloc(qp->rq.wqe_cnt, sizeof(*qp->rq.rwr_ctx),
888 				 GFP_KERNEL);
889 	if (!qp->sq.swr_ctx || !qp->rq.rwr_ctx) {
890 		err = -ENOMEM;
891 		goto err_wrid;
892 	}
893 	qp->create_type = MLX5_QP_KERNEL;
894 
895 	return 0;
896 
897 err_wrid:
898 	mlx5_db_free(dev->mdev, &qp->db);
899 	kfree(qp->sq.swr_ctx);
900 	kfree(qp->rq.rwr_ctx);
901 
902 err_free:
903 	kvfree(*in);
904 
905 err_buf:
906 	mlx5_buf_free(dev->mdev, &qp->buf);
907 
908 err_uuar:
909 	free_uuar(&dev->mdev->priv.uuari, uuarn);
910 	return err;
911 }
912 
913 static void destroy_qp_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp)
914 {
915 	mlx5_db_free(dev->mdev, &qp->db);
916 	kfree(qp->sq.swr_ctx);
917 	kfree(qp->rq.rwr_ctx);
918 	mlx5_buf_free(dev->mdev, &qp->buf);
919 	free_uuar(&dev->mdev->priv.uuari, qp->bf->uuarn);
920 }
921 
922 static __be32 get_rx_type(struct mlx5_ib_qp *qp, struct ib_qp_init_attr *attr)
923 {
924 	enum ib_qp_type qt = attr->qp_type;
925 
926 	if (attr->srq || (qt == IB_QPT_XRC_TGT) || (qt == IB_QPT_XRC_INI))
927 		return cpu_to_be32(MLX5_SRQ_RQ);
928 	else if (!qp->has_rq)
929 		return cpu_to_be32(MLX5_ZERO_LEN_RQ);
930 	else
931 		return cpu_to_be32(MLX5_NON_ZERO_RQ);
932 }
933 
934 static int is_connected(enum ib_qp_type qp_type)
935 {
936 	if (qp_type == IB_QPT_RC || qp_type == IB_QPT_UC)
937 		return 1;
938 
939 	return 0;
940 }
941 
942 static void get_cqs(enum ib_qp_type qp_type,
943 		    struct ib_cq *ib_send_cq, struct ib_cq *ib_recv_cq,
944 		    struct mlx5_ib_cq **send_cq, struct mlx5_ib_cq **recv_cq)
945 {
946 	switch (qp_type) {
947 	case IB_QPT_XRC_TGT:
948 		*send_cq = NULL;
949 		*recv_cq = NULL;
950 		break;
951 	case IB_QPT_XRC_INI:
952 		*send_cq = ib_send_cq ? to_mcq(ib_send_cq) : NULL;
953 		*recv_cq = NULL;
954 		break;
955 
956 	case IB_QPT_SMI:
957 	case IB_QPT_GSI:
958 	case IB_QPT_RC:
959 	case IB_QPT_UC:
960 	case IB_QPT_UD:
961 	case IB_QPT_RAW_IPV6:
962 	case IB_QPT_RAW_ETHERTYPE:
963 	case IB_QPT_RAW_PACKET:
964 		*send_cq = ib_send_cq ? to_mcq(ib_send_cq) : NULL;
965 		*recv_cq = ib_recv_cq ? to_mcq(ib_recv_cq) : NULL;
966 		break;
967 
968 	case IB_QPT_MAX:
969 	default:
970 		*send_cq = NULL;
971 		*recv_cq = NULL;
972 		break;
973 	}
974 }
975 
976 enum {
977 	MLX5_QP_END_PAD_MODE_ALIGN	= MLX5_WQ_END_PAD_MODE_ALIGN,
978 	MLX5_QP_END_PAD_MODE_NONE	= MLX5_WQ_END_PAD_MODE_NONE,
979 };
980 
981 static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd,
982 			    struct ib_qp_init_attr *init_attr,
983 			    struct ib_udata *udata, struct mlx5_ib_qp *qp)
984 {
985 	struct mlx5_ib_resources *devr = &dev->devr;
986 	struct mlx5_core_dev *mdev = dev->mdev;
987 	struct mlx5_create_qp_mbox_in *in = NULL;
988 	struct mlx5_exp_ib_create_qp ucmd;
989 	struct mlx5_ib_create_qp *pucmd = NULL;
990 	struct mlx5_ib_cq *send_cq;
991 	struct mlx5_ib_cq *recv_cq;
992 	unsigned long flags;
993 	int inlen = sizeof(*in);
994 	size_t ucmd_size;
995 	int err;
996 	int st;
997 	u32 uidx;
998 	void *qpc;
999 
1000 	mutex_init(&qp->mutex);
1001 	spin_lock_init(&qp->sq.lock);
1002 	spin_lock_init(&qp->rq.lock);
1003 
1004 	if (init_attr->create_flags & IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) {
1005 		if (!MLX5_CAP_GEN(mdev, block_lb_mc)) {
1006 			mlx5_ib_warn(dev, "block multicast loopback isn't supported\n");
1007 			return -EINVAL;
1008 		} else {
1009 			qp->flags |= MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK;
1010 		}
1011 	}
1012 
1013 	if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
1014 		qp->sq_signal_bits = MLX5_WQE_CTRL_CQ_UPDATE;
1015 
1016 	if (pd && pd->uobject) {
1017 		memset(&ucmd, 0, sizeof(ucmd));
1018 		ucmd_size = sizeof(struct mlx5_ib_create_qp);
1019 		if (ucmd_size > offsetof(struct mlx5_exp_ib_create_qp, size_of_prefix)) {
1020 			mlx5_ib_warn(dev, "mlx5_ib_create_qp is too big to fit as prefix of mlx5_exp_ib_create_qp\n");
1021 				return -EINVAL;
1022 		}
1023 		err = ib_copy_from_udata(&ucmd, udata, min(udata->inlen, ucmd_size));
1024 		if (err) {
1025 			mlx5_ib_err(dev, "copy failed\n");
1026 			return err;
1027 		}
1028 		pucmd = (struct mlx5_ib_create_qp *)&ucmd;
1029 		if (ucmd.exp.comp_mask & MLX5_EXP_CREATE_QP_MASK_UIDX)
1030 			uidx = ucmd.exp.uidx;
1031 		else
1032 			uidx = 0xffffff;
1033 
1034 		qp->wq_sig = !!(ucmd.flags & MLX5_QP_FLAG_SIGNATURE);
1035 	} else {
1036 		qp->wq_sig = !!workqueue_signature;
1037 		uidx = 0xffffff;
1038 	}
1039 
1040 	qp->has_rq = qp_has_rq(init_attr);
1041 	err = set_rq_size(dev, &init_attr->cap, qp->has_rq,
1042 			  qp, (pd && pd->uobject) ? pucmd : NULL);
1043 	if (err) {
1044 		mlx5_ib_warn(dev, "err %d\n", err);
1045 		return err;
1046 	}
1047 
1048 	if (pd) {
1049 		if (pd->uobject) {
1050 			__u32 max_wqes =
1051 				1 << MLX5_CAP_GEN(mdev, log_max_qp_sz);
1052 			mlx5_ib_dbg(dev, "requested sq_wqe_count (%d)\n", ucmd.sq_wqe_count);
1053 			if (ucmd.rq_wqe_shift != qp->rq.wqe_shift ||
1054 			    ucmd.rq_wqe_count != qp->rq.wqe_cnt) {
1055 				mlx5_ib_warn(dev, "invalid rq params\n");
1056 				return -EINVAL;
1057 			}
1058 			if (ucmd.sq_wqe_count > max_wqes) {
1059 				mlx5_ib_warn(dev, "requested sq_wqe_count (%d) > max allowed (%d)\n",
1060 					     ucmd.sq_wqe_count, max_wqes);
1061 				return -EINVAL;
1062 			}
1063 			err = create_user_qp(dev, pd, qp, udata, init_attr, &in,
1064 					     &inlen, &ucmd);
1065 			if (err)
1066 				mlx5_ib_warn(dev, "err %d\n", err);
1067 		} else {
1068 			if (init_attr->qp_type == IB_QPT_RAW_PACKET) {
1069 				mlx5_ib_warn(dev, "Raw Eth QP is disabled for Kernel consumers\n");
1070 				return -EINVAL;
1071 			}
1072 			err = create_kernel_qp(dev, init_attr, qp, &in, &inlen);
1073 			if (err)
1074 				mlx5_ib_warn(dev, "err %d\n", err);
1075 			else
1076 				qp->pa_lkey = to_mpd(pd)->pa_lkey;
1077 		}
1078 
1079 		if (err)
1080 			return err;
1081 	} else {
1082 		in = mlx5_vzalloc(sizeof(*in));
1083 		if (!in)
1084 			return -ENOMEM;
1085 
1086 		qp->create_type = MLX5_QP_EMPTY;
1087 	}
1088 
1089 	if (is_sqp(init_attr->qp_type))
1090 		qp->port = init_attr->port_num;
1091 
1092 	st = to_mlx5_st(init_attr->qp_type);
1093 	if (st < 0) {
1094 		mlx5_ib_warn(dev, "invalid service type\n");
1095 		err = st;
1096 		goto err_create;
1097 	}
1098 	in->ctx.flags |= cpu_to_be32(st << 16 | MLX5_QP_PM_MIGRATED << 11);
1099 
1100 	in->ctx.flags_pd = cpu_to_be32(to_mpd(pd ? pd : devr->p0)->pdn);
1101 
1102 	if (qp->wq_sig)
1103 		in->ctx.flags_pd |= cpu_to_be32(MLX5_QP_ENABLE_SIG);
1104 
1105 	if (qp->flags & MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK)
1106 		in->ctx.flags_pd |= cpu_to_be32(MLX5_QP_BLOCK_MCAST);
1107 
1108 	if (qp->flags &  MLX5_IB_QP_CAP_RX_END_PADDING)
1109 		in->ctx.flags |= cpu_to_be32(MLX5_QP_END_PAD_MODE_ALIGN << 2);
1110 	else
1111 		in->ctx.flags |= cpu_to_be32(MLX5_QP_END_PAD_MODE_NONE << 2);
1112 
1113 	if (qp->scat_cqe && is_connected(init_attr->qp_type)) {
1114 		int rcqe_sz;
1115 		int scqe_sz;
1116 
1117 		rcqe_sz = mlx5_ib_get_cqe_size(dev, init_attr->recv_cq);
1118 		scqe_sz = mlx5_ib_get_cqe_size(dev, init_attr->send_cq);
1119 
1120 		if (rcqe_sz == 128) {
1121 			in->ctx.cs_res = MLX5_RES_SCAT_DATA64_CQE;
1122 		} else {
1123 			in->ctx.cs_res = MLX5_RES_SCAT_DATA32_CQE;
1124 		}
1125 
1126 		if (init_attr->sq_sig_type != IB_SIGNAL_ALL_WR) {
1127 			in->ctx.cs_req = 0;
1128 		} else {
1129 			if (scqe_sz == 128)
1130 				in->ctx.cs_req = MLX5_REQ_SCAT_DATA64_CQE;
1131 			else
1132 				in->ctx.cs_req = MLX5_REQ_SCAT_DATA32_CQE;
1133 		}
1134 	}
1135 
1136 	if (qp->rq.wqe_cnt) {
1137 		in->ctx.rq_size_stride = (qp->rq.wqe_shift - 4);
1138 		in->ctx.rq_size_stride |= ilog2(qp->rq.wqe_cnt) << 3;
1139 	}
1140 
1141 	in->ctx.rq_type_srqn = get_rx_type(qp, init_attr);
1142 
1143 	if (qp->sq.wqe_cnt)
1144 		in->ctx.sq_crq_size |= cpu_to_be16(ilog2(qp->sq.wqe_cnt) << 11);
1145 	else
1146 		in->ctx.sq_crq_size |= cpu_to_be16(0x8000);
1147 
1148 	/* Set default resources */
1149 	switch (init_attr->qp_type) {
1150 	case IB_QPT_XRC_TGT:
1151 		in->ctx.cqn_recv = cpu_to_be32(to_mcq(devr->c0)->mcq.cqn);
1152 		in->ctx.cqn_send = cpu_to_be32(to_mcq(devr->c0)->mcq.cqn);
1153 		in->ctx.rq_type_srqn |= cpu_to_be32(to_msrq(devr->s0)->msrq.srqn);
1154 		in->ctx.xrcd = cpu_to_be32(to_mxrcd(init_attr->xrcd)->xrcdn);
1155 		break;
1156 	case IB_QPT_XRC_INI:
1157 		in->ctx.cqn_recv = cpu_to_be32(to_mcq(devr->c0)->mcq.cqn);
1158 		in->ctx.xrcd = cpu_to_be32(to_mxrcd(devr->x1)->xrcdn);
1159 		in->ctx.rq_type_srqn |= cpu_to_be32(to_msrq(devr->s0)->msrq.srqn);
1160 		break;
1161 	default:
1162 		if (init_attr->srq) {
1163 			in->ctx.xrcd = cpu_to_be32(to_mxrcd(devr->x0)->xrcdn);
1164 			in->ctx.rq_type_srqn |= cpu_to_be32(to_msrq(init_attr->srq)->msrq.srqn);
1165 		} else {
1166 			in->ctx.xrcd = cpu_to_be32(to_mxrcd(devr->x1)->xrcdn);
1167 			in->ctx.rq_type_srqn |= cpu_to_be32(to_msrq(devr->s1)->msrq.srqn);
1168 		}
1169 	}
1170 
1171 	if (init_attr->send_cq)
1172 		in->ctx.cqn_send = cpu_to_be32(to_mcq(init_attr->send_cq)->mcq.cqn);
1173 
1174 	if (init_attr->recv_cq)
1175 		in->ctx.cqn_recv = cpu_to_be32(to_mcq(init_attr->recv_cq)->mcq.cqn);
1176 
1177 	in->ctx.db_rec_addr = cpu_to_be64(qp->db.dma);
1178 
1179 	if (MLX5_CAP_GEN(mdev, cqe_version)) {
1180 		qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
1181 		/* 0xffffff means we ask to work with cqe version 0 */
1182 		MLX5_SET(qpc, qpc, user_index, uidx);
1183 	}
1184 
1185 	if (init_attr->qp_type == IB_QPT_RAW_PACKET) {
1186 		if (MLX5_CAP_GEN(dev->mdev, port_type) != MLX5_CAP_PORT_TYPE_ETH) {
1187 			mlx5_ib_warn(dev, "Raw Ethernet QP is allowed only for Ethernet link layer\n");
1188 			return -ENOSYS;
1189 		}
1190 		if (ucmd.exp.comp_mask & MLX5_EXP_CREATE_QP_MASK_SQ_BUFF_ADD) {
1191 			qp->sq_buf_addr = ucmd.exp.sq_buf_addr;
1192 		} else {
1193 			mlx5_ib_warn(dev, "Raw Ethernet QP needs SQ buff address\n");
1194 			return -EINVAL;
1195 		}
1196 		err = -EOPNOTSUPP;
1197 	} else {
1198 		err = mlx5_core_create_qp(dev->mdev, &qp->mqp, in, inlen);
1199 		qp->mqp.event = mlx5_ib_qp_event;
1200 	}
1201 
1202 	if (err) {
1203 		mlx5_ib_warn(dev, "create qp failed\n");
1204 		goto err_create;
1205 	}
1206 
1207 	kvfree(in);
1208 	/* Hardware wants QPN written in big-endian order (after
1209 	 * shifting) for send doorbell.  Precompute this value to save
1210 	 * a little bit when posting sends.
1211 	 */
1212 	qp->doorbell_qpn = swab32(qp->mqp.qpn << 8);
1213 
1214 	get_cqs(init_attr->qp_type, init_attr->send_cq, init_attr->recv_cq,
1215 		&send_cq, &recv_cq);
1216 	spin_lock_irqsave(&dev->reset_flow_resource_lock, flags);
1217 	mlx5_ib_lock_cqs(send_cq, recv_cq);
1218 	/* Maintain device to QPs access, needed for further handling via reset
1219 	 * flow
1220 	 */
1221 	list_add_tail(&qp->qps_list, &dev->qp_list);
1222 	/* Maintain CQ to QPs access, needed for further handling via reset flow
1223 	 */
1224 	if (send_cq)
1225 		list_add_tail(&qp->cq_send_list, &send_cq->list_send_qp);
1226 	if (recv_cq)
1227 		list_add_tail(&qp->cq_recv_list, &recv_cq->list_recv_qp);
1228 	mlx5_ib_unlock_cqs(send_cq, recv_cq);
1229 	spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags);
1230 
1231 	return 0;
1232 
1233 err_create:
1234 	if (qp->create_type == MLX5_QP_USER)
1235 		destroy_qp_user(pd, qp);
1236 	else if (qp->create_type == MLX5_QP_KERNEL)
1237 		destroy_qp_kernel(dev, qp);
1238 
1239 	kvfree(in);
1240 	return err;
1241 }
1242 
1243 static void mlx5_ib_lock_cqs(struct mlx5_ib_cq *send_cq, struct mlx5_ib_cq *recv_cq)
1244 	__acquires(&send_cq->lock) __acquires(&recv_cq->lock)
1245 {
1246 	if (send_cq) {
1247 		if (recv_cq) {
1248 			if (send_cq->mcq.cqn < recv_cq->mcq.cqn)  {
1249 				spin_lock(&send_cq->lock);
1250 				spin_lock_nested(&recv_cq->lock,
1251 						 SINGLE_DEPTH_NESTING);
1252 			} else if (send_cq->mcq.cqn == recv_cq->mcq.cqn) {
1253 				spin_lock(&send_cq->lock);
1254 				__acquire(&recv_cq->lock);
1255 			} else {
1256 				spin_lock(&recv_cq->lock);
1257 				spin_lock_nested(&send_cq->lock,
1258 						 SINGLE_DEPTH_NESTING);
1259 			}
1260 		} else {
1261 			spin_lock(&send_cq->lock);
1262 			__acquire(&recv_cq->lock);
1263 		}
1264 	} else if (recv_cq) {
1265 		spin_lock(&recv_cq->lock);
1266 		__acquire(&send_cq->lock);
1267 	} else {
1268 		__acquire(&send_cq->lock);
1269 		__acquire(&recv_cq->lock);
1270 	}
1271 }
1272 
1273 static void mlx5_ib_unlock_cqs(struct mlx5_ib_cq *send_cq, struct mlx5_ib_cq *recv_cq)
1274 	__releases(&send_cq->lock) __releases(&recv_cq->lock)
1275 {
1276 	if (send_cq) {
1277 		if (recv_cq) {
1278 			if (send_cq->mcq.cqn < recv_cq->mcq.cqn)  {
1279 				spin_unlock(&recv_cq->lock);
1280 				spin_unlock(&send_cq->lock);
1281 			} else if (send_cq->mcq.cqn == recv_cq->mcq.cqn) {
1282 				__release(&recv_cq->lock);
1283 				spin_unlock(&send_cq->lock);
1284 			} else {
1285 				spin_unlock(&send_cq->lock);
1286 				spin_unlock(&recv_cq->lock);
1287 			}
1288 		} else {
1289 			__release(&recv_cq->lock);
1290 			spin_unlock(&send_cq->lock);
1291 		}
1292 	} else if (recv_cq) {
1293 		__release(&send_cq->lock);
1294 		spin_unlock(&recv_cq->lock);
1295 	} else {
1296 		__release(&recv_cq->lock);
1297 		__release(&send_cq->lock);
1298 	}
1299 }
1300 
1301 static struct mlx5_ib_pd *get_pd(struct mlx5_ib_qp *qp)
1302 {
1303 	return to_mpd(qp->ibqp.pd);
1304 }
1305 
1306 static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp)
1307 {
1308 	struct mlx5_ib_cq *send_cq, *recv_cq;
1309 	struct mlx5_modify_qp_mbox_in *in;
1310 	unsigned long flags;
1311 	int err;
1312 
1313 	in = kzalloc(sizeof(*in), GFP_KERNEL);
1314 	if (!in)
1315 		return;
1316 
1317 	if (qp->state != IB_QPS_RESET) {
1318 		if (qp->ibqp.qp_type != IB_QPT_RAW_PACKET) {
1319 			if (mlx5_core_qp_modify(dev->mdev, MLX5_CMD_OP_2RST_QP, in, 0,
1320 						&qp->mqp))
1321 			mlx5_ib_warn(dev, "mlx5_ib: modify QP %06x to RESET failed\n",
1322 				     qp->mqp.qpn);
1323 		}
1324 	}
1325 
1326 	get_cqs(qp->ibqp.qp_type, qp->ibqp.send_cq, qp->ibqp.recv_cq,
1327 		&send_cq, &recv_cq);
1328 
1329 	spin_lock_irqsave(&dev->reset_flow_resource_lock, flags);
1330 	mlx5_ib_lock_cqs(send_cq, recv_cq);
1331 	/* del from lists under both locks above to protect reset flow paths */
1332 	list_del(&qp->qps_list);
1333 	if (send_cq)
1334 		list_del(&qp->cq_send_list);
1335 
1336 	if (recv_cq)
1337 		list_del(&qp->cq_recv_list);
1338 
1339 	if (qp->create_type == MLX5_QP_KERNEL) {
1340 		__mlx5_ib_cq_clean(recv_cq, qp->mqp.qpn,
1341 				   qp->ibqp.srq ? to_msrq(qp->ibqp.srq) : NULL);
1342 		if (send_cq != recv_cq)
1343 			__mlx5_ib_cq_clean(send_cq, qp->mqp.qpn, NULL);
1344 	}
1345 	mlx5_ib_unlock_cqs(send_cq, recv_cq);
1346 	spin_unlock_irqrestore(&dev->reset_flow_resource_lock, flags);
1347 
1348 	if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) {
1349 	} else {
1350 		err = mlx5_core_destroy_qp(dev->mdev, &qp->mqp);
1351 		if (err)
1352 			mlx5_ib_warn(dev, "failed to destroy QP 0x%x\n",
1353 				     qp->mqp.qpn);
1354 	}
1355 
1356 	kfree(in);
1357 
1358 	if (qp->create_type == MLX5_QP_KERNEL)
1359 		destroy_qp_kernel(dev, qp);
1360 	else if (qp->create_type == MLX5_QP_USER)
1361 		destroy_qp_user(&get_pd(qp)->ibpd, qp);
1362 }
1363 
1364 static const char *ib_qp_type_str(enum ib_qp_type type)
1365 {
1366 	switch (type) {
1367 	case IB_QPT_SMI:
1368 		return "IB_QPT_SMI";
1369 	case IB_QPT_GSI:
1370 		return "IB_QPT_GSI";
1371 	case IB_QPT_RC:
1372 		return "IB_QPT_RC";
1373 	case IB_QPT_UC:
1374 		return "IB_QPT_UC";
1375 	case IB_QPT_UD:
1376 		return "IB_QPT_UD";
1377 	case IB_QPT_RAW_IPV6:
1378 		return "IB_QPT_RAW_IPV6";
1379 	case IB_QPT_RAW_ETHERTYPE:
1380 		return "IB_QPT_RAW_ETHERTYPE";
1381 	case IB_QPT_XRC_INI:
1382 		return "IB_QPT_XRC_INI";
1383 	case IB_QPT_XRC_TGT:
1384 		return "IB_QPT_XRC_TGT";
1385 	case IB_QPT_RAW_PACKET:
1386 		return "IB_QPT_RAW_PACKET";
1387 	case IB_QPT_MAX:
1388 	default:
1389 		return "Invalid QP type";
1390 	}
1391 }
1392 
1393 struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd,
1394 				struct ib_qp_init_attr *init_attr,
1395 				struct ib_udata *udata)
1396 {
1397 	struct mlx5_ib_dev *dev;
1398 	struct mlx5_ib_qp *qp;
1399 	u16 xrcdn = 0;
1400 	int err;
1401 	u32 rcqn;
1402 	u32 scqn;
1403 
1404 	init_attr->qpg_type = IB_QPG_NONE;
1405 
1406 	if (pd) {
1407 		dev = to_mdev(pd->device);
1408 	} else {
1409 		/* being cautious here */
1410 		if (init_attr->qp_type != IB_QPT_XRC_TGT) {
1411 			printf("mlx5_ib: WARN: ""%s: no PD for transport %s\n", __func__, ib_qp_type_str(init_attr->qp_type));
1412 			return ERR_PTR(-EINVAL);
1413 		}
1414 		dev = to_mdev(to_mxrcd(init_attr->xrcd)->ibxrcd.device);
1415 	}
1416 
1417 	switch (init_attr->qp_type) {
1418 	case IB_QPT_XRC_TGT:
1419 	case IB_QPT_XRC_INI:
1420 		if (!MLX5_CAP_GEN(dev->mdev, xrc)) {
1421 			mlx5_ib_warn(dev, "XRC not supported\n");
1422 			return ERR_PTR(-ENOSYS);
1423 		}
1424 		init_attr->recv_cq = NULL;
1425 		if (init_attr->qp_type == IB_QPT_XRC_TGT) {
1426 			xrcdn = to_mxrcd(init_attr->xrcd)->xrcdn;
1427 			init_attr->send_cq = NULL;
1428 		}
1429 
1430 		/* fall through */
1431 	case IB_QPT_RC:
1432 	case IB_QPT_UC:
1433 	case IB_QPT_UD:
1434 	case IB_QPT_SMI:
1435 	case IB_QPT_GSI:
1436 	case IB_QPT_RAW_ETHERTYPE:
1437 	case IB_QPT_RAW_PACKET:
1438 		qp = kzalloc(sizeof(*qp), GFP_KERNEL);
1439 		if (!qp)
1440 			return ERR_PTR(-ENOMEM);
1441 
1442 		err = create_qp_common(dev, pd, init_attr, udata, qp);
1443 		if (err) {
1444 			mlx5_ib_warn(dev, "create_qp_common failed\n");
1445 			kfree(qp);
1446 			return ERR_PTR(err);
1447 		}
1448 
1449 		if (is_qp0(init_attr->qp_type))
1450 			qp->ibqp.qp_num = 0;
1451 		else if (is_qp1(init_attr->qp_type))
1452 			qp->ibqp.qp_num = 1;
1453 		else
1454 			qp->ibqp.qp_num = qp->mqp.qpn;
1455 
1456 		rcqn = init_attr->recv_cq ? to_mcq(init_attr->recv_cq)->mcq.cqn : -1;
1457 		scqn = init_attr->send_cq ? to_mcq(init_attr->send_cq)->mcq.cqn : -1;
1458 		mlx5_ib_dbg(dev, "ib qpnum 0x%x, mlx qpn 0x%x, rcqn 0x%x, scqn 0x%x\n",
1459 			    qp->ibqp.qp_num, qp->mqp.qpn, rcqn, scqn);
1460 
1461 		qp->xrcdn = xrcdn;
1462 
1463 		break;
1464 
1465 	case IB_QPT_RAW_IPV6:
1466 	case IB_QPT_MAX:
1467 	default:
1468 		mlx5_ib_warn(dev, "unsupported qp type %d\n",
1469 			     init_attr->qp_type);
1470 		/* Don't support raw QPs */
1471 		return ERR_PTR(-EINVAL);
1472 	}
1473 
1474 	return &qp->ibqp;
1475 }
1476 
1477 int mlx5_ib_destroy_qp(struct ib_qp *qp)
1478 {
1479 	struct mlx5_ib_dev *dev = to_mdev(qp->device);
1480 	struct mlx5_ib_qp *mqp = to_mqp(qp);
1481 
1482 	destroy_qp_common(dev, mqp);
1483 
1484 	kfree(mqp);
1485 
1486 	return 0;
1487 }
1488 
1489 static u32 atomic_mode_qp(struct mlx5_ib_dev *dev)
1490 {
1491 	unsigned long mask;
1492 	unsigned long tmp;
1493 
1494 	mask = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_qp) &
1495 		MLX5_CAP_ATOMIC(dev->mdev, atomic_size_dc);
1496 
1497 	tmp = find_last_bit(&mask, BITS_PER_LONG);
1498 	if (tmp < 2 || tmp >= BITS_PER_LONG)
1499 		return MLX5_ATOMIC_MODE_NONE;
1500 
1501 	if (tmp == 2)
1502 		return MLX5_ATOMIC_MODE_CX;
1503 
1504 	return tmp << MLX5_ATOMIC_MODE_OFF;
1505 }
1506 
1507 static __be32 to_mlx5_access_flags(struct mlx5_ib_qp *qp, const struct ib_qp_attr *attr,
1508 				   int attr_mask)
1509 {
1510 	struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.device);
1511 	u32 hw_access_flags = 0;
1512 	u8 dest_rd_atomic;
1513 	u32 access_flags;
1514 
1515 	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
1516 		dest_rd_atomic = attr->max_dest_rd_atomic;
1517 	else
1518 		dest_rd_atomic = qp->resp_depth;
1519 
1520 	if (attr_mask & IB_QP_ACCESS_FLAGS)
1521 		access_flags = attr->qp_access_flags;
1522 	else
1523 		access_flags = qp->atomic_rd_en;
1524 
1525 	if (!dest_rd_atomic)
1526 		access_flags &= IB_ACCESS_REMOTE_WRITE;
1527 
1528 	if (access_flags & IB_ACCESS_REMOTE_READ)
1529 		hw_access_flags |= MLX5_QP_BIT_RRE;
1530 	if (access_flags & IB_ACCESS_REMOTE_ATOMIC)
1531 		hw_access_flags |= (MLX5_QP_BIT_RAE |
1532 				    atomic_mode_qp(dev));
1533 	if (access_flags & IB_ACCESS_REMOTE_WRITE)
1534 		hw_access_flags |= MLX5_QP_BIT_RWE;
1535 
1536 	return cpu_to_be32(hw_access_flags);
1537 }
1538 
1539 enum {
1540 	MLX5_PATH_FLAG_FL	= 1 << 0,
1541 	MLX5_PATH_FLAG_FREE_AR	= 1 << 1,
1542 	MLX5_PATH_FLAG_COUNTER	= 1 << 2,
1543 };
1544 
1545 static int ib_rate_to_mlx5(struct mlx5_ib_dev *dev, u8 rate)
1546 {
1547 	if (rate == IB_RATE_PORT_CURRENT) {
1548 		return 0;
1549 	} else if (rate < IB_RATE_2_5_GBPS || rate > IB_RATE_300_GBPS) {
1550 		return -EINVAL;
1551 	} else {
1552 		while (rate != IB_RATE_2_5_GBPS &&
1553 		       !(1 << (rate + MLX5_STAT_RATE_OFFSET) &
1554 			 MLX5_CAP_GEN(dev->mdev, stat_rate_support)))
1555 			--rate;
1556 	}
1557 
1558 	return rate + MLX5_STAT_RATE_OFFSET;
1559 }
1560 
1561 static int mlx5_set_path(struct mlx5_ib_dev *dev, const struct ib_ah_attr *ah,
1562 			 struct mlx5_qp_path *path, u8 port, int attr_mask,
1563 			 u32 path_flags, const struct ib_qp_attr *attr,
1564 			 int alt)
1565 {
1566 	enum rdma_link_layer ll = dev->ib_dev.get_link_layer(&dev->ib_dev,
1567 							     port);
1568 	int err;
1569 	int gid_type;
1570 
1571 	if ((ll == IB_LINK_LAYER_ETHERNET) || (ah->ah_flags & IB_AH_GRH)) {
1572 		int len = dev->mdev->port_caps[port - 1].gid_table_len;
1573 		if (ah->grh.sgid_index >= len) {
1574 			printf("mlx5_ib: ERR: ""sgid_index (%u) too large. max is %d\n", ah->grh.sgid_index, len - 1);
1575 			return -EINVAL;
1576 		}
1577 	}
1578 
1579 	if (ll == IB_LINK_LAYER_ETHERNET) {
1580 		if (!(ah->ah_flags & IB_AH_GRH))
1581 			return -EINVAL;
1582 
1583 		err = mlx5_get_roce_gid_type(dev, port, ah->grh.sgid_index,
1584 					     &gid_type);
1585 		if (err)
1586 			return err;
1587 		memcpy(path->rmac, ah->dmac, sizeof(ah->dmac));
1588 		path->udp_sport = mlx5_get_roce_udp_sport(dev, port,
1589 							  ah->grh.sgid_index,
1590 							  0);
1591 		path->dci_cfi_prio_sl = (ah->sl & 0xf) << 4;
1592 	} else {
1593 		path->fl_free_ar = (path_flags & MLX5_PATH_FLAG_FL) ? 0x80 : 0;
1594 		path->grh_mlid	= ah->src_path_bits & 0x7f;
1595 		path->rlid	= cpu_to_be16(ah->dlid);
1596 		if (ah->ah_flags & IB_AH_GRH)
1597 			path->grh_mlid	|= 1 << 7;
1598 		if (attr_mask & IB_QP_PKEY_INDEX)
1599 			path->pkey_index = cpu_to_be16(alt ?
1600 						       attr->alt_pkey_index :
1601 						       attr->pkey_index);
1602 
1603 		path->dci_cfi_prio_sl = ah->sl & 0xf;
1604 	}
1605 
1606 	path->fl_free_ar |= (path_flags & MLX5_PATH_FLAG_FREE_AR) ? 0x40 : 0;
1607 
1608 	if (ah->ah_flags & IB_AH_GRH) {
1609 		path->mgid_index = ah->grh.sgid_index;
1610 		path->hop_limit  = ah->grh.hop_limit;
1611 		path->tclass_flowlabel =
1612 			cpu_to_be32((ah->grh.traffic_class << 20) |
1613 				    (ah->grh.flow_label));
1614 		memcpy(path->rgid, ah->grh.dgid.raw, 16);
1615 	}
1616 
1617 	err = ib_rate_to_mlx5(dev, ah->static_rate);
1618 	if (err < 0)
1619 		return err;
1620 	path->static_rate = err;
1621 	path->port = port;
1622 
1623 	if (attr_mask & IB_QP_TIMEOUT)
1624 		path->ackto_lt = alt ? attr->alt_timeout << 3 : attr->timeout << 3;
1625 
1626 	return 0;
1627 }
1628 
1629 static enum mlx5_qp_optpar opt_mask[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE][MLX5_QP_ST_MAX] = {
1630 	[MLX5_QP_STATE_INIT] = {
1631 		[MLX5_QP_STATE_INIT] = {
1632 			[MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_RRE		|
1633 					  MLX5_QP_OPTPAR_RAE		|
1634 					  MLX5_QP_OPTPAR_RWE		|
1635 					  MLX5_QP_OPTPAR_PKEY_INDEX	|
1636 					  MLX5_QP_OPTPAR_PRI_PORT,
1637 			[MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_RWE		|
1638 					  MLX5_QP_OPTPAR_PKEY_INDEX	|
1639 					  MLX5_QP_OPTPAR_PRI_PORT,
1640 			[MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_PKEY_INDEX	|
1641 					  MLX5_QP_OPTPAR_Q_KEY		|
1642 					  MLX5_QP_OPTPAR_PRI_PORT,
1643 			[MLX5_QP_ST_DCI] = MLX5_QP_OPTPAR_PRI_PORT	|
1644 					  MLX5_QP_OPTPAR_DC_KEY		|
1645 					  MLX5_QP_OPTPAR_PKEY_INDEX	|
1646 					  MLX5_QP_OPTPAR_RAE,
1647 		},
1648 		[MLX5_QP_STATE_RTR] = {
1649 			[MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH  |
1650 					  MLX5_QP_OPTPAR_RRE            |
1651 					  MLX5_QP_OPTPAR_RAE            |
1652 					  MLX5_QP_OPTPAR_RWE            |
1653 					  MLX5_QP_OPTPAR_PKEY_INDEX,
1654 			[MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH  |
1655 					  MLX5_QP_OPTPAR_RWE            |
1656 					  MLX5_QP_OPTPAR_PKEY_INDEX,
1657 			[MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_PKEY_INDEX     |
1658 					  MLX5_QP_OPTPAR_Q_KEY,
1659 			[MLX5_QP_ST_MLX] = MLX5_QP_OPTPAR_PKEY_INDEX	|
1660 					   MLX5_QP_OPTPAR_Q_KEY,
1661 			[MLX5_QP_ST_XRC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH |
1662 					  MLX5_QP_OPTPAR_RRE            |
1663 					  MLX5_QP_OPTPAR_RAE            |
1664 					  MLX5_QP_OPTPAR_RWE            |
1665 					  MLX5_QP_OPTPAR_PKEY_INDEX,
1666 			[MLX5_QP_ST_DCI] = MLX5_QP_OPTPAR_PKEY_INDEX	|
1667 					  MLX5_QP_OPTPAR_RAE		|
1668 					  MLX5_QP_OPTPAR_DC_KEY,
1669 		},
1670 	},
1671 	[MLX5_QP_STATE_RTR] = {
1672 		[MLX5_QP_STATE_RTS] = {
1673 			[MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH	|
1674 					  MLX5_QP_OPTPAR_RRE		|
1675 					  MLX5_QP_OPTPAR_RAE		|
1676 					  MLX5_QP_OPTPAR_RWE		|
1677 					  MLX5_QP_OPTPAR_PM_STATE	|
1678 					  MLX5_QP_OPTPAR_RNR_TIMEOUT,
1679 			[MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_ALT_ADDR_PATH	|
1680 					  MLX5_QP_OPTPAR_RWE		|
1681 					  MLX5_QP_OPTPAR_PM_STATE,
1682 			[MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_Q_KEY,
1683 			[MLX5_QP_ST_DCI] = MLX5_QP_OPTPAR_DC_KEY		|
1684 					  MLX5_QP_OPTPAR_PM_STATE	|
1685 					  MLX5_QP_OPTPAR_RAE,
1686 		},
1687 	},
1688 	[MLX5_QP_STATE_RTS] = {
1689 		[MLX5_QP_STATE_RTS] = {
1690 			[MLX5_QP_ST_RC] = MLX5_QP_OPTPAR_RRE		|
1691 					  MLX5_QP_OPTPAR_RAE		|
1692 					  MLX5_QP_OPTPAR_RWE		|
1693 					  MLX5_QP_OPTPAR_RNR_TIMEOUT	|
1694 					  MLX5_QP_OPTPAR_PM_STATE	|
1695 					  MLX5_QP_OPTPAR_ALT_ADDR_PATH,
1696 			[MLX5_QP_ST_UC] = MLX5_QP_OPTPAR_RWE		|
1697 					  MLX5_QP_OPTPAR_PM_STATE	|
1698 					  MLX5_QP_OPTPAR_ALT_ADDR_PATH,
1699 			[MLX5_QP_ST_UD] = MLX5_QP_OPTPAR_Q_KEY		|
1700 					  MLX5_QP_OPTPAR_SRQN		|
1701 					  MLX5_QP_OPTPAR_CQN_RCV,
1702 			[MLX5_QP_ST_DCI] = MLX5_QP_OPTPAR_DC_KEY		|
1703 					  MLX5_QP_OPTPAR_PM_STATE	|
1704 					  MLX5_QP_OPTPAR_RAE,
1705 		},
1706 	},
1707 	[MLX5_QP_STATE_SQER] = {
1708 		[MLX5_QP_STATE_RTS] = {
1709 			[MLX5_QP_ST_UD]	 = MLX5_QP_OPTPAR_Q_KEY,
1710 			[MLX5_QP_ST_MLX] = MLX5_QP_OPTPAR_Q_KEY,
1711 			[MLX5_QP_ST_UC]	 = MLX5_QP_OPTPAR_RWE,
1712 			[MLX5_QP_ST_RC]	 = MLX5_QP_OPTPAR_RNR_TIMEOUT	|
1713 					   MLX5_QP_OPTPAR_RWE		|
1714 					   MLX5_QP_OPTPAR_RAE		|
1715 					   MLX5_QP_OPTPAR_RRE,
1716 			[MLX5_QP_ST_DCI]  = MLX5_QP_OPTPAR_DC_KEY	|
1717 					   MLX5_QP_OPTPAR_RAE,
1718 
1719 		},
1720 	},
1721 	[MLX5_QP_STATE_SQD] = {
1722 		[MLX5_QP_STATE_RTS] = {
1723 			[MLX5_QP_ST_UD]	 = MLX5_QP_OPTPAR_Q_KEY,
1724 			[MLX5_QP_ST_MLX] = MLX5_QP_OPTPAR_Q_KEY,
1725 			[MLX5_QP_ST_UC]	 = MLX5_QP_OPTPAR_RWE,
1726 			[MLX5_QP_ST_RC]	 = MLX5_QP_OPTPAR_RNR_TIMEOUT	|
1727 					   MLX5_QP_OPTPAR_RWE		|
1728 					   MLX5_QP_OPTPAR_RAE		|
1729 					   MLX5_QP_OPTPAR_RRE,
1730 		},
1731 	},
1732 };
1733 
1734 static int ib_nr_to_mlx5_nr(int ib_mask)
1735 {
1736 	switch (ib_mask) {
1737 	case IB_QP_STATE:
1738 		return 0;
1739 	case IB_QP_CUR_STATE:
1740 		return 0;
1741 	case IB_QP_EN_SQD_ASYNC_NOTIFY:
1742 		return 0;
1743 	case IB_QP_ACCESS_FLAGS:
1744 		return MLX5_QP_OPTPAR_RWE | MLX5_QP_OPTPAR_RRE |
1745 			MLX5_QP_OPTPAR_RAE;
1746 	case IB_QP_PKEY_INDEX:
1747 		return MLX5_QP_OPTPAR_PKEY_INDEX;
1748 	case IB_QP_PORT:
1749 		return MLX5_QP_OPTPAR_PRI_PORT;
1750 	case IB_QP_QKEY:
1751 		return MLX5_QP_OPTPAR_Q_KEY;
1752 	case IB_QP_AV:
1753 		return MLX5_QP_OPTPAR_PRIMARY_ADDR_PATH |
1754 			MLX5_QP_OPTPAR_PRI_PORT;
1755 	case IB_QP_PATH_MTU:
1756 		return 0;
1757 	case IB_QP_TIMEOUT:
1758 		return MLX5_QP_OPTPAR_ACK_TIMEOUT;
1759 	case IB_QP_RETRY_CNT:
1760 		return MLX5_QP_OPTPAR_RETRY_COUNT;
1761 	case IB_QP_RNR_RETRY:
1762 		return MLX5_QP_OPTPAR_RNR_RETRY;
1763 	case IB_QP_RQ_PSN:
1764 		return 0;
1765 	case IB_QP_MAX_QP_RD_ATOMIC:
1766 		return MLX5_QP_OPTPAR_SRA_MAX;
1767 	case IB_QP_ALT_PATH:
1768 		return MLX5_QP_OPTPAR_ALT_ADDR_PATH;
1769 	case IB_QP_MIN_RNR_TIMER:
1770 		return MLX5_QP_OPTPAR_RNR_TIMEOUT;
1771 	case IB_QP_SQ_PSN:
1772 		return 0;
1773 	case IB_QP_MAX_DEST_RD_ATOMIC:
1774 		return MLX5_QP_OPTPAR_RRA_MAX | MLX5_QP_OPTPAR_RWE |
1775 			MLX5_QP_OPTPAR_RRE | MLX5_QP_OPTPAR_RAE;
1776 	case IB_QP_PATH_MIG_STATE:
1777 		return MLX5_QP_OPTPAR_PM_STATE;
1778 	case IB_QP_CAP:
1779 		return 0;
1780 	case IB_QP_DEST_QPN:
1781 		return 0;
1782 	}
1783 	return 0;
1784 }
1785 
1786 static int ib_mask_to_mlx5_opt(int ib_mask)
1787 {
1788 	int result = 0;
1789 	int i;
1790 
1791 	for (i = 0; i < 8 * sizeof(int); i++) {
1792 		if ((1 << i) & ib_mask)
1793 			result |= ib_nr_to_mlx5_nr(1 << i);
1794 	}
1795 
1796 	return result;
1797 }
1798 
1799 static int __mlx5_ib_modify_qp(struct ib_qp *ibqp,
1800 			       const struct ib_qp_attr *attr, int attr_mask,
1801 			       enum ib_qp_state cur_state, enum ib_qp_state new_state)
1802 {
1803 	static const u16 optab[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE] = {
1804 		[MLX5_QP_STATE_RST] = {
1805 			[MLX5_QP_STATE_RST]	= MLX5_CMD_OP_2RST_QP,
1806 			[MLX5_QP_STATE_ERR]	= MLX5_CMD_OP_2ERR_QP,
1807 			[MLX5_QP_STATE_INIT]	= MLX5_CMD_OP_RST2INIT_QP,
1808 		},
1809 		[MLX5_QP_STATE_INIT]  = {
1810 			[MLX5_QP_STATE_RST]	= MLX5_CMD_OP_2RST_QP,
1811 			[MLX5_QP_STATE_ERR]	= MLX5_CMD_OP_2ERR_QP,
1812 			[MLX5_QP_STATE_INIT]	= MLX5_CMD_OP_INIT2INIT_QP,
1813 			[MLX5_QP_STATE_RTR]	= MLX5_CMD_OP_INIT2RTR_QP,
1814 		},
1815 		[MLX5_QP_STATE_RTR]   = {
1816 			[MLX5_QP_STATE_RST]	= MLX5_CMD_OP_2RST_QP,
1817 			[MLX5_QP_STATE_ERR]	= MLX5_CMD_OP_2ERR_QP,
1818 			[MLX5_QP_STATE_RTS]	= MLX5_CMD_OP_RTR2RTS_QP,
1819 		},
1820 		[MLX5_QP_STATE_RTS]   = {
1821 			[MLX5_QP_STATE_RST]	= MLX5_CMD_OP_2RST_QP,
1822 			[MLX5_QP_STATE_ERR]	= MLX5_CMD_OP_2ERR_QP,
1823 			[MLX5_QP_STATE_RTS]	= MLX5_CMD_OP_RTS2RTS_QP,
1824 		},
1825 		[MLX5_QP_STATE_SQD] = {
1826 			[MLX5_QP_STATE_RST]	= MLX5_CMD_OP_2RST_QP,
1827 			[MLX5_QP_STATE_ERR]	= MLX5_CMD_OP_2ERR_QP,
1828 			[MLX5_QP_STATE_RTS]	= MLX5_CMD_OP_SQD_RTS_QP,
1829 		},
1830 		[MLX5_QP_STATE_SQER] = {
1831 			[MLX5_QP_STATE_RST]	= MLX5_CMD_OP_2RST_QP,
1832 			[MLX5_QP_STATE_ERR]	= MLX5_CMD_OP_2ERR_QP,
1833 			[MLX5_QP_STATE_RTS]	= MLX5_CMD_OP_SQERR2RTS_QP,
1834 		},
1835 		[MLX5_QP_STATE_ERR] = {
1836 			[MLX5_QP_STATE_RST]	= MLX5_CMD_OP_2RST_QP,
1837 			[MLX5_QP_STATE_ERR]	= MLX5_CMD_OP_2ERR_QP,
1838 		}
1839 	};
1840 
1841 	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
1842 	struct mlx5_ib_qp *qp = to_mqp(ibqp);
1843 	struct mlx5_ib_cq *send_cq, *recv_cq;
1844 	struct mlx5_qp_context *context;
1845 	struct mlx5_modify_qp_mbox_in *in;
1846 	struct mlx5_ib_pd *pd;
1847 	enum mlx5_qp_state mlx5_cur, mlx5_new;
1848 	enum mlx5_qp_optpar optpar;
1849 	int sqd_event;
1850 	int mlx5_st;
1851 	int err;
1852 	u16 op;
1853 
1854 	in = kzalloc(sizeof(*in), GFP_KERNEL);
1855 	if (!in)
1856 		return -ENOMEM;
1857 
1858 	context = &in->ctx;
1859 	err = to_mlx5_st(ibqp->qp_type);
1860 	if (err < 0)
1861 		goto out;
1862 
1863 	context->flags = cpu_to_be32(err << 16);
1864 
1865 	if (!(attr_mask & IB_QP_PATH_MIG_STATE)) {
1866 		context->flags |= cpu_to_be32(MLX5_QP_PM_MIGRATED << 11);
1867 	} else {
1868 		switch (attr->path_mig_state) {
1869 		case IB_MIG_MIGRATED:
1870 			context->flags |= cpu_to_be32(MLX5_QP_PM_MIGRATED << 11);
1871 			break;
1872 		case IB_MIG_REARM:
1873 			context->flags |= cpu_to_be32(MLX5_QP_PM_REARM << 11);
1874 			break;
1875 		case IB_MIG_ARMED:
1876 			context->flags |= cpu_to_be32(MLX5_QP_PM_ARMED << 11);
1877 			break;
1878 		}
1879 	}
1880 
1881 	if (ibqp->qp_type == IB_QPT_GSI || ibqp->qp_type == IB_QPT_SMI) {
1882 		context->mtu_msgmax = (IB_MTU_256 << 5) | 8;
1883 	} else if (ibqp->qp_type == IB_QPT_UD) {
1884 		context->mtu_msgmax = (IB_MTU_4096 << 5) | 12;
1885 	} else if (attr_mask & IB_QP_PATH_MTU) {
1886 		if (attr->path_mtu < IB_MTU_256 ||
1887 		    attr->path_mtu > IB_MTU_4096) {
1888 			mlx5_ib_warn(dev, "invalid mtu %d\n", attr->path_mtu);
1889 			err = -EINVAL;
1890 			goto out;
1891 		}
1892 		context->mtu_msgmax = (attr->path_mtu << 5) |
1893 				      (u8)MLX5_CAP_GEN(dev->mdev, log_max_msg);
1894 	}
1895 
1896 	if (attr_mask & IB_QP_DEST_QPN)
1897 		context->log_pg_sz_remote_qpn = cpu_to_be32(attr->dest_qp_num);
1898 
1899 	if (attr_mask & IB_QP_PKEY_INDEX)
1900 		context->pri_path.pkey_index = cpu_to_be16(attr->pkey_index);
1901 
1902 	/* todo implement counter_index functionality */
1903 
1904 	if (is_sqp(ibqp->qp_type))
1905 		context->pri_path.port = qp->port;
1906 
1907 	if (attr_mask & IB_QP_PORT)
1908 		context->pri_path.port = attr->port_num;
1909 
1910 	if (attr_mask & IB_QP_AV) {
1911 		err = mlx5_set_path(dev, &attr->ah_attr, &context->pri_path,
1912 				    attr_mask & IB_QP_PORT ? attr->port_num : qp->port,
1913 				    attr_mask, 0, attr, 0);
1914 		if (err)
1915 			goto out;
1916 	}
1917 
1918 	if (attr_mask & IB_QP_TIMEOUT)
1919 		context->pri_path.ackto_lt |= attr->timeout << 3;
1920 
1921 	if (attr_mask & IB_QP_ALT_PATH) {
1922 		err = mlx5_set_path(dev, &attr->alt_ah_attr, &context->alt_path,
1923 				    attr->alt_port_num,
1924 				    attr_mask  | IB_QP_PKEY_INDEX | IB_QP_TIMEOUT,
1925 				    0, attr, 1);
1926 		if (err)
1927 			goto out;
1928 	}
1929 
1930 	pd = get_pd(qp);
1931 	get_cqs(qp->ibqp.qp_type, qp->ibqp.send_cq, qp->ibqp.recv_cq,
1932 		&send_cq, &recv_cq);
1933 
1934 	context->flags_pd = cpu_to_be32(pd ? pd->pdn : to_mpd(dev->devr.p0)->pdn);
1935 	context->cqn_send = send_cq ? cpu_to_be32(send_cq->mcq.cqn) : 0;
1936 	context->cqn_recv = recv_cq ? cpu_to_be32(recv_cq->mcq.cqn) : 0;
1937 	context->params1  = cpu_to_be32(MLX5_IB_ACK_REQ_FREQ << 28);
1938 
1939 	if (attr_mask & IB_QP_RNR_RETRY)
1940 		context->params1 |= cpu_to_be32(attr->rnr_retry << 13);
1941 
1942 	if (attr_mask & IB_QP_RETRY_CNT)
1943 		context->params1 |= cpu_to_be32(attr->retry_cnt << 16);
1944 
1945 	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC) {
1946 		if (attr->max_rd_atomic)
1947 			context->params1 |=
1948 				cpu_to_be32(fls(attr->max_rd_atomic - 1) << 21);
1949 	}
1950 
1951 	if (attr_mask & IB_QP_SQ_PSN)
1952 		context->next_send_psn = cpu_to_be32(attr->sq_psn & 0xffffff);
1953 
1954 	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) {
1955 		if (attr->max_dest_rd_atomic)
1956 			context->params2 |=
1957 				cpu_to_be32(fls(attr->max_dest_rd_atomic - 1) << 21);
1958 	}
1959 
1960 	if ((attr_mask & IB_QP_ACCESS_FLAGS) &&
1961 	    (attr->qp_access_flags & IB_ACCESS_REMOTE_ATOMIC) &&
1962 	    !dev->enable_atomic_resp) {
1963 		mlx5_ib_warn(dev, "atomic responder is not supported\n");
1964 		err = -EINVAL;
1965 		goto out;
1966 	}
1967 
1968 	if (attr_mask & (IB_QP_ACCESS_FLAGS | IB_QP_MAX_DEST_RD_ATOMIC))
1969 		context->params2 |= to_mlx5_access_flags(qp, attr, attr_mask);
1970 
1971 	if (attr_mask & IB_QP_MIN_RNR_TIMER)
1972 		context->rnr_nextrecvpsn |= cpu_to_be32(attr->min_rnr_timer << 24);
1973 
1974 	if (attr_mask & IB_QP_RQ_PSN)
1975 		context->rnr_nextrecvpsn |= cpu_to_be32(attr->rq_psn & 0xffffff);
1976 
1977 	if (attr_mask & IB_QP_QKEY)
1978 		context->qkey = cpu_to_be32(attr->qkey);
1979 
1980 	if (qp->rq.wqe_cnt && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
1981 		context->db_rec_addr = cpu_to_be64(qp->db.dma);
1982 
1983 	if (cur_state == IB_QPS_RTS && new_state == IB_QPS_SQD	&&
1984 	    attr_mask & IB_QP_EN_SQD_ASYNC_NOTIFY && attr->en_sqd_async_notify)
1985 		sqd_event = 1;
1986 	else
1987 		sqd_event = 0;
1988 
1989 	if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT)
1990 		context->sq_crq_size |= cpu_to_be16(1 << 4);
1991 
1992 	if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) {
1993 		u8 port_num = (attr_mask & IB_QP_PORT ? attr->port_num :
1994 			       qp->port) - 1;
1995 		struct mlx5_ib_port *mibport = &dev->port[port_num];
1996 
1997 		context->qp_counter_set_usr_page |=
1998 			cpu_to_be32(mibport->q_cnt_id << 24);
1999 	}
2000 
2001 	mlx5_cur = to_mlx5_state(cur_state);
2002 	mlx5_new = to_mlx5_state(new_state);
2003 	mlx5_st = to_mlx5_st(ibqp->qp_type);
2004 	if (mlx5_st < 0)
2005 		goto out;
2006 
2007 	if (mlx5_cur >= MLX5_QP_NUM_STATE || mlx5_new >= MLX5_QP_NUM_STATE ||
2008 	    !optab[mlx5_cur][mlx5_new])
2009 		return -EINVAL;
2010 
2011 	op = optab[mlx5_cur][mlx5_new];
2012 	optpar = ib_mask_to_mlx5_opt(attr_mask);
2013 	optpar &= opt_mask[mlx5_cur][mlx5_new][mlx5_st];
2014 	in->optparam = cpu_to_be32(optpar);
2015 
2016 	if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET)
2017 		err = -EOPNOTSUPP;
2018 	else
2019 		err = mlx5_core_qp_modify(dev->mdev, op, in, sqd_event,
2020 				  &qp->mqp);
2021 	if (err)
2022 		goto out;
2023 
2024 	qp->state = new_state;
2025 
2026 	if (attr_mask & IB_QP_ACCESS_FLAGS)
2027 		qp->atomic_rd_en = attr->qp_access_flags;
2028 	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC)
2029 		qp->resp_depth = attr->max_dest_rd_atomic;
2030 	if (attr_mask & IB_QP_PORT)
2031 		qp->port = attr->port_num;
2032 	if (attr_mask & IB_QP_ALT_PATH)
2033 		qp->alt_port = attr->alt_port_num;
2034 
2035 	/*
2036 	 * If we moved a kernel QP to RESET, clean up all old CQ
2037 	 * entries and reinitialize the QP.
2038 	 */
2039 	if (new_state == IB_QPS_RESET && !ibqp->uobject) {
2040 		mlx5_ib_cq_clean(recv_cq, qp->mqp.qpn,
2041 				 ibqp->srq ? to_msrq(ibqp->srq) : NULL);
2042 		if (send_cq != recv_cq)
2043 			mlx5_ib_cq_clean(send_cq, qp->mqp.qpn, NULL);
2044 
2045 		qp->rq.head = 0;
2046 		qp->rq.tail = 0;
2047 		qp->sq.head = 0;
2048 		qp->sq.tail = 0;
2049 		qp->sq.cur_post = 0;
2050 		qp->sq.last_poll = 0;
2051 		if (qp->db.db) {
2052 			qp->db.db[MLX5_RCV_DBR] = 0;
2053 			qp->db.db[MLX5_SND_DBR] = 0;
2054 		}
2055 	}
2056 
2057 out:
2058 	kfree(in);
2059 	return err;
2060 }
2061 
2062 static int ignored_ts_check(enum ib_qp_type qp_type)
2063 {
2064 	return 0;
2065 }
2066 
2067 int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
2068 		      int attr_mask, struct ib_udata *udata)
2069 {
2070 	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
2071 	struct mlx5_ib_qp *qp = to_mqp(ibqp);
2072 	enum ib_qp_state cur_state, new_state;
2073 	int err = -EINVAL;
2074 	int port;
2075 	enum rdma_link_layer ll = IB_LINK_LAYER_UNSPECIFIED;
2076 
2077 	mutex_lock(&qp->mutex);
2078 
2079 	cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state;
2080 	new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state;
2081 
2082 	if (!(cur_state == new_state && cur_state == IB_QPS_RESET)) {
2083 		port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port;
2084 		ll = dev->ib_dev.get_link_layer(&dev->ib_dev, port);
2085 	}
2086 
2087 	if (!ignored_ts_check(ibqp->qp_type) &&
2088 	    !ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask, ll))
2089 		goto out;
2090 
2091 	if ((attr_mask & IB_QP_PORT) &&
2092 	    (attr->port_num == 0 ||
2093 	     attr->port_num > MLX5_CAP_GEN(dev->mdev, num_ports)))
2094 		goto out;
2095 
2096 	if (attr_mask & IB_QP_PKEY_INDEX) {
2097 		port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port;
2098 		if (attr->pkey_index >=
2099 		    dev->mdev->port_caps[port - 1].pkey_table_len)
2100 			goto out;
2101 	}
2102 
2103 	if (attr_mask & IB_QP_MAX_QP_RD_ATOMIC &&
2104 	    attr->max_rd_atomic >
2105 	    (1 << MLX5_CAP_GEN(dev->mdev, log_max_ra_res_qp)))
2106 		goto out;
2107 
2108 	if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC &&
2109 	    attr->max_dest_rd_atomic >
2110 	    (1 << MLX5_CAP_GEN(dev->mdev, log_max_ra_req_qp)))
2111 		goto out;
2112 
2113 	if (cur_state == new_state && cur_state == IB_QPS_RESET) {
2114 		err = 0;
2115 		goto out;
2116 	}
2117 
2118 	err = __mlx5_ib_modify_qp(ibqp, attr, attr_mask, cur_state, new_state);
2119 
2120 out:
2121 	mutex_unlock(&qp->mutex);
2122 	return err;
2123 }
2124 
2125 static int mlx5_wq_overflow(struct mlx5_ib_wq *wq, int nreq, struct ib_cq *ib_cq)
2126 {
2127 	struct mlx5_ib_cq *cq;
2128 	unsigned cur;
2129 
2130 	cur = wq->head - wq->tail;
2131 	if (likely(cur + nreq < wq->max_post))
2132 		return 0;
2133 
2134 	cq = to_mcq(ib_cq);
2135 	spin_lock(&cq->lock);
2136 	cur = wq->head - wq->tail;
2137 	spin_unlock(&cq->lock);
2138 
2139 	return cur + nreq >= wq->max_post;
2140 }
2141 
2142 static __always_inline void set_raddr_seg(struct mlx5_wqe_raddr_seg *rseg,
2143 					  u64 remote_addr, u32 rkey)
2144 {
2145 	rseg->raddr    = cpu_to_be64(remote_addr);
2146 	rseg->rkey     = cpu_to_be32(rkey);
2147 	rseg->reserved = 0;
2148 }
2149 
2150 static void set_datagram_seg(struct mlx5_wqe_datagram_seg *dseg,
2151 			     struct ib_send_wr *wr)
2152 {
2153 	memcpy(&dseg->av, &to_mah(wr->wr.ud.ah)->av, sizeof(struct mlx5_av));
2154 	dseg->av.dqp_dct = cpu_to_be32(wr->wr.ud.remote_qpn | MLX5_EXTENDED_UD_AV);
2155 	dseg->av.key.qkey.qkey = cpu_to_be32(wr->wr.ud.remote_qkey);
2156 }
2157 
2158 static void set_data_ptr_seg(struct mlx5_wqe_data_seg *dseg, struct ib_sge *sg)
2159 {
2160 	dseg->byte_count = cpu_to_be32(sg->length);
2161 	dseg->lkey       = cpu_to_be32(sg->lkey);
2162 	dseg->addr       = cpu_to_be64(sg->addr);
2163 }
2164 
2165 static __be16 get_klm_octo(int npages)
2166 {
2167 	return cpu_to_be16(ALIGN(npages, 8) / 2);
2168 }
2169 
2170 static __be64 frwr_mkey_mask(void)
2171 {
2172 	u64 result;
2173 
2174 	result = MLX5_MKEY_MASK_LEN		|
2175 		MLX5_MKEY_MASK_PAGE_SIZE	|
2176 		MLX5_MKEY_MASK_START_ADDR	|
2177 		MLX5_MKEY_MASK_EN_RINVAL	|
2178 		MLX5_MKEY_MASK_KEY		|
2179 		MLX5_MKEY_MASK_LR		|
2180 		MLX5_MKEY_MASK_LW		|
2181 		MLX5_MKEY_MASK_RR		|
2182 		MLX5_MKEY_MASK_RW		|
2183 		MLX5_MKEY_MASK_A		|
2184 		MLX5_MKEY_MASK_SMALL_FENCE	|
2185 		MLX5_MKEY_MASK_FREE;
2186 
2187 	return cpu_to_be64(result);
2188 }
2189 
2190 static void set_frwr_umr_segment(struct mlx5_wqe_umr_ctrl_seg *umr,
2191 				 struct ib_send_wr *wr, int li)
2192 {
2193 	memset(umr, 0, sizeof(*umr));
2194 
2195 	if (li) {
2196 		umr->mkey_mask = cpu_to_be64(MLX5_MKEY_MASK_FREE);
2197 		umr->flags = 1 << 7;
2198 		return;
2199 	}
2200 
2201 	umr->flags = (1 << 5); /* fail if not free */
2202 	umr->klm_octowords = get_klm_octo(wr->wr.fast_reg.page_list_len);
2203 	umr->mkey_mask = frwr_mkey_mask();
2204 }
2205 
2206 static u8 get_umr_flags(int acc)
2207 {
2208 	return (acc & IB_ACCESS_REMOTE_ATOMIC ? MLX5_PERM_ATOMIC       : 0) |
2209 	       (acc & IB_ACCESS_REMOTE_WRITE  ? MLX5_PERM_REMOTE_WRITE : 0) |
2210 	       (acc & IB_ACCESS_REMOTE_READ   ? MLX5_PERM_REMOTE_READ  : 0) |
2211 	       (acc & IB_ACCESS_LOCAL_WRITE   ? MLX5_PERM_LOCAL_WRITE  : 0) |
2212 		MLX5_PERM_LOCAL_READ | MLX5_PERM_UMR_EN;
2213 }
2214 
2215 static void set_mkey_segment(struct mlx5_mkey_seg *seg, struct ib_send_wr *wr,
2216 			     int li, int *writ)
2217 {
2218 	memset(seg, 0, sizeof(*seg));
2219 	if (li) {
2220 		seg->status = MLX5_MKEY_STATUS_FREE;
2221 		return;
2222 	}
2223 
2224 	seg->flags = get_umr_flags(wr->wr.fast_reg.access_flags) |
2225 		     MLX5_ACCESS_MODE_MTT;
2226 	*writ = seg->flags & (MLX5_PERM_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE);
2227 	seg->qpn_mkey7_0 = cpu_to_be32((wr->wr.fast_reg.rkey & 0xff) | 0xffffff00);
2228 	seg->flags_pd = cpu_to_be32(MLX5_MKEY_REMOTE_INVAL);
2229 	seg->start_addr = cpu_to_be64(wr->wr.fast_reg.iova_start);
2230 	seg->len = cpu_to_be64(wr->wr.fast_reg.length);
2231 	seg->xlt_oct_size = cpu_to_be32((wr->wr.fast_reg.page_list_len + 1) / 2);
2232 	seg->log2_page_size = wr->wr.fast_reg.page_shift;
2233 }
2234 
2235 static void set_frwr_pages(struct mlx5_wqe_data_seg *dseg,
2236 			   struct ib_send_wr *wr,
2237 			   struct mlx5_core_dev *mdev,
2238 			   struct mlx5_ib_pd *pd,
2239 			   int writ)
2240 {
2241 	struct mlx5_ib_fast_reg_page_list *mfrpl = to_mfrpl(wr->wr.fast_reg.page_list);
2242 	u64 *page_list = wr->wr.fast_reg.page_list->page_list;
2243 	u64 perm = MLX5_EN_RD | (writ ? MLX5_EN_WR : 0);
2244 	int i;
2245 
2246 	for (i = 0; i < wr->wr.fast_reg.page_list_len; i++)
2247 		mfrpl->mapped_page_list[i] = cpu_to_be64(page_list[i] | perm);
2248 	dseg->addr = cpu_to_be64(mfrpl->map);
2249 	dseg->byte_count = cpu_to_be32(ALIGN(sizeof(u64) * wr->wr.fast_reg.page_list_len, 64));
2250 	dseg->lkey = cpu_to_be32(pd->pa_lkey);
2251 }
2252 
2253 static __be32 send_ieth(struct ib_send_wr *wr)
2254 {
2255 	switch (wr->opcode) {
2256 	case IB_WR_SEND_WITH_IMM:
2257 	case IB_WR_RDMA_WRITE_WITH_IMM:
2258 		return wr->ex.imm_data;
2259 
2260 	case IB_WR_SEND_WITH_INV:
2261 		return cpu_to_be32(wr->ex.invalidate_rkey);
2262 
2263 	default:
2264 		return 0;
2265 	}
2266 }
2267 
2268 static u8 calc_sig(void *wqe, int size)
2269 {
2270 	u8 *p = wqe;
2271 	u8 res = 0;
2272 	int i;
2273 
2274 	for (i = 0; i < size; i++)
2275 		res ^= p[i];
2276 
2277 	return ~res;
2278 }
2279 
2280 static u8 calc_wq_sig(void *wqe)
2281 {
2282 	return calc_sig(wqe, (*((u8 *)wqe + 8) & 0x3f) << 4);
2283 }
2284 
2285 static int set_data_inl_seg(struct mlx5_ib_qp *qp, struct ib_send_wr *wr,
2286 			    void *wqe, int *sz)
2287 {
2288 	struct mlx5_wqe_inline_seg *seg;
2289 	void *qend = qp->sq.qend;
2290 	void *addr;
2291 	int inl = 0;
2292 	int copy;
2293 	int len;
2294 	int i;
2295 
2296 	seg = wqe;
2297 	wqe += sizeof(*seg);
2298 	for (i = 0; i < wr->num_sge; i++) {
2299 		addr = (void *)(uintptr_t)(wr->sg_list[i].addr);
2300 		len  = wr->sg_list[i].length;
2301 		inl += len;
2302 
2303 		if (unlikely(inl > qp->max_inline_data))
2304 			return -ENOMEM;
2305 
2306 		if (unlikely(wqe + len > qend)) {
2307 			copy = (int)(qend - wqe);
2308 			memcpy(wqe, addr, copy);
2309 			addr += copy;
2310 			len -= copy;
2311 			wqe = mlx5_get_send_wqe(qp, 0);
2312 		}
2313 		memcpy(wqe, addr, len);
2314 		wqe += len;
2315 	}
2316 
2317 	seg->byte_count = cpu_to_be32(inl | MLX5_INLINE_SEG);
2318 
2319 	*sz = ALIGN(inl + sizeof(seg->byte_count), 16) / 16;
2320 
2321 	return 0;
2322 }
2323 
2324 static int set_frwr_li_wr(void **seg, struct ib_send_wr *wr, int *size,
2325 			  struct mlx5_core_dev *mdev, struct mlx5_ib_pd *pd, struct mlx5_ib_qp *qp)
2326 {
2327 	int writ = 0;
2328 	int li;
2329 
2330 	li = wr->opcode == IB_WR_LOCAL_INV ? 1 : 0;
2331 	if (unlikely(wr->send_flags & IB_SEND_INLINE))
2332 		return -EINVAL;
2333 
2334 	set_frwr_umr_segment(*seg, wr, li);
2335 	*seg += sizeof(struct mlx5_wqe_umr_ctrl_seg);
2336 	*size += sizeof(struct mlx5_wqe_umr_ctrl_seg) / 16;
2337 	if (unlikely((*seg == qp->sq.qend)))
2338 		*seg = mlx5_get_send_wqe(qp, 0);
2339 	set_mkey_segment(*seg, wr, li, &writ);
2340 	*seg += sizeof(struct mlx5_mkey_seg);
2341 	*size += sizeof(struct mlx5_mkey_seg) / 16;
2342 	if (unlikely((*seg == qp->sq.qend)))
2343 		*seg = mlx5_get_send_wqe(qp, 0);
2344 	if (!li) {
2345 		if (unlikely(wr->wr.fast_reg.page_list_len >
2346 			     wr->wr.fast_reg.page_list->max_page_list_len))
2347 			return	-ENOMEM;
2348 
2349 		set_frwr_pages(*seg, wr, mdev, pd, writ);
2350 		*seg += sizeof(struct mlx5_wqe_data_seg);
2351 		*size += (sizeof(struct mlx5_wqe_data_seg) / 16);
2352 	}
2353 	return 0;
2354 }
2355 
2356 static void dump_wqe(struct mlx5_ib_qp *qp, int idx, int size_16)
2357 {
2358 	__be32 *p = NULL;
2359 	int tidx = idx;
2360 	int i, j;
2361 
2362 	pr_debug("dump wqe at %p\n", mlx5_get_send_wqe(qp, tidx));
2363 	for (i = 0, j = 0; i < size_16 * 4; i += 4, j += 4) {
2364 		if ((i & 0xf) == 0) {
2365 			void *buf = mlx5_get_send_wqe(qp, tidx);
2366 			tidx = (tidx + 1) & (qp->sq.wqe_cnt - 1);
2367 			p = buf;
2368 			j = 0;
2369 		}
2370 		pr_debug("%08x %08x %08x %08x\n", be32_to_cpu(p[j]),
2371 			 be32_to_cpu(p[j + 1]), be32_to_cpu(p[j + 2]),
2372 			 be32_to_cpu(p[j + 3]));
2373 	}
2374 }
2375 
2376 static void mlx5_bf_copy(u64 __iomem *dst, u64 *src,
2377 			 unsigned bytecnt, struct mlx5_ib_qp *qp)
2378 {
2379 	while (bytecnt > 0) {
2380 		__iowrite64_copy(dst++, src++, 8);
2381 		__iowrite64_copy(dst++, src++, 8);
2382 		__iowrite64_copy(dst++, src++, 8);
2383 		__iowrite64_copy(dst++, src++, 8);
2384 		__iowrite64_copy(dst++, src++, 8);
2385 		__iowrite64_copy(dst++, src++, 8);
2386 		__iowrite64_copy(dst++, src++, 8);
2387 		__iowrite64_copy(dst++, src++, 8);
2388 		bytecnt -= 64;
2389 		if (unlikely(src == qp->sq.qend))
2390 			src = mlx5_get_send_wqe(qp, 0);
2391 	}
2392 }
2393 
2394 static u8 get_fence(u8 fence, struct ib_send_wr *wr)
2395 {
2396 	if (unlikely(wr->opcode == IB_WR_LOCAL_INV &&
2397 		     wr->send_flags & IB_SEND_FENCE))
2398 		return MLX5_FENCE_MODE_STRONG_ORDERING;
2399 
2400 	if (unlikely(fence)) {
2401 		if (wr->send_flags & IB_SEND_FENCE)
2402 			return MLX5_FENCE_MODE_SMALL_AND_FENCE;
2403 		else
2404 			return fence;
2405 
2406 	} else {
2407 		return 0;
2408 	}
2409 }
2410 
2411 static int begin_wqe(struct mlx5_ib_qp *qp, void **seg,
2412 		     struct mlx5_wqe_ctrl_seg **ctrl,
2413 		     struct ib_send_wr *wr, unsigned *idx,
2414 		     int *size, int nreq)
2415 {
2416 	int err = 0;
2417 
2418 	if (unlikely(mlx5_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq))) {
2419 		mlx5_ib_warn(to_mdev(qp->ibqp.device), "work queue overflow\n");
2420 		err = -ENOMEM;
2421 		return err;
2422 	}
2423 
2424 	*idx = qp->sq.cur_post & (qp->sq.wqe_cnt - 1);
2425 	*seg = mlx5_get_send_wqe(qp, *idx);
2426 	*ctrl = *seg;
2427 	*(u32 *)(*seg + 8) = 0;
2428 	(*ctrl)->imm = send_ieth(wr);
2429 	(*ctrl)->fm_ce_se = qp->sq_signal_bits |
2430 		(wr->send_flags & IB_SEND_SIGNALED ?
2431 		 MLX5_WQE_CTRL_CQ_UPDATE : 0) |
2432 		(wr->send_flags & IB_SEND_SOLICITED ?
2433 		 MLX5_WQE_CTRL_SOLICITED : 0);
2434 
2435 	*seg += sizeof(**ctrl);
2436 	*size = sizeof(**ctrl) / 16;
2437 
2438 	return err;
2439 }
2440 
2441 static void finish_wqe(struct mlx5_ib_qp *qp,
2442 		       struct mlx5_wqe_ctrl_seg *ctrl,
2443 		       u8 size, unsigned idx,
2444 		       struct ib_send_wr *wr,
2445 		       int nreq, u8 fence, u8 next_fence,
2446 		       u32 mlx5_opcode)
2447 {
2448 	u8 opmod = 0;
2449 
2450 	ctrl->opmod_idx_opcode = cpu_to_be32(((u32)(qp->sq.cur_post) << 8) |
2451 					     mlx5_opcode | ((u32)opmod << 24));
2452 	ctrl->qpn_ds = cpu_to_be32(size | (qp->mqp.qpn << 8));
2453 	ctrl->fm_ce_se |= fence;
2454 	qp->fm_cache = next_fence;
2455 	if (unlikely(qp->wq_sig))
2456 		ctrl->signature = calc_wq_sig(ctrl);
2457 
2458 	qp->sq.swr_ctx[idx].wrid = wr->wr_id;
2459 	qp->sq.swr_ctx[idx].w_list.opcode = mlx5_opcode;
2460 	qp->sq.swr_ctx[idx].wqe_head = qp->sq.head + nreq;
2461 	qp->sq.cur_post += DIV_ROUND_UP(size * 16, MLX5_SEND_WQE_BB);
2462 	qp->sq.swr_ctx[idx].w_list.next = qp->sq.cur_post;
2463 	qp->sq.swr_ctx[idx].sig_piped = 0;
2464 }
2465 
2466 int mlx5_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
2467 		      struct ib_send_wr **bad_wr)
2468 {
2469 	struct mlx5_wqe_ctrl_seg *ctrl = NULL;  /* compiler warning */
2470 	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
2471 	struct mlx5_core_dev *mdev = dev->mdev;
2472 	struct mlx5_ib_qp *qp = to_mqp(ibqp);
2473 	struct mlx5_wqe_data_seg *dpseg;
2474 	struct mlx5_wqe_xrc_seg *xrc;
2475 	struct mlx5_bf *bf = qp->bf;
2476 	int uninitialized_var(size);
2477 	void *qend = qp->sq.qend;
2478 	unsigned long flags;
2479 	unsigned idx;
2480 	int err = 0;
2481 	int inl = 0;
2482 	int num_sge;
2483 	void *seg;
2484 	int nreq;
2485 	int i;
2486 	u8 next_fence = 0;
2487 	u8 fence;
2488 
2489 
2490 	spin_lock_irqsave(&qp->sq.lock, flags);
2491 
2492 	if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) {
2493 		err = -EIO;
2494 		*bad_wr = wr;
2495 		nreq = 0;
2496 		goto out;
2497 	}
2498 
2499 	for (nreq = 0; wr; nreq++, wr = wr->next) {
2500 		if (unlikely(wr->opcode < 0 || wr->opcode >= ARRAY_SIZE(mlx5_ib_opcode))) {
2501 			mlx5_ib_warn(dev, "Invalid opcode 0x%x\n", wr->opcode);
2502 			err = -EINVAL;
2503 			*bad_wr = wr;
2504 			goto out;
2505 		}
2506 
2507 		fence = qp->fm_cache;
2508 		num_sge = wr->num_sge;
2509 		if (unlikely(num_sge > qp->sq.max_gs)) {
2510 			mlx5_ib_warn(dev, "Max gs exceeded %d (max = %d)\n", wr->num_sge, qp->sq.max_gs);
2511 			err = -ENOMEM;
2512 			*bad_wr = wr;
2513 			goto out;
2514 		}
2515 
2516 		err = begin_wqe(qp, &seg, &ctrl, wr, &idx, &size, nreq);
2517 		if (err) {
2518 			mlx5_ib_warn(dev, "Failed to prepare WQE\n");
2519 			err = -ENOMEM;
2520 			*bad_wr = wr;
2521 			goto out;
2522 		}
2523 
2524 		switch (ibqp->qp_type) {
2525 		case IB_QPT_XRC_INI:
2526 			xrc = seg;
2527 			xrc->xrc_srqn = htonl(wr->xrc_remote_srq_num);
2528 			seg += sizeof(*xrc);
2529 			size += sizeof(*xrc) / 16;
2530 			/* fall through */
2531 		case IB_QPT_RC:
2532 			switch (wr->opcode) {
2533 			case IB_WR_RDMA_READ:
2534 			case IB_WR_RDMA_WRITE:
2535 			case IB_WR_RDMA_WRITE_WITH_IMM:
2536 				set_raddr_seg(seg, wr->wr.rdma.remote_addr,
2537 					      wr->wr.rdma.rkey);
2538 				seg += sizeof(struct mlx5_wqe_raddr_seg);
2539 				size += sizeof(struct mlx5_wqe_raddr_seg) / 16;
2540 				break;
2541 
2542 			case IB_WR_ATOMIC_CMP_AND_SWP:
2543 			case IB_WR_ATOMIC_FETCH_AND_ADD:
2544 			case IB_WR_MASKED_ATOMIC_CMP_AND_SWP:
2545 				mlx5_ib_warn(dev, "Atomic operations are not supported yet\n");
2546 				err = -ENOSYS;
2547 				*bad_wr = wr;
2548 				goto out;
2549 
2550 			case IB_WR_LOCAL_INV:
2551 				next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL;
2552 				qp->sq.swr_ctx[idx].wr_data = IB_WR_LOCAL_INV;
2553 				ctrl->imm = cpu_to_be32(wr->ex.invalidate_rkey);
2554 				err = set_frwr_li_wr(&seg, wr, &size, mdev, to_mpd(ibqp->pd), qp);
2555 				if (err) {
2556 					mlx5_ib_warn(dev, "Failed to prepare LOCAL_INV WQE\n");
2557 					*bad_wr = wr;
2558 					goto out;
2559 				}
2560 				num_sge = 0;
2561 				break;
2562 
2563 			case IB_WR_FAST_REG_MR:
2564 				next_fence = MLX5_FENCE_MODE_INITIATOR_SMALL;
2565 				qp->sq.swr_ctx[idx].wr_data = IB_WR_FAST_REG_MR;
2566 				ctrl->imm = cpu_to_be32(wr->wr.fast_reg.rkey);
2567 				err = set_frwr_li_wr(&seg, wr, &size, mdev, to_mpd(ibqp->pd), qp);
2568 				if (err) {
2569 					mlx5_ib_warn(dev, "Failed to prepare FAST_REG_MR WQE\n");
2570 					*bad_wr = wr;
2571 					goto out;
2572 				}
2573 				num_sge = 0;
2574 				break;
2575 
2576 			default:
2577 				break;
2578 			}
2579 			break;
2580 
2581 		case IB_QPT_UC:
2582 			switch (wr->opcode) {
2583 			case IB_WR_RDMA_WRITE:
2584 			case IB_WR_RDMA_WRITE_WITH_IMM:
2585 				set_raddr_seg(seg, wr->wr.rdma.remote_addr,
2586 					      wr->wr.rdma.rkey);
2587 				seg  += sizeof(struct mlx5_wqe_raddr_seg);
2588 				size += sizeof(struct mlx5_wqe_raddr_seg) / 16;
2589 				break;
2590 
2591 			default:
2592 				break;
2593 			}
2594 			break;
2595 
2596 		case IB_QPT_SMI:
2597 			if (!mlx5_core_is_pf(mdev)) {
2598 				err = -EINVAL;
2599 				mlx5_ib_warn(dev, "Only physical function is allowed to send SMP MADs\n");
2600 				*bad_wr = wr;
2601 				goto out;
2602 			}
2603 		case IB_QPT_GSI:
2604 		case IB_QPT_UD:
2605 			set_datagram_seg(seg, wr);
2606 			seg += sizeof(struct mlx5_wqe_datagram_seg);
2607 			size += sizeof(struct mlx5_wqe_datagram_seg) / 16;
2608 			if (unlikely((seg == qend)))
2609 				seg = mlx5_get_send_wqe(qp, 0);
2610 			break;
2611 		default:
2612 			break;
2613 		}
2614 
2615 		if (wr->send_flags & IB_SEND_INLINE && num_sge) {
2616 			int uninitialized_var(sz);
2617 
2618 			err = set_data_inl_seg(qp, wr, seg, &sz);
2619 			if (unlikely(err)) {
2620 				mlx5_ib_warn(dev, "Failed to prepare inline data segment\n");
2621 				*bad_wr = wr;
2622 				goto out;
2623 			}
2624 			inl = 1;
2625 			size += sz;
2626 		} else {
2627 			dpseg = seg;
2628 			for (i = 0; i < num_sge; i++) {
2629 				if (unlikely(dpseg == qend)) {
2630 					seg = mlx5_get_send_wqe(qp, 0);
2631 					dpseg = seg;
2632 				}
2633 				if (likely(wr->sg_list[i].length)) {
2634 					set_data_ptr_seg(dpseg, wr->sg_list + i);
2635 					size += sizeof(struct mlx5_wqe_data_seg) / 16;
2636 					dpseg++;
2637 				}
2638 			}
2639 		}
2640 
2641 		finish_wqe(qp, ctrl, size, idx, wr, nreq,
2642 			   get_fence(fence, wr), next_fence,
2643 			   mlx5_ib_opcode[wr->opcode]);
2644 		if (0)
2645 			dump_wqe(qp, idx, size);
2646 	}
2647 
2648 out:
2649 	if (likely(nreq)) {
2650 		qp->sq.head += nreq;
2651 
2652 		/* Make sure that descriptors are written before
2653 		 * updating doorbell record and ringing the doorbell
2654 		 */
2655 		wmb();
2656 
2657 		qp->db.db[MLX5_SND_DBR] = cpu_to_be32(qp->sq.cur_post);
2658 
2659 		/* Make sure doorbell record is visible to the HCA before
2660 		 * we hit doorbell */
2661 		wmb();
2662 
2663 		if (bf->need_lock)
2664 			spin_lock(&bf->lock);
2665 		else
2666 			__acquire(&bf->lock);
2667 
2668 		/* TBD enable WC */
2669 		if (BF_ENABLE && nreq == 1 && bf->uuarn && inl && size > 1 &&
2670 		    size <= bf->buf_size / 16) {
2671 			mlx5_bf_copy(bf->reg + bf->offset, (u64 *)ctrl, ALIGN(size * 16, 64), qp);
2672 			/* wc_wmb(); */
2673 		} else {
2674 			mlx5_write64((__be32 *)ctrl, bf->regreg + bf->offset,
2675 				     MLX5_GET_DOORBELL_LOCK(&bf->lock32));
2676 			/* Make sure doorbells don't leak out of SQ spinlock
2677 			 * and reach the HCA out of order.
2678 			 */
2679 			mmiowb();
2680 		}
2681 		bf->offset ^= bf->buf_size;
2682 		if (bf->need_lock)
2683 			spin_unlock(&bf->lock);
2684 		else
2685 			__release(&bf->lock);
2686 	}
2687 
2688 	spin_unlock_irqrestore(&qp->sq.lock, flags);
2689 
2690 	return err;
2691 }
2692 
2693 static void set_sig_seg(struct mlx5_rwqe_sig *sig, int size)
2694 {
2695 	sig->signature = calc_sig(sig, size);
2696 }
2697 
2698 int mlx5_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
2699 		      struct ib_recv_wr **bad_wr)
2700 {
2701 	struct mlx5_ib_qp *qp = to_mqp(ibqp);
2702 	struct mlx5_wqe_data_seg *scat;
2703 	struct mlx5_rwqe_sig *sig;
2704 	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
2705 	struct mlx5_core_dev *mdev = dev->mdev;
2706 	unsigned long flags;
2707 	int err = 0;
2708 	int nreq;
2709 	int ind;
2710 	int i;
2711 
2712 	spin_lock_irqsave(&qp->rq.lock, flags);
2713 
2714 	if (mdev->state == MLX5_DEVICE_STATE_INTERNAL_ERROR) {
2715 		err = -EIO;
2716 		*bad_wr = wr;
2717 		nreq = 0;
2718 		goto out;
2719 	}
2720 
2721 	ind = qp->rq.head & (qp->rq.wqe_cnt - 1);
2722 
2723 	for (nreq = 0; wr; nreq++, wr = wr->next) {
2724 		if (mlx5_wq_overflow(&qp->rq, nreq, qp->ibqp.recv_cq)) {
2725 			err = -ENOMEM;
2726 			*bad_wr = wr;
2727 			goto out;
2728 		}
2729 
2730 		if (unlikely(wr->num_sge > qp->rq.max_gs)) {
2731 			err = -EINVAL;
2732 			*bad_wr = wr;
2733 			goto out;
2734 		}
2735 
2736 		scat = get_recv_wqe(qp, ind);
2737 		if (qp->wq_sig)
2738 			scat++;
2739 
2740 		for (i = 0; i < wr->num_sge; i++)
2741 			set_data_ptr_seg(scat + i, wr->sg_list + i);
2742 
2743 		if (i < qp->rq.max_gs) {
2744 			scat[i].byte_count = 0;
2745 			scat[i].lkey       = cpu_to_be32(MLX5_INVALID_LKEY);
2746 			scat[i].addr       = 0;
2747 		}
2748 
2749 		if (qp->wq_sig) {
2750 			sig = (struct mlx5_rwqe_sig *)scat;
2751 			set_sig_seg(sig, (qp->rq.max_gs + 1) << 2);
2752 		}
2753 
2754 		qp->rq.rwr_ctx[ind].wrid = wr->wr_id;
2755 
2756 		ind = (ind + 1) & (qp->rq.wqe_cnt - 1);
2757 	}
2758 
2759 out:
2760 	if (likely(nreq)) {
2761 		qp->rq.head += nreq;
2762 
2763 		/* Make sure that descriptors are written before
2764 		 * doorbell record.
2765 		 */
2766 		wmb();
2767 
2768 		*qp->db.db = cpu_to_be32(qp->rq.head & 0xffff);
2769 	}
2770 
2771 	spin_unlock_irqrestore(&qp->rq.lock, flags);
2772 
2773 	return err;
2774 }
2775 
2776 static inline enum ib_qp_state to_ib_qp_state(enum mlx5_qp_state mlx5_state)
2777 {
2778 	switch (mlx5_state) {
2779 	case MLX5_QP_STATE_RST:      return IB_QPS_RESET;
2780 	case MLX5_QP_STATE_INIT:     return IB_QPS_INIT;
2781 	case MLX5_QP_STATE_RTR:      return IB_QPS_RTR;
2782 	case MLX5_QP_STATE_RTS:      return IB_QPS_RTS;
2783 	case MLX5_QP_STATE_SQ_DRAINING:
2784 	case MLX5_QP_STATE_SQD:      return IB_QPS_SQD;
2785 	case MLX5_QP_STATE_SQER:     return IB_QPS_SQE;
2786 	case MLX5_QP_STATE_ERR:      return IB_QPS_ERR;
2787 	default:		     return -1;
2788 	}
2789 }
2790 
2791 static inline enum ib_mig_state to_ib_mig_state(int mlx5_mig_state)
2792 {
2793 	switch (mlx5_mig_state) {
2794 	case MLX5_QP_PM_ARMED:		return IB_MIG_ARMED;
2795 	case MLX5_QP_PM_REARM:		return IB_MIG_REARM;
2796 	case MLX5_QP_PM_MIGRATED:	return IB_MIG_MIGRATED;
2797 	default: return -1;
2798 	}
2799 }
2800 
2801 static int to_ib_qp_access_flags(int mlx5_flags)
2802 {
2803 	int ib_flags = 0;
2804 
2805 	if (mlx5_flags & MLX5_QP_BIT_RRE)
2806 		ib_flags |= IB_ACCESS_REMOTE_READ;
2807 	if (mlx5_flags & MLX5_QP_BIT_RWE)
2808 		ib_flags |= IB_ACCESS_REMOTE_WRITE;
2809 	if (mlx5_flags & MLX5_QP_BIT_RAE)
2810 		ib_flags |= IB_ACCESS_REMOTE_ATOMIC;
2811 
2812 	return ib_flags;
2813 }
2814 
2815 static void to_ib_ah_attr(struct mlx5_ib_dev *ibdev, struct ib_ah_attr *ib_ah_attr,
2816 				struct mlx5_qp_path *path)
2817 {
2818 	struct mlx5_core_dev *dev = ibdev->mdev;
2819 
2820 	memset(ib_ah_attr, 0, sizeof(*ib_ah_attr));
2821 	ib_ah_attr->port_num	  = path->port;
2822 
2823 	if (ib_ah_attr->port_num == 0 ||
2824 	    ib_ah_attr->port_num > MLX5_CAP_GEN(dev, num_ports))
2825 		return;
2826 
2827 	ib_ah_attr->sl = path->dci_cfi_prio_sl & 0xf;
2828 
2829 	ib_ah_attr->dlid	  = be16_to_cpu(path->rlid);
2830 	ib_ah_attr->src_path_bits = path->grh_mlid & 0x7f;
2831 	ib_ah_attr->static_rate   = path->static_rate ? path->static_rate - 5 : 0;
2832 	ib_ah_attr->ah_flags      = (path->grh_mlid & (1 << 7)) ? IB_AH_GRH : 0;
2833 	if (ib_ah_attr->ah_flags) {
2834 		ib_ah_attr->grh.sgid_index = path->mgid_index;
2835 		ib_ah_attr->grh.hop_limit  = path->hop_limit;
2836 		ib_ah_attr->grh.traffic_class =
2837 			(be32_to_cpu(path->tclass_flowlabel) >> 20) & 0xff;
2838 		ib_ah_attr->grh.flow_label =
2839 			be32_to_cpu(path->tclass_flowlabel) & 0xfffff;
2840 		memcpy(ib_ah_attr->grh.dgid.raw,
2841 		       path->rgid, sizeof(ib_ah_attr->grh.dgid.raw));
2842 	}
2843 }
2844 
2845 int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask,
2846 		     struct ib_qp_init_attr *qp_init_attr)
2847 {
2848 	struct mlx5_ib_dev *dev = to_mdev(ibqp->device);
2849 	struct mlx5_ib_qp *qp = to_mqp(ibqp);
2850 	struct mlx5_query_qp_mbox_out *outb;
2851 	struct mlx5_qp_context *context;
2852 	int mlx5_state;
2853 	int err = 0;
2854 
2855 	mutex_lock(&qp->mutex);
2856 	if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) {
2857 		err = -EOPNOTSUPP;
2858 		goto out;
2859 	} else {
2860 		outb = kzalloc(sizeof(*outb), GFP_KERNEL);
2861 		if (!outb) {
2862 			err = -ENOMEM;
2863 			goto out;
2864 		}
2865 
2866 		context = &outb->ctx;
2867 		err = mlx5_core_qp_query(dev->mdev, &qp->mqp, outb,
2868 					 sizeof(*outb));
2869 		if (err) {
2870 			kfree(outb);
2871 			goto out;
2872 		}
2873 
2874 		mlx5_state = be32_to_cpu(context->flags) >> 28;
2875 
2876 		qp->state		     = to_ib_qp_state(mlx5_state);
2877 		qp_attr->path_mtu	     = context->mtu_msgmax >> 5;
2878 		qp_attr->path_mig_state	     =
2879 			to_ib_mig_state((be32_to_cpu(context->flags) >> 11) & 0x3);
2880 		qp_attr->qkey		     = be32_to_cpu(context->qkey);
2881 		qp_attr->rq_psn		     = be32_to_cpu(context->rnr_nextrecvpsn) & 0xffffff;
2882 		qp_attr->sq_psn		     = be32_to_cpu(context->next_send_psn) & 0xffffff;
2883 		qp_attr->dest_qp_num	     = be32_to_cpu(context->log_pg_sz_remote_qpn) & 0xffffff;
2884 		qp_attr->qp_access_flags     =
2885 			to_ib_qp_access_flags(be32_to_cpu(context->params2));
2886 
2887 		if (qp->ibqp.qp_type == IB_QPT_RC || qp->ibqp.qp_type == IB_QPT_UC) {
2888 			to_ib_ah_attr(dev, &qp_attr->ah_attr, &context->pri_path);
2889 			to_ib_ah_attr(dev, &qp_attr->alt_ah_attr, &context->alt_path);
2890 				qp_attr->alt_pkey_index = be16_to_cpu(context->alt_path.pkey_index);
2891 			qp_attr->alt_port_num	= qp_attr->alt_ah_attr.port_num;
2892 		}
2893 
2894 		qp_attr->pkey_index = be16_to_cpu(context->pri_path.pkey_index);
2895 		qp_attr->port_num = context->pri_path.port;
2896 
2897 		/* qp_attr->en_sqd_async_notify is only applicable in modify qp */
2898 		qp_attr->sq_draining = mlx5_state == MLX5_QP_STATE_SQ_DRAINING;
2899 
2900 		qp_attr->max_rd_atomic = 1 << ((be32_to_cpu(context->params1) >> 21) & 0x7);
2901 
2902 		qp_attr->max_dest_rd_atomic =
2903 			1 << ((be32_to_cpu(context->params2) >> 21) & 0x7);
2904 		qp_attr->min_rnr_timer	    =
2905 			(be32_to_cpu(context->rnr_nextrecvpsn) >> 24) & 0x1f;
2906 		qp_attr->timeout	    = context->pri_path.ackto_lt >> 3;
2907 		qp_attr->retry_cnt	    = (be32_to_cpu(context->params1) >> 16) & 0x7;
2908 		qp_attr->rnr_retry	    = (be32_to_cpu(context->params1) >> 13) & 0x7;
2909 		qp_attr->alt_timeout	    = context->alt_path.ackto_lt >> 3;
2910 
2911 
2912 		kfree(outb);
2913 	}
2914 
2915 	qp_attr->qp_state	     = qp->state;
2916 	qp_attr->cur_qp_state	     = qp_attr->qp_state;
2917 	qp_attr->cap.max_recv_wr     = qp->rq.wqe_cnt;
2918 	qp_attr->cap.max_recv_sge    = qp->rq.max_gs;
2919 
2920 	if (!ibqp->uobject) {
2921 		qp_attr->cap.max_send_wr  = qp->sq.max_post;
2922 		qp_attr->cap.max_send_sge = qp->sq.max_gs;
2923 		qp_init_attr->qp_context = ibqp->qp_context;
2924 	} else {
2925 		qp_attr->cap.max_send_wr  = 0;
2926 		qp_attr->cap.max_send_sge = 0;
2927 	}
2928 
2929 	qp_init_attr->qp_type = ibqp->qp_type;
2930 	qp_init_attr->recv_cq = ibqp->recv_cq;
2931 	qp_init_attr->send_cq = ibqp->send_cq;
2932 	qp_init_attr->srq = ibqp->srq;
2933 	qp_attr->cap.max_inline_data = qp->max_inline_data;
2934 
2935 	qp_init_attr->cap	     = qp_attr->cap;
2936 
2937 	qp_init_attr->create_flags = 0;
2938 	if (qp->flags & MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK)
2939 		qp_init_attr->create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK;
2940 
2941 	qp_init_attr->sq_sig_type = qp->sq_signal_bits & MLX5_WQE_CTRL_CQ_UPDATE ?
2942 		IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR;
2943 
2944 out:
2945 	mutex_unlock(&qp->mutex);
2946 	return err;
2947 }
2948 
2949 struct ib_xrcd *mlx5_ib_alloc_xrcd(struct ib_device *ibdev,
2950 					  struct ib_ucontext *context,
2951 					  struct ib_udata *udata)
2952 {
2953 	struct mlx5_ib_dev *dev = to_mdev(ibdev);
2954 	struct mlx5_ib_xrcd *xrcd;
2955 	int err;
2956 
2957 	if (!MLX5_CAP_GEN(dev->mdev, xrc))
2958 		return ERR_PTR(-ENOSYS);
2959 
2960 	xrcd = kmalloc(sizeof(*xrcd), GFP_KERNEL);
2961 	if (!xrcd)
2962 		return ERR_PTR(-ENOMEM);
2963 
2964 	err = mlx5_core_xrcd_alloc(dev->mdev, &xrcd->xrcdn);
2965 	if (err) {
2966 		kfree(xrcd);
2967 		return ERR_PTR(-ENOMEM);
2968 	}
2969 
2970 	return &xrcd->ibxrcd;
2971 }
2972 
2973 int mlx5_ib_dealloc_xrcd(struct ib_xrcd *xrcd)
2974 {
2975 	struct mlx5_ib_dev *dev = to_mdev(xrcd->device);
2976 	u32 xrcdn = to_mxrcd(xrcd)->xrcdn;
2977 	int err;
2978 
2979 	err = mlx5_core_xrcd_dealloc(dev->mdev, xrcdn);
2980 	if (err) {
2981 		mlx5_ib_warn(dev, "failed to dealloc xrcdn 0x%x\n", xrcdn);
2982 		return err;
2983 	}
2984 
2985 	kfree(xrcd);
2986 
2987 	return 0;
2988 }
2989