xref: /linux/drivers/vdpa/mlx5/net/mlx5_vnet.c (revision 0be3ff0c)
1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2 /* Copyright (c) 2020 Mellanox Technologies Ltd. */
3 
4 #include <linux/module.h>
5 #include <linux/vdpa.h>
6 #include <linux/vringh.h>
7 #include <uapi/linux/virtio_net.h>
8 #include <uapi/linux/virtio_ids.h>
9 #include <uapi/linux/vdpa.h>
10 #include <linux/virtio_config.h>
11 #include <linux/auxiliary_bus.h>
12 #include <linux/mlx5/cq.h>
13 #include <linux/mlx5/qp.h>
14 #include <linux/mlx5/device.h>
15 #include <linux/mlx5/driver.h>
16 #include <linux/mlx5/vport.h>
17 #include <linux/mlx5/fs.h>
18 #include <linux/mlx5/mlx5_ifc_vdpa.h>
19 #include <linux/mlx5/mpfs.h>
20 #include "mlx5_vdpa.h"
21 
22 MODULE_AUTHOR("Eli Cohen <eli@mellanox.com>");
23 MODULE_DESCRIPTION("Mellanox VDPA driver");
24 MODULE_LICENSE("Dual BSD/GPL");
25 
26 #define to_mlx5_vdpa_ndev(__mvdev)                                             \
27 	container_of(__mvdev, struct mlx5_vdpa_net, mvdev)
28 #define to_mvdev(__vdev) container_of((__vdev), struct mlx5_vdpa_dev, vdev)
29 
30 #define VALID_FEATURES_MASK                                                                        \
31 	(BIT_ULL(VIRTIO_NET_F_CSUM) | BIT_ULL(VIRTIO_NET_F_GUEST_CSUM) |                                   \
32 	 BIT_ULL(VIRTIO_NET_F_CTRL_GUEST_OFFLOADS) | BIT_ULL(VIRTIO_NET_F_MTU) | BIT_ULL(VIRTIO_NET_F_MAC) |   \
33 	 BIT_ULL(VIRTIO_NET_F_GUEST_TSO4) | BIT_ULL(VIRTIO_NET_F_GUEST_TSO6) |                             \
34 	 BIT_ULL(VIRTIO_NET_F_GUEST_ECN) | BIT_ULL(VIRTIO_NET_F_GUEST_UFO) | BIT_ULL(VIRTIO_NET_F_HOST_TSO4) | \
35 	 BIT_ULL(VIRTIO_NET_F_HOST_TSO6) | BIT_ULL(VIRTIO_NET_F_HOST_ECN) | BIT_ULL(VIRTIO_NET_F_HOST_UFO) |   \
36 	 BIT_ULL(VIRTIO_NET_F_MRG_RXBUF) | BIT_ULL(VIRTIO_NET_F_STATUS) | BIT_ULL(VIRTIO_NET_F_CTRL_VQ) |      \
37 	 BIT_ULL(VIRTIO_NET_F_CTRL_RX) | BIT_ULL(VIRTIO_NET_F_CTRL_VLAN) |                                 \
38 	 BIT_ULL(VIRTIO_NET_F_CTRL_RX_EXTRA) | BIT_ULL(VIRTIO_NET_F_GUEST_ANNOUNCE) |                      \
39 	 BIT_ULL(VIRTIO_NET_F_MQ) | BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR) | BIT_ULL(VIRTIO_NET_F_HASH_REPORT) |  \
40 	 BIT_ULL(VIRTIO_NET_F_RSS) | BIT_ULL(VIRTIO_NET_F_RSC_EXT) | BIT_ULL(VIRTIO_NET_F_STANDBY) |           \
41 	 BIT_ULL(VIRTIO_NET_F_SPEED_DUPLEX) | BIT_ULL(VIRTIO_F_NOTIFY_ON_EMPTY) |                          \
42 	 BIT_ULL(VIRTIO_F_ANY_LAYOUT) | BIT_ULL(VIRTIO_F_VERSION_1) | BIT_ULL(VIRTIO_F_ACCESS_PLATFORM) |      \
43 	 BIT_ULL(VIRTIO_F_RING_PACKED) | BIT_ULL(VIRTIO_F_ORDER_PLATFORM) | BIT_ULL(VIRTIO_F_SR_IOV))
44 
45 #define VALID_STATUS_MASK                                                                          \
46 	(VIRTIO_CONFIG_S_ACKNOWLEDGE | VIRTIO_CONFIG_S_DRIVER | VIRTIO_CONFIG_S_DRIVER_OK |        \
47 	 VIRTIO_CONFIG_S_FEATURES_OK | VIRTIO_CONFIG_S_NEEDS_RESET | VIRTIO_CONFIG_S_FAILED)
48 
49 #define MLX5_FEATURE(_mvdev, _feature) (!!((_mvdev)->actual_features & BIT_ULL(_feature)))
50 
51 struct mlx5_vdpa_net_resources {
52 	u32 tisn;
53 	u32 tdn;
54 	u32 tirn;
55 	u32 rqtn;
56 	bool valid;
57 };
58 
59 struct mlx5_vdpa_cq_buf {
60 	struct mlx5_frag_buf_ctrl fbc;
61 	struct mlx5_frag_buf frag_buf;
62 	int cqe_size;
63 	int nent;
64 };
65 
66 struct mlx5_vdpa_cq {
67 	struct mlx5_core_cq mcq;
68 	struct mlx5_vdpa_cq_buf buf;
69 	struct mlx5_db db;
70 	int cqe;
71 };
72 
73 struct mlx5_vdpa_umem {
74 	struct mlx5_frag_buf_ctrl fbc;
75 	struct mlx5_frag_buf frag_buf;
76 	int size;
77 	u32 id;
78 };
79 
80 struct mlx5_vdpa_qp {
81 	struct mlx5_core_qp mqp;
82 	struct mlx5_frag_buf frag_buf;
83 	struct mlx5_db db;
84 	u16 head;
85 	bool fw;
86 };
87 
88 struct mlx5_vq_restore_info {
89 	u32 num_ent;
90 	u64 desc_addr;
91 	u64 device_addr;
92 	u64 driver_addr;
93 	u16 avail_index;
94 	u16 used_index;
95 	bool ready;
96 	bool restore;
97 };
98 
99 struct mlx5_vdpa_virtqueue {
100 	bool ready;
101 	u64 desc_addr;
102 	u64 device_addr;
103 	u64 driver_addr;
104 	u32 num_ent;
105 
106 	/* Resources for implementing the notification channel from the device
107 	 * to the driver. fwqp is the firmware end of an RC connection; the
108 	 * other end is vqqp used by the driver. cq is is where completions are
109 	 * reported.
110 	 */
111 	struct mlx5_vdpa_cq cq;
112 	struct mlx5_vdpa_qp fwqp;
113 	struct mlx5_vdpa_qp vqqp;
114 
115 	/* umem resources are required for the virtqueue operation. They're use
116 	 * is internal and they must be provided by the driver.
117 	 */
118 	struct mlx5_vdpa_umem umem1;
119 	struct mlx5_vdpa_umem umem2;
120 	struct mlx5_vdpa_umem umem3;
121 
122 	bool initialized;
123 	int index;
124 	u32 virtq_id;
125 	struct mlx5_vdpa_net *ndev;
126 	u16 avail_idx;
127 	u16 used_idx;
128 	int fw_state;
129 
130 	/* keep last in the struct */
131 	struct mlx5_vq_restore_info ri;
132 };
133 
134 static bool is_index_valid(struct mlx5_vdpa_dev *mvdev, u16 idx)
135 {
136 	if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_MQ))) {
137 		if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)))
138 			return idx < 2;
139 		else
140 			return idx < 3;
141 	}
142 
143 	return idx <= mvdev->max_idx;
144 }
145 
146 struct mlx5_vdpa_net {
147 	struct mlx5_vdpa_dev mvdev;
148 	struct mlx5_vdpa_net_resources res;
149 	struct virtio_net_config config;
150 	struct mlx5_vdpa_virtqueue *vqs;
151 	struct vdpa_callback *event_cbs;
152 
153 	/* Serialize vq resources creation and destruction. This is required
154 	 * since memory map might change and we need to destroy and create
155 	 * resources while driver in operational.
156 	 */
157 	struct mutex reslock;
158 	struct mlx5_flow_table *rxft;
159 	struct mlx5_fc *rx_counter;
160 	struct mlx5_flow_handle *rx_rule_ucast;
161 	struct mlx5_flow_handle *rx_rule_mcast;
162 	bool setup;
163 	u32 cur_num_vqs;
164 	u32 rqt_size;
165 	struct notifier_block nb;
166 	struct vdpa_callback config_cb;
167 	struct mlx5_vdpa_wq_ent cvq_ent;
168 };
169 
170 static void free_resources(struct mlx5_vdpa_net *ndev);
171 static void init_mvqs(struct mlx5_vdpa_net *ndev);
172 static int setup_driver(struct mlx5_vdpa_dev *mvdev);
173 static void teardown_driver(struct mlx5_vdpa_net *ndev);
174 
175 static bool mlx5_vdpa_debug;
176 
177 #define MLX5_CVQ_MAX_ENT 16
178 
179 #define MLX5_LOG_VIO_FLAG(_feature)                                                                \
180 	do {                                                                                       \
181 		if (features & BIT_ULL(_feature))                                                  \
182 			mlx5_vdpa_info(mvdev, "%s\n", #_feature);                                  \
183 	} while (0)
184 
185 #define MLX5_LOG_VIO_STAT(_status)                                                                 \
186 	do {                                                                                       \
187 		if (status & (_status))                                                            \
188 			mlx5_vdpa_info(mvdev, "%s\n", #_status);                                   \
189 	} while (0)
190 
191 /* TODO: cross-endian support */
192 static inline bool mlx5_vdpa_is_little_endian(struct mlx5_vdpa_dev *mvdev)
193 {
194 	return virtio_legacy_is_little_endian() ||
195 		(mvdev->actual_features & BIT_ULL(VIRTIO_F_VERSION_1));
196 }
197 
198 static u16 mlx5vdpa16_to_cpu(struct mlx5_vdpa_dev *mvdev, __virtio16 val)
199 {
200 	return __virtio16_to_cpu(mlx5_vdpa_is_little_endian(mvdev), val);
201 }
202 
203 static __virtio16 cpu_to_mlx5vdpa16(struct mlx5_vdpa_dev *mvdev, u16 val)
204 {
205 	return __cpu_to_virtio16(mlx5_vdpa_is_little_endian(mvdev), val);
206 }
207 
208 static u16 ctrl_vq_idx(struct mlx5_vdpa_dev *mvdev)
209 {
210 	if (!(mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_MQ)))
211 		return 2;
212 
213 	return mvdev->max_vqs;
214 }
215 
216 static bool is_ctrl_vq_idx(struct mlx5_vdpa_dev *mvdev, u16 idx)
217 {
218 	return idx == ctrl_vq_idx(mvdev);
219 }
220 
221 static void print_status(struct mlx5_vdpa_dev *mvdev, u8 status, bool set)
222 {
223 	if (status & ~VALID_STATUS_MASK)
224 		mlx5_vdpa_warn(mvdev, "Warning: there are invalid status bits 0x%x\n",
225 			       status & ~VALID_STATUS_MASK);
226 
227 	if (!mlx5_vdpa_debug)
228 		return;
229 
230 	mlx5_vdpa_info(mvdev, "driver status %s", set ? "set" : "get");
231 	if (set && !status) {
232 		mlx5_vdpa_info(mvdev, "driver resets the device\n");
233 		return;
234 	}
235 
236 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_ACKNOWLEDGE);
237 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_DRIVER);
238 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_DRIVER_OK);
239 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_FEATURES_OK);
240 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_NEEDS_RESET);
241 	MLX5_LOG_VIO_STAT(VIRTIO_CONFIG_S_FAILED);
242 }
243 
244 static void print_features(struct mlx5_vdpa_dev *mvdev, u64 features, bool set)
245 {
246 	if (features & ~VALID_FEATURES_MASK)
247 		mlx5_vdpa_warn(mvdev, "There are invalid feature bits 0x%llx\n",
248 			       features & ~VALID_FEATURES_MASK);
249 
250 	if (!mlx5_vdpa_debug)
251 		return;
252 
253 	mlx5_vdpa_info(mvdev, "driver %s feature bits:\n", set ? "sets" : "reads");
254 	if (!features)
255 		mlx5_vdpa_info(mvdev, "all feature bits are cleared\n");
256 
257 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CSUM);
258 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_CSUM);
259 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_GUEST_OFFLOADS);
260 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MTU);
261 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MAC);
262 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_TSO4);
263 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_TSO6);
264 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_ECN);
265 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_UFO);
266 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_TSO4);
267 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_TSO6);
268 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_ECN);
269 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HOST_UFO);
270 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MRG_RXBUF);
271 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_STATUS);
272 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_VQ);
273 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_RX);
274 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_VLAN);
275 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_RX_EXTRA);
276 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_GUEST_ANNOUNCE);
277 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_MQ);
278 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_CTRL_MAC_ADDR);
279 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_HASH_REPORT);
280 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_RSS);
281 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_RSC_EXT);
282 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_STANDBY);
283 	MLX5_LOG_VIO_FLAG(VIRTIO_NET_F_SPEED_DUPLEX);
284 	MLX5_LOG_VIO_FLAG(VIRTIO_F_NOTIFY_ON_EMPTY);
285 	MLX5_LOG_VIO_FLAG(VIRTIO_F_ANY_LAYOUT);
286 	MLX5_LOG_VIO_FLAG(VIRTIO_F_VERSION_1);
287 	MLX5_LOG_VIO_FLAG(VIRTIO_F_ACCESS_PLATFORM);
288 	MLX5_LOG_VIO_FLAG(VIRTIO_F_RING_PACKED);
289 	MLX5_LOG_VIO_FLAG(VIRTIO_F_ORDER_PLATFORM);
290 	MLX5_LOG_VIO_FLAG(VIRTIO_F_SR_IOV);
291 }
292 
293 static int create_tis(struct mlx5_vdpa_net *ndev)
294 {
295 	struct mlx5_vdpa_dev *mvdev = &ndev->mvdev;
296 	u32 in[MLX5_ST_SZ_DW(create_tis_in)] = {};
297 	void *tisc;
298 	int err;
299 
300 	tisc = MLX5_ADDR_OF(create_tis_in, in, ctx);
301 	MLX5_SET(tisc, tisc, transport_domain, ndev->res.tdn);
302 	err = mlx5_vdpa_create_tis(mvdev, in, &ndev->res.tisn);
303 	if (err)
304 		mlx5_vdpa_warn(mvdev, "create TIS (%d)\n", err);
305 
306 	return err;
307 }
308 
309 static void destroy_tis(struct mlx5_vdpa_net *ndev)
310 {
311 	mlx5_vdpa_destroy_tis(&ndev->mvdev, ndev->res.tisn);
312 }
313 
314 #define MLX5_VDPA_CQE_SIZE 64
315 #define MLX5_VDPA_LOG_CQE_SIZE ilog2(MLX5_VDPA_CQE_SIZE)
316 
317 static int cq_frag_buf_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_cq_buf *buf, int nent)
318 {
319 	struct mlx5_frag_buf *frag_buf = &buf->frag_buf;
320 	u8 log_wq_stride = MLX5_VDPA_LOG_CQE_SIZE;
321 	u8 log_wq_sz = MLX5_VDPA_LOG_CQE_SIZE;
322 	int err;
323 
324 	err = mlx5_frag_buf_alloc_node(ndev->mvdev.mdev, nent * MLX5_VDPA_CQE_SIZE, frag_buf,
325 				       ndev->mvdev.mdev->priv.numa_node);
326 	if (err)
327 		return err;
328 
329 	mlx5_init_fbc(frag_buf->frags, log_wq_stride, log_wq_sz, &buf->fbc);
330 
331 	buf->cqe_size = MLX5_VDPA_CQE_SIZE;
332 	buf->nent = nent;
333 
334 	return 0;
335 }
336 
337 static int umem_frag_buf_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_umem *umem, int size)
338 {
339 	struct mlx5_frag_buf *frag_buf = &umem->frag_buf;
340 
341 	return mlx5_frag_buf_alloc_node(ndev->mvdev.mdev, size, frag_buf,
342 					ndev->mvdev.mdev->priv.numa_node);
343 }
344 
345 static void cq_frag_buf_free(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_cq_buf *buf)
346 {
347 	mlx5_frag_buf_free(ndev->mvdev.mdev, &buf->frag_buf);
348 }
349 
350 static void *get_cqe(struct mlx5_vdpa_cq *vcq, int n)
351 {
352 	return mlx5_frag_buf_get_wqe(&vcq->buf.fbc, n);
353 }
354 
355 static void cq_frag_buf_init(struct mlx5_vdpa_cq *vcq, struct mlx5_vdpa_cq_buf *buf)
356 {
357 	struct mlx5_cqe64 *cqe64;
358 	void *cqe;
359 	int i;
360 
361 	for (i = 0; i < buf->nent; i++) {
362 		cqe = get_cqe(vcq, i);
363 		cqe64 = cqe;
364 		cqe64->op_own = MLX5_CQE_INVALID << 4;
365 	}
366 }
367 
368 static void *get_sw_cqe(struct mlx5_vdpa_cq *cq, int n)
369 {
370 	struct mlx5_cqe64 *cqe64 = get_cqe(cq, n & (cq->cqe - 1));
371 
372 	if (likely(get_cqe_opcode(cqe64) != MLX5_CQE_INVALID) &&
373 	    !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & cq->cqe)))
374 		return cqe64;
375 
376 	return NULL;
377 }
378 
379 static void rx_post(struct mlx5_vdpa_qp *vqp, int n)
380 {
381 	vqp->head += n;
382 	vqp->db.db[0] = cpu_to_be32(vqp->head);
383 }
384 
385 static void qp_prepare(struct mlx5_vdpa_net *ndev, bool fw, void *in,
386 		       struct mlx5_vdpa_virtqueue *mvq, u32 num_ent)
387 {
388 	struct mlx5_vdpa_qp *vqp;
389 	__be64 *pas;
390 	void *qpc;
391 
392 	vqp = fw ? &mvq->fwqp : &mvq->vqqp;
393 	MLX5_SET(create_qp_in, in, uid, ndev->mvdev.res.uid);
394 	qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
395 	if (vqp->fw) {
396 		/* Firmware QP is allocated by the driver for the firmware's
397 		 * use so we can skip part of the params as they will be chosen by firmware
398 		 */
399 		qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
400 		MLX5_SET(qpc, qpc, rq_type, MLX5_ZERO_LEN_RQ);
401 		MLX5_SET(qpc, qpc, no_sq, 1);
402 		return;
403 	}
404 
405 	MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
406 	MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
407 	MLX5_SET(qpc, qpc, pd, ndev->mvdev.res.pdn);
408 	MLX5_SET(qpc, qpc, mtu, MLX5_QPC_MTU_256_BYTES);
409 	MLX5_SET(qpc, qpc, uar_page, ndev->mvdev.res.uar->index);
410 	MLX5_SET(qpc, qpc, log_page_size, vqp->frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
411 	MLX5_SET(qpc, qpc, no_sq, 1);
412 	MLX5_SET(qpc, qpc, cqn_rcv, mvq->cq.mcq.cqn);
413 	MLX5_SET(qpc, qpc, log_rq_size, ilog2(num_ent));
414 	MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ);
415 	pas = (__be64 *)MLX5_ADDR_OF(create_qp_in, in, pas);
416 	mlx5_fill_page_frag_array(&vqp->frag_buf, pas);
417 }
418 
419 static int rq_buf_alloc(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_qp *vqp, u32 num_ent)
420 {
421 	return mlx5_frag_buf_alloc_node(ndev->mvdev.mdev,
422 					num_ent * sizeof(struct mlx5_wqe_data_seg), &vqp->frag_buf,
423 					ndev->mvdev.mdev->priv.numa_node);
424 }
425 
426 static void rq_buf_free(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_qp *vqp)
427 {
428 	mlx5_frag_buf_free(ndev->mvdev.mdev, &vqp->frag_buf);
429 }
430 
431 static int qp_create(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq,
432 		     struct mlx5_vdpa_qp *vqp)
433 {
434 	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
435 	int inlen = MLX5_ST_SZ_BYTES(create_qp_in);
436 	u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {};
437 	void *qpc;
438 	void *in;
439 	int err;
440 
441 	if (!vqp->fw) {
442 		vqp = &mvq->vqqp;
443 		err = rq_buf_alloc(ndev, vqp, mvq->num_ent);
444 		if (err)
445 			return err;
446 
447 		err = mlx5_db_alloc(ndev->mvdev.mdev, &vqp->db);
448 		if (err)
449 			goto err_db;
450 		inlen += vqp->frag_buf.npages * sizeof(__be64);
451 	}
452 
453 	in = kzalloc(inlen, GFP_KERNEL);
454 	if (!in) {
455 		err = -ENOMEM;
456 		goto err_kzalloc;
457 	}
458 
459 	qp_prepare(ndev, vqp->fw, in, mvq, mvq->num_ent);
460 	qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
461 	MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
462 	MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
463 	MLX5_SET(qpc, qpc, pd, ndev->mvdev.res.pdn);
464 	MLX5_SET(qpc, qpc, mtu, MLX5_QPC_MTU_256_BYTES);
465 	if (!vqp->fw)
466 		MLX5_SET64(qpc, qpc, dbr_addr, vqp->db.dma);
467 	MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP);
468 	err = mlx5_cmd_exec(mdev, in, inlen, out, sizeof(out));
469 	kfree(in);
470 	if (err)
471 		goto err_kzalloc;
472 
473 	vqp->mqp.uid = ndev->mvdev.res.uid;
474 	vqp->mqp.qpn = MLX5_GET(create_qp_out, out, qpn);
475 
476 	if (!vqp->fw)
477 		rx_post(vqp, mvq->num_ent);
478 
479 	return 0;
480 
481 err_kzalloc:
482 	if (!vqp->fw)
483 		mlx5_db_free(ndev->mvdev.mdev, &vqp->db);
484 err_db:
485 	if (!vqp->fw)
486 		rq_buf_free(ndev, vqp);
487 
488 	return err;
489 }
490 
491 static void qp_destroy(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_qp *vqp)
492 {
493 	u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {};
494 
495 	MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP);
496 	MLX5_SET(destroy_qp_in, in, qpn, vqp->mqp.qpn);
497 	MLX5_SET(destroy_qp_in, in, uid, ndev->mvdev.res.uid);
498 	if (mlx5_cmd_exec_in(ndev->mvdev.mdev, destroy_qp, in))
499 		mlx5_vdpa_warn(&ndev->mvdev, "destroy qp 0x%x\n", vqp->mqp.qpn);
500 	if (!vqp->fw) {
501 		mlx5_db_free(ndev->mvdev.mdev, &vqp->db);
502 		rq_buf_free(ndev, vqp);
503 	}
504 }
505 
506 static void *next_cqe_sw(struct mlx5_vdpa_cq *cq)
507 {
508 	return get_sw_cqe(cq, cq->mcq.cons_index);
509 }
510 
511 static int mlx5_vdpa_poll_one(struct mlx5_vdpa_cq *vcq)
512 {
513 	struct mlx5_cqe64 *cqe64;
514 
515 	cqe64 = next_cqe_sw(vcq);
516 	if (!cqe64)
517 		return -EAGAIN;
518 
519 	vcq->mcq.cons_index++;
520 	return 0;
521 }
522 
523 static void mlx5_vdpa_handle_completions(struct mlx5_vdpa_virtqueue *mvq, int num)
524 {
525 	struct mlx5_vdpa_net *ndev = mvq->ndev;
526 	struct vdpa_callback *event_cb;
527 
528 	event_cb = &ndev->event_cbs[mvq->index];
529 	mlx5_cq_set_ci(&mvq->cq.mcq);
530 
531 	/* make sure CQ cosumer update is visible to the hardware before updating
532 	 * RX doorbell record.
533 	 */
534 	dma_wmb();
535 	rx_post(&mvq->vqqp, num);
536 	if (event_cb->callback)
537 		event_cb->callback(event_cb->private);
538 }
539 
540 static void mlx5_vdpa_cq_comp(struct mlx5_core_cq *mcq, struct mlx5_eqe *eqe)
541 {
542 	struct mlx5_vdpa_virtqueue *mvq = container_of(mcq, struct mlx5_vdpa_virtqueue, cq.mcq);
543 	struct mlx5_vdpa_net *ndev = mvq->ndev;
544 	void __iomem *uar_page = ndev->mvdev.res.uar->map;
545 	int num = 0;
546 
547 	while (!mlx5_vdpa_poll_one(&mvq->cq)) {
548 		num++;
549 		if (num > mvq->num_ent / 2) {
550 			/* If completions keep coming while we poll, we want to
551 			 * let the hardware know that we consumed them by
552 			 * updating the doorbell record.  We also let vdpa core
553 			 * know about this so it passes it on the virtio driver
554 			 * on the guest.
555 			 */
556 			mlx5_vdpa_handle_completions(mvq, num);
557 			num = 0;
558 		}
559 	}
560 
561 	if (num)
562 		mlx5_vdpa_handle_completions(mvq, num);
563 
564 	mlx5_cq_arm(&mvq->cq.mcq, MLX5_CQ_DB_REQ_NOT, uar_page, mvq->cq.mcq.cons_index);
565 }
566 
567 static int cq_create(struct mlx5_vdpa_net *ndev, u16 idx, u32 num_ent)
568 {
569 	struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
570 	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
571 	void __iomem *uar_page = ndev->mvdev.res.uar->map;
572 	u32 out[MLX5_ST_SZ_DW(create_cq_out)];
573 	struct mlx5_vdpa_cq *vcq = &mvq->cq;
574 	__be64 *pas;
575 	int inlen;
576 	void *cqc;
577 	void *in;
578 	int err;
579 	int eqn;
580 
581 	err = mlx5_db_alloc(mdev, &vcq->db);
582 	if (err)
583 		return err;
584 
585 	vcq->mcq.set_ci_db = vcq->db.db;
586 	vcq->mcq.arm_db = vcq->db.db + 1;
587 	vcq->mcq.cqe_sz = 64;
588 
589 	err = cq_frag_buf_alloc(ndev, &vcq->buf, num_ent);
590 	if (err)
591 		goto err_db;
592 
593 	cq_frag_buf_init(vcq, &vcq->buf);
594 
595 	inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
596 		MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) * vcq->buf.frag_buf.npages;
597 	in = kzalloc(inlen, GFP_KERNEL);
598 	if (!in) {
599 		err = -ENOMEM;
600 		goto err_vzalloc;
601 	}
602 
603 	MLX5_SET(create_cq_in, in, uid, ndev->mvdev.res.uid);
604 	pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas);
605 	mlx5_fill_page_frag_array(&vcq->buf.frag_buf, pas);
606 
607 	cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
608 	MLX5_SET(cqc, cqc, log_page_size, vcq->buf.frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
609 
610 	/* Use vector 0 by default. Consider adding code to choose least used
611 	 * vector.
612 	 */
613 	err = mlx5_vector2eqn(mdev, 0, &eqn);
614 	if (err)
615 		goto err_vec;
616 
617 	cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
618 	MLX5_SET(cqc, cqc, log_cq_size, ilog2(num_ent));
619 	MLX5_SET(cqc, cqc, uar_page, ndev->mvdev.res.uar->index);
620 	MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn);
621 	MLX5_SET64(cqc, cqc, dbr_addr, vcq->db.dma);
622 
623 	err = mlx5_core_create_cq(mdev, &vcq->mcq, in, inlen, out, sizeof(out));
624 	if (err)
625 		goto err_vec;
626 
627 	vcq->mcq.comp = mlx5_vdpa_cq_comp;
628 	vcq->cqe = num_ent;
629 	vcq->mcq.set_ci_db = vcq->db.db;
630 	vcq->mcq.arm_db = vcq->db.db + 1;
631 	mlx5_cq_arm(&mvq->cq.mcq, MLX5_CQ_DB_REQ_NOT, uar_page, mvq->cq.mcq.cons_index);
632 	kfree(in);
633 	return 0;
634 
635 err_vec:
636 	kfree(in);
637 err_vzalloc:
638 	cq_frag_buf_free(ndev, &vcq->buf);
639 err_db:
640 	mlx5_db_free(ndev->mvdev.mdev, &vcq->db);
641 	return err;
642 }
643 
644 static void cq_destroy(struct mlx5_vdpa_net *ndev, u16 idx)
645 {
646 	struct mlx5_vdpa_virtqueue *mvq = &ndev->vqs[idx];
647 	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
648 	struct mlx5_vdpa_cq *vcq = &mvq->cq;
649 
650 	if (mlx5_core_destroy_cq(mdev, &vcq->mcq)) {
651 		mlx5_vdpa_warn(&ndev->mvdev, "destroy CQ 0x%x\n", vcq->mcq.cqn);
652 		return;
653 	}
654 	cq_frag_buf_free(ndev, &vcq->buf);
655 	mlx5_db_free(ndev->mvdev.mdev, &vcq->db);
656 }
657 
658 static void set_umem_size(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num,
659 			  struct mlx5_vdpa_umem **umemp)
660 {
661 	struct mlx5_core_dev *mdev = ndev->mvdev.mdev;
662 	int p_a;
663 	int p_b;
664 
665 	switch (num) {
666 	case 1:
667 		p_a = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_1_buffer_param_a);
668 		p_b = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_1_buffer_param_b);
669 		*umemp = &mvq->umem1;
670 		break;
671 	case 2:
672 		p_a = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_2_buffer_param_a);
673 		p_b = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_2_buffer_param_b);
674 		*umemp = &mvq->umem2;
675 		break;
676 	case 3:
677 		p_a = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_3_buffer_param_a);
678 		p_b = MLX5_CAP_DEV_VDPA_EMULATION(mdev, umem_3_buffer_param_b);
679 		*umemp = &mvq->umem3;
680 		break;
681 	}
682 	(*umemp)->size = p_a * mvq->num_ent + p_b;
683 }
684 
685 static void umem_frag_buf_free(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_umem *umem)
686 {
687 	mlx5_frag_buf_free(ndev->mvdev.mdev, &umem->frag_buf);
688 }
689 
690 static int create_umem(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num)
691 {
692 	int inlen;
693 	u32 out[MLX5_ST_SZ_DW(create_umem_out)] = {};
694 	void *um;
695 	void *in;
696 	int err;
697 	__be64 *pas;
698 	struct mlx5_vdpa_umem *umem;
699 
700 	set_umem_size(ndev, mvq, num, &umem);
701 	err = umem_frag_buf_alloc(ndev, umem, umem->size);
702 	if (err)
703 		return err;
704 
705 	inlen = MLX5_ST_SZ_BYTES(create_umem_in) + MLX5_ST_SZ_BYTES(mtt) * umem->frag_buf.npages;
706 
707 	in = kzalloc(inlen, GFP_KERNEL);
708 	if (!in) {
709 		err = -ENOMEM;
710 		goto err_in;
711 	}
712 
713 	MLX5_SET(create_umem_in, in, opcode, MLX5_CMD_OP_CREATE_UMEM);
714 	MLX5_SET(create_umem_in, in, uid, ndev->mvdev.res.uid);
715 	um = MLX5_ADDR_OF(create_umem_in, in, umem);
716 	MLX5_SET(umem, um, log_page_size, umem->frag_buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
717 	MLX5_SET64(umem, um, num_of_mtt, umem->frag_buf.npages);
718 
719 	pas = (__be64 *)MLX5_ADDR_OF(umem, um, mtt[0]);
720 	mlx5_fill_page_frag_array_perm(&umem->frag_buf, pas, MLX5_MTT_PERM_RW);
721 
722 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out));
723 	if (err) {
724 		mlx5_vdpa_warn(&ndev->mvdev, "create umem(%d)\n", err);
725 		goto err_cmd;
726 	}
727 
728 	kfree(in);
729 	umem->id = MLX5_GET(create_umem_out, out, umem_id);
730 
731 	return 0;
732 
733 err_cmd:
734 	kfree(in);
735 err_in:
736 	umem_frag_buf_free(ndev, umem);
737 	return err;
738 }
739 
740 static void umem_destroy(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int num)
741 {
742 	u32 in[MLX5_ST_SZ_DW(destroy_umem_in)] = {};
743 	u32 out[MLX5_ST_SZ_DW(destroy_umem_out)] = {};
744 	struct mlx5_vdpa_umem *umem;
745 
746 	switch (num) {
747 	case 1:
748 		umem = &mvq->umem1;
749 		break;
750 	case 2:
751 		umem = &mvq->umem2;
752 		break;
753 	case 3:
754 		umem = &mvq->umem3;
755 		break;
756 	}
757 
758 	MLX5_SET(destroy_umem_in, in, opcode, MLX5_CMD_OP_DESTROY_UMEM);
759 	MLX5_SET(destroy_umem_in, in, umem_id, umem->id);
760 	if (mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out)))
761 		return;
762 
763 	umem_frag_buf_free(ndev, umem);
764 }
765 
766 static int umems_create(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
767 {
768 	int num;
769 	int err;
770 
771 	for (num = 1; num <= 3; num++) {
772 		err = create_umem(ndev, mvq, num);
773 		if (err)
774 			goto err_umem;
775 	}
776 	return 0;
777 
778 err_umem:
779 	for (num--; num > 0; num--)
780 		umem_destroy(ndev, mvq, num);
781 
782 	return err;
783 }
784 
785 static void umems_destroy(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
786 {
787 	int num;
788 
789 	for (num = 3; num > 0; num--)
790 		umem_destroy(ndev, mvq, num);
791 }
792 
793 static int get_queue_type(struct mlx5_vdpa_net *ndev)
794 {
795 	u32 type_mask;
796 
797 	type_mask = MLX5_CAP_DEV_VDPA_EMULATION(ndev->mvdev.mdev, virtio_queue_type);
798 
799 	/* prefer split queue */
800 	if (type_mask & MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_SPLIT)
801 		return MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_SPLIT;
802 
803 	WARN_ON(!(type_mask & MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_PACKED));
804 
805 	return MLX5_VIRTIO_EMULATION_VIRTIO_QUEUE_TYPE_PACKED;
806 }
807 
808 static bool vq_is_tx(u16 idx)
809 {
810 	return idx % 2;
811 }
812 
813 static u16 get_features_12_3(u64 features)
814 {
815 	return (!!(features & BIT_ULL(VIRTIO_NET_F_HOST_TSO4)) << 9) |
816 	       (!!(features & BIT_ULL(VIRTIO_NET_F_HOST_TSO6)) << 8) |
817 	       (!!(features & BIT_ULL(VIRTIO_NET_F_CSUM)) << 7) |
818 	       (!!(features & BIT_ULL(VIRTIO_NET_F_GUEST_CSUM)) << 6);
819 }
820 
821 static int create_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
822 {
823 	int inlen = MLX5_ST_SZ_BYTES(create_virtio_net_q_in);
824 	u32 out[MLX5_ST_SZ_DW(create_virtio_net_q_out)] = {};
825 	void *obj_context;
826 	void *cmd_hdr;
827 	void *vq_ctx;
828 	void *in;
829 	int err;
830 
831 	err = umems_create(ndev, mvq);
832 	if (err)
833 		return err;
834 
835 	in = kzalloc(inlen, GFP_KERNEL);
836 	if (!in) {
837 		err = -ENOMEM;
838 		goto err_alloc;
839 	}
840 
841 	cmd_hdr = MLX5_ADDR_OF(create_virtio_net_q_in, in, general_obj_in_cmd_hdr);
842 
843 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
844 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_NET_Q);
845 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
846 
847 	obj_context = MLX5_ADDR_OF(create_virtio_net_q_in, in, obj_context);
848 	MLX5_SET(virtio_net_q_object, obj_context, hw_available_index, mvq->avail_idx);
849 	MLX5_SET(virtio_net_q_object, obj_context, hw_used_index, mvq->used_idx);
850 	MLX5_SET(virtio_net_q_object, obj_context, queue_feature_bit_mask_12_3,
851 		 get_features_12_3(ndev->mvdev.actual_features));
852 	vq_ctx = MLX5_ADDR_OF(virtio_net_q_object, obj_context, virtio_q_context);
853 	MLX5_SET(virtio_q, vq_ctx, virtio_q_type, get_queue_type(ndev));
854 
855 	if (vq_is_tx(mvq->index))
856 		MLX5_SET(virtio_net_q_object, obj_context, tisn_or_qpn, ndev->res.tisn);
857 
858 	MLX5_SET(virtio_q, vq_ctx, event_mode, MLX5_VIRTIO_Q_EVENT_MODE_QP_MODE);
859 	MLX5_SET(virtio_q, vq_ctx, queue_index, mvq->index);
860 	MLX5_SET(virtio_q, vq_ctx, event_qpn_or_msix, mvq->fwqp.mqp.qpn);
861 	MLX5_SET(virtio_q, vq_ctx, queue_size, mvq->num_ent);
862 	MLX5_SET(virtio_q, vq_ctx, virtio_version_1_0,
863 		 !!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_F_VERSION_1)));
864 	MLX5_SET64(virtio_q, vq_ctx, desc_addr, mvq->desc_addr);
865 	MLX5_SET64(virtio_q, vq_ctx, used_addr, mvq->device_addr);
866 	MLX5_SET64(virtio_q, vq_ctx, available_addr, mvq->driver_addr);
867 	MLX5_SET(virtio_q, vq_ctx, virtio_q_mkey, ndev->mvdev.mr.mkey);
868 	MLX5_SET(virtio_q, vq_ctx, umem_1_id, mvq->umem1.id);
869 	MLX5_SET(virtio_q, vq_ctx, umem_1_size, mvq->umem1.size);
870 	MLX5_SET(virtio_q, vq_ctx, umem_2_id, mvq->umem2.id);
871 	MLX5_SET(virtio_q, vq_ctx, umem_2_size, mvq->umem2.size);
872 	MLX5_SET(virtio_q, vq_ctx, umem_3_id, mvq->umem3.id);
873 	MLX5_SET(virtio_q, vq_ctx, umem_3_size, mvq->umem3.size);
874 	MLX5_SET(virtio_q, vq_ctx, pd, ndev->mvdev.res.pdn);
875 
876 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out));
877 	if (err)
878 		goto err_cmd;
879 
880 	kfree(in);
881 	mvq->virtq_id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id);
882 
883 	return 0;
884 
885 err_cmd:
886 	kfree(in);
887 err_alloc:
888 	umems_destroy(ndev, mvq);
889 	return err;
890 }
891 
892 static void destroy_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
893 {
894 	u32 in[MLX5_ST_SZ_DW(destroy_virtio_net_q_in)] = {};
895 	u32 out[MLX5_ST_SZ_DW(destroy_virtio_net_q_out)] = {};
896 
897 	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.opcode,
898 		 MLX5_CMD_OP_DESTROY_GENERAL_OBJECT);
899 	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.obj_id, mvq->virtq_id);
900 	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.uid, ndev->mvdev.res.uid);
901 	MLX5_SET(destroy_virtio_net_q_in, in, general_obj_out_cmd_hdr.obj_type,
902 		 MLX5_OBJ_TYPE_VIRTIO_NET_Q);
903 	if (mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, sizeof(out))) {
904 		mlx5_vdpa_warn(&ndev->mvdev, "destroy virtqueue 0x%x\n", mvq->virtq_id);
905 		return;
906 	}
907 	umems_destroy(ndev, mvq);
908 }
909 
910 static u32 get_rqpn(struct mlx5_vdpa_virtqueue *mvq, bool fw)
911 {
912 	return fw ? mvq->vqqp.mqp.qpn : mvq->fwqp.mqp.qpn;
913 }
914 
915 static u32 get_qpn(struct mlx5_vdpa_virtqueue *mvq, bool fw)
916 {
917 	return fw ? mvq->fwqp.mqp.qpn : mvq->vqqp.mqp.qpn;
918 }
919 
920 static void alloc_inout(struct mlx5_vdpa_net *ndev, int cmd, void **in, int *inlen, void **out,
921 			int *outlen, u32 qpn, u32 rqpn)
922 {
923 	void *qpc;
924 	void *pp;
925 
926 	switch (cmd) {
927 	case MLX5_CMD_OP_2RST_QP:
928 		*inlen = MLX5_ST_SZ_BYTES(qp_2rst_in);
929 		*outlen = MLX5_ST_SZ_BYTES(qp_2rst_out);
930 		*in = kzalloc(*inlen, GFP_KERNEL);
931 		*out = kzalloc(*outlen, GFP_KERNEL);
932 		if (!*in || !*out)
933 			goto outerr;
934 
935 		MLX5_SET(qp_2rst_in, *in, opcode, cmd);
936 		MLX5_SET(qp_2rst_in, *in, uid, ndev->mvdev.res.uid);
937 		MLX5_SET(qp_2rst_in, *in, qpn, qpn);
938 		break;
939 	case MLX5_CMD_OP_RST2INIT_QP:
940 		*inlen = MLX5_ST_SZ_BYTES(rst2init_qp_in);
941 		*outlen = MLX5_ST_SZ_BYTES(rst2init_qp_out);
942 		*in = kzalloc(*inlen, GFP_KERNEL);
943 		*out = kzalloc(MLX5_ST_SZ_BYTES(rst2init_qp_out), GFP_KERNEL);
944 		if (!*in || !*out)
945 			goto outerr;
946 
947 		MLX5_SET(rst2init_qp_in, *in, opcode, cmd);
948 		MLX5_SET(rst2init_qp_in, *in, uid, ndev->mvdev.res.uid);
949 		MLX5_SET(rst2init_qp_in, *in, qpn, qpn);
950 		qpc = MLX5_ADDR_OF(rst2init_qp_in, *in, qpc);
951 		MLX5_SET(qpc, qpc, remote_qpn, rqpn);
952 		MLX5_SET(qpc, qpc, rwe, 1);
953 		pp = MLX5_ADDR_OF(qpc, qpc, primary_address_path);
954 		MLX5_SET(ads, pp, vhca_port_num, 1);
955 		break;
956 	case MLX5_CMD_OP_INIT2RTR_QP:
957 		*inlen = MLX5_ST_SZ_BYTES(init2rtr_qp_in);
958 		*outlen = MLX5_ST_SZ_BYTES(init2rtr_qp_out);
959 		*in = kzalloc(*inlen, GFP_KERNEL);
960 		*out = kzalloc(MLX5_ST_SZ_BYTES(init2rtr_qp_out), GFP_KERNEL);
961 		if (!*in || !*out)
962 			goto outerr;
963 
964 		MLX5_SET(init2rtr_qp_in, *in, opcode, cmd);
965 		MLX5_SET(init2rtr_qp_in, *in, uid, ndev->mvdev.res.uid);
966 		MLX5_SET(init2rtr_qp_in, *in, qpn, qpn);
967 		qpc = MLX5_ADDR_OF(rst2init_qp_in, *in, qpc);
968 		MLX5_SET(qpc, qpc, mtu, MLX5_QPC_MTU_256_BYTES);
969 		MLX5_SET(qpc, qpc, log_msg_max, 30);
970 		MLX5_SET(qpc, qpc, remote_qpn, rqpn);
971 		pp = MLX5_ADDR_OF(qpc, qpc, primary_address_path);
972 		MLX5_SET(ads, pp, fl, 1);
973 		break;
974 	case MLX5_CMD_OP_RTR2RTS_QP:
975 		*inlen = MLX5_ST_SZ_BYTES(rtr2rts_qp_in);
976 		*outlen = MLX5_ST_SZ_BYTES(rtr2rts_qp_out);
977 		*in = kzalloc(*inlen, GFP_KERNEL);
978 		*out = kzalloc(MLX5_ST_SZ_BYTES(rtr2rts_qp_out), GFP_KERNEL);
979 		if (!*in || !*out)
980 			goto outerr;
981 
982 		MLX5_SET(rtr2rts_qp_in, *in, opcode, cmd);
983 		MLX5_SET(rtr2rts_qp_in, *in, uid, ndev->mvdev.res.uid);
984 		MLX5_SET(rtr2rts_qp_in, *in, qpn, qpn);
985 		qpc = MLX5_ADDR_OF(rst2init_qp_in, *in, qpc);
986 		pp = MLX5_ADDR_OF(qpc, qpc, primary_address_path);
987 		MLX5_SET(ads, pp, ack_timeout, 14);
988 		MLX5_SET(qpc, qpc, retry_count, 7);
989 		MLX5_SET(qpc, qpc, rnr_retry, 7);
990 		break;
991 	default:
992 		goto outerr_nullify;
993 	}
994 
995 	return;
996 
997 outerr:
998 	kfree(*in);
999 	kfree(*out);
1000 outerr_nullify:
1001 	*in = NULL;
1002 	*out = NULL;
1003 }
1004 
1005 static void free_inout(void *in, void *out)
1006 {
1007 	kfree(in);
1008 	kfree(out);
1009 }
1010 
1011 /* Two QPs are used by each virtqueue. One is used by the driver and one by
1012  * firmware. The fw argument indicates whether the subjected QP is the one used
1013  * by firmware.
1014  */
1015 static int modify_qp(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, bool fw, int cmd)
1016 {
1017 	int outlen;
1018 	int inlen;
1019 	void *out;
1020 	void *in;
1021 	int err;
1022 
1023 	alloc_inout(ndev, cmd, &in, &inlen, &out, &outlen, get_qpn(mvq, fw), get_rqpn(mvq, fw));
1024 	if (!in || !out)
1025 		return -ENOMEM;
1026 
1027 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, outlen);
1028 	free_inout(in, out);
1029 	return err;
1030 }
1031 
1032 static int connect_qps(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1033 {
1034 	int err;
1035 
1036 	err = modify_qp(ndev, mvq, true, MLX5_CMD_OP_2RST_QP);
1037 	if (err)
1038 		return err;
1039 
1040 	err = modify_qp(ndev, mvq, false, MLX5_CMD_OP_2RST_QP);
1041 	if (err)
1042 		return err;
1043 
1044 	err = modify_qp(ndev, mvq, true, MLX5_CMD_OP_RST2INIT_QP);
1045 	if (err)
1046 		return err;
1047 
1048 	err = modify_qp(ndev, mvq, false, MLX5_CMD_OP_RST2INIT_QP);
1049 	if (err)
1050 		return err;
1051 
1052 	err = modify_qp(ndev, mvq, true, MLX5_CMD_OP_INIT2RTR_QP);
1053 	if (err)
1054 		return err;
1055 
1056 	err = modify_qp(ndev, mvq, false, MLX5_CMD_OP_INIT2RTR_QP);
1057 	if (err)
1058 		return err;
1059 
1060 	return modify_qp(ndev, mvq, true, MLX5_CMD_OP_RTR2RTS_QP);
1061 }
1062 
1063 struct mlx5_virtq_attr {
1064 	u8 state;
1065 	u16 available_index;
1066 	u16 used_index;
1067 };
1068 
1069 static int query_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq,
1070 			   struct mlx5_virtq_attr *attr)
1071 {
1072 	int outlen = MLX5_ST_SZ_BYTES(query_virtio_net_q_out);
1073 	u32 in[MLX5_ST_SZ_DW(query_virtio_net_q_in)] = {};
1074 	void *out;
1075 	void *obj_context;
1076 	void *cmd_hdr;
1077 	int err;
1078 
1079 	out = kzalloc(outlen, GFP_KERNEL);
1080 	if (!out)
1081 		return -ENOMEM;
1082 
1083 	cmd_hdr = MLX5_ADDR_OF(query_virtio_net_q_in, in, general_obj_in_cmd_hdr);
1084 
1085 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_QUERY_GENERAL_OBJECT);
1086 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_NET_Q);
1087 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->virtq_id);
1088 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
1089 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, sizeof(in), out, outlen);
1090 	if (err)
1091 		goto err_cmd;
1092 
1093 	obj_context = MLX5_ADDR_OF(query_virtio_net_q_out, out, obj_context);
1094 	memset(attr, 0, sizeof(*attr));
1095 	attr->state = MLX5_GET(virtio_net_q_object, obj_context, state);
1096 	attr->available_index = MLX5_GET(virtio_net_q_object, obj_context, hw_available_index);
1097 	attr->used_index = MLX5_GET(virtio_net_q_object, obj_context, hw_used_index);
1098 	kfree(out);
1099 	return 0;
1100 
1101 err_cmd:
1102 	kfree(out);
1103 	return err;
1104 }
1105 
1106 static int modify_virtqueue(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq, int state)
1107 {
1108 	int inlen = MLX5_ST_SZ_BYTES(modify_virtio_net_q_in);
1109 	u32 out[MLX5_ST_SZ_DW(modify_virtio_net_q_out)] = {};
1110 	void *obj_context;
1111 	void *cmd_hdr;
1112 	void *in;
1113 	int err;
1114 
1115 	in = kzalloc(inlen, GFP_KERNEL);
1116 	if (!in)
1117 		return -ENOMEM;
1118 
1119 	cmd_hdr = MLX5_ADDR_OF(modify_virtio_net_q_in, in, general_obj_in_cmd_hdr);
1120 
1121 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_MODIFY_GENERAL_OBJECT);
1122 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_VIRTIO_NET_Q);
1123 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, mvq->virtq_id);
1124 	MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, uid, ndev->mvdev.res.uid);
1125 
1126 	obj_context = MLX5_ADDR_OF(modify_virtio_net_q_in, in, obj_context);
1127 	MLX5_SET64(virtio_net_q_object, obj_context, modify_field_select,
1128 		   MLX5_VIRTQ_MODIFY_MASK_STATE);
1129 	MLX5_SET(virtio_net_q_object, obj_context, state, state);
1130 	err = mlx5_cmd_exec(ndev->mvdev.mdev, in, inlen, out, sizeof(out));
1131 	kfree(in);
1132 	if (!err)
1133 		mvq->fw_state = state;
1134 
1135 	return err;
1136 }
1137 
1138 static int setup_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1139 {
1140 	u16 idx = mvq->index;
1141 	int err;
1142 
1143 	if (!mvq->num_ent)
1144 		return 0;
1145 
1146 	if (mvq->initialized)
1147 		return 0;
1148 
1149 	err = cq_create(ndev, idx, mvq->num_ent);
1150 	if (err)
1151 		return err;
1152 
1153 	err = qp_create(ndev, mvq, &mvq->fwqp);
1154 	if (err)
1155 		goto err_fwqp;
1156 
1157 	err = qp_create(ndev, mvq, &mvq->vqqp);
1158 	if (err)
1159 		goto err_vqqp;
1160 
1161 	err = connect_qps(ndev, mvq);
1162 	if (err)
1163 		goto err_connect;
1164 
1165 	err = create_virtqueue(ndev, mvq);
1166 	if (err)
1167 		goto err_connect;
1168 
1169 	if (mvq->ready) {
1170 		err = modify_virtqueue(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY);
1171 		if (err) {
1172 			mlx5_vdpa_warn(&ndev->mvdev, "failed to modify to ready vq idx %d(%d)\n",
1173 				       idx, err);
1174 			goto err_connect;
1175 		}
1176 	}
1177 
1178 	mvq->initialized = true;
1179 	return 0;
1180 
1181 err_connect:
1182 	qp_destroy(ndev, &mvq->vqqp);
1183 err_vqqp:
1184 	qp_destroy(ndev, &mvq->fwqp);
1185 err_fwqp:
1186 	cq_destroy(ndev, idx);
1187 	return err;
1188 }
1189 
1190 static void suspend_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1191 {
1192 	struct mlx5_virtq_attr attr;
1193 
1194 	if (!mvq->initialized)
1195 		return;
1196 
1197 	if (mvq->fw_state != MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY)
1198 		return;
1199 
1200 	if (modify_virtqueue(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND))
1201 		mlx5_vdpa_warn(&ndev->mvdev, "modify to suspend failed\n");
1202 
1203 	if (query_virtqueue(ndev, mvq, &attr)) {
1204 		mlx5_vdpa_warn(&ndev->mvdev, "failed to query virtqueue\n");
1205 		return;
1206 	}
1207 	mvq->avail_idx = attr.available_index;
1208 	mvq->used_idx = attr.used_index;
1209 }
1210 
1211 static void suspend_vqs(struct mlx5_vdpa_net *ndev)
1212 {
1213 	int i;
1214 
1215 	for (i = 0; i < ndev->mvdev.max_vqs; i++)
1216 		suspend_vq(ndev, &ndev->vqs[i]);
1217 }
1218 
1219 static void teardown_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
1220 {
1221 	if (!mvq->initialized)
1222 		return;
1223 
1224 	suspend_vq(ndev, mvq);
1225 	destroy_virtqueue(ndev, mvq);
1226 	qp_destroy(ndev, &mvq->vqqp);
1227 	qp_destroy(ndev, &mvq->fwqp);
1228 	cq_destroy(ndev, mvq->index);
1229 	mvq->initialized = false;
1230 }
1231 
1232 static int create_rqt(struct mlx5_vdpa_net *ndev)
1233 {
1234 	__be32 *list;
1235 	void *rqtc;
1236 	int inlen;
1237 	void *in;
1238 	int i, j;
1239 	int err;
1240 
1241 	inlen = MLX5_ST_SZ_BYTES(create_rqt_in) + ndev->rqt_size * MLX5_ST_SZ_BYTES(rq_num);
1242 	in = kzalloc(inlen, GFP_KERNEL);
1243 	if (!in)
1244 		return -ENOMEM;
1245 
1246 	MLX5_SET(create_rqt_in, in, uid, ndev->mvdev.res.uid);
1247 	rqtc = MLX5_ADDR_OF(create_rqt_in, in, rqt_context);
1248 
1249 	MLX5_SET(rqtc, rqtc, list_q_type, MLX5_RQTC_LIST_Q_TYPE_VIRTIO_NET_Q);
1250 	MLX5_SET(rqtc, rqtc, rqt_max_size, ndev->rqt_size);
1251 	list = MLX5_ADDR_OF(rqtc, rqtc, rq_num[0]);
1252 	for (i = 0, j = 0; i < ndev->rqt_size; i++, j += 2)
1253 		list[i] = cpu_to_be32(ndev->vqs[j % ndev->cur_num_vqs].virtq_id);
1254 
1255 	MLX5_SET(rqtc, rqtc, rqt_actual_size, ndev->rqt_size);
1256 	err = mlx5_vdpa_create_rqt(&ndev->mvdev, in, inlen, &ndev->res.rqtn);
1257 	kfree(in);
1258 	if (err)
1259 		return err;
1260 
1261 	return 0;
1262 }
1263 
1264 #define MLX5_MODIFY_RQT_NUM_RQS ((u64)1)
1265 
1266 static int modify_rqt(struct mlx5_vdpa_net *ndev, int num)
1267 {
1268 	__be32 *list;
1269 	void *rqtc;
1270 	int inlen;
1271 	void *in;
1272 	int i, j;
1273 	int err;
1274 
1275 	inlen = MLX5_ST_SZ_BYTES(modify_rqt_in) + ndev->rqt_size * MLX5_ST_SZ_BYTES(rq_num);
1276 	in = kzalloc(inlen, GFP_KERNEL);
1277 	if (!in)
1278 		return -ENOMEM;
1279 
1280 	MLX5_SET(modify_rqt_in, in, uid, ndev->mvdev.res.uid);
1281 	MLX5_SET64(modify_rqt_in, in, bitmask, MLX5_MODIFY_RQT_NUM_RQS);
1282 	rqtc = MLX5_ADDR_OF(modify_rqt_in, in, ctx);
1283 	MLX5_SET(rqtc, rqtc, list_q_type, MLX5_RQTC_LIST_Q_TYPE_VIRTIO_NET_Q);
1284 
1285 	list = MLX5_ADDR_OF(rqtc, rqtc, rq_num[0]);
1286 	for (i = 0, j = 0; i < ndev->rqt_size; i++, j += 2)
1287 		list[i] = cpu_to_be32(ndev->vqs[j % num].virtq_id);
1288 
1289 	MLX5_SET(rqtc, rqtc, rqt_actual_size, ndev->rqt_size);
1290 	err = mlx5_vdpa_modify_rqt(&ndev->mvdev, in, inlen, ndev->res.rqtn);
1291 	kfree(in);
1292 	if (err)
1293 		return err;
1294 
1295 	return 0;
1296 }
1297 
1298 static void destroy_rqt(struct mlx5_vdpa_net *ndev)
1299 {
1300 	mlx5_vdpa_destroy_rqt(&ndev->mvdev, ndev->res.rqtn);
1301 }
1302 
1303 static int create_tir(struct mlx5_vdpa_net *ndev)
1304 {
1305 #define HASH_IP_L4PORTS                                                                            \
1306 	(MLX5_HASH_FIELD_SEL_SRC_IP | MLX5_HASH_FIELD_SEL_DST_IP | MLX5_HASH_FIELD_SEL_L4_SPORT |  \
1307 	 MLX5_HASH_FIELD_SEL_L4_DPORT)
1308 	static const u8 rx_hash_toeplitz_key[] = { 0x2c, 0xc6, 0x81, 0xd1, 0x5b, 0xdb, 0xf4, 0xf7,
1309 						   0xfc, 0xa2, 0x83, 0x19, 0xdb, 0x1a, 0x3e, 0x94,
1310 						   0x6b, 0x9e, 0x38, 0xd9, 0x2c, 0x9c, 0x03, 0xd1,
1311 						   0xad, 0x99, 0x44, 0xa7, 0xd9, 0x56, 0x3d, 0x59,
1312 						   0x06, 0x3c, 0x25, 0xf3, 0xfc, 0x1f, 0xdc, 0x2a };
1313 	void *rss_key;
1314 	void *outer;
1315 	void *tirc;
1316 	void *in;
1317 	int err;
1318 
1319 	in = kzalloc(MLX5_ST_SZ_BYTES(create_tir_in), GFP_KERNEL);
1320 	if (!in)
1321 		return -ENOMEM;
1322 
1323 	MLX5_SET(create_tir_in, in, uid, ndev->mvdev.res.uid);
1324 	tirc = MLX5_ADDR_OF(create_tir_in, in, ctx);
1325 	MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_INDIRECT);
1326 
1327 	MLX5_SET(tirc, tirc, rx_hash_symmetric, 1);
1328 	MLX5_SET(tirc, tirc, rx_hash_fn, MLX5_RX_HASH_FN_TOEPLITZ);
1329 	rss_key = MLX5_ADDR_OF(tirc, tirc, rx_hash_toeplitz_key);
1330 	memcpy(rss_key, rx_hash_toeplitz_key, sizeof(rx_hash_toeplitz_key));
1331 
1332 	outer = MLX5_ADDR_OF(tirc, tirc, rx_hash_field_selector_outer);
1333 	MLX5_SET(rx_hash_field_select, outer, l3_prot_type, MLX5_L3_PROT_TYPE_IPV4);
1334 	MLX5_SET(rx_hash_field_select, outer, l4_prot_type, MLX5_L4_PROT_TYPE_TCP);
1335 	MLX5_SET(rx_hash_field_select, outer, selected_fields, HASH_IP_L4PORTS);
1336 
1337 	MLX5_SET(tirc, tirc, indirect_table, ndev->res.rqtn);
1338 	MLX5_SET(tirc, tirc, transport_domain, ndev->res.tdn);
1339 
1340 	err = mlx5_vdpa_create_tir(&ndev->mvdev, in, &ndev->res.tirn);
1341 	kfree(in);
1342 	return err;
1343 }
1344 
1345 static void destroy_tir(struct mlx5_vdpa_net *ndev)
1346 {
1347 	mlx5_vdpa_destroy_tir(&ndev->mvdev, ndev->res.tirn);
1348 }
1349 
1350 static int add_fwd_to_tir(struct mlx5_vdpa_net *ndev)
1351 {
1352 	struct mlx5_flow_destination dest[2] = {};
1353 	struct mlx5_flow_table_attr ft_attr = {};
1354 	struct mlx5_flow_act flow_act = {};
1355 	struct mlx5_flow_namespace *ns;
1356 	struct mlx5_flow_spec *spec;
1357 	void *headers_c;
1358 	void *headers_v;
1359 	u8 *dmac_c;
1360 	u8 *dmac_v;
1361 	int err;
1362 
1363 	spec = kvzalloc(sizeof(*spec), GFP_KERNEL);
1364 	if (!spec)
1365 		return -ENOMEM;
1366 
1367 	spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
1368 	ft_attr.max_fte = 2;
1369 	ft_attr.autogroup.max_num_groups = 2;
1370 
1371 	ns = mlx5_get_flow_namespace(ndev->mvdev.mdev, MLX5_FLOW_NAMESPACE_BYPASS);
1372 	if (!ns) {
1373 		mlx5_vdpa_warn(&ndev->mvdev, "failed to get flow namespace\n");
1374 		err = -EOPNOTSUPP;
1375 		goto err_ns;
1376 	}
1377 
1378 	ndev->rxft = mlx5_create_auto_grouped_flow_table(ns, &ft_attr);
1379 	if (IS_ERR(ndev->rxft)) {
1380 		err = PTR_ERR(ndev->rxft);
1381 		goto err_ns;
1382 	}
1383 
1384 	ndev->rx_counter = mlx5_fc_create(ndev->mvdev.mdev, false);
1385 	if (IS_ERR(ndev->rx_counter)) {
1386 		err = PTR_ERR(ndev->rx_counter);
1387 		goto err_fc;
1388 	}
1389 
1390 	headers_c = MLX5_ADDR_OF(fte_match_param, spec->match_criteria, outer_headers);
1391 	dmac_c = MLX5_ADDR_OF(fte_match_param, headers_c, outer_headers.dmac_47_16);
1392 	memset(dmac_c, 0xff, ETH_ALEN);
1393 	headers_v = MLX5_ADDR_OF(fte_match_param, spec->match_value, outer_headers);
1394 	dmac_v = MLX5_ADDR_OF(fte_match_param, headers_v, outer_headers.dmac_47_16);
1395 	ether_addr_copy(dmac_v, ndev->config.mac);
1396 
1397 	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST | MLX5_FLOW_CONTEXT_ACTION_COUNT;
1398 	dest[0].type = MLX5_FLOW_DESTINATION_TYPE_TIR;
1399 	dest[0].tir_num = ndev->res.tirn;
1400 	dest[1].type = MLX5_FLOW_DESTINATION_TYPE_COUNTER;
1401 	dest[1].counter_id = mlx5_fc_id(ndev->rx_counter);
1402 	ndev->rx_rule_ucast = mlx5_add_flow_rules(ndev->rxft, spec, &flow_act, dest, 2);
1403 
1404 	if (IS_ERR(ndev->rx_rule_ucast)) {
1405 		err = PTR_ERR(ndev->rx_rule_ucast);
1406 		ndev->rx_rule_ucast = NULL;
1407 		goto err_rule_ucast;
1408 	}
1409 
1410 	memset(dmac_c, 0, ETH_ALEN);
1411 	memset(dmac_v, 0, ETH_ALEN);
1412 	dmac_c[0] = 1;
1413 	dmac_v[0] = 1;
1414 	flow_act.action = MLX5_FLOW_CONTEXT_ACTION_FWD_DEST;
1415 	ndev->rx_rule_mcast = mlx5_add_flow_rules(ndev->rxft, spec, &flow_act, dest, 1);
1416 	if (IS_ERR(ndev->rx_rule_mcast)) {
1417 		err = PTR_ERR(ndev->rx_rule_mcast);
1418 		ndev->rx_rule_mcast = NULL;
1419 		goto err_rule_mcast;
1420 	}
1421 
1422 	kvfree(spec);
1423 	return 0;
1424 
1425 err_rule_mcast:
1426 	mlx5_del_flow_rules(ndev->rx_rule_ucast);
1427 	ndev->rx_rule_ucast = NULL;
1428 err_rule_ucast:
1429 	mlx5_fc_destroy(ndev->mvdev.mdev, ndev->rx_counter);
1430 err_fc:
1431 	mlx5_destroy_flow_table(ndev->rxft);
1432 err_ns:
1433 	kvfree(spec);
1434 	return err;
1435 }
1436 
1437 static void remove_fwd_to_tir(struct mlx5_vdpa_net *ndev)
1438 {
1439 	if (!ndev->rx_rule_ucast)
1440 		return;
1441 
1442 	mlx5_del_flow_rules(ndev->rx_rule_mcast);
1443 	ndev->rx_rule_mcast = NULL;
1444 	mlx5_del_flow_rules(ndev->rx_rule_ucast);
1445 	ndev->rx_rule_ucast = NULL;
1446 	mlx5_fc_destroy(ndev->mvdev.mdev, ndev->rx_counter);
1447 	mlx5_destroy_flow_table(ndev->rxft);
1448 }
1449 
1450 static virtio_net_ctrl_ack handle_ctrl_mac(struct mlx5_vdpa_dev *mvdev, u8 cmd)
1451 {
1452 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1453 	struct mlx5_control_vq *cvq = &mvdev->cvq;
1454 	virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
1455 	struct mlx5_core_dev *pfmdev;
1456 	size_t read;
1457 	u8 mac[ETH_ALEN], mac_back[ETH_ALEN];
1458 
1459 	pfmdev = pci_get_drvdata(pci_physfn(mvdev->mdev->pdev));
1460 	switch (cmd) {
1461 	case VIRTIO_NET_CTRL_MAC_ADDR_SET:
1462 		read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, (void *)mac, ETH_ALEN);
1463 		if (read != ETH_ALEN)
1464 			break;
1465 
1466 		if (!memcmp(ndev->config.mac, mac, 6)) {
1467 			status = VIRTIO_NET_OK;
1468 			break;
1469 		}
1470 
1471 		if (is_zero_ether_addr(mac))
1472 			break;
1473 
1474 		if (!is_zero_ether_addr(ndev->config.mac)) {
1475 			if (mlx5_mpfs_del_mac(pfmdev, ndev->config.mac)) {
1476 				mlx5_vdpa_warn(mvdev, "failed to delete old MAC %pM from MPFS table\n",
1477 					       ndev->config.mac);
1478 				break;
1479 			}
1480 		}
1481 
1482 		if (mlx5_mpfs_add_mac(pfmdev, mac)) {
1483 			mlx5_vdpa_warn(mvdev, "failed to insert new MAC %pM into MPFS table\n",
1484 				       mac);
1485 			break;
1486 		}
1487 
1488 		/* backup the original mac address so that if failed to add the forward rules
1489 		 * we could restore it
1490 		 */
1491 		memcpy(mac_back, ndev->config.mac, ETH_ALEN);
1492 
1493 		memcpy(ndev->config.mac, mac, ETH_ALEN);
1494 
1495 		/* Need recreate the flow table entry, so that the packet could forward back
1496 		 */
1497 		remove_fwd_to_tir(ndev);
1498 
1499 		if (add_fwd_to_tir(ndev)) {
1500 			mlx5_vdpa_warn(mvdev, "failed to insert forward rules, try to restore\n");
1501 
1502 			/* Although it hardly run here, we still need double check */
1503 			if (is_zero_ether_addr(mac_back)) {
1504 				mlx5_vdpa_warn(mvdev, "restore mac failed: Original MAC is zero\n");
1505 				break;
1506 			}
1507 
1508 			/* Try to restore original mac address to MFPS table, and try to restore
1509 			 * the forward rule entry.
1510 			 */
1511 			if (mlx5_mpfs_del_mac(pfmdev, ndev->config.mac)) {
1512 				mlx5_vdpa_warn(mvdev, "restore mac failed: delete MAC %pM from MPFS table failed\n",
1513 					       ndev->config.mac);
1514 			}
1515 
1516 			if (mlx5_mpfs_add_mac(pfmdev, mac_back)) {
1517 				mlx5_vdpa_warn(mvdev, "restore mac failed: insert old MAC %pM into MPFS table failed\n",
1518 					       mac_back);
1519 			}
1520 
1521 			memcpy(ndev->config.mac, mac_back, ETH_ALEN);
1522 
1523 			if (add_fwd_to_tir(ndev))
1524 				mlx5_vdpa_warn(mvdev, "restore forward rules failed: insert forward rules failed\n");
1525 
1526 			break;
1527 		}
1528 
1529 		status = VIRTIO_NET_OK;
1530 		break;
1531 
1532 	default:
1533 		break;
1534 	}
1535 
1536 	return status;
1537 }
1538 
1539 static int change_num_qps(struct mlx5_vdpa_dev *mvdev, int newqps)
1540 {
1541 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1542 	int cur_qps = ndev->cur_num_vqs / 2;
1543 	int err;
1544 	int i;
1545 
1546 	if (cur_qps > newqps) {
1547 		err = modify_rqt(ndev, 2 * newqps);
1548 		if (err)
1549 			return err;
1550 
1551 		for (i = ndev->cur_num_vqs - 1; i >= 2 * newqps; i--)
1552 			teardown_vq(ndev, &ndev->vqs[i]);
1553 
1554 		ndev->cur_num_vqs = 2 * newqps;
1555 	} else {
1556 		ndev->cur_num_vqs = 2 * newqps;
1557 		for (i = cur_qps * 2; i < 2 * newqps; i++) {
1558 			err = setup_vq(ndev, &ndev->vqs[i]);
1559 			if (err)
1560 				goto clean_added;
1561 		}
1562 		err = modify_rqt(ndev, 2 * newqps);
1563 		if (err)
1564 			goto clean_added;
1565 	}
1566 	return 0;
1567 
1568 clean_added:
1569 	for (--i; i >= 2 * cur_qps; --i)
1570 		teardown_vq(ndev, &ndev->vqs[i]);
1571 
1572 	ndev->cur_num_vqs = 2 * cur_qps;
1573 
1574 	return err;
1575 }
1576 
1577 static virtio_net_ctrl_ack handle_ctrl_mq(struct mlx5_vdpa_dev *mvdev, u8 cmd)
1578 {
1579 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1580 	virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
1581 	struct mlx5_control_vq *cvq = &mvdev->cvq;
1582 	struct virtio_net_ctrl_mq mq;
1583 	size_t read;
1584 	u16 newqps;
1585 
1586 	switch (cmd) {
1587 	case VIRTIO_NET_CTRL_MQ_VQ_PAIRS_SET:
1588 		/* This mq feature check aligns with pre-existing userspace
1589 		 * implementation.
1590 		 *
1591 		 * Without it, an untrusted driver could fake a multiqueue config
1592 		 * request down to a non-mq device that may cause kernel to
1593 		 * panic due to uninitialized resources for extra vqs. Even with
1594 		 * a well behaving guest driver, it is not expected to allow
1595 		 * changing the number of vqs on a non-mq device.
1596 		 */
1597 		if (!MLX5_FEATURE(mvdev, VIRTIO_NET_F_MQ))
1598 			break;
1599 
1600 		read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, (void *)&mq, sizeof(mq));
1601 		if (read != sizeof(mq))
1602 			break;
1603 
1604 		newqps = mlx5vdpa16_to_cpu(mvdev, mq.virtqueue_pairs);
1605 		if (newqps < VIRTIO_NET_CTRL_MQ_VQ_PAIRS_MIN ||
1606 		    newqps > ndev->rqt_size)
1607 			break;
1608 
1609 		if (ndev->cur_num_vqs == 2 * newqps) {
1610 			status = VIRTIO_NET_OK;
1611 			break;
1612 		}
1613 
1614 		if (!change_num_qps(mvdev, newqps))
1615 			status = VIRTIO_NET_OK;
1616 
1617 		break;
1618 	default:
1619 		break;
1620 	}
1621 
1622 	return status;
1623 }
1624 
1625 static void mlx5_cvq_kick_handler(struct work_struct *work)
1626 {
1627 	virtio_net_ctrl_ack status = VIRTIO_NET_ERR;
1628 	struct virtio_net_ctrl_hdr ctrl;
1629 	struct mlx5_vdpa_wq_ent *wqent;
1630 	struct mlx5_vdpa_dev *mvdev;
1631 	struct mlx5_control_vq *cvq;
1632 	struct mlx5_vdpa_net *ndev;
1633 	size_t read, write;
1634 	int err;
1635 
1636 	wqent = container_of(work, struct mlx5_vdpa_wq_ent, work);
1637 	mvdev = wqent->mvdev;
1638 	ndev = to_mlx5_vdpa_ndev(mvdev);
1639 	cvq = &mvdev->cvq;
1640 
1641 	mutex_lock(&ndev->reslock);
1642 
1643 	if (!(mvdev->status & VIRTIO_CONFIG_S_DRIVER_OK))
1644 		goto out;
1645 
1646 	if (!(ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)))
1647 		goto out;
1648 
1649 	if (!cvq->ready)
1650 		goto out;
1651 
1652 	while (true) {
1653 		err = vringh_getdesc_iotlb(&cvq->vring, &cvq->riov, &cvq->wiov, &cvq->head,
1654 					   GFP_ATOMIC);
1655 		if (err <= 0)
1656 			break;
1657 
1658 		read = vringh_iov_pull_iotlb(&cvq->vring, &cvq->riov, &ctrl, sizeof(ctrl));
1659 		if (read != sizeof(ctrl))
1660 			break;
1661 
1662 		switch (ctrl.class) {
1663 		case VIRTIO_NET_CTRL_MAC:
1664 			status = handle_ctrl_mac(mvdev, ctrl.cmd);
1665 			break;
1666 		case VIRTIO_NET_CTRL_MQ:
1667 			status = handle_ctrl_mq(mvdev, ctrl.cmd);
1668 			break;
1669 
1670 		default:
1671 			break;
1672 		}
1673 
1674 		/* Make sure data is written before advancing index */
1675 		smp_wmb();
1676 
1677 		write = vringh_iov_push_iotlb(&cvq->vring, &cvq->wiov, &status, sizeof(status));
1678 		vringh_complete_iotlb(&cvq->vring, cvq->head, write);
1679 		vringh_kiov_cleanup(&cvq->riov);
1680 		vringh_kiov_cleanup(&cvq->wiov);
1681 
1682 		if (vringh_need_notify_iotlb(&cvq->vring))
1683 			vringh_notify(&cvq->vring);
1684 
1685 		queue_work(mvdev->wq, &wqent->work);
1686 		break;
1687 	}
1688 
1689 out:
1690 	mutex_unlock(&ndev->reslock);
1691 }
1692 
1693 static void mlx5_vdpa_kick_vq(struct vdpa_device *vdev, u16 idx)
1694 {
1695 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1696 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1697 	struct mlx5_vdpa_virtqueue *mvq;
1698 
1699 	if (!is_index_valid(mvdev, idx))
1700 		return;
1701 
1702 	if (unlikely(is_ctrl_vq_idx(mvdev, idx))) {
1703 		if (!mvdev->wq || !mvdev->cvq.ready)
1704 			return;
1705 
1706 		queue_work(mvdev->wq, &ndev->cvq_ent.work);
1707 		return;
1708 	}
1709 
1710 	mvq = &ndev->vqs[idx];
1711 	if (unlikely(!mvq->ready))
1712 		return;
1713 
1714 	iowrite16(idx, ndev->mvdev.res.kick_addr);
1715 }
1716 
1717 static int mlx5_vdpa_set_vq_address(struct vdpa_device *vdev, u16 idx, u64 desc_area,
1718 				    u64 driver_area, u64 device_area)
1719 {
1720 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1721 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1722 	struct mlx5_vdpa_virtqueue *mvq;
1723 
1724 	if (!is_index_valid(mvdev, idx))
1725 		return -EINVAL;
1726 
1727 	if (is_ctrl_vq_idx(mvdev, idx)) {
1728 		mvdev->cvq.desc_addr = desc_area;
1729 		mvdev->cvq.device_addr = device_area;
1730 		mvdev->cvq.driver_addr = driver_area;
1731 		return 0;
1732 	}
1733 
1734 	mvq = &ndev->vqs[idx];
1735 	mvq->desc_addr = desc_area;
1736 	mvq->device_addr = device_area;
1737 	mvq->driver_addr = driver_area;
1738 	return 0;
1739 }
1740 
1741 static void mlx5_vdpa_set_vq_num(struct vdpa_device *vdev, u16 idx, u32 num)
1742 {
1743 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1744 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1745 	struct mlx5_vdpa_virtqueue *mvq;
1746 
1747 	if (!is_index_valid(mvdev, idx) || is_ctrl_vq_idx(mvdev, idx))
1748 		return;
1749 
1750 	mvq = &ndev->vqs[idx];
1751 	mvq->num_ent = num;
1752 }
1753 
1754 static void mlx5_vdpa_set_vq_cb(struct vdpa_device *vdev, u16 idx, struct vdpa_callback *cb)
1755 {
1756 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1757 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1758 
1759 	ndev->event_cbs[idx] = *cb;
1760 }
1761 
1762 static void mlx5_cvq_notify(struct vringh *vring)
1763 {
1764 	struct mlx5_control_vq *cvq = container_of(vring, struct mlx5_control_vq, vring);
1765 
1766 	if (!cvq->event_cb.callback)
1767 		return;
1768 
1769 	cvq->event_cb.callback(cvq->event_cb.private);
1770 }
1771 
1772 static void set_cvq_ready(struct mlx5_vdpa_dev *mvdev, bool ready)
1773 {
1774 	struct mlx5_control_vq *cvq = &mvdev->cvq;
1775 
1776 	cvq->ready = ready;
1777 	if (!ready)
1778 		return;
1779 
1780 	cvq->vring.notify = mlx5_cvq_notify;
1781 }
1782 
1783 static void mlx5_vdpa_set_vq_ready(struct vdpa_device *vdev, u16 idx, bool ready)
1784 {
1785 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1786 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1787 	struct mlx5_vdpa_virtqueue *mvq;
1788 
1789 	if (!mvdev->actual_features)
1790 		return;
1791 
1792 	if (!is_index_valid(mvdev, idx))
1793 		return;
1794 
1795 	if (is_ctrl_vq_idx(mvdev, idx)) {
1796 		set_cvq_ready(mvdev, ready);
1797 		return;
1798 	}
1799 
1800 	mvq = &ndev->vqs[idx];
1801 	if (!ready)
1802 		suspend_vq(ndev, mvq);
1803 
1804 	mvq->ready = ready;
1805 }
1806 
1807 static bool mlx5_vdpa_get_vq_ready(struct vdpa_device *vdev, u16 idx)
1808 {
1809 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1810 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1811 
1812 	if (!is_index_valid(mvdev, idx))
1813 		return false;
1814 
1815 	if (is_ctrl_vq_idx(mvdev, idx))
1816 		return mvdev->cvq.ready;
1817 
1818 	return ndev->vqs[idx].ready;
1819 }
1820 
1821 static int mlx5_vdpa_set_vq_state(struct vdpa_device *vdev, u16 idx,
1822 				  const struct vdpa_vq_state *state)
1823 {
1824 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1825 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1826 	struct mlx5_vdpa_virtqueue *mvq;
1827 
1828 	if (!is_index_valid(mvdev, idx))
1829 		return -EINVAL;
1830 
1831 	if (is_ctrl_vq_idx(mvdev, idx)) {
1832 		mvdev->cvq.vring.last_avail_idx = state->split.avail_index;
1833 		return 0;
1834 	}
1835 
1836 	mvq = &ndev->vqs[idx];
1837 	if (mvq->fw_state == MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY) {
1838 		mlx5_vdpa_warn(mvdev, "can't modify available index\n");
1839 		return -EINVAL;
1840 	}
1841 
1842 	mvq->used_idx = state->split.avail_index;
1843 	mvq->avail_idx = state->split.avail_index;
1844 	return 0;
1845 }
1846 
1847 static int mlx5_vdpa_get_vq_state(struct vdpa_device *vdev, u16 idx, struct vdpa_vq_state *state)
1848 {
1849 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1850 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1851 	struct mlx5_vdpa_virtqueue *mvq;
1852 	struct mlx5_virtq_attr attr;
1853 	int err;
1854 
1855 	if (!is_index_valid(mvdev, idx))
1856 		return -EINVAL;
1857 
1858 	if (is_ctrl_vq_idx(mvdev, idx)) {
1859 		state->split.avail_index = mvdev->cvq.vring.last_avail_idx;
1860 		return 0;
1861 	}
1862 
1863 	mvq = &ndev->vqs[idx];
1864 	/* If the virtq object was destroyed, use the value saved at
1865 	 * the last minute of suspend_vq. This caters for userspace
1866 	 * that cares about emulating the index after vq is stopped.
1867 	 */
1868 	if (!mvq->initialized) {
1869 		/* Firmware returns a wrong value for the available index.
1870 		 * Since both values should be identical, we take the value of
1871 		 * used_idx which is reported correctly.
1872 		 */
1873 		state->split.avail_index = mvq->used_idx;
1874 		return 0;
1875 	}
1876 
1877 	err = query_virtqueue(ndev, mvq, &attr);
1878 	if (err) {
1879 		mlx5_vdpa_warn(mvdev, "failed to query virtqueue\n");
1880 		return err;
1881 	}
1882 	state->split.avail_index = attr.used_index;
1883 	return 0;
1884 }
1885 
1886 static u32 mlx5_vdpa_get_vq_align(struct vdpa_device *vdev)
1887 {
1888 	return PAGE_SIZE;
1889 }
1890 
1891 enum { MLX5_VIRTIO_NET_F_GUEST_CSUM = 1 << 9,
1892 	MLX5_VIRTIO_NET_F_CSUM = 1 << 10,
1893 	MLX5_VIRTIO_NET_F_HOST_TSO6 = 1 << 11,
1894 	MLX5_VIRTIO_NET_F_HOST_TSO4 = 1 << 12,
1895 };
1896 
1897 static u64 mlx_to_vritio_features(u16 dev_features)
1898 {
1899 	u64 result = 0;
1900 
1901 	if (dev_features & MLX5_VIRTIO_NET_F_GUEST_CSUM)
1902 		result |= BIT_ULL(VIRTIO_NET_F_GUEST_CSUM);
1903 	if (dev_features & MLX5_VIRTIO_NET_F_CSUM)
1904 		result |= BIT_ULL(VIRTIO_NET_F_CSUM);
1905 	if (dev_features & MLX5_VIRTIO_NET_F_HOST_TSO6)
1906 		result |= BIT_ULL(VIRTIO_NET_F_HOST_TSO6);
1907 	if (dev_features & MLX5_VIRTIO_NET_F_HOST_TSO4)
1908 		result |= BIT_ULL(VIRTIO_NET_F_HOST_TSO4);
1909 
1910 	return result;
1911 }
1912 
1913 static u64 get_supported_features(struct mlx5_core_dev *mdev)
1914 {
1915 	u64 mlx_vdpa_features = 0;
1916 	u16 dev_features;
1917 
1918 	dev_features = MLX5_CAP_DEV_VDPA_EMULATION(mdev, device_features_bits_mask);
1919 	mlx_vdpa_features |= mlx_to_vritio_features(dev_features);
1920 	if (MLX5_CAP_DEV_VDPA_EMULATION(mdev, virtio_version_1_0))
1921 		mlx_vdpa_features |= BIT_ULL(VIRTIO_F_VERSION_1);
1922 	mlx_vdpa_features |= BIT_ULL(VIRTIO_F_ACCESS_PLATFORM);
1923 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_CTRL_VQ);
1924 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_CTRL_MAC_ADDR);
1925 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_MQ);
1926 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_STATUS);
1927 	mlx_vdpa_features |= BIT_ULL(VIRTIO_NET_F_MTU);
1928 
1929 	return mlx_vdpa_features;
1930 }
1931 
1932 static u64 mlx5_vdpa_get_device_features(struct vdpa_device *vdev)
1933 {
1934 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
1935 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1936 
1937 	print_features(mvdev, ndev->mvdev.mlx_features, false);
1938 	return ndev->mvdev.mlx_features;
1939 }
1940 
1941 static int verify_driver_features(struct mlx5_vdpa_dev *mvdev, u64 features)
1942 {
1943 	/* Minimum features to expect */
1944 	if (!(features & BIT_ULL(VIRTIO_F_ACCESS_PLATFORM)))
1945 		return -EOPNOTSUPP;
1946 
1947 	/* Double check features combination sent down by the driver.
1948 	 * Fail invalid features due to absence of the depended feature.
1949 	 *
1950 	 * Per VIRTIO v1.1 specification, section 5.1.3.1 Feature bit
1951 	 * requirements: "VIRTIO_NET_F_MQ Requires VIRTIO_NET_F_CTRL_VQ".
1952 	 * By failing the invalid features sent down by untrusted drivers,
1953 	 * we're assured the assumption made upon is_index_valid() and
1954 	 * is_ctrl_vq_idx() will not be compromised.
1955 	 */
1956 	if ((features & (BIT_ULL(VIRTIO_NET_F_MQ) | BIT_ULL(VIRTIO_NET_F_CTRL_VQ))) ==
1957             BIT_ULL(VIRTIO_NET_F_MQ))
1958 		return -EINVAL;
1959 
1960 	return 0;
1961 }
1962 
1963 static int setup_virtqueues(struct mlx5_vdpa_dev *mvdev)
1964 {
1965 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
1966 	struct mlx5_control_vq *cvq = &mvdev->cvq;
1967 	int err;
1968 	int i;
1969 
1970 	for (i = 0; i < mvdev->max_vqs; i++) {
1971 		err = setup_vq(ndev, &ndev->vqs[i]);
1972 		if (err)
1973 			goto err_vq;
1974 	}
1975 
1976 	if (mvdev->actual_features & BIT_ULL(VIRTIO_NET_F_CTRL_VQ)) {
1977 		err = vringh_init_iotlb(&cvq->vring, mvdev->actual_features,
1978 					MLX5_CVQ_MAX_ENT, false,
1979 					(struct vring_desc *)(uintptr_t)cvq->desc_addr,
1980 					(struct vring_avail *)(uintptr_t)cvq->driver_addr,
1981 					(struct vring_used *)(uintptr_t)cvq->device_addr);
1982 		if (err)
1983 			goto err_vq;
1984 	}
1985 
1986 	return 0;
1987 
1988 err_vq:
1989 	for (--i; i >= 0; i--)
1990 		teardown_vq(ndev, &ndev->vqs[i]);
1991 
1992 	return err;
1993 }
1994 
1995 static void teardown_virtqueues(struct mlx5_vdpa_net *ndev)
1996 {
1997 	struct mlx5_vdpa_virtqueue *mvq;
1998 	int i;
1999 
2000 	for (i = ndev->mvdev.max_vqs - 1; i >= 0; i--) {
2001 		mvq = &ndev->vqs[i];
2002 		if (!mvq->initialized)
2003 			continue;
2004 
2005 		teardown_vq(ndev, mvq);
2006 	}
2007 }
2008 
2009 static void update_cvq_info(struct mlx5_vdpa_dev *mvdev)
2010 {
2011 	if (MLX5_FEATURE(mvdev, VIRTIO_NET_F_CTRL_VQ)) {
2012 		if (MLX5_FEATURE(mvdev, VIRTIO_NET_F_MQ)) {
2013 			/* MQ supported. CVQ index is right above the last data virtqueue's */
2014 			mvdev->max_idx = mvdev->max_vqs;
2015 		} else {
2016 			/* Only CVQ supportted. data virtqueues occupy indices 0 and 1.
2017 			 * CVQ gets index 2
2018 			 */
2019 			mvdev->max_idx = 2;
2020 		}
2021 	} else {
2022 		/* Two data virtqueues only: one for rx and one for tx */
2023 		mvdev->max_idx = 1;
2024 	}
2025 }
2026 
2027 static int mlx5_vdpa_set_driver_features(struct vdpa_device *vdev, u64 features)
2028 {
2029 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2030 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2031 	int err;
2032 
2033 	print_features(mvdev, features, true);
2034 
2035 	err = verify_driver_features(mvdev, features);
2036 	if (err)
2037 		return err;
2038 
2039 	ndev->mvdev.actual_features = features & ndev->mvdev.mlx_features;
2040 	if (ndev->mvdev.actual_features & BIT_ULL(VIRTIO_NET_F_MQ))
2041 		ndev->rqt_size = mlx5vdpa16_to_cpu(mvdev, ndev->config.max_virtqueue_pairs);
2042 	else
2043 		ndev->rqt_size = 1;
2044 
2045 	ndev->cur_num_vqs = 2 * ndev->rqt_size;
2046 
2047 	update_cvq_info(mvdev);
2048 	return err;
2049 }
2050 
2051 static void mlx5_vdpa_set_config_cb(struct vdpa_device *vdev, struct vdpa_callback *cb)
2052 {
2053 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2054 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2055 
2056 	ndev->config_cb = *cb;
2057 }
2058 
2059 #define MLX5_VDPA_MAX_VQ_ENTRIES 256
2060 static u16 mlx5_vdpa_get_vq_num_max(struct vdpa_device *vdev)
2061 {
2062 	return MLX5_VDPA_MAX_VQ_ENTRIES;
2063 }
2064 
2065 static u32 mlx5_vdpa_get_device_id(struct vdpa_device *vdev)
2066 {
2067 	return VIRTIO_ID_NET;
2068 }
2069 
2070 static u32 mlx5_vdpa_get_vendor_id(struct vdpa_device *vdev)
2071 {
2072 	return PCI_VENDOR_ID_MELLANOX;
2073 }
2074 
2075 static u8 mlx5_vdpa_get_status(struct vdpa_device *vdev)
2076 {
2077 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2078 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2079 
2080 	print_status(mvdev, ndev->mvdev.status, false);
2081 	return ndev->mvdev.status;
2082 }
2083 
2084 static int save_channel_info(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *mvq)
2085 {
2086 	struct mlx5_vq_restore_info *ri = &mvq->ri;
2087 	struct mlx5_virtq_attr attr = {};
2088 	int err;
2089 
2090 	if (mvq->initialized) {
2091 		err = query_virtqueue(ndev, mvq, &attr);
2092 		if (err)
2093 			return err;
2094 	}
2095 
2096 	ri->avail_index = attr.available_index;
2097 	ri->used_index = attr.used_index;
2098 	ri->ready = mvq->ready;
2099 	ri->num_ent = mvq->num_ent;
2100 	ri->desc_addr = mvq->desc_addr;
2101 	ri->device_addr = mvq->device_addr;
2102 	ri->driver_addr = mvq->driver_addr;
2103 	ri->restore = true;
2104 	return 0;
2105 }
2106 
2107 static int save_channels_info(struct mlx5_vdpa_net *ndev)
2108 {
2109 	int i;
2110 
2111 	for (i = 0; i < ndev->mvdev.max_vqs; i++) {
2112 		memset(&ndev->vqs[i].ri, 0, sizeof(ndev->vqs[i].ri));
2113 		save_channel_info(ndev, &ndev->vqs[i]);
2114 	}
2115 	return 0;
2116 }
2117 
2118 static void mlx5_clear_vqs(struct mlx5_vdpa_net *ndev)
2119 {
2120 	int i;
2121 
2122 	for (i = 0; i < ndev->mvdev.max_vqs; i++)
2123 		memset(&ndev->vqs[i], 0, offsetof(struct mlx5_vdpa_virtqueue, ri));
2124 }
2125 
2126 static void restore_channels_info(struct mlx5_vdpa_net *ndev)
2127 {
2128 	struct mlx5_vdpa_virtqueue *mvq;
2129 	struct mlx5_vq_restore_info *ri;
2130 	int i;
2131 
2132 	mlx5_clear_vqs(ndev);
2133 	init_mvqs(ndev);
2134 	for (i = 0; i < ndev->mvdev.max_vqs; i++) {
2135 		mvq = &ndev->vqs[i];
2136 		ri = &mvq->ri;
2137 		if (!ri->restore)
2138 			continue;
2139 
2140 		mvq->avail_idx = ri->avail_index;
2141 		mvq->used_idx = ri->used_index;
2142 		mvq->ready = ri->ready;
2143 		mvq->num_ent = ri->num_ent;
2144 		mvq->desc_addr = ri->desc_addr;
2145 		mvq->device_addr = ri->device_addr;
2146 		mvq->driver_addr = ri->driver_addr;
2147 	}
2148 }
2149 
2150 static int mlx5_vdpa_change_map(struct mlx5_vdpa_dev *mvdev, struct vhost_iotlb *iotlb)
2151 {
2152 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2153 	int err;
2154 
2155 	suspend_vqs(ndev);
2156 	err = save_channels_info(ndev);
2157 	if (err)
2158 		goto err_mr;
2159 
2160 	teardown_driver(ndev);
2161 	mlx5_vdpa_destroy_mr(mvdev);
2162 	err = mlx5_vdpa_create_mr(mvdev, iotlb);
2163 	if (err)
2164 		goto err_mr;
2165 
2166 	if (!(mvdev->status & VIRTIO_CONFIG_S_DRIVER_OK))
2167 		goto err_mr;
2168 
2169 	restore_channels_info(ndev);
2170 	err = setup_driver(mvdev);
2171 	if (err)
2172 		goto err_setup;
2173 
2174 	return 0;
2175 
2176 err_setup:
2177 	mlx5_vdpa_destroy_mr(mvdev);
2178 err_mr:
2179 	return err;
2180 }
2181 
2182 /* reslock must be held for this function */
2183 static int setup_driver(struct mlx5_vdpa_dev *mvdev)
2184 {
2185 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2186 	int err;
2187 
2188 	WARN_ON(!mutex_is_locked(&ndev->reslock));
2189 
2190 	if (ndev->setup) {
2191 		mlx5_vdpa_warn(mvdev, "setup driver called for already setup driver\n");
2192 		err = 0;
2193 		goto out;
2194 	}
2195 	err = setup_virtqueues(mvdev);
2196 	if (err) {
2197 		mlx5_vdpa_warn(mvdev, "setup_virtqueues\n");
2198 		goto out;
2199 	}
2200 
2201 	err = create_rqt(ndev);
2202 	if (err) {
2203 		mlx5_vdpa_warn(mvdev, "create_rqt\n");
2204 		goto err_rqt;
2205 	}
2206 
2207 	err = create_tir(ndev);
2208 	if (err) {
2209 		mlx5_vdpa_warn(mvdev, "create_tir\n");
2210 		goto err_tir;
2211 	}
2212 
2213 	err = add_fwd_to_tir(ndev);
2214 	if (err) {
2215 		mlx5_vdpa_warn(mvdev, "add_fwd_to_tir\n");
2216 		goto err_fwd;
2217 	}
2218 	ndev->setup = true;
2219 
2220 	return 0;
2221 
2222 err_fwd:
2223 	destroy_tir(ndev);
2224 err_tir:
2225 	destroy_rqt(ndev);
2226 err_rqt:
2227 	teardown_virtqueues(ndev);
2228 out:
2229 	return err;
2230 }
2231 
2232 /* reslock must be held for this function */
2233 static void teardown_driver(struct mlx5_vdpa_net *ndev)
2234 {
2235 
2236 	WARN_ON(!mutex_is_locked(&ndev->reslock));
2237 
2238 	if (!ndev->setup)
2239 		return;
2240 
2241 	remove_fwd_to_tir(ndev);
2242 	destroy_tir(ndev);
2243 	destroy_rqt(ndev);
2244 	teardown_virtqueues(ndev);
2245 	ndev->setup = false;
2246 }
2247 
2248 static void clear_vqs_ready(struct mlx5_vdpa_net *ndev)
2249 {
2250 	int i;
2251 
2252 	for (i = 0; i < ndev->mvdev.max_vqs; i++)
2253 		ndev->vqs[i].ready = false;
2254 
2255 	ndev->mvdev.cvq.ready = false;
2256 }
2257 
2258 static void mlx5_vdpa_set_status(struct vdpa_device *vdev, u8 status)
2259 {
2260 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2261 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2262 	int err;
2263 
2264 	print_status(mvdev, status, true);
2265 
2266 	mutex_lock(&ndev->reslock);
2267 
2268 	if ((status ^ ndev->mvdev.status) & VIRTIO_CONFIG_S_DRIVER_OK) {
2269 		if (status & VIRTIO_CONFIG_S_DRIVER_OK) {
2270 			err = setup_driver(mvdev);
2271 			if (err) {
2272 				mlx5_vdpa_warn(mvdev, "failed to setup driver\n");
2273 				goto err_setup;
2274 			}
2275 		} else {
2276 			mlx5_vdpa_warn(mvdev, "did not expect DRIVER_OK to be cleared\n");
2277 			goto err_clear;
2278 		}
2279 	}
2280 
2281 	ndev->mvdev.status = status;
2282 	mutex_unlock(&ndev->reslock);
2283 	return;
2284 
2285 err_setup:
2286 	mlx5_vdpa_destroy_mr(&ndev->mvdev);
2287 	ndev->mvdev.status |= VIRTIO_CONFIG_S_FAILED;
2288 err_clear:
2289 	mutex_unlock(&ndev->reslock);
2290 }
2291 
2292 static int mlx5_vdpa_reset(struct vdpa_device *vdev)
2293 {
2294 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2295 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2296 
2297 	print_status(mvdev, 0, true);
2298 	mlx5_vdpa_info(mvdev, "performing device reset\n");
2299 
2300 	mutex_lock(&ndev->reslock);
2301 	teardown_driver(ndev);
2302 	clear_vqs_ready(ndev);
2303 	mlx5_vdpa_destroy_mr(&ndev->mvdev);
2304 	ndev->mvdev.status = 0;
2305 	ndev->cur_num_vqs = 0;
2306 	memset(ndev->event_cbs, 0, sizeof(*ndev->event_cbs) * (mvdev->max_vqs + 1));
2307 	ndev->mvdev.actual_features = 0;
2308 	++mvdev->generation;
2309 	if (MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) {
2310 		if (mlx5_vdpa_create_mr(mvdev, NULL))
2311 			mlx5_vdpa_warn(mvdev, "create MR failed\n");
2312 	}
2313 	mutex_unlock(&ndev->reslock);
2314 
2315 	return 0;
2316 }
2317 
2318 static size_t mlx5_vdpa_get_config_size(struct vdpa_device *vdev)
2319 {
2320 	return sizeof(struct virtio_net_config);
2321 }
2322 
2323 static void mlx5_vdpa_get_config(struct vdpa_device *vdev, unsigned int offset, void *buf,
2324 				 unsigned int len)
2325 {
2326 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2327 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2328 
2329 	if (offset + len <= sizeof(struct virtio_net_config))
2330 		memcpy(buf, (u8 *)&ndev->config + offset, len);
2331 }
2332 
2333 static void mlx5_vdpa_set_config(struct vdpa_device *vdev, unsigned int offset, const void *buf,
2334 				 unsigned int len)
2335 {
2336 	/* not supported */
2337 }
2338 
2339 static u32 mlx5_vdpa_get_generation(struct vdpa_device *vdev)
2340 {
2341 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2342 
2343 	return mvdev->generation;
2344 }
2345 
2346 static int mlx5_vdpa_set_map(struct vdpa_device *vdev, struct vhost_iotlb *iotlb)
2347 {
2348 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2349 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2350 	bool change_map;
2351 	int err;
2352 
2353 	mutex_lock(&ndev->reslock);
2354 
2355 	err = mlx5_vdpa_handle_set_map(mvdev, iotlb, &change_map);
2356 	if (err) {
2357 		mlx5_vdpa_warn(mvdev, "set map failed(%d)\n", err);
2358 		goto err;
2359 	}
2360 
2361 	if (change_map)
2362 		err = mlx5_vdpa_change_map(mvdev, iotlb);
2363 
2364 err:
2365 	mutex_unlock(&ndev->reslock);
2366 	return err;
2367 }
2368 
2369 static void mlx5_vdpa_free(struct vdpa_device *vdev)
2370 {
2371 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2372 	struct mlx5_core_dev *pfmdev;
2373 	struct mlx5_vdpa_net *ndev;
2374 
2375 	ndev = to_mlx5_vdpa_ndev(mvdev);
2376 
2377 	free_resources(ndev);
2378 	mlx5_vdpa_destroy_mr(mvdev);
2379 	if (!is_zero_ether_addr(ndev->config.mac)) {
2380 		pfmdev = pci_get_drvdata(pci_physfn(mvdev->mdev->pdev));
2381 		mlx5_mpfs_del_mac(pfmdev, ndev->config.mac);
2382 	}
2383 	mlx5_vdpa_free_resources(&ndev->mvdev);
2384 	mutex_destroy(&ndev->reslock);
2385 	kfree(ndev->event_cbs);
2386 	kfree(ndev->vqs);
2387 }
2388 
2389 static struct vdpa_notification_area mlx5_get_vq_notification(struct vdpa_device *vdev, u16 idx)
2390 {
2391 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2392 	struct vdpa_notification_area ret = {};
2393 	struct mlx5_vdpa_net *ndev;
2394 	phys_addr_t addr;
2395 
2396 	if (!is_index_valid(mvdev, idx) || is_ctrl_vq_idx(mvdev, idx))
2397 		return ret;
2398 
2399 	/* If SF BAR size is smaller than PAGE_SIZE, do not use direct
2400 	 * notification to avoid the risk of mapping pages that contain BAR of more
2401 	 * than one SF
2402 	 */
2403 	if (MLX5_CAP_GEN(mvdev->mdev, log_min_sf_size) + 12 < PAGE_SHIFT)
2404 		return ret;
2405 
2406 	ndev = to_mlx5_vdpa_ndev(mvdev);
2407 	addr = (phys_addr_t)ndev->mvdev.res.phys_kick_addr;
2408 	ret.addr = addr;
2409 	ret.size = PAGE_SIZE;
2410 	return ret;
2411 }
2412 
2413 static int mlx5_get_vq_irq(struct vdpa_device *vdv, u16 idx)
2414 {
2415 	return -EOPNOTSUPP;
2416 }
2417 
2418 static u64 mlx5_vdpa_get_driver_features(struct vdpa_device *vdev)
2419 {
2420 	struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev);
2421 
2422 	return mvdev->actual_features;
2423 }
2424 
2425 static const struct vdpa_config_ops mlx5_vdpa_ops = {
2426 	.set_vq_address = mlx5_vdpa_set_vq_address,
2427 	.set_vq_num = mlx5_vdpa_set_vq_num,
2428 	.kick_vq = mlx5_vdpa_kick_vq,
2429 	.set_vq_cb = mlx5_vdpa_set_vq_cb,
2430 	.set_vq_ready = mlx5_vdpa_set_vq_ready,
2431 	.get_vq_ready = mlx5_vdpa_get_vq_ready,
2432 	.set_vq_state = mlx5_vdpa_set_vq_state,
2433 	.get_vq_state = mlx5_vdpa_get_vq_state,
2434 	.get_vq_notification = mlx5_get_vq_notification,
2435 	.get_vq_irq = mlx5_get_vq_irq,
2436 	.get_vq_align = mlx5_vdpa_get_vq_align,
2437 	.get_device_features = mlx5_vdpa_get_device_features,
2438 	.set_driver_features = mlx5_vdpa_set_driver_features,
2439 	.get_driver_features = mlx5_vdpa_get_driver_features,
2440 	.set_config_cb = mlx5_vdpa_set_config_cb,
2441 	.get_vq_num_max = mlx5_vdpa_get_vq_num_max,
2442 	.get_device_id = mlx5_vdpa_get_device_id,
2443 	.get_vendor_id = mlx5_vdpa_get_vendor_id,
2444 	.get_status = mlx5_vdpa_get_status,
2445 	.set_status = mlx5_vdpa_set_status,
2446 	.reset = mlx5_vdpa_reset,
2447 	.get_config_size = mlx5_vdpa_get_config_size,
2448 	.get_config = mlx5_vdpa_get_config,
2449 	.set_config = mlx5_vdpa_set_config,
2450 	.get_generation = mlx5_vdpa_get_generation,
2451 	.set_map = mlx5_vdpa_set_map,
2452 	.free = mlx5_vdpa_free,
2453 };
2454 
2455 static int query_mtu(struct mlx5_core_dev *mdev, u16 *mtu)
2456 {
2457 	u16 hw_mtu;
2458 	int err;
2459 
2460 	err = mlx5_query_nic_vport_mtu(mdev, &hw_mtu);
2461 	if (err)
2462 		return err;
2463 
2464 	*mtu = hw_mtu - MLX5V_ETH_HARD_MTU;
2465 	return 0;
2466 }
2467 
2468 static int alloc_resources(struct mlx5_vdpa_net *ndev)
2469 {
2470 	struct mlx5_vdpa_net_resources *res = &ndev->res;
2471 	int err;
2472 
2473 	if (res->valid) {
2474 		mlx5_vdpa_warn(&ndev->mvdev, "resources already allocated\n");
2475 		return -EEXIST;
2476 	}
2477 
2478 	err = mlx5_vdpa_alloc_transport_domain(&ndev->mvdev, &res->tdn);
2479 	if (err)
2480 		return err;
2481 
2482 	err = create_tis(ndev);
2483 	if (err)
2484 		goto err_tis;
2485 
2486 	res->valid = true;
2487 
2488 	return 0;
2489 
2490 err_tis:
2491 	mlx5_vdpa_dealloc_transport_domain(&ndev->mvdev, res->tdn);
2492 	return err;
2493 }
2494 
2495 static void free_resources(struct mlx5_vdpa_net *ndev)
2496 {
2497 	struct mlx5_vdpa_net_resources *res = &ndev->res;
2498 
2499 	if (!res->valid)
2500 		return;
2501 
2502 	destroy_tis(ndev);
2503 	mlx5_vdpa_dealloc_transport_domain(&ndev->mvdev, res->tdn);
2504 	res->valid = false;
2505 }
2506 
2507 static void init_mvqs(struct mlx5_vdpa_net *ndev)
2508 {
2509 	struct mlx5_vdpa_virtqueue *mvq;
2510 	int i;
2511 
2512 	for (i = 0; i < ndev->mvdev.max_vqs; ++i) {
2513 		mvq = &ndev->vqs[i];
2514 		memset(mvq, 0, offsetof(struct mlx5_vdpa_virtqueue, ri));
2515 		mvq->index = i;
2516 		mvq->ndev = ndev;
2517 		mvq->fwqp.fw = true;
2518 	}
2519 	for (; i < ndev->mvdev.max_vqs; i++) {
2520 		mvq = &ndev->vqs[i];
2521 		memset(mvq, 0, offsetof(struct mlx5_vdpa_virtqueue, ri));
2522 		mvq->index = i;
2523 		mvq->ndev = ndev;
2524 	}
2525 }
2526 
2527 struct mlx5_vdpa_mgmtdev {
2528 	struct vdpa_mgmt_dev mgtdev;
2529 	struct mlx5_adev *madev;
2530 	struct mlx5_vdpa_net *ndev;
2531 };
2532 
2533 static u8 query_vport_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport)
2534 {
2535 	u32 out[MLX5_ST_SZ_DW(query_vport_state_out)] = {};
2536 	u32 in[MLX5_ST_SZ_DW(query_vport_state_in)] = {};
2537 	int err;
2538 
2539 	MLX5_SET(query_vport_state_in, in, opcode, MLX5_CMD_OP_QUERY_VPORT_STATE);
2540 	MLX5_SET(query_vport_state_in, in, op_mod, opmod);
2541 	MLX5_SET(query_vport_state_in, in, vport_number, vport);
2542 	if (vport)
2543 		MLX5_SET(query_vport_state_in, in, other_vport, 1);
2544 
2545 	err = mlx5_cmd_exec_inout(mdev, query_vport_state, in, out);
2546 	if (err)
2547 		return 0;
2548 
2549 	return MLX5_GET(query_vport_state_out, out, state);
2550 }
2551 
2552 static bool get_link_state(struct mlx5_vdpa_dev *mvdev)
2553 {
2554 	if (query_vport_state(mvdev->mdev, MLX5_VPORT_STATE_OP_MOD_VNIC_VPORT, 0) ==
2555 	    VPORT_STATE_UP)
2556 		return true;
2557 
2558 	return false;
2559 }
2560 
2561 static void update_carrier(struct work_struct *work)
2562 {
2563 	struct mlx5_vdpa_wq_ent *wqent;
2564 	struct mlx5_vdpa_dev *mvdev;
2565 	struct mlx5_vdpa_net *ndev;
2566 
2567 	wqent = container_of(work, struct mlx5_vdpa_wq_ent, work);
2568 	mvdev = wqent->mvdev;
2569 	ndev = to_mlx5_vdpa_ndev(mvdev);
2570 	if (get_link_state(mvdev))
2571 		ndev->config.status |= cpu_to_mlx5vdpa16(mvdev, VIRTIO_NET_S_LINK_UP);
2572 	else
2573 		ndev->config.status &= cpu_to_mlx5vdpa16(mvdev, ~VIRTIO_NET_S_LINK_UP);
2574 
2575 	if (ndev->config_cb.callback)
2576 		ndev->config_cb.callback(ndev->config_cb.private);
2577 
2578 	kfree(wqent);
2579 }
2580 
2581 static int event_handler(struct notifier_block *nb, unsigned long event, void *param)
2582 {
2583 	struct mlx5_vdpa_net *ndev = container_of(nb, struct mlx5_vdpa_net, nb);
2584 	struct mlx5_eqe *eqe = param;
2585 	int ret = NOTIFY_DONE;
2586 	struct mlx5_vdpa_wq_ent *wqent;
2587 
2588 	if (event == MLX5_EVENT_TYPE_PORT_CHANGE) {
2589 		switch (eqe->sub_type) {
2590 		case MLX5_PORT_CHANGE_SUBTYPE_DOWN:
2591 		case MLX5_PORT_CHANGE_SUBTYPE_ACTIVE:
2592 			wqent = kzalloc(sizeof(*wqent), GFP_ATOMIC);
2593 			if (!wqent)
2594 				return NOTIFY_DONE;
2595 
2596 			wqent->mvdev = &ndev->mvdev;
2597 			INIT_WORK(&wqent->work, update_carrier);
2598 			queue_work(ndev->mvdev.wq, &wqent->work);
2599 			ret = NOTIFY_OK;
2600 			break;
2601 		default:
2602 			return NOTIFY_DONE;
2603 		}
2604 		return ret;
2605 	}
2606 	return ret;
2607 }
2608 
2609 static int config_func_mtu(struct mlx5_core_dev *mdev, u16 mtu)
2610 {
2611 	int inlen = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in);
2612 	void *in;
2613 	int err;
2614 
2615 	in = kvzalloc(inlen, GFP_KERNEL);
2616 	if (!in)
2617 		return -ENOMEM;
2618 
2619 	MLX5_SET(modify_nic_vport_context_in, in, field_select.mtu, 1);
2620 	MLX5_SET(modify_nic_vport_context_in, in, nic_vport_context.mtu,
2621 		 mtu + MLX5V_ETH_HARD_MTU);
2622 	MLX5_SET(modify_nic_vport_context_in, in, opcode,
2623 		 MLX5_CMD_OP_MODIFY_NIC_VPORT_CONTEXT);
2624 
2625 	err = mlx5_cmd_exec_in(mdev, modify_nic_vport_context, in);
2626 
2627 	kvfree(in);
2628 	return err;
2629 }
2630 
2631 static int mlx5_vdpa_dev_add(struct vdpa_mgmt_dev *v_mdev, const char *name,
2632 			     const struct vdpa_dev_set_config *add_config)
2633 {
2634 	struct mlx5_vdpa_mgmtdev *mgtdev = container_of(v_mdev, struct mlx5_vdpa_mgmtdev, mgtdev);
2635 	struct virtio_net_config *config;
2636 	struct mlx5_core_dev *pfmdev;
2637 	struct mlx5_vdpa_dev *mvdev;
2638 	struct mlx5_vdpa_net *ndev;
2639 	struct mlx5_core_dev *mdev;
2640 	u32 max_vqs;
2641 	u16 mtu;
2642 	int err;
2643 
2644 	if (mgtdev->ndev)
2645 		return -ENOSPC;
2646 
2647 	mdev = mgtdev->madev->mdev;
2648 	if (!(MLX5_CAP_DEV_VDPA_EMULATION(mdev, virtio_queue_type) &
2649 	    MLX5_VIRTIO_EMULATION_CAP_VIRTIO_QUEUE_TYPE_SPLIT)) {
2650 		dev_warn(mdev->device, "missing support for split virtqueues\n");
2651 		return -EOPNOTSUPP;
2652 	}
2653 
2654 	max_vqs = min_t(int, MLX5_CAP_DEV_VDPA_EMULATION(mdev, max_num_virtio_queues),
2655 			1 << MLX5_CAP_GEN(mdev, log_max_rqt_size));
2656 	if (max_vqs < 2) {
2657 		dev_warn(mdev->device,
2658 			 "%d virtqueues are supported. At least 2 are required\n",
2659 			 max_vqs);
2660 		return -EAGAIN;
2661 	}
2662 
2663 	if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP)) {
2664 		if (add_config->net.max_vq_pairs > max_vqs / 2)
2665 			return -EINVAL;
2666 		max_vqs = min_t(u32, max_vqs, 2 * add_config->net.max_vq_pairs);
2667 	} else {
2668 		max_vqs = 2;
2669 	}
2670 
2671 	ndev = vdpa_alloc_device(struct mlx5_vdpa_net, mvdev.vdev, mdev->device, &mlx5_vdpa_ops,
2672 				 name, false);
2673 	if (IS_ERR(ndev))
2674 		return PTR_ERR(ndev);
2675 
2676 	ndev->mvdev.mlx_features = mgtdev->mgtdev.supported_features;
2677 	ndev->mvdev.max_vqs = max_vqs;
2678 	mvdev = &ndev->mvdev;
2679 	mvdev->mdev = mdev;
2680 
2681 	ndev->vqs = kcalloc(max_vqs, sizeof(*ndev->vqs), GFP_KERNEL);
2682 	ndev->event_cbs = kcalloc(max_vqs + 1, sizeof(*ndev->event_cbs), GFP_KERNEL);
2683 	if (!ndev->vqs || !ndev->event_cbs) {
2684 		err = -ENOMEM;
2685 		goto err_alloc;
2686 	}
2687 
2688 	init_mvqs(ndev);
2689 	mutex_init(&ndev->reslock);
2690 	config = &ndev->config;
2691 
2692 	if (add_config->mask & BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MTU)) {
2693 		err = config_func_mtu(mdev, add_config->net.mtu);
2694 		if (err)
2695 			goto err_mtu;
2696 	}
2697 
2698 	err = query_mtu(mdev, &mtu);
2699 	if (err)
2700 		goto err_mtu;
2701 
2702 	ndev->config.mtu = cpu_to_mlx5vdpa16(mvdev, mtu);
2703 
2704 	if (get_link_state(mvdev))
2705 		ndev->config.status |= cpu_to_mlx5vdpa16(mvdev, VIRTIO_NET_S_LINK_UP);
2706 	else
2707 		ndev->config.status &= cpu_to_mlx5vdpa16(mvdev, ~VIRTIO_NET_S_LINK_UP);
2708 
2709 	if (add_config->mask & (1 << VDPA_ATTR_DEV_NET_CFG_MACADDR)) {
2710 		memcpy(ndev->config.mac, add_config->net.mac, ETH_ALEN);
2711 	} else {
2712 		err = mlx5_query_nic_vport_mac_address(mdev, 0, 0, config->mac);
2713 		if (err)
2714 			goto err_mtu;
2715 	}
2716 
2717 	if (!is_zero_ether_addr(config->mac)) {
2718 		pfmdev = pci_get_drvdata(pci_physfn(mdev->pdev));
2719 		err = mlx5_mpfs_add_mac(pfmdev, config->mac);
2720 		if (err)
2721 			goto err_mtu;
2722 
2723 		ndev->mvdev.mlx_features |= BIT_ULL(VIRTIO_NET_F_MAC);
2724 	}
2725 
2726 	config->max_virtqueue_pairs = cpu_to_mlx5vdpa16(mvdev, max_vqs / 2);
2727 	mvdev->vdev.dma_dev = &mdev->pdev->dev;
2728 	err = mlx5_vdpa_alloc_resources(&ndev->mvdev);
2729 	if (err)
2730 		goto err_mpfs;
2731 
2732 	if (MLX5_CAP_GEN(mvdev->mdev, umem_uid_0)) {
2733 		err = mlx5_vdpa_create_mr(mvdev, NULL);
2734 		if (err)
2735 			goto err_res;
2736 	}
2737 
2738 	err = alloc_resources(ndev);
2739 	if (err)
2740 		goto err_mr;
2741 
2742 	ndev->cvq_ent.mvdev = mvdev;
2743 	INIT_WORK(&ndev->cvq_ent.work, mlx5_cvq_kick_handler);
2744 	mvdev->wq = create_singlethread_workqueue("mlx5_vdpa_wq");
2745 	if (!mvdev->wq) {
2746 		err = -ENOMEM;
2747 		goto err_res2;
2748 	}
2749 
2750 	ndev->nb.notifier_call = event_handler;
2751 	mlx5_notifier_register(mdev, &ndev->nb);
2752 	mvdev->vdev.mdev = &mgtdev->mgtdev;
2753 	err = _vdpa_register_device(&mvdev->vdev, max_vqs + 1);
2754 	if (err)
2755 		goto err_reg;
2756 
2757 	mgtdev->ndev = ndev;
2758 	return 0;
2759 
2760 err_reg:
2761 	destroy_workqueue(mvdev->wq);
2762 err_res2:
2763 	free_resources(ndev);
2764 err_mr:
2765 	mlx5_vdpa_destroy_mr(mvdev);
2766 err_res:
2767 	mlx5_vdpa_free_resources(&ndev->mvdev);
2768 err_mpfs:
2769 	if (!is_zero_ether_addr(config->mac))
2770 		mlx5_mpfs_del_mac(pfmdev, config->mac);
2771 err_mtu:
2772 	mutex_destroy(&ndev->reslock);
2773 err_alloc:
2774 	put_device(&mvdev->vdev.dev);
2775 	return err;
2776 }
2777 
2778 static void mlx5_vdpa_dev_del(struct vdpa_mgmt_dev *v_mdev, struct vdpa_device *dev)
2779 {
2780 	struct mlx5_vdpa_mgmtdev *mgtdev = container_of(v_mdev, struct mlx5_vdpa_mgmtdev, mgtdev);
2781 	struct mlx5_vdpa_dev *mvdev = to_mvdev(dev);
2782 	struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev);
2783 	struct workqueue_struct *wq;
2784 
2785 	mlx5_notifier_unregister(mvdev->mdev, &ndev->nb);
2786 	wq = mvdev->wq;
2787 	mvdev->wq = NULL;
2788 	destroy_workqueue(wq);
2789 	_vdpa_unregister_device(dev);
2790 	mgtdev->ndev = NULL;
2791 }
2792 
2793 static const struct vdpa_mgmtdev_ops mdev_ops = {
2794 	.dev_add = mlx5_vdpa_dev_add,
2795 	.dev_del = mlx5_vdpa_dev_del,
2796 };
2797 
2798 static struct virtio_device_id id_table[] = {
2799 	{ VIRTIO_ID_NET, VIRTIO_DEV_ANY_ID },
2800 	{ 0 },
2801 };
2802 
2803 static int mlx5v_probe(struct auxiliary_device *adev,
2804 		       const struct auxiliary_device_id *id)
2805 
2806 {
2807 	struct mlx5_adev *madev = container_of(adev, struct mlx5_adev, adev);
2808 	struct mlx5_core_dev *mdev = madev->mdev;
2809 	struct mlx5_vdpa_mgmtdev *mgtdev;
2810 	int err;
2811 
2812 	mgtdev = kzalloc(sizeof(*mgtdev), GFP_KERNEL);
2813 	if (!mgtdev)
2814 		return -ENOMEM;
2815 
2816 	mgtdev->mgtdev.ops = &mdev_ops;
2817 	mgtdev->mgtdev.device = mdev->device;
2818 	mgtdev->mgtdev.id_table = id_table;
2819 	mgtdev->mgtdev.config_attr_mask = BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MACADDR) |
2820 					  BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MAX_VQP) |
2821 					  BIT_ULL(VDPA_ATTR_DEV_NET_CFG_MTU);
2822 	mgtdev->mgtdev.max_supported_vqs =
2823 		MLX5_CAP_DEV_VDPA_EMULATION(mdev, max_num_virtio_queues) + 1;
2824 	mgtdev->mgtdev.supported_features = get_supported_features(mdev);
2825 	mgtdev->madev = madev;
2826 
2827 	err = vdpa_mgmtdev_register(&mgtdev->mgtdev);
2828 	if (err)
2829 		goto reg_err;
2830 
2831 	auxiliary_set_drvdata(adev, mgtdev);
2832 
2833 	return 0;
2834 
2835 reg_err:
2836 	kfree(mgtdev);
2837 	return err;
2838 }
2839 
2840 static void mlx5v_remove(struct auxiliary_device *adev)
2841 {
2842 	struct mlx5_vdpa_mgmtdev *mgtdev;
2843 
2844 	mgtdev = auxiliary_get_drvdata(adev);
2845 	vdpa_mgmtdev_unregister(&mgtdev->mgtdev);
2846 	kfree(mgtdev);
2847 }
2848 
2849 static const struct auxiliary_device_id mlx5v_id_table[] = {
2850 	{ .name = MLX5_ADEV_NAME ".vnet", },
2851 	{},
2852 };
2853 
2854 MODULE_DEVICE_TABLE(auxiliary, mlx5v_id_table);
2855 
2856 static struct auxiliary_driver mlx5v_driver = {
2857 	.name = "vnet",
2858 	.probe = mlx5v_probe,
2859 	.remove = mlx5v_remove,
2860 	.id_table = mlx5v_id_table,
2861 };
2862 
2863 module_auxiliary_driver(mlx5v_driver);
2864