1 /*
2  * Copyright (c) 2016, Mellanox Technologies. All rights reserved.
3  *
4  * This software is available to you under a choice of one of two
5  * licenses.  You may choose to be licensed under the terms of the GNU
6  * General Public License (GPL) Version 2, available from the file
7  * COPYING in the main directory of this source tree, or the
8  * OpenIB.org BSD license below:
9  *
10  *     Redistribution and use in source and binary forms, with or
11  *     without modification, are permitted provided that the following
12  *     conditions are met:
13  *
14  *      - Redistributions of source code must retain the above
15  *        copyright notice, this list of conditions and the following
16  *        disclaimer.
17  *
18  *      - Redistributions in binary form must reproduce the above
19  *        copyright notice, this list of conditions and the following
20  *        disclaimer in the documentation and/or other materials
21  *        provided with the distribution.
22  *
23  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
24  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
25  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
26  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
27  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
28  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
29  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
30  * SOFTWARE.
31  */
32 
33 #include <linux/netdevice.h>
34 #include <net/bonding.h>
35 #include <linux/mlx5/driver.h>
36 #include <linux/mlx5/eswitch.h>
37 #include <linux/mlx5/vport.h>
38 #include "lib/devcom.h"
39 #include "mlx5_core.h"
40 #include "eswitch.h"
41 #include "esw/acl/ofld.h"
42 #include "lag.h"
43 #include "mp.h"
44 #include "mpesw.h"
45 
46 enum {
47 	MLX5_LAG_EGRESS_PORT_1 = 1,
48 	MLX5_LAG_EGRESS_PORT_2,
49 };
50 
51 /* General purpose, use for short periods of time.
52  * Beware of lock dependencies (preferably, no locks should be acquired
53  * under it).
54  */
55 static DEFINE_SPINLOCK(lag_lock);
56 
57 static int get_port_sel_mode(enum mlx5_lag_mode mode, unsigned long flags)
58 {
59 	if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &flags))
60 		return MLX5_LAG_PORT_SELECT_MODE_PORT_SELECT_FT;
61 
62 	if (mode == MLX5_LAG_MODE_MPESW)
63 		return MLX5_LAG_PORT_SELECT_MODE_PORT_SELECT_MPESW;
64 
65 	return MLX5_LAG_PORT_SELECT_MODE_QUEUE_AFFINITY;
66 }
67 
68 static u8 lag_active_port_bits(struct mlx5_lag *ldev)
69 {
70 	u8 enabled_ports[MLX5_MAX_PORTS] = {};
71 	u8 active_port = 0;
72 	int num_enabled;
73 	int idx;
74 
75 	mlx5_infer_tx_enabled(&ldev->tracker, ldev->ports, enabled_ports,
76 			      &num_enabled);
77 	for (idx = 0; idx < num_enabled; idx++)
78 		active_port |= BIT_MASK(enabled_ports[idx]);
79 
80 	return active_port;
81 }
82 
83 static int mlx5_cmd_create_lag(struct mlx5_core_dev *dev, u8 *ports, int mode,
84 			       unsigned long flags)
85 {
86 	bool fdb_sel_mode = test_bit(MLX5_LAG_MODE_FLAG_FDB_SEL_MODE_NATIVE,
87 				     &flags);
88 	int port_sel_mode = get_port_sel_mode(mode, flags);
89 	u32 in[MLX5_ST_SZ_DW(create_lag_in)] = {};
90 	void *lag_ctx;
91 
92 	lag_ctx = MLX5_ADDR_OF(create_lag_in, in, ctx);
93 	MLX5_SET(create_lag_in, in, opcode, MLX5_CMD_OP_CREATE_LAG);
94 	MLX5_SET(lagc, lag_ctx, fdb_selection_mode, fdb_sel_mode);
95 
96 	switch (port_sel_mode) {
97 	case MLX5_LAG_PORT_SELECT_MODE_QUEUE_AFFINITY:
98 		MLX5_SET(lagc, lag_ctx, tx_remap_affinity_1, ports[0]);
99 		MLX5_SET(lagc, lag_ctx, tx_remap_affinity_2, ports[1]);
100 		break;
101 	case MLX5_LAG_PORT_SELECT_MODE_PORT_SELECT_FT:
102 		if (!MLX5_CAP_PORT_SELECTION(dev, port_select_flow_table_bypass))
103 			break;
104 
105 		MLX5_SET(lagc, lag_ctx, active_port,
106 			 lag_active_port_bits(mlx5_lag_dev(dev)));
107 		break;
108 	default:
109 		break;
110 	}
111 	MLX5_SET(lagc, lag_ctx, port_select_mode, port_sel_mode);
112 
113 	return mlx5_cmd_exec_in(dev, create_lag, in);
114 }
115 
116 static int mlx5_cmd_modify_lag(struct mlx5_core_dev *dev, u8 num_ports,
117 			       u8 *ports)
118 {
119 	u32 in[MLX5_ST_SZ_DW(modify_lag_in)] = {};
120 	void *lag_ctx = MLX5_ADDR_OF(modify_lag_in, in, ctx);
121 
122 	MLX5_SET(modify_lag_in, in, opcode, MLX5_CMD_OP_MODIFY_LAG);
123 	MLX5_SET(modify_lag_in, in, field_select, 0x1);
124 
125 	MLX5_SET(lagc, lag_ctx, tx_remap_affinity_1, ports[0]);
126 	MLX5_SET(lagc, lag_ctx, tx_remap_affinity_2, ports[1]);
127 
128 	return mlx5_cmd_exec_in(dev, modify_lag, in);
129 }
130 
131 int mlx5_cmd_create_vport_lag(struct mlx5_core_dev *dev)
132 {
133 	u32 in[MLX5_ST_SZ_DW(create_vport_lag_in)] = {};
134 
135 	MLX5_SET(create_vport_lag_in, in, opcode, MLX5_CMD_OP_CREATE_VPORT_LAG);
136 
137 	return mlx5_cmd_exec_in(dev, create_vport_lag, in);
138 }
139 EXPORT_SYMBOL(mlx5_cmd_create_vport_lag);
140 
141 int mlx5_cmd_destroy_vport_lag(struct mlx5_core_dev *dev)
142 {
143 	u32 in[MLX5_ST_SZ_DW(destroy_vport_lag_in)] = {};
144 
145 	MLX5_SET(destroy_vport_lag_in, in, opcode, MLX5_CMD_OP_DESTROY_VPORT_LAG);
146 
147 	return mlx5_cmd_exec_in(dev, destroy_vport_lag, in);
148 }
149 EXPORT_SYMBOL(mlx5_cmd_destroy_vport_lag);
150 
151 static void mlx5_infer_tx_disabled(struct lag_tracker *tracker, u8 num_ports,
152 				   u8 *ports, int *num_disabled)
153 {
154 	int i;
155 
156 	*num_disabled = 0;
157 	for (i = 0; i < num_ports; i++) {
158 		if (!tracker->netdev_state[i].tx_enabled ||
159 		    !tracker->netdev_state[i].link_up)
160 			ports[(*num_disabled)++] = i;
161 	}
162 }
163 
164 void mlx5_infer_tx_enabled(struct lag_tracker *tracker, u8 num_ports,
165 			   u8 *ports, int *num_enabled)
166 {
167 	int i;
168 
169 	*num_enabled = 0;
170 	for (i = 0; i < num_ports; i++) {
171 		if (tracker->netdev_state[i].tx_enabled &&
172 		    tracker->netdev_state[i].link_up)
173 			ports[(*num_enabled)++] = i;
174 	}
175 
176 	if (*num_enabled == 0)
177 		mlx5_infer_tx_disabled(tracker, num_ports, ports, num_enabled);
178 }
179 
180 static void mlx5_lag_print_mapping(struct mlx5_core_dev *dev,
181 				   struct mlx5_lag *ldev,
182 				   struct lag_tracker *tracker,
183 				   unsigned long flags)
184 {
185 	char buf[MLX5_MAX_PORTS * 10 + 1] = {};
186 	u8 enabled_ports[MLX5_MAX_PORTS] = {};
187 	int written = 0;
188 	int num_enabled;
189 	int idx;
190 	int err;
191 	int i;
192 	int j;
193 
194 	if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &flags)) {
195 		mlx5_infer_tx_enabled(tracker, ldev->ports, enabled_ports,
196 				      &num_enabled);
197 		for (i = 0; i < num_enabled; i++) {
198 			err = scnprintf(buf + written, 4, "%d, ", enabled_ports[i] + 1);
199 			if (err != 3)
200 				return;
201 			written += err;
202 		}
203 		buf[written - 2] = 0;
204 		mlx5_core_info(dev, "lag map active ports: %s\n", buf);
205 	} else {
206 		for (i = 0; i < ldev->ports; i++) {
207 			for (j  = 0; j < ldev->buckets; j++) {
208 				idx = i * ldev->buckets + j;
209 				err = scnprintf(buf + written, 10,
210 						" port %d:%d", i + 1, ldev->v2p_map[idx]);
211 				if (err != 9)
212 					return;
213 				written += err;
214 			}
215 		}
216 		mlx5_core_info(dev, "lag map:%s\n", buf);
217 	}
218 }
219 
220 static int mlx5_lag_netdev_event(struct notifier_block *this,
221 				 unsigned long event, void *ptr);
222 static void mlx5_do_bond_work(struct work_struct *work);
223 
224 static void mlx5_ldev_free(struct kref *ref)
225 {
226 	struct mlx5_lag *ldev = container_of(ref, struct mlx5_lag, ref);
227 
228 	if (ldev->nb.notifier_call)
229 		unregister_netdevice_notifier_net(&init_net, &ldev->nb);
230 	mlx5_lag_mp_cleanup(ldev);
231 	mlx5_lag_mpesw_cleanup(ldev);
232 	cancel_work_sync(&ldev->mpesw_work);
233 	destroy_workqueue(ldev->wq);
234 	mutex_destroy(&ldev->lock);
235 	kfree(ldev);
236 }
237 
238 static void mlx5_ldev_put(struct mlx5_lag *ldev)
239 {
240 	kref_put(&ldev->ref, mlx5_ldev_free);
241 }
242 
243 static void mlx5_ldev_get(struct mlx5_lag *ldev)
244 {
245 	kref_get(&ldev->ref);
246 }
247 
248 static struct mlx5_lag *mlx5_lag_dev_alloc(struct mlx5_core_dev *dev)
249 {
250 	struct mlx5_lag *ldev;
251 	int err;
252 
253 	ldev = kzalloc(sizeof(*ldev), GFP_KERNEL);
254 	if (!ldev)
255 		return NULL;
256 
257 	ldev->wq = create_singlethread_workqueue("mlx5_lag");
258 	if (!ldev->wq) {
259 		kfree(ldev);
260 		return NULL;
261 	}
262 
263 	kref_init(&ldev->ref);
264 	mutex_init(&ldev->lock);
265 	INIT_DELAYED_WORK(&ldev->bond_work, mlx5_do_bond_work);
266 
267 	ldev->nb.notifier_call = mlx5_lag_netdev_event;
268 	if (register_netdevice_notifier_net(&init_net, &ldev->nb)) {
269 		ldev->nb.notifier_call = NULL;
270 		mlx5_core_err(dev, "Failed to register LAG netdev notifier\n");
271 	}
272 	ldev->mode = MLX5_LAG_MODE_NONE;
273 
274 	err = mlx5_lag_mp_init(ldev);
275 	if (err)
276 		mlx5_core_err(dev, "Failed to init multipath lag err=%d\n",
277 			      err);
278 
279 	mlx5_lag_mpesw_init(ldev);
280 	ldev->ports = MLX5_CAP_GEN(dev, num_lag_ports);
281 	ldev->buckets = 1;
282 
283 	return ldev;
284 }
285 
286 int mlx5_lag_dev_get_netdev_idx(struct mlx5_lag *ldev,
287 				struct net_device *ndev)
288 {
289 	int i;
290 
291 	for (i = 0; i < ldev->ports; i++)
292 		if (ldev->pf[i].netdev == ndev)
293 			return i;
294 
295 	return -ENOENT;
296 }
297 
298 static bool __mlx5_lag_is_roce(struct mlx5_lag *ldev)
299 {
300 	return ldev->mode == MLX5_LAG_MODE_ROCE;
301 }
302 
303 static bool __mlx5_lag_is_sriov(struct mlx5_lag *ldev)
304 {
305 	return ldev->mode == MLX5_LAG_MODE_SRIOV;
306 }
307 
308 /* Create a mapping between steering slots and active ports.
309  * As we have ldev->buckets slots per port first assume the native
310  * mapping should be used.
311  * If there are ports that are disabled fill the relevant slots
312  * with mapping that points to active ports.
313  */
314 static void mlx5_infer_tx_affinity_mapping(struct lag_tracker *tracker,
315 					   u8 num_ports,
316 					   u8 buckets,
317 					   u8 *ports)
318 {
319 	int disabled[MLX5_MAX_PORTS] = {};
320 	int enabled[MLX5_MAX_PORTS] = {};
321 	int disabled_ports_num = 0;
322 	int enabled_ports_num = 0;
323 	int idx;
324 	u32 rand;
325 	int i;
326 	int j;
327 
328 	for (i = 0; i < num_ports; i++) {
329 		if (tracker->netdev_state[i].tx_enabled &&
330 		    tracker->netdev_state[i].link_up)
331 			enabled[enabled_ports_num++] = i;
332 		else
333 			disabled[disabled_ports_num++] = i;
334 	}
335 
336 	/* Use native mapping by default where each port's buckets
337 	 * point the native port: 1 1 1 .. 1 2 2 2 ... 2 3 3 3 ... 3 etc
338 	 */
339 	for (i = 0; i < num_ports; i++)
340 		for (j = 0; j < buckets; j++) {
341 			idx = i * buckets + j;
342 			ports[idx] = MLX5_LAG_EGRESS_PORT_1 + i;
343 		}
344 
345 	/* If all ports are disabled/enabled keep native mapping */
346 	if (enabled_ports_num == num_ports ||
347 	    disabled_ports_num == num_ports)
348 		return;
349 
350 	/* Go over the disabled ports and for each assign a random active port */
351 	for (i = 0; i < disabled_ports_num; i++) {
352 		for (j = 0; j < buckets; j++) {
353 			get_random_bytes(&rand, 4);
354 			ports[disabled[i] * buckets + j] = enabled[rand % enabled_ports_num] + 1;
355 		}
356 	}
357 }
358 
359 static bool mlx5_lag_has_drop_rule(struct mlx5_lag *ldev)
360 {
361 	int i;
362 
363 	for (i = 0; i < ldev->ports; i++)
364 		if (ldev->pf[i].has_drop)
365 			return true;
366 	return false;
367 }
368 
369 static void mlx5_lag_drop_rule_cleanup(struct mlx5_lag *ldev)
370 {
371 	int i;
372 
373 	for (i = 0; i < ldev->ports; i++) {
374 		if (!ldev->pf[i].has_drop)
375 			continue;
376 
377 		mlx5_esw_acl_ingress_vport_drop_rule_destroy(ldev->pf[i].dev->priv.eswitch,
378 							     MLX5_VPORT_UPLINK);
379 		ldev->pf[i].has_drop = false;
380 	}
381 }
382 
383 static void mlx5_lag_drop_rule_setup(struct mlx5_lag *ldev,
384 				     struct lag_tracker *tracker)
385 {
386 	u8 disabled_ports[MLX5_MAX_PORTS] = {};
387 	struct mlx5_core_dev *dev;
388 	int disabled_index;
389 	int num_disabled;
390 	int err;
391 	int i;
392 
393 	/* First delete the current drop rule so there won't be any dropped
394 	 * packets
395 	 */
396 	mlx5_lag_drop_rule_cleanup(ldev);
397 
398 	if (!ldev->tracker.has_inactive)
399 		return;
400 
401 	mlx5_infer_tx_disabled(tracker, ldev->ports, disabled_ports, &num_disabled);
402 
403 	for (i = 0; i < num_disabled; i++) {
404 		disabled_index = disabled_ports[i];
405 		dev = ldev->pf[disabled_index].dev;
406 		err = mlx5_esw_acl_ingress_vport_drop_rule_create(dev->priv.eswitch,
407 								  MLX5_VPORT_UPLINK);
408 		if (!err)
409 			ldev->pf[disabled_index].has_drop = true;
410 		else
411 			mlx5_core_err(dev,
412 				      "Failed to create lag drop rule, error: %d", err);
413 	}
414 }
415 
416 static int mlx5_cmd_modify_active_port(struct mlx5_core_dev *dev, u8 ports)
417 {
418 	u32 in[MLX5_ST_SZ_DW(modify_lag_in)] = {};
419 	void *lag_ctx;
420 
421 	lag_ctx = MLX5_ADDR_OF(modify_lag_in, in, ctx);
422 
423 	MLX5_SET(modify_lag_in, in, opcode, MLX5_CMD_OP_MODIFY_LAG);
424 	MLX5_SET(modify_lag_in, in, field_select, 0x2);
425 
426 	MLX5_SET(lagc, lag_ctx, active_port, ports);
427 
428 	return mlx5_cmd_exec_in(dev, modify_lag, in);
429 }
430 
431 static int _mlx5_modify_lag(struct mlx5_lag *ldev, u8 *ports)
432 {
433 	struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
434 	u8 active_ports;
435 	int ret;
436 
437 	if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &ldev->mode_flags)) {
438 		ret = mlx5_lag_port_sel_modify(ldev, ports);
439 		if (ret ||
440 		    !MLX5_CAP_PORT_SELECTION(dev0, port_select_flow_table_bypass))
441 			return ret;
442 
443 		active_ports = lag_active_port_bits(ldev);
444 
445 		return mlx5_cmd_modify_active_port(dev0, active_ports);
446 	}
447 	return mlx5_cmd_modify_lag(dev0, ldev->ports, ports);
448 }
449 
450 void mlx5_modify_lag(struct mlx5_lag *ldev,
451 		     struct lag_tracker *tracker)
452 {
453 	u8 ports[MLX5_MAX_PORTS * MLX5_LAG_MAX_HASH_BUCKETS] = {};
454 	struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
455 	int idx;
456 	int err;
457 	int i;
458 	int j;
459 
460 	mlx5_infer_tx_affinity_mapping(tracker, ldev->ports, ldev->buckets, ports);
461 
462 	for (i = 0; i < ldev->ports; i++) {
463 		for (j = 0; j < ldev->buckets; j++) {
464 			idx = i * ldev->buckets + j;
465 			if (ports[idx] == ldev->v2p_map[idx])
466 				continue;
467 			err = _mlx5_modify_lag(ldev, ports);
468 			if (err) {
469 				mlx5_core_err(dev0,
470 					      "Failed to modify LAG (%d)\n",
471 					      err);
472 				return;
473 			}
474 			memcpy(ldev->v2p_map, ports, sizeof(ports));
475 
476 			mlx5_lag_print_mapping(dev0, ldev, tracker,
477 					       ldev->mode_flags);
478 			break;
479 		}
480 	}
481 
482 	if (tracker->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP &&
483 	    !(ldev->mode == MLX5_LAG_MODE_ROCE))
484 		mlx5_lag_drop_rule_setup(ldev, tracker);
485 }
486 
487 static int mlx5_lag_set_port_sel_mode_roce(struct mlx5_lag *ldev,
488 					   unsigned long *flags)
489 {
490 	struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
491 
492 	if (!MLX5_CAP_PORT_SELECTION(dev0, port_select_flow_table)) {
493 		if (ldev->ports > 2)
494 			return -EINVAL;
495 		return 0;
496 	}
497 
498 	if (ldev->ports > 2)
499 		ldev->buckets = MLX5_LAG_MAX_HASH_BUCKETS;
500 
501 	set_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, flags);
502 
503 	return 0;
504 }
505 
506 static void mlx5_lag_set_port_sel_mode_offloads(struct mlx5_lag *ldev,
507 						struct lag_tracker *tracker,
508 						enum mlx5_lag_mode mode,
509 						unsigned long *flags)
510 {
511 	struct lag_func *dev0 = &ldev->pf[MLX5_LAG_P1];
512 
513 	if (mode == MLX5_LAG_MODE_MPESW)
514 		return;
515 
516 	if (MLX5_CAP_PORT_SELECTION(dev0->dev, port_select_flow_table) &&
517 	    tracker->tx_type == NETDEV_LAG_TX_TYPE_HASH)
518 		set_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, flags);
519 }
520 
521 static int mlx5_lag_set_flags(struct mlx5_lag *ldev, enum mlx5_lag_mode mode,
522 			      struct lag_tracker *tracker, bool shared_fdb,
523 			      unsigned long *flags)
524 {
525 	bool roce_lag = mode == MLX5_LAG_MODE_ROCE;
526 
527 	*flags = 0;
528 	if (shared_fdb) {
529 		set_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, flags);
530 		set_bit(MLX5_LAG_MODE_FLAG_FDB_SEL_MODE_NATIVE, flags);
531 	}
532 
533 	if (mode == MLX5_LAG_MODE_MPESW)
534 		set_bit(MLX5_LAG_MODE_FLAG_FDB_SEL_MODE_NATIVE, flags);
535 
536 	if (roce_lag)
537 		return mlx5_lag_set_port_sel_mode_roce(ldev, flags);
538 
539 	mlx5_lag_set_port_sel_mode_offloads(ldev, tracker, mode, flags);
540 	return 0;
541 }
542 
543 char *mlx5_get_str_port_sel_mode(enum mlx5_lag_mode mode, unsigned long flags)
544 {
545 	int port_sel_mode = get_port_sel_mode(mode, flags);
546 
547 	switch (port_sel_mode) {
548 	case MLX5_LAG_PORT_SELECT_MODE_QUEUE_AFFINITY: return "queue_affinity";
549 	case MLX5_LAG_PORT_SELECT_MODE_PORT_SELECT_FT: return "hash";
550 	case MLX5_LAG_PORT_SELECT_MODE_PORT_SELECT_MPESW: return "mpesw";
551 	default: return "invalid";
552 	}
553 }
554 
555 static int mlx5_create_lag(struct mlx5_lag *ldev,
556 			   struct lag_tracker *tracker,
557 			   enum mlx5_lag_mode mode,
558 			   unsigned long flags)
559 {
560 	bool shared_fdb = test_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, &flags);
561 	struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
562 	struct mlx5_core_dev *dev1 = ldev->pf[MLX5_LAG_P2].dev;
563 	u32 in[MLX5_ST_SZ_DW(destroy_lag_in)] = {};
564 	int err;
565 
566 	if (tracker)
567 		mlx5_lag_print_mapping(dev0, ldev, tracker, flags);
568 	mlx5_core_info(dev0, "shared_fdb:%d mode:%s\n",
569 		       shared_fdb, mlx5_get_str_port_sel_mode(mode, flags));
570 
571 	err = mlx5_cmd_create_lag(dev0, ldev->v2p_map, mode, flags);
572 	if (err) {
573 		mlx5_core_err(dev0,
574 			      "Failed to create LAG (%d)\n",
575 			      err);
576 		return err;
577 	}
578 
579 	if (shared_fdb) {
580 		err = mlx5_eswitch_offloads_config_single_fdb(dev0->priv.eswitch,
581 							      dev1->priv.eswitch);
582 		if (err)
583 			mlx5_core_err(dev0, "Can't enable single FDB mode\n");
584 		else
585 			mlx5_core_info(dev0, "Operation mode is single FDB\n");
586 	}
587 
588 	if (err) {
589 		MLX5_SET(destroy_lag_in, in, opcode, MLX5_CMD_OP_DESTROY_LAG);
590 		if (mlx5_cmd_exec_in(dev0, destroy_lag, in))
591 			mlx5_core_err(dev0,
592 				      "Failed to deactivate RoCE LAG; driver restart required\n");
593 	}
594 
595 	return err;
596 }
597 
598 int mlx5_activate_lag(struct mlx5_lag *ldev,
599 		      struct lag_tracker *tracker,
600 		      enum mlx5_lag_mode mode,
601 		      bool shared_fdb)
602 {
603 	bool roce_lag = mode == MLX5_LAG_MODE_ROCE;
604 	struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
605 	unsigned long flags = 0;
606 	int err;
607 
608 	err = mlx5_lag_set_flags(ldev, mode, tracker, shared_fdb, &flags);
609 	if (err)
610 		return err;
611 
612 	if (mode != MLX5_LAG_MODE_MPESW) {
613 		mlx5_infer_tx_affinity_mapping(tracker, ldev->ports, ldev->buckets, ldev->v2p_map);
614 		if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &flags)) {
615 			err = mlx5_lag_port_sel_create(ldev, tracker->hash_type,
616 						       ldev->v2p_map);
617 			if (err) {
618 				mlx5_core_err(dev0,
619 					      "Failed to create LAG port selection(%d)\n",
620 					      err);
621 				return err;
622 			}
623 		}
624 	}
625 
626 	err = mlx5_create_lag(ldev, tracker, mode, flags);
627 	if (err) {
628 		if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &flags))
629 			mlx5_lag_port_sel_destroy(ldev);
630 		if (roce_lag)
631 			mlx5_core_err(dev0,
632 				      "Failed to activate RoCE LAG\n");
633 		else
634 			mlx5_core_err(dev0,
635 				      "Failed to activate VF LAG\n"
636 				      "Make sure all VFs are unbound prior to VF LAG activation or deactivation\n");
637 		return err;
638 	}
639 
640 	if (tracker && tracker->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP &&
641 	    !roce_lag)
642 		mlx5_lag_drop_rule_setup(ldev, tracker);
643 
644 	ldev->mode = mode;
645 	ldev->mode_flags = flags;
646 	return 0;
647 }
648 
649 static int mlx5_deactivate_lag(struct mlx5_lag *ldev)
650 {
651 	struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
652 	struct mlx5_core_dev *dev1 = ldev->pf[MLX5_LAG_P2].dev;
653 	u32 in[MLX5_ST_SZ_DW(destroy_lag_in)] = {};
654 	bool roce_lag = __mlx5_lag_is_roce(ldev);
655 	unsigned long flags = ldev->mode_flags;
656 	int err;
657 
658 	ldev->mode = MLX5_LAG_MODE_NONE;
659 	ldev->mode_flags = 0;
660 	mlx5_lag_mp_reset(ldev);
661 
662 	if (test_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, &flags)) {
663 		mlx5_eswitch_offloads_destroy_single_fdb(dev0->priv.eswitch,
664 							 dev1->priv.eswitch);
665 		clear_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, &flags);
666 	}
667 
668 	MLX5_SET(destroy_lag_in, in, opcode, MLX5_CMD_OP_DESTROY_LAG);
669 	err = mlx5_cmd_exec_in(dev0, destroy_lag, in);
670 	if (err) {
671 		if (roce_lag) {
672 			mlx5_core_err(dev0,
673 				      "Failed to deactivate RoCE LAG; driver restart required\n");
674 		} else {
675 			mlx5_core_err(dev0,
676 				      "Failed to deactivate VF LAG; driver restart required\n"
677 				      "Make sure all VFs are unbound prior to VF LAG activation or deactivation\n");
678 		}
679 		return err;
680 	}
681 
682 	if (test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &flags))
683 		mlx5_lag_port_sel_destroy(ldev);
684 	if (mlx5_lag_has_drop_rule(ldev))
685 		mlx5_lag_drop_rule_cleanup(ldev);
686 
687 	return 0;
688 }
689 
690 #define MLX5_LAG_OFFLOADS_SUPPORTED_PORTS 2
691 static bool mlx5_lag_check_prereq(struct mlx5_lag *ldev)
692 {
693 #ifdef CONFIG_MLX5_ESWITCH
694 	struct mlx5_core_dev *dev;
695 	u8 mode;
696 #endif
697 	int i;
698 
699 	for (i = 0; i < ldev->ports; i++)
700 		if (!ldev->pf[i].dev)
701 			return false;
702 
703 #ifdef CONFIG_MLX5_ESWITCH
704 	dev = ldev->pf[MLX5_LAG_P1].dev;
705 	if ((mlx5_sriov_is_enabled(dev)) && !is_mdev_switchdev_mode(dev))
706 		return false;
707 
708 	mode = mlx5_eswitch_mode(dev);
709 	for (i = 0; i < ldev->ports; i++)
710 		if (mlx5_eswitch_mode(ldev->pf[i].dev) != mode)
711 			return false;
712 
713 	if (mode == MLX5_ESWITCH_OFFLOADS && ldev->ports != MLX5_LAG_OFFLOADS_SUPPORTED_PORTS)
714 		return false;
715 #else
716 	for (i = 0; i < ldev->ports; i++)
717 		if (mlx5_sriov_is_enabled(ldev->pf[i].dev))
718 			return false;
719 #endif
720 	return true;
721 }
722 
723 static void mlx5_lag_add_devices(struct mlx5_lag *ldev)
724 {
725 	int i;
726 
727 	for (i = 0; i < ldev->ports; i++) {
728 		if (!ldev->pf[i].dev)
729 			continue;
730 
731 		if (ldev->pf[i].dev->priv.flags &
732 		    MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV)
733 			continue;
734 
735 		ldev->pf[i].dev->priv.flags &= ~MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
736 		mlx5_rescan_drivers_locked(ldev->pf[i].dev);
737 	}
738 }
739 
740 static void mlx5_lag_remove_devices(struct mlx5_lag *ldev)
741 {
742 	int i;
743 
744 	for (i = 0; i < ldev->ports; i++) {
745 		if (!ldev->pf[i].dev)
746 			continue;
747 
748 		if (ldev->pf[i].dev->priv.flags &
749 		    MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV)
750 			continue;
751 
752 		ldev->pf[i].dev->priv.flags |= MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
753 		mlx5_rescan_drivers_locked(ldev->pf[i].dev);
754 	}
755 }
756 
757 void mlx5_disable_lag(struct mlx5_lag *ldev)
758 {
759 	bool shared_fdb = test_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, &ldev->mode_flags);
760 	struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
761 	struct mlx5_core_dev *dev1 = ldev->pf[MLX5_LAG_P2].dev;
762 	bool roce_lag;
763 	int err;
764 	int i;
765 
766 	roce_lag = __mlx5_lag_is_roce(ldev);
767 
768 	if (shared_fdb) {
769 		mlx5_lag_remove_devices(ldev);
770 	} else if (roce_lag) {
771 		if (!(dev0->priv.flags & MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV)) {
772 			dev0->priv.flags |= MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
773 			mlx5_rescan_drivers_locked(dev0);
774 		}
775 		for (i = 1; i < ldev->ports; i++)
776 			mlx5_nic_vport_disable_roce(ldev->pf[i].dev);
777 	}
778 
779 	err = mlx5_deactivate_lag(ldev);
780 	if (err)
781 		return;
782 
783 	if (shared_fdb || roce_lag)
784 		mlx5_lag_add_devices(ldev);
785 
786 	if (shared_fdb) {
787 		if (!(dev0->priv.flags & MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV))
788 			mlx5_eswitch_reload_reps(dev0->priv.eswitch);
789 		if (!(dev1->priv.flags & MLX5_PRIV_FLAGS_DISABLE_ALL_ADEV))
790 			mlx5_eswitch_reload_reps(dev1->priv.eswitch);
791 	}
792 }
793 
794 bool mlx5_shared_fdb_supported(struct mlx5_lag *ldev)
795 {
796 	struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
797 	struct mlx5_core_dev *dev1 = ldev->pf[MLX5_LAG_P2].dev;
798 
799 	if (is_mdev_switchdev_mode(dev0) &&
800 	    is_mdev_switchdev_mode(dev1) &&
801 	    mlx5_eswitch_vport_match_metadata_enabled(dev0->priv.eswitch) &&
802 	    mlx5_eswitch_vport_match_metadata_enabled(dev1->priv.eswitch) &&
803 	    mlx5_devcom_is_paired(dev0->priv.devcom,
804 				  MLX5_DEVCOM_ESW_OFFLOADS) &&
805 	    MLX5_CAP_GEN(dev1, lag_native_fdb_selection) &&
806 	    MLX5_CAP_ESW(dev1, root_ft_on_other_esw) &&
807 	    MLX5_CAP_ESW(dev0, esw_shared_ingress_acl))
808 		return true;
809 
810 	return false;
811 }
812 
813 static bool mlx5_lag_is_roce_lag(struct mlx5_lag *ldev)
814 {
815 	bool roce_lag = true;
816 	int i;
817 
818 	for (i = 0; i < ldev->ports; i++)
819 		roce_lag = roce_lag && !mlx5_sriov_is_enabled(ldev->pf[i].dev);
820 
821 #ifdef CONFIG_MLX5_ESWITCH
822 	for (i = 0; i < ldev->ports; i++)
823 		roce_lag = roce_lag && is_mdev_legacy_mode(ldev->pf[i].dev);
824 #endif
825 
826 	return roce_lag;
827 }
828 
829 static bool mlx5_lag_should_modify_lag(struct mlx5_lag *ldev, bool do_bond)
830 {
831 	return do_bond && __mlx5_lag_is_active(ldev) &&
832 	       ldev->mode != MLX5_LAG_MODE_MPESW;
833 }
834 
835 static bool mlx5_lag_should_disable_lag(struct mlx5_lag *ldev, bool do_bond)
836 {
837 	return !do_bond && __mlx5_lag_is_active(ldev) &&
838 	       ldev->mode != MLX5_LAG_MODE_MPESW;
839 }
840 
841 static void mlx5_do_bond(struct mlx5_lag *ldev)
842 {
843 	struct mlx5_core_dev *dev0 = ldev->pf[MLX5_LAG_P1].dev;
844 	struct mlx5_core_dev *dev1 = ldev->pf[MLX5_LAG_P2].dev;
845 	struct lag_tracker tracker = { };
846 	bool do_bond, roce_lag;
847 	int err;
848 	int i;
849 
850 	if (!mlx5_lag_is_ready(ldev)) {
851 		do_bond = false;
852 	} else {
853 		/* VF LAG is in multipath mode, ignore bond change requests */
854 		if (mlx5_lag_is_multipath(dev0))
855 			return;
856 
857 		tracker = ldev->tracker;
858 
859 		do_bond = tracker.is_bonded && mlx5_lag_check_prereq(ldev);
860 	}
861 
862 	if (do_bond && !__mlx5_lag_is_active(ldev)) {
863 		bool shared_fdb = mlx5_shared_fdb_supported(ldev);
864 
865 		roce_lag = mlx5_lag_is_roce_lag(ldev);
866 
867 		if (shared_fdb || roce_lag)
868 			mlx5_lag_remove_devices(ldev);
869 
870 		err = mlx5_activate_lag(ldev, &tracker,
871 					roce_lag ? MLX5_LAG_MODE_ROCE :
872 						   MLX5_LAG_MODE_SRIOV,
873 					shared_fdb);
874 		if (err) {
875 			if (shared_fdb || roce_lag)
876 				mlx5_lag_add_devices(ldev);
877 
878 			return;
879 		} else if (roce_lag) {
880 			dev0->priv.flags &= ~MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
881 			mlx5_rescan_drivers_locked(dev0);
882 			for (i = 1; i < ldev->ports; i++)
883 				mlx5_nic_vport_enable_roce(ldev->pf[i].dev);
884 		} else if (shared_fdb) {
885 			dev0->priv.flags &= ~MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
886 			mlx5_rescan_drivers_locked(dev0);
887 
888 			err = mlx5_eswitch_reload_reps(dev0->priv.eswitch);
889 			if (!err)
890 				err = mlx5_eswitch_reload_reps(dev1->priv.eswitch);
891 
892 			if (err) {
893 				dev0->priv.flags |= MLX5_PRIV_FLAGS_DISABLE_IB_ADEV;
894 				mlx5_rescan_drivers_locked(dev0);
895 				mlx5_deactivate_lag(ldev);
896 				mlx5_lag_add_devices(ldev);
897 				mlx5_eswitch_reload_reps(dev0->priv.eswitch);
898 				mlx5_eswitch_reload_reps(dev1->priv.eswitch);
899 				mlx5_core_err(dev0, "Failed to enable lag\n");
900 				return;
901 			}
902 		}
903 	} else if (mlx5_lag_should_modify_lag(ldev, do_bond)) {
904 		mlx5_modify_lag(ldev, &tracker);
905 	} else if (mlx5_lag_should_disable_lag(ldev, do_bond)) {
906 		mlx5_disable_lag(ldev);
907 	}
908 }
909 
910 static void mlx5_queue_bond_work(struct mlx5_lag *ldev, unsigned long delay)
911 {
912 	queue_delayed_work(ldev->wq, &ldev->bond_work, delay);
913 }
914 
915 static void mlx5_do_bond_work(struct work_struct *work)
916 {
917 	struct delayed_work *delayed_work = to_delayed_work(work);
918 	struct mlx5_lag *ldev = container_of(delayed_work, struct mlx5_lag,
919 					     bond_work);
920 	int status;
921 
922 	status = mlx5_dev_list_trylock();
923 	if (!status) {
924 		mlx5_queue_bond_work(ldev, HZ);
925 		return;
926 	}
927 
928 	mutex_lock(&ldev->lock);
929 	if (ldev->mode_changes_in_progress) {
930 		mutex_unlock(&ldev->lock);
931 		mlx5_dev_list_unlock();
932 		mlx5_queue_bond_work(ldev, HZ);
933 		return;
934 	}
935 
936 	mlx5_do_bond(ldev);
937 	mutex_unlock(&ldev->lock);
938 	mlx5_dev_list_unlock();
939 }
940 
941 static int mlx5_handle_changeupper_event(struct mlx5_lag *ldev,
942 					 struct lag_tracker *tracker,
943 					 struct netdev_notifier_changeupper_info *info)
944 {
945 	struct net_device *upper = info->upper_dev, *ndev_tmp;
946 	struct netdev_lag_upper_info *lag_upper_info = NULL;
947 	bool is_bonded, is_in_lag, mode_supported;
948 	bool has_inactive = 0;
949 	struct slave *slave;
950 	u8 bond_status = 0;
951 	int num_slaves = 0;
952 	int changed = 0;
953 	int idx;
954 
955 	if (!netif_is_lag_master(upper))
956 		return 0;
957 
958 	if (info->linking)
959 		lag_upper_info = info->upper_info;
960 
961 	/* The event may still be of interest if the slave does not belong to
962 	 * us, but is enslaved to a master which has one or more of our netdevs
963 	 * as slaves (e.g., if a new slave is added to a master that bonds two
964 	 * of our netdevs, we should unbond).
965 	 */
966 	rcu_read_lock();
967 	for_each_netdev_in_bond_rcu(upper, ndev_tmp) {
968 		idx = mlx5_lag_dev_get_netdev_idx(ldev, ndev_tmp);
969 		if (idx >= 0) {
970 			slave = bond_slave_get_rcu(ndev_tmp);
971 			if (slave)
972 				has_inactive |= bond_is_slave_inactive(slave);
973 			bond_status |= (1 << idx);
974 		}
975 
976 		num_slaves++;
977 	}
978 	rcu_read_unlock();
979 
980 	/* None of this lagdev's netdevs are slaves of this master. */
981 	if (!(bond_status & GENMASK(ldev->ports - 1, 0)))
982 		return 0;
983 
984 	if (lag_upper_info) {
985 		tracker->tx_type = lag_upper_info->tx_type;
986 		tracker->hash_type = lag_upper_info->hash_type;
987 	}
988 
989 	tracker->has_inactive = has_inactive;
990 	/* Determine bonding status:
991 	 * A device is considered bonded if both its physical ports are slaves
992 	 * of the same lag master, and only them.
993 	 */
994 	is_in_lag = num_slaves == ldev->ports &&
995 		bond_status == GENMASK(ldev->ports - 1, 0);
996 
997 	/* Lag mode must be activebackup or hash. */
998 	mode_supported = tracker->tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP ||
999 			 tracker->tx_type == NETDEV_LAG_TX_TYPE_HASH;
1000 
1001 	is_bonded = is_in_lag && mode_supported;
1002 	if (tracker->is_bonded != is_bonded) {
1003 		tracker->is_bonded = is_bonded;
1004 		changed = 1;
1005 	}
1006 
1007 	if (!is_in_lag)
1008 		return changed;
1009 
1010 	if (!mlx5_lag_is_ready(ldev))
1011 		NL_SET_ERR_MSG_MOD(info->info.extack,
1012 				   "Can't activate LAG offload, PF is configured with more than 64 VFs");
1013 	else if (!mode_supported)
1014 		NL_SET_ERR_MSG_MOD(info->info.extack,
1015 				   "Can't activate LAG offload, TX type isn't supported");
1016 
1017 	return changed;
1018 }
1019 
1020 static int mlx5_handle_changelowerstate_event(struct mlx5_lag *ldev,
1021 					      struct lag_tracker *tracker,
1022 					      struct net_device *ndev,
1023 					      struct netdev_notifier_changelowerstate_info *info)
1024 {
1025 	struct netdev_lag_lower_state_info *lag_lower_info;
1026 	int idx;
1027 
1028 	if (!netif_is_lag_port(ndev))
1029 		return 0;
1030 
1031 	idx = mlx5_lag_dev_get_netdev_idx(ldev, ndev);
1032 	if (idx < 0)
1033 		return 0;
1034 
1035 	/* This information is used to determine virtual to physical
1036 	 * port mapping.
1037 	 */
1038 	lag_lower_info = info->lower_state_info;
1039 	if (!lag_lower_info)
1040 		return 0;
1041 
1042 	tracker->netdev_state[idx] = *lag_lower_info;
1043 
1044 	return 1;
1045 }
1046 
1047 static int mlx5_handle_changeinfodata_event(struct mlx5_lag *ldev,
1048 					    struct lag_tracker *tracker,
1049 					    struct net_device *ndev)
1050 {
1051 	struct net_device *ndev_tmp;
1052 	struct slave *slave;
1053 	bool has_inactive = 0;
1054 	int idx;
1055 
1056 	if (!netif_is_lag_master(ndev))
1057 		return 0;
1058 
1059 	rcu_read_lock();
1060 	for_each_netdev_in_bond_rcu(ndev, ndev_tmp) {
1061 		idx = mlx5_lag_dev_get_netdev_idx(ldev, ndev_tmp);
1062 		if (idx < 0)
1063 			continue;
1064 
1065 		slave = bond_slave_get_rcu(ndev_tmp);
1066 		if (slave)
1067 			has_inactive |= bond_is_slave_inactive(slave);
1068 	}
1069 	rcu_read_unlock();
1070 
1071 	if (tracker->has_inactive == has_inactive)
1072 		return 0;
1073 
1074 	tracker->has_inactive = has_inactive;
1075 
1076 	return 1;
1077 }
1078 
1079 /* this handler is always registered to netdev events */
1080 static int mlx5_lag_netdev_event(struct notifier_block *this,
1081 				 unsigned long event, void *ptr)
1082 {
1083 	struct net_device *ndev = netdev_notifier_info_to_dev(ptr);
1084 	struct lag_tracker tracker;
1085 	struct mlx5_lag *ldev;
1086 	int changed = 0;
1087 
1088 	if (event != NETDEV_CHANGEUPPER &&
1089 	    event != NETDEV_CHANGELOWERSTATE &&
1090 	    event != NETDEV_CHANGEINFODATA)
1091 		return NOTIFY_DONE;
1092 
1093 	ldev    = container_of(this, struct mlx5_lag, nb);
1094 
1095 	tracker = ldev->tracker;
1096 
1097 	switch (event) {
1098 	case NETDEV_CHANGEUPPER:
1099 		changed = mlx5_handle_changeupper_event(ldev, &tracker, ptr);
1100 		break;
1101 	case NETDEV_CHANGELOWERSTATE:
1102 		changed = mlx5_handle_changelowerstate_event(ldev, &tracker,
1103 							     ndev, ptr);
1104 		break;
1105 	case NETDEV_CHANGEINFODATA:
1106 		changed = mlx5_handle_changeinfodata_event(ldev, &tracker, ndev);
1107 		break;
1108 	}
1109 
1110 	ldev->tracker = tracker;
1111 
1112 	if (changed)
1113 		mlx5_queue_bond_work(ldev, 0);
1114 
1115 	return NOTIFY_DONE;
1116 }
1117 
1118 static void mlx5_ldev_add_netdev(struct mlx5_lag *ldev,
1119 				 struct mlx5_core_dev *dev,
1120 				 struct net_device *netdev)
1121 {
1122 	unsigned int fn = mlx5_get_dev_index(dev);
1123 	unsigned long flags;
1124 
1125 	if (fn >= ldev->ports)
1126 		return;
1127 
1128 	spin_lock_irqsave(&lag_lock, flags);
1129 	ldev->pf[fn].netdev = netdev;
1130 	ldev->tracker.netdev_state[fn].link_up = 0;
1131 	ldev->tracker.netdev_state[fn].tx_enabled = 0;
1132 	spin_unlock_irqrestore(&lag_lock, flags);
1133 }
1134 
1135 static void mlx5_ldev_remove_netdev(struct mlx5_lag *ldev,
1136 				    struct net_device *netdev)
1137 {
1138 	unsigned long flags;
1139 	int i;
1140 
1141 	spin_lock_irqsave(&lag_lock, flags);
1142 	for (i = 0; i < ldev->ports; i++) {
1143 		if (ldev->pf[i].netdev == netdev) {
1144 			ldev->pf[i].netdev = NULL;
1145 			break;
1146 		}
1147 	}
1148 	spin_unlock_irqrestore(&lag_lock, flags);
1149 }
1150 
1151 static void mlx5_ldev_add_mdev(struct mlx5_lag *ldev,
1152 			       struct mlx5_core_dev *dev)
1153 {
1154 	unsigned int fn = mlx5_get_dev_index(dev);
1155 
1156 	if (fn >= ldev->ports)
1157 		return;
1158 
1159 	ldev->pf[fn].dev = dev;
1160 	dev->priv.lag = ldev;
1161 }
1162 
1163 static void mlx5_ldev_remove_mdev(struct mlx5_lag *ldev,
1164 				  struct mlx5_core_dev *dev)
1165 {
1166 	int i;
1167 
1168 	for (i = 0; i < ldev->ports; i++)
1169 		if (ldev->pf[i].dev == dev)
1170 			break;
1171 
1172 	if (i == ldev->ports)
1173 		return;
1174 
1175 	ldev->pf[i].dev = NULL;
1176 	dev->priv.lag = NULL;
1177 }
1178 
1179 /* Must be called with intf_mutex held */
1180 static int __mlx5_lag_dev_add_mdev(struct mlx5_core_dev *dev)
1181 {
1182 	struct mlx5_lag *ldev = NULL;
1183 	struct mlx5_core_dev *tmp_dev;
1184 
1185 	tmp_dev = mlx5_get_next_phys_dev_lag(dev);
1186 	if (tmp_dev)
1187 		ldev = tmp_dev->priv.lag;
1188 
1189 	if (!ldev) {
1190 		ldev = mlx5_lag_dev_alloc(dev);
1191 		if (!ldev) {
1192 			mlx5_core_err(dev, "Failed to alloc lag dev\n");
1193 			return 0;
1194 		}
1195 		mlx5_ldev_add_mdev(ldev, dev);
1196 		return 0;
1197 	}
1198 
1199 	mutex_lock(&ldev->lock);
1200 	if (ldev->mode_changes_in_progress) {
1201 		mutex_unlock(&ldev->lock);
1202 		return -EAGAIN;
1203 	}
1204 	mlx5_ldev_get(ldev);
1205 	mlx5_ldev_add_mdev(ldev, dev);
1206 	mutex_unlock(&ldev->lock);
1207 
1208 	return 0;
1209 }
1210 
1211 void mlx5_lag_remove_mdev(struct mlx5_core_dev *dev)
1212 {
1213 	struct mlx5_lag *ldev;
1214 
1215 	ldev = mlx5_lag_dev(dev);
1216 	if (!ldev)
1217 		return;
1218 
1219 	/* mdev is being removed, might as well remove debugfs
1220 	 * as early as possible.
1221 	 */
1222 	mlx5_ldev_remove_debugfs(dev->priv.dbg.lag_debugfs);
1223 recheck:
1224 	mutex_lock(&ldev->lock);
1225 	if (ldev->mode_changes_in_progress) {
1226 		mutex_unlock(&ldev->lock);
1227 		msleep(100);
1228 		goto recheck;
1229 	}
1230 	mlx5_ldev_remove_mdev(ldev, dev);
1231 	mutex_unlock(&ldev->lock);
1232 	mlx5_ldev_put(ldev);
1233 }
1234 
1235 void mlx5_lag_add_mdev(struct mlx5_core_dev *dev)
1236 {
1237 	int err;
1238 
1239 	if (!MLX5_CAP_GEN(dev, vport_group_manager) ||
1240 	    !MLX5_CAP_GEN(dev, lag_master) ||
1241 	    (MLX5_CAP_GEN(dev, num_lag_ports) > MLX5_MAX_PORTS ||
1242 	     MLX5_CAP_GEN(dev, num_lag_ports) <= 1))
1243 		return;
1244 
1245 recheck:
1246 	mlx5_dev_list_lock();
1247 	err = __mlx5_lag_dev_add_mdev(dev);
1248 	mlx5_dev_list_unlock();
1249 
1250 	if (err) {
1251 		msleep(100);
1252 		goto recheck;
1253 	}
1254 	mlx5_ldev_add_debugfs(dev);
1255 }
1256 
1257 void mlx5_lag_remove_netdev(struct mlx5_core_dev *dev,
1258 			    struct net_device *netdev)
1259 {
1260 	struct mlx5_lag *ldev;
1261 	bool lag_is_active;
1262 
1263 	ldev = mlx5_lag_dev(dev);
1264 	if (!ldev)
1265 		return;
1266 
1267 	mutex_lock(&ldev->lock);
1268 	mlx5_ldev_remove_netdev(ldev, netdev);
1269 	clear_bit(MLX5_LAG_FLAG_NDEVS_READY, &ldev->state_flags);
1270 
1271 	lag_is_active = __mlx5_lag_is_active(ldev);
1272 	mutex_unlock(&ldev->lock);
1273 
1274 	if (lag_is_active)
1275 		mlx5_queue_bond_work(ldev, 0);
1276 }
1277 
1278 void mlx5_lag_add_netdev(struct mlx5_core_dev *dev,
1279 			 struct net_device *netdev)
1280 {
1281 	struct mlx5_lag *ldev;
1282 	int i;
1283 
1284 	ldev = mlx5_lag_dev(dev);
1285 	if (!ldev)
1286 		return;
1287 
1288 	mutex_lock(&ldev->lock);
1289 	mlx5_ldev_add_netdev(ldev, dev, netdev);
1290 
1291 	for (i = 0; i < ldev->ports; i++)
1292 		if (!ldev->pf[i].netdev)
1293 			break;
1294 
1295 	if (i >= ldev->ports)
1296 		set_bit(MLX5_LAG_FLAG_NDEVS_READY, &ldev->state_flags);
1297 	mutex_unlock(&ldev->lock);
1298 	mlx5_queue_bond_work(ldev, 0);
1299 }
1300 
1301 bool mlx5_lag_is_roce(struct mlx5_core_dev *dev)
1302 {
1303 	struct mlx5_lag *ldev;
1304 	unsigned long flags;
1305 	bool res;
1306 
1307 	spin_lock_irqsave(&lag_lock, flags);
1308 	ldev = mlx5_lag_dev(dev);
1309 	res  = ldev && __mlx5_lag_is_roce(ldev);
1310 	spin_unlock_irqrestore(&lag_lock, flags);
1311 
1312 	return res;
1313 }
1314 EXPORT_SYMBOL(mlx5_lag_is_roce);
1315 
1316 bool mlx5_lag_is_active(struct mlx5_core_dev *dev)
1317 {
1318 	struct mlx5_lag *ldev;
1319 	unsigned long flags;
1320 	bool res;
1321 
1322 	spin_lock_irqsave(&lag_lock, flags);
1323 	ldev = mlx5_lag_dev(dev);
1324 	res  = ldev && __mlx5_lag_is_active(ldev);
1325 	spin_unlock_irqrestore(&lag_lock, flags);
1326 
1327 	return res;
1328 }
1329 EXPORT_SYMBOL(mlx5_lag_is_active);
1330 
1331 bool mlx5_lag_mode_is_hash(struct mlx5_core_dev *dev)
1332 {
1333 	struct mlx5_lag *ldev;
1334 	unsigned long flags;
1335 	bool res = 0;
1336 
1337 	spin_lock_irqsave(&lag_lock, flags);
1338 	ldev = mlx5_lag_dev(dev);
1339 	if (ldev)
1340 		res = test_bit(MLX5_LAG_MODE_FLAG_HASH_BASED, &ldev->mode_flags);
1341 	spin_unlock_irqrestore(&lag_lock, flags);
1342 
1343 	return res;
1344 }
1345 EXPORT_SYMBOL(mlx5_lag_mode_is_hash);
1346 
1347 bool mlx5_lag_is_master(struct mlx5_core_dev *dev)
1348 {
1349 	struct mlx5_lag *ldev;
1350 	unsigned long flags;
1351 	bool res;
1352 
1353 	spin_lock_irqsave(&lag_lock, flags);
1354 	ldev = mlx5_lag_dev(dev);
1355 	res = ldev && __mlx5_lag_is_active(ldev) &&
1356 		dev == ldev->pf[MLX5_LAG_P1].dev;
1357 	spin_unlock_irqrestore(&lag_lock, flags);
1358 
1359 	return res;
1360 }
1361 EXPORT_SYMBOL(mlx5_lag_is_master);
1362 
1363 bool mlx5_lag_is_sriov(struct mlx5_core_dev *dev)
1364 {
1365 	struct mlx5_lag *ldev;
1366 	unsigned long flags;
1367 	bool res;
1368 
1369 	spin_lock_irqsave(&lag_lock, flags);
1370 	ldev = mlx5_lag_dev(dev);
1371 	res  = ldev && __mlx5_lag_is_sriov(ldev);
1372 	spin_unlock_irqrestore(&lag_lock, flags);
1373 
1374 	return res;
1375 }
1376 EXPORT_SYMBOL(mlx5_lag_is_sriov);
1377 
1378 bool mlx5_lag_is_shared_fdb(struct mlx5_core_dev *dev)
1379 {
1380 	struct mlx5_lag *ldev;
1381 	unsigned long flags;
1382 	bool res;
1383 
1384 	spin_lock_irqsave(&lag_lock, flags);
1385 	ldev = mlx5_lag_dev(dev);
1386 	res = ldev && __mlx5_lag_is_sriov(ldev) &&
1387 	      test_bit(MLX5_LAG_MODE_FLAG_SHARED_FDB, &ldev->mode_flags);
1388 	spin_unlock_irqrestore(&lag_lock, flags);
1389 
1390 	return res;
1391 }
1392 EXPORT_SYMBOL(mlx5_lag_is_shared_fdb);
1393 
1394 void mlx5_lag_disable_change(struct mlx5_core_dev *dev)
1395 {
1396 	struct mlx5_lag *ldev;
1397 
1398 	ldev = mlx5_lag_dev(dev);
1399 	if (!ldev)
1400 		return;
1401 
1402 	mlx5_dev_list_lock();
1403 	mutex_lock(&ldev->lock);
1404 
1405 	ldev->mode_changes_in_progress++;
1406 	if (__mlx5_lag_is_active(ldev))
1407 		mlx5_disable_lag(ldev);
1408 
1409 	mutex_unlock(&ldev->lock);
1410 	mlx5_dev_list_unlock();
1411 }
1412 
1413 void mlx5_lag_enable_change(struct mlx5_core_dev *dev)
1414 {
1415 	struct mlx5_lag *ldev;
1416 
1417 	ldev = mlx5_lag_dev(dev);
1418 	if (!ldev)
1419 		return;
1420 
1421 	mutex_lock(&ldev->lock);
1422 	ldev->mode_changes_in_progress--;
1423 	mutex_unlock(&ldev->lock);
1424 	mlx5_queue_bond_work(ldev, 0);
1425 }
1426 
1427 struct net_device *mlx5_lag_get_roce_netdev(struct mlx5_core_dev *dev)
1428 {
1429 	struct net_device *ndev = NULL;
1430 	struct mlx5_lag *ldev;
1431 	unsigned long flags;
1432 	int i;
1433 
1434 	spin_lock_irqsave(&lag_lock, flags);
1435 	ldev = mlx5_lag_dev(dev);
1436 
1437 	if (!(ldev && __mlx5_lag_is_roce(ldev)))
1438 		goto unlock;
1439 
1440 	if (ldev->tracker.tx_type == NETDEV_LAG_TX_TYPE_ACTIVEBACKUP) {
1441 		for (i = 0; i < ldev->ports; i++)
1442 			if (ldev->tracker.netdev_state[i].tx_enabled)
1443 				ndev = ldev->pf[i].netdev;
1444 		if (!ndev)
1445 			ndev = ldev->pf[ldev->ports - 1].netdev;
1446 	} else {
1447 		ndev = ldev->pf[MLX5_LAG_P1].netdev;
1448 	}
1449 	if (ndev)
1450 		dev_hold(ndev);
1451 
1452 unlock:
1453 	spin_unlock_irqrestore(&lag_lock, flags);
1454 
1455 	return ndev;
1456 }
1457 EXPORT_SYMBOL(mlx5_lag_get_roce_netdev);
1458 
1459 u8 mlx5_lag_get_slave_port(struct mlx5_core_dev *dev,
1460 			   struct net_device *slave)
1461 {
1462 	struct mlx5_lag *ldev;
1463 	unsigned long flags;
1464 	u8 port = 0;
1465 	int i;
1466 
1467 	spin_lock_irqsave(&lag_lock, flags);
1468 	ldev = mlx5_lag_dev(dev);
1469 	if (!(ldev && __mlx5_lag_is_roce(ldev)))
1470 		goto unlock;
1471 
1472 	for (i = 0; i < ldev->ports; i++) {
1473 		if (ldev->pf[MLX5_LAG_P1].netdev == slave) {
1474 			port = i;
1475 			break;
1476 		}
1477 	}
1478 
1479 	port = ldev->v2p_map[port * ldev->buckets];
1480 
1481 unlock:
1482 	spin_unlock_irqrestore(&lag_lock, flags);
1483 	return port;
1484 }
1485 EXPORT_SYMBOL(mlx5_lag_get_slave_port);
1486 
1487 u8 mlx5_lag_get_num_ports(struct mlx5_core_dev *dev)
1488 {
1489 	struct mlx5_lag *ldev;
1490 
1491 	ldev = mlx5_lag_dev(dev);
1492 	if (!ldev)
1493 		return 0;
1494 
1495 	return ldev->ports;
1496 }
1497 EXPORT_SYMBOL(mlx5_lag_get_num_ports);
1498 
1499 struct mlx5_core_dev *mlx5_lag_get_peer_mdev(struct mlx5_core_dev *dev)
1500 {
1501 	struct mlx5_core_dev *peer_dev = NULL;
1502 	struct mlx5_lag *ldev;
1503 	unsigned long flags;
1504 
1505 	spin_lock_irqsave(&lag_lock, flags);
1506 	ldev = mlx5_lag_dev(dev);
1507 	if (!ldev)
1508 		goto unlock;
1509 
1510 	peer_dev = ldev->pf[MLX5_LAG_P1].dev == dev ?
1511 			   ldev->pf[MLX5_LAG_P2].dev :
1512 			   ldev->pf[MLX5_LAG_P1].dev;
1513 
1514 unlock:
1515 	spin_unlock_irqrestore(&lag_lock, flags);
1516 	return peer_dev;
1517 }
1518 EXPORT_SYMBOL(mlx5_lag_get_peer_mdev);
1519 
1520 int mlx5_lag_query_cong_counters(struct mlx5_core_dev *dev,
1521 				 u64 *values,
1522 				 int num_counters,
1523 				 size_t *offsets)
1524 {
1525 	int outlen = MLX5_ST_SZ_BYTES(query_cong_statistics_out);
1526 	struct mlx5_core_dev **mdev;
1527 	struct mlx5_lag *ldev;
1528 	unsigned long flags;
1529 	int num_ports;
1530 	int ret, i, j;
1531 	void *out;
1532 
1533 	out = kvzalloc(outlen, GFP_KERNEL);
1534 	if (!out)
1535 		return -ENOMEM;
1536 
1537 	mdev = kvzalloc(sizeof(mdev[0]) * MLX5_MAX_PORTS, GFP_KERNEL);
1538 	if (!mdev) {
1539 		ret = -ENOMEM;
1540 		goto free_out;
1541 	}
1542 
1543 	memset(values, 0, sizeof(*values) * num_counters);
1544 
1545 	spin_lock_irqsave(&lag_lock, flags);
1546 	ldev = mlx5_lag_dev(dev);
1547 	if (ldev && __mlx5_lag_is_active(ldev)) {
1548 		num_ports = ldev->ports;
1549 		for (i = 0; i < ldev->ports; i++)
1550 			mdev[i] = ldev->pf[i].dev;
1551 	} else {
1552 		num_ports = 1;
1553 		mdev[MLX5_LAG_P1] = dev;
1554 	}
1555 	spin_unlock_irqrestore(&lag_lock, flags);
1556 
1557 	for (i = 0; i < num_ports; ++i) {
1558 		u32 in[MLX5_ST_SZ_DW(query_cong_statistics_in)] = {};
1559 
1560 		MLX5_SET(query_cong_statistics_in, in, opcode,
1561 			 MLX5_CMD_OP_QUERY_CONG_STATISTICS);
1562 		ret = mlx5_cmd_exec_inout(mdev[i], query_cong_statistics, in,
1563 					  out);
1564 		if (ret)
1565 			goto free_mdev;
1566 
1567 		for (j = 0; j < num_counters; ++j)
1568 			values[j] += be64_to_cpup((__be64 *)(out + offsets[j]));
1569 	}
1570 
1571 free_mdev:
1572 	kvfree(mdev);
1573 free_out:
1574 	kvfree(out);
1575 	return ret;
1576 }
1577 EXPORT_SYMBOL(mlx5_lag_query_cong_counters);
1578