1 // SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2 /* Copyright (c) 2021, NVIDIA CORPORATION & AFFILIATES. All rights reserved. */
3 
4 #include "eswitch.h"
5 #include "esw/qos.h"
6 #include "en/port.h"
7 #define CREATE_TRACE_POINTS
8 #include "diag/qos_tracepoint.h"
9 
10 /* Minimum supported BW share value by the HW is 1 Mbit/sec */
11 #define MLX5_MIN_BW_SHARE 1
12 
13 #define MLX5_RATE_TO_BW_SHARE(rate, divider, limit) \
14 	min_t(u32, max_t(u32, DIV_ROUND_UP(rate, divider), MLX5_MIN_BW_SHARE), limit)
15 
16 struct mlx5_esw_rate_group {
17 	u32 tsar_ix;
18 	u32 max_rate;
19 	u32 min_rate;
20 	u32 bw_share;
21 	struct list_head list;
22 };
23 
24 static int esw_qos_tsar_config(struct mlx5_core_dev *dev, u32 *sched_ctx,
25 			       u32 parent_ix, u32 tsar_ix,
26 			       u32 max_rate, u32 bw_share)
27 {
28 	u32 bitmask = 0;
29 
30 	if (!MLX5_CAP_GEN(dev, qos) || !MLX5_CAP_QOS(dev, esw_scheduling))
31 		return -EOPNOTSUPP;
32 
33 	MLX5_SET(scheduling_context, sched_ctx, parent_element_id, parent_ix);
34 	MLX5_SET(scheduling_context, sched_ctx, max_average_bw, max_rate);
35 	MLX5_SET(scheduling_context, sched_ctx, bw_share, bw_share);
36 	bitmask |= MODIFY_SCHEDULING_ELEMENT_IN_MODIFY_BITMASK_MAX_AVERAGE_BW;
37 	bitmask |= MODIFY_SCHEDULING_ELEMENT_IN_MODIFY_BITMASK_BW_SHARE;
38 
39 	return mlx5_modify_scheduling_element_cmd(dev,
40 						  SCHEDULING_HIERARCHY_E_SWITCH,
41 						  sched_ctx,
42 						  tsar_ix,
43 						  bitmask);
44 }
45 
46 static int esw_qos_group_config(struct mlx5_eswitch *esw, struct mlx5_esw_rate_group *group,
47 				u32 max_rate, u32 bw_share, struct netlink_ext_ack *extack)
48 {
49 	u32 sched_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {};
50 	struct mlx5_core_dev *dev = esw->dev;
51 	int err;
52 
53 	err = esw_qos_tsar_config(dev, sched_ctx,
54 				  esw->qos.root_tsar_ix, group->tsar_ix,
55 				  max_rate, bw_share);
56 	if (err)
57 		NL_SET_ERR_MSG_MOD(extack, "E-Switch modify group TSAR element failed");
58 
59 	trace_mlx5_esw_group_qos_config(dev, group, group->tsar_ix, bw_share, max_rate);
60 
61 	return err;
62 }
63 
64 static int esw_qos_vport_config(struct mlx5_eswitch *esw,
65 				struct mlx5_vport *vport,
66 				u32 max_rate, u32 bw_share,
67 				struct netlink_ext_ack *extack)
68 {
69 	u32 sched_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {};
70 	struct mlx5_esw_rate_group *group = vport->qos.group;
71 	struct mlx5_core_dev *dev = esw->dev;
72 	u32 parent_tsar_ix;
73 	void *vport_elem;
74 	int err;
75 
76 	if (!vport->qos.enabled)
77 		return -EIO;
78 
79 	parent_tsar_ix = group ? group->tsar_ix : esw->qos.root_tsar_ix;
80 	MLX5_SET(scheduling_context, sched_ctx, element_type,
81 		 SCHEDULING_CONTEXT_ELEMENT_TYPE_VPORT);
82 	vport_elem = MLX5_ADDR_OF(scheduling_context, sched_ctx,
83 				  element_attributes);
84 	MLX5_SET(vport_element, vport_elem, vport_number, vport->vport);
85 
86 	err = esw_qos_tsar_config(dev, sched_ctx, parent_tsar_ix, vport->qos.esw_tsar_ix,
87 				  max_rate, bw_share);
88 	if (err) {
89 		esw_warn(esw->dev,
90 			 "E-Switch modify TSAR vport element failed (vport=%d,err=%d)\n",
91 			 vport->vport, err);
92 		NL_SET_ERR_MSG_MOD(extack, "E-Switch modify TSAR vport element failed");
93 		return err;
94 	}
95 
96 	trace_mlx5_esw_vport_qos_config(vport, bw_share, max_rate);
97 
98 	return 0;
99 }
100 
101 static u32 esw_qos_calculate_min_rate_divider(struct mlx5_eswitch *esw,
102 					      struct mlx5_esw_rate_group *group,
103 					      bool group_level)
104 {
105 	u32 fw_max_bw_share = MLX5_CAP_QOS(esw->dev, max_tsar_bw_share);
106 	struct mlx5_vport *evport;
107 	u32 max_guarantee = 0;
108 	unsigned long i;
109 
110 	if (group_level) {
111 		struct mlx5_esw_rate_group *group;
112 
113 		list_for_each_entry(group, &esw->qos.groups, list) {
114 			if (group->min_rate < max_guarantee)
115 				continue;
116 			max_guarantee = group->min_rate;
117 		}
118 	} else {
119 		mlx5_esw_for_each_vport(esw, i, evport) {
120 			if (!evport->enabled || !evport->qos.enabled ||
121 			    evport->qos.group != group || evport->qos.min_rate < max_guarantee)
122 				continue;
123 			max_guarantee = evport->qos.min_rate;
124 		}
125 	}
126 
127 	if (max_guarantee)
128 		return max_t(u32, max_guarantee / fw_max_bw_share, 1);
129 
130 	/* If vports min rate divider is 0 but their group has bw_share configured, then
131 	 * need to set bw_share for vports to minimal value.
132 	 */
133 	if (!group_level && !max_guarantee && group && group->bw_share)
134 		return 1;
135 	return 0;
136 }
137 
138 static u32 esw_qos_calc_bw_share(u32 min_rate, u32 divider, u32 fw_max)
139 {
140 	if (divider)
141 		return MLX5_RATE_TO_BW_SHARE(min_rate, divider, fw_max);
142 
143 	return 0;
144 }
145 
146 static int esw_qos_normalize_vports_min_rate(struct mlx5_eswitch *esw,
147 					     struct mlx5_esw_rate_group *group,
148 					     struct netlink_ext_ack *extack)
149 {
150 	u32 fw_max_bw_share = MLX5_CAP_QOS(esw->dev, max_tsar_bw_share);
151 	u32 divider = esw_qos_calculate_min_rate_divider(esw, group, false);
152 	struct mlx5_vport *evport;
153 	unsigned long i;
154 	u32 bw_share;
155 	int err;
156 
157 	mlx5_esw_for_each_vport(esw, i, evport) {
158 		if (!evport->enabled || !evport->qos.enabled || evport->qos.group != group)
159 			continue;
160 		bw_share = esw_qos_calc_bw_share(evport->qos.min_rate, divider, fw_max_bw_share);
161 
162 		if (bw_share == evport->qos.bw_share)
163 			continue;
164 
165 		err = esw_qos_vport_config(esw, evport, evport->qos.max_rate, bw_share, extack);
166 		if (err)
167 			return err;
168 
169 		evport->qos.bw_share = bw_share;
170 	}
171 
172 	return 0;
173 }
174 
175 static int esw_qos_normalize_groups_min_rate(struct mlx5_eswitch *esw, u32 divider,
176 					     struct netlink_ext_ack *extack)
177 {
178 	u32 fw_max_bw_share = MLX5_CAP_QOS(esw->dev, max_tsar_bw_share);
179 	struct mlx5_esw_rate_group *group;
180 	u32 bw_share;
181 	int err;
182 
183 	list_for_each_entry(group, &esw->qos.groups, list) {
184 		bw_share = esw_qos_calc_bw_share(group->min_rate, divider, fw_max_bw_share);
185 
186 		if (bw_share == group->bw_share)
187 			continue;
188 
189 		err = esw_qos_group_config(esw, group, group->max_rate, bw_share, extack);
190 		if (err)
191 			return err;
192 
193 		group->bw_share = bw_share;
194 
195 		/* All the group's vports need to be set with default bw_share
196 		 * to enable them with QOS
197 		 */
198 		err = esw_qos_normalize_vports_min_rate(esw, group, extack);
199 
200 		if (err)
201 			return err;
202 	}
203 
204 	return 0;
205 }
206 
207 static int esw_qos_set_vport_min_rate(struct mlx5_eswitch *esw, struct mlx5_vport *evport,
208 				      u32 min_rate, struct netlink_ext_ack *extack)
209 {
210 	u32 fw_max_bw_share, previous_min_rate;
211 	bool min_rate_supported;
212 	int err;
213 
214 	lockdep_assert_held(&esw->state_lock);
215 	fw_max_bw_share = MLX5_CAP_QOS(esw->dev, max_tsar_bw_share);
216 	min_rate_supported = MLX5_CAP_QOS(esw->dev, esw_bw_share) &&
217 				fw_max_bw_share >= MLX5_MIN_BW_SHARE;
218 	if (min_rate && !min_rate_supported)
219 		return -EOPNOTSUPP;
220 	if (min_rate == evport->qos.min_rate)
221 		return 0;
222 
223 	previous_min_rate = evport->qos.min_rate;
224 	evport->qos.min_rate = min_rate;
225 	err = esw_qos_normalize_vports_min_rate(esw, evport->qos.group, extack);
226 	if (err)
227 		evport->qos.min_rate = previous_min_rate;
228 
229 	return err;
230 }
231 
232 static int esw_qos_set_vport_max_rate(struct mlx5_eswitch *esw, struct mlx5_vport *evport,
233 				      u32 max_rate, struct netlink_ext_ack *extack)
234 {
235 	u32 act_max_rate = max_rate;
236 	bool max_rate_supported;
237 	int err;
238 
239 	lockdep_assert_held(&esw->state_lock);
240 	max_rate_supported = MLX5_CAP_QOS(esw->dev, esw_rate_limit);
241 
242 	if (max_rate && !max_rate_supported)
243 		return -EOPNOTSUPP;
244 	if (max_rate == evport->qos.max_rate)
245 		return 0;
246 
247 	/* If parent group has rate limit need to set to group
248 	 * value when new max rate is 0.
249 	 */
250 	if (evport->qos.group && !max_rate)
251 		act_max_rate = evport->qos.group->max_rate;
252 
253 	err = esw_qos_vport_config(esw, evport, act_max_rate, evport->qos.bw_share, extack);
254 
255 	if (!err)
256 		evport->qos.max_rate = max_rate;
257 
258 	return err;
259 }
260 
261 static int esw_qos_set_group_min_rate(struct mlx5_eswitch *esw, struct mlx5_esw_rate_group *group,
262 				      u32 min_rate, struct netlink_ext_ack *extack)
263 {
264 	u32 fw_max_bw_share = MLX5_CAP_QOS(esw->dev, max_tsar_bw_share);
265 	struct mlx5_core_dev *dev = esw->dev;
266 	u32 previous_min_rate, divider;
267 	int err;
268 
269 	if (!(MLX5_CAP_QOS(dev, esw_bw_share) && fw_max_bw_share >= MLX5_MIN_BW_SHARE))
270 		return -EOPNOTSUPP;
271 
272 	if (min_rate == group->min_rate)
273 		return 0;
274 
275 	previous_min_rate = group->min_rate;
276 	group->min_rate = min_rate;
277 	divider = esw_qos_calculate_min_rate_divider(esw, group, true);
278 	err = esw_qos_normalize_groups_min_rate(esw, divider, extack);
279 	if (err) {
280 		group->min_rate = previous_min_rate;
281 		NL_SET_ERR_MSG_MOD(extack, "E-Switch group min rate setting failed");
282 
283 		/* Attempt restoring previous configuration */
284 		divider = esw_qos_calculate_min_rate_divider(esw, group, true);
285 		if (esw_qos_normalize_groups_min_rate(esw, divider, extack))
286 			NL_SET_ERR_MSG_MOD(extack, "E-Switch BW share restore failed");
287 	}
288 
289 	return err;
290 }
291 
292 static int esw_qos_set_group_max_rate(struct mlx5_eswitch *esw,
293 				      struct mlx5_esw_rate_group *group,
294 				      u32 max_rate, struct netlink_ext_ack *extack)
295 {
296 	struct mlx5_vport *vport;
297 	unsigned long i;
298 	int err;
299 
300 	if (group->max_rate == max_rate)
301 		return 0;
302 
303 	err = esw_qos_group_config(esw, group, max_rate, group->bw_share, extack);
304 	if (err)
305 		return err;
306 
307 	group->max_rate = max_rate;
308 
309 	/* Any unlimited vports in the group should be set
310 	 * with the value of the group.
311 	 */
312 	mlx5_esw_for_each_vport(esw, i, vport) {
313 		if (!vport->enabled || !vport->qos.enabled ||
314 		    vport->qos.group != group || vport->qos.max_rate)
315 			continue;
316 
317 		err = esw_qos_vport_config(esw, vport, max_rate, vport->qos.bw_share, extack);
318 		if (err)
319 			NL_SET_ERR_MSG_MOD(extack,
320 					   "E-Switch vport implicit rate limit setting failed");
321 	}
322 
323 	return err;
324 }
325 
326 static int esw_qos_vport_create_sched_element(struct mlx5_eswitch *esw,
327 					      struct mlx5_vport *vport,
328 					      u32 max_rate, u32 bw_share)
329 {
330 	u32 sched_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {};
331 	struct mlx5_esw_rate_group *group = vport->qos.group;
332 	struct mlx5_core_dev *dev = esw->dev;
333 	u32 parent_tsar_ix;
334 	void *vport_elem;
335 	int err;
336 
337 	parent_tsar_ix = group ? group->tsar_ix : esw->qos.root_tsar_ix;
338 	MLX5_SET(scheduling_context, sched_ctx, element_type,
339 		 SCHEDULING_CONTEXT_ELEMENT_TYPE_VPORT);
340 	vport_elem = MLX5_ADDR_OF(scheduling_context, sched_ctx, element_attributes);
341 	MLX5_SET(vport_element, vport_elem, vport_number, vport->vport);
342 	MLX5_SET(scheduling_context, sched_ctx, parent_element_id, parent_tsar_ix);
343 	MLX5_SET(scheduling_context, sched_ctx, max_average_bw, max_rate);
344 	MLX5_SET(scheduling_context, sched_ctx, bw_share, bw_share);
345 
346 	err = mlx5_create_scheduling_element_cmd(dev,
347 						 SCHEDULING_HIERARCHY_E_SWITCH,
348 						 sched_ctx,
349 						 &vport->qos.esw_tsar_ix);
350 	if (err) {
351 		esw_warn(esw->dev, "E-Switch create TSAR vport element failed (vport=%d,err=%d)\n",
352 			 vport->vport, err);
353 		return err;
354 	}
355 
356 	return 0;
357 }
358 
359 static int esw_qos_update_group_scheduling_element(struct mlx5_eswitch *esw,
360 						   struct mlx5_vport *vport,
361 						   struct mlx5_esw_rate_group *curr_group,
362 						   struct mlx5_esw_rate_group *new_group,
363 						   struct netlink_ext_ack *extack)
364 {
365 	u32 max_rate;
366 	int err;
367 
368 	err = mlx5_destroy_scheduling_element_cmd(esw->dev,
369 						  SCHEDULING_HIERARCHY_E_SWITCH,
370 						  vport->qos.esw_tsar_ix);
371 	if (err) {
372 		NL_SET_ERR_MSG_MOD(extack, "E-Switch destroy TSAR vport element failed");
373 		return err;
374 	}
375 
376 	vport->qos.group = new_group;
377 	max_rate = vport->qos.max_rate ? vport->qos.max_rate : new_group->max_rate;
378 
379 	/* If vport is unlimited, we set the group's value.
380 	 * Therefore, if the group is limited it will apply to
381 	 * the vport as well and if not, vport will remain unlimited.
382 	 */
383 	err = esw_qos_vport_create_sched_element(esw, vport, max_rate, vport->qos.bw_share);
384 	if (err) {
385 		NL_SET_ERR_MSG_MOD(extack, "E-Switch vport group set failed.");
386 		goto err_sched;
387 	}
388 
389 	return 0;
390 
391 err_sched:
392 	vport->qos.group = curr_group;
393 	max_rate = vport->qos.max_rate ? vport->qos.max_rate : curr_group->max_rate;
394 	if (esw_qos_vport_create_sched_element(esw, vport, max_rate, vport->qos.bw_share))
395 		esw_warn(esw->dev, "E-Switch vport group restore failed (vport=%d)\n",
396 			 vport->vport);
397 
398 	return err;
399 }
400 
401 static int esw_qos_vport_update_group(struct mlx5_eswitch *esw,
402 				      struct mlx5_vport *vport,
403 				      struct mlx5_esw_rate_group *group,
404 				      struct netlink_ext_ack *extack)
405 {
406 	struct mlx5_esw_rate_group *new_group, *curr_group;
407 	int err;
408 
409 	if (!vport->enabled)
410 		return -EINVAL;
411 
412 	curr_group = vport->qos.group;
413 	new_group = group ?: esw->qos.group0;
414 	if (curr_group == new_group)
415 		return 0;
416 
417 	err = esw_qos_update_group_scheduling_element(esw, vport, curr_group, new_group, extack);
418 	if (err)
419 		return err;
420 
421 	/* Recalculate bw share weights of old and new groups */
422 	if (vport->qos.bw_share || new_group->bw_share) {
423 		esw_qos_normalize_vports_min_rate(esw, curr_group, extack);
424 		esw_qos_normalize_vports_min_rate(esw, new_group, extack);
425 	}
426 
427 	return 0;
428 }
429 
430 static struct mlx5_esw_rate_group *
431 __esw_qos_create_rate_group(struct mlx5_eswitch *esw, struct netlink_ext_ack *extack)
432 {
433 	u32 tsar_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {};
434 	struct mlx5_esw_rate_group *group;
435 	u32 divider;
436 	int err;
437 
438 	group = kzalloc(sizeof(*group), GFP_KERNEL);
439 	if (!group)
440 		return ERR_PTR(-ENOMEM);
441 
442 	MLX5_SET(scheduling_context, tsar_ctx, parent_element_id,
443 		 esw->qos.root_tsar_ix);
444 	err = mlx5_create_scheduling_element_cmd(esw->dev,
445 						 SCHEDULING_HIERARCHY_E_SWITCH,
446 						 tsar_ctx,
447 						 &group->tsar_ix);
448 	if (err) {
449 		NL_SET_ERR_MSG_MOD(extack, "E-Switch create TSAR for group failed");
450 		goto err_sched_elem;
451 	}
452 
453 	list_add_tail(&group->list, &esw->qos.groups);
454 
455 	divider = esw_qos_calculate_min_rate_divider(esw, group, true);
456 	if (divider) {
457 		err = esw_qos_normalize_groups_min_rate(esw, divider, extack);
458 		if (err) {
459 			NL_SET_ERR_MSG_MOD(extack, "E-Switch groups normalization failed");
460 			goto err_min_rate;
461 		}
462 	}
463 	trace_mlx5_esw_group_qos_create(esw->dev, group, group->tsar_ix);
464 
465 	return group;
466 
467 err_min_rate:
468 	list_del(&group->list);
469 	if (mlx5_destroy_scheduling_element_cmd(esw->dev,
470 						SCHEDULING_HIERARCHY_E_SWITCH,
471 						group->tsar_ix))
472 		NL_SET_ERR_MSG_MOD(extack, "E-Switch destroy TSAR for group failed");
473 err_sched_elem:
474 	kfree(group);
475 	return ERR_PTR(err);
476 }
477 
478 static int esw_qos_get(struct mlx5_eswitch *esw, struct netlink_ext_ack *extack);
479 static void esw_qos_put(struct mlx5_eswitch *esw);
480 
481 static struct mlx5_esw_rate_group *
482 esw_qos_create_rate_group(struct mlx5_eswitch *esw, struct netlink_ext_ack *extack)
483 {
484 	struct mlx5_esw_rate_group *group;
485 	int err;
486 
487 	if (!MLX5_CAP_QOS(esw->dev, log_esw_max_sched_depth))
488 		return ERR_PTR(-EOPNOTSUPP);
489 
490 	err = esw_qos_get(esw, extack);
491 	if (err)
492 		return ERR_PTR(err);
493 
494 	group = __esw_qos_create_rate_group(esw, extack);
495 	if (IS_ERR(group))
496 		esw_qos_put(esw);
497 
498 	return group;
499 }
500 
501 static int __esw_qos_destroy_rate_group(struct mlx5_eswitch *esw,
502 					struct mlx5_esw_rate_group *group,
503 					struct netlink_ext_ack *extack)
504 {
505 	u32 divider;
506 	int err;
507 
508 	list_del(&group->list);
509 
510 	divider = esw_qos_calculate_min_rate_divider(esw, NULL, true);
511 	err = esw_qos_normalize_groups_min_rate(esw, divider, extack);
512 	if (err)
513 		NL_SET_ERR_MSG_MOD(extack, "E-Switch groups' normalization failed");
514 
515 	err = mlx5_destroy_scheduling_element_cmd(esw->dev,
516 						  SCHEDULING_HIERARCHY_E_SWITCH,
517 						  group->tsar_ix);
518 	if (err)
519 		NL_SET_ERR_MSG_MOD(extack, "E-Switch destroy TSAR_ID failed");
520 
521 	trace_mlx5_esw_group_qos_destroy(esw->dev, group, group->tsar_ix);
522 
523 	kfree(group);
524 
525 	return err;
526 }
527 
528 static int esw_qos_destroy_rate_group(struct mlx5_eswitch *esw,
529 				      struct mlx5_esw_rate_group *group,
530 				      struct netlink_ext_ack *extack)
531 {
532 	int err;
533 
534 	err = __esw_qos_destroy_rate_group(esw, group, extack);
535 	esw_qos_put(esw);
536 
537 	return err;
538 }
539 
540 static bool esw_qos_element_type_supported(struct mlx5_core_dev *dev, int type)
541 {
542 	switch (type) {
543 	case SCHEDULING_CONTEXT_ELEMENT_TYPE_TSAR:
544 		return MLX5_CAP_QOS(dev, esw_element_type) &
545 		       ELEMENT_TYPE_CAP_MASK_TASR;
546 	case SCHEDULING_CONTEXT_ELEMENT_TYPE_VPORT:
547 		return MLX5_CAP_QOS(dev, esw_element_type) &
548 		       ELEMENT_TYPE_CAP_MASK_VPORT;
549 	case SCHEDULING_CONTEXT_ELEMENT_TYPE_VPORT_TC:
550 		return MLX5_CAP_QOS(dev, esw_element_type) &
551 		       ELEMENT_TYPE_CAP_MASK_VPORT_TC;
552 	case SCHEDULING_CONTEXT_ELEMENT_TYPE_PARA_VPORT_TC:
553 		return MLX5_CAP_QOS(dev, esw_element_type) &
554 		       ELEMENT_TYPE_CAP_MASK_PARA_VPORT_TC;
555 	}
556 	return false;
557 }
558 
559 static int esw_qos_create(struct mlx5_eswitch *esw, struct netlink_ext_ack *extack)
560 {
561 	u32 tsar_ctx[MLX5_ST_SZ_DW(scheduling_context)] = {};
562 	struct mlx5_core_dev *dev = esw->dev;
563 	__be32 *attr;
564 	int err;
565 
566 	if (!MLX5_CAP_GEN(dev, qos) || !MLX5_CAP_QOS(dev, esw_scheduling))
567 		return -EOPNOTSUPP;
568 
569 	if (!esw_qos_element_type_supported(dev, SCHEDULING_CONTEXT_ELEMENT_TYPE_TSAR))
570 		return -EOPNOTSUPP;
571 
572 	MLX5_SET(scheduling_context, tsar_ctx, element_type,
573 		 SCHEDULING_CONTEXT_ELEMENT_TYPE_TSAR);
574 
575 	attr = MLX5_ADDR_OF(scheduling_context, tsar_ctx, element_attributes);
576 	*attr = cpu_to_be32(TSAR_ELEMENT_TSAR_TYPE_DWRR << 16);
577 
578 	err = mlx5_create_scheduling_element_cmd(dev,
579 						 SCHEDULING_HIERARCHY_E_SWITCH,
580 						 tsar_ctx,
581 						 &esw->qos.root_tsar_ix);
582 	if (err) {
583 		esw_warn(dev, "E-Switch create root TSAR failed (%d)\n", err);
584 		return err;
585 	}
586 
587 	INIT_LIST_HEAD(&esw->qos.groups);
588 	if (MLX5_CAP_QOS(dev, log_esw_max_sched_depth)) {
589 		esw->qos.group0 = __esw_qos_create_rate_group(esw, extack);
590 		if (IS_ERR(esw->qos.group0)) {
591 			esw_warn(dev, "E-Switch create rate group 0 failed (%ld)\n",
592 				 PTR_ERR(esw->qos.group0));
593 			err = PTR_ERR(esw->qos.group0);
594 			goto err_group0;
595 		}
596 	}
597 	refcount_set(&esw->qos.refcnt, 1);
598 
599 	return 0;
600 
601 err_group0:
602 	if (mlx5_destroy_scheduling_element_cmd(esw->dev, SCHEDULING_HIERARCHY_E_SWITCH,
603 						esw->qos.root_tsar_ix))
604 		esw_warn(esw->dev, "E-Switch destroy root TSAR failed.\n");
605 
606 	return err;
607 }
608 
609 static void esw_qos_destroy(struct mlx5_eswitch *esw)
610 {
611 	int err;
612 
613 	if (esw->qos.group0)
614 		__esw_qos_destroy_rate_group(esw, esw->qos.group0, NULL);
615 
616 	err = mlx5_destroy_scheduling_element_cmd(esw->dev,
617 						  SCHEDULING_HIERARCHY_E_SWITCH,
618 						  esw->qos.root_tsar_ix);
619 	if (err)
620 		esw_warn(esw->dev, "E-Switch destroy root TSAR failed (%d)\n", err);
621 }
622 
623 static int esw_qos_get(struct mlx5_eswitch *esw, struct netlink_ext_ack *extack)
624 {
625 	int err = 0;
626 
627 	lockdep_assert_held(&esw->state_lock);
628 
629 	if (!refcount_inc_not_zero(&esw->qos.refcnt)) {
630 		/* esw_qos_create() set refcount to 1 only on success.
631 		 * No need to decrement on failure.
632 		 */
633 		err = esw_qos_create(esw, extack);
634 	}
635 
636 	return err;
637 }
638 
639 static void esw_qos_put(struct mlx5_eswitch *esw)
640 {
641 	lockdep_assert_held(&esw->state_lock);
642 	if (refcount_dec_and_test(&esw->qos.refcnt))
643 		esw_qos_destroy(esw);
644 }
645 
646 static int esw_qos_vport_enable(struct mlx5_eswitch *esw, struct mlx5_vport *vport,
647 				u32 max_rate, u32 bw_share, struct netlink_ext_ack *extack)
648 {
649 	int err;
650 
651 	lockdep_assert_held(&esw->state_lock);
652 	if (vport->qos.enabled)
653 		return 0;
654 
655 	err = esw_qos_get(esw, extack);
656 	if (err)
657 		return err;
658 
659 	vport->qos.group = esw->qos.group0;
660 
661 	err = esw_qos_vport_create_sched_element(esw, vport, max_rate, bw_share);
662 	if (err)
663 		goto err_out;
664 
665 	vport->qos.enabled = true;
666 	trace_mlx5_esw_vport_qos_create(vport, bw_share, max_rate);
667 
668 	return 0;
669 
670 err_out:
671 	esw_qos_put(esw);
672 
673 	return err;
674 }
675 
676 void mlx5_esw_qos_vport_disable(struct mlx5_eswitch *esw, struct mlx5_vport *vport)
677 {
678 	int err;
679 
680 	lockdep_assert_held(&esw->state_lock);
681 	if (!vport->qos.enabled)
682 		return;
683 	WARN(vport->qos.group && vport->qos.group != esw->qos.group0,
684 	     "Disabling QoS on port before detaching it from group");
685 
686 	err = mlx5_destroy_scheduling_element_cmd(esw->dev,
687 						  SCHEDULING_HIERARCHY_E_SWITCH,
688 						  vport->qos.esw_tsar_ix);
689 	if (err)
690 		esw_warn(esw->dev, "E-Switch destroy TSAR vport element failed (vport=%d,err=%d)\n",
691 			 vport->vport, err);
692 
693 	memset(&vport->qos, 0, sizeof(vport->qos));
694 	trace_mlx5_esw_vport_qos_destroy(vport);
695 
696 	esw_qos_put(esw);
697 }
698 
699 int mlx5_esw_qos_set_vport_rate(struct mlx5_eswitch *esw, struct mlx5_vport *vport,
700 				u32 max_rate, u32 min_rate)
701 {
702 	int err;
703 
704 	lockdep_assert_held(&esw->state_lock);
705 	err = esw_qos_vport_enable(esw, vport, 0, 0, NULL);
706 	if (err)
707 		return err;
708 
709 	err = esw_qos_set_vport_min_rate(esw, vport, min_rate, NULL);
710 	if (!err)
711 		err = esw_qos_set_vport_max_rate(esw, vport, max_rate, NULL);
712 
713 	return err;
714 }
715 
716 int mlx5_esw_qos_modify_vport_rate(struct mlx5_eswitch *esw, u16 vport_num, u32 rate_mbps)
717 {
718 	u32 ctx[MLX5_ST_SZ_DW(scheduling_context)] = {};
719 	struct mlx5_vport *vport;
720 	u32 bitmask;
721 	int err;
722 
723 	vport = mlx5_eswitch_get_vport(esw, vport_num);
724 	if (IS_ERR(vport))
725 		return PTR_ERR(vport);
726 
727 	mutex_lock(&esw->state_lock);
728 	if (!vport->qos.enabled) {
729 		/* Eswitch QoS wasn't enabled yet. Enable it and vport QoS. */
730 		err = esw_qos_vport_enable(esw, vport, rate_mbps, vport->qos.bw_share, NULL);
731 	} else {
732 		MLX5_SET(scheduling_context, ctx, max_average_bw, rate_mbps);
733 
734 		bitmask = MODIFY_SCHEDULING_ELEMENT_IN_MODIFY_BITMASK_MAX_AVERAGE_BW;
735 		err = mlx5_modify_scheduling_element_cmd(esw->dev,
736 							 SCHEDULING_HIERARCHY_E_SWITCH,
737 							 ctx,
738 							 vport->qos.esw_tsar_ix,
739 							 bitmask);
740 	}
741 	mutex_unlock(&esw->state_lock);
742 
743 	return err;
744 }
745 
746 #define MLX5_LINKSPEED_UNIT 125000 /* 1Mbps in Bps */
747 
748 /* Converts bytes per second value passed in a pointer into megabits per
749  * second, rewriting last. If converted rate exceed link speed or is not a
750  * fraction of Mbps - returns error.
751  */
752 static int esw_qos_devlink_rate_to_mbps(struct mlx5_core_dev *mdev, const char *name,
753 					u64 *rate, struct netlink_ext_ack *extack)
754 {
755 	u32 link_speed_max, reminder;
756 	u64 value;
757 	int err;
758 
759 	err = mlx5e_port_max_linkspeed(mdev, &link_speed_max);
760 	if (err) {
761 		NL_SET_ERR_MSG_MOD(extack, "Failed to get link maximum speed");
762 		return err;
763 	}
764 
765 	value = div_u64_rem(*rate, MLX5_LINKSPEED_UNIT, &reminder);
766 	if (reminder) {
767 		pr_err("%s rate value %lluBps not in link speed units of 1Mbps.\n",
768 		       name, *rate);
769 		NL_SET_ERR_MSG_MOD(extack, "TX rate value not in link speed units of 1Mbps");
770 		return -EINVAL;
771 	}
772 
773 	if (value > link_speed_max) {
774 		pr_err("%s rate value %lluMbps exceed link maximum speed %u.\n",
775 		       name, value, link_speed_max);
776 		NL_SET_ERR_MSG_MOD(extack, "TX rate value exceed link maximum speed");
777 		return -EINVAL;
778 	}
779 
780 	*rate = value;
781 	return 0;
782 }
783 
784 /* Eswitch devlink rate API */
785 
786 int mlx5_esw_devlink_rate_leaf_tx_share_set(struct devlink_rate *rate_leaf, void *priv,
787 					    u64 tx_share, struct netlink_ext_ack *extack)
788 {
789 	struct mlx5_vport *vport = priv;
790 	struct mlx5_eswitch *esw;
791 	int err;
792 
793 	esw = vport->dev->priv.eswitch;
794 	if (!mlx5_esw_allowed(esw))
795 		return -EPERM;
796 
797 	err = esw_qos_devlink_rate_to_mbps(vport->dev, "tx_share", &tx_share, extack);
798 	if (err)
799 		return err;
800 
801 	mutex_lock(&esw->state_lock);
802 	err = esw_qos_vport_enable(esw, vport, 0, 0, extack);
803 	if (err)
804 		goto unlock;
805 
806 	err = esw_qos_set_vport_min_rate(esw, vport, tx_share, extack);
807 unlock:
808 	mutex_unlock(&esw->state_lock);
809 	return err;
810 }
811 
812 int mlx5_esw_devlink_rate_leaf_tx_max_set(struct devlink_rate *rate_leaf, void *priv,
813 					  u64 tx_max, struct netlink_ext_ack *extack)
814 {
815 	struct mlx5_vport *vport = priv;
816 	struct mlx5_eswitch *esw;
817 	int err;
818 
819 	esw = vport->dev->priv.eswitch;
820 	if (!mlx5_esw_allowed(esw))
821 		return -EPERM;
822 
823 	err = esw_qos_devlink_rate_to_mbps(vport->dev, "tx_max", &tx_max, extack);
824 	if (err)
825 		return err;
826 
827 	mutex_lock(&esw->state_lock);
828 	err = esw_qos_vport_enable(esw, vport, 0, 0, extack);
829 	if (err)
830 		goto unlock;
831 
832 	err = esw_qos_set_vport_max_rate(esw, vport, tx_max, extack);
833 unlock:
834 	mutex_unlock(&esw->state_lock);
835 	return err;
836 }
837 
838 int mlx5_esw_devlink_rate_node_tx_share_set(struct devlink_rate *rate_node, void *priv,
839 					    u64 tx_share, struct netlink_ext_ack *extack)
840 {
841 	struct mlx5_core_dev *dev = devlink_priv(rate_node->devlink);
842 	struct mlx5_eswitch *esw = dev->priv.eswitch;
843 	struct mlx5_esw_rate_group *group = priv;
844 	int err;
845 
846 	err = esw_qos_devlink_rate_to_mbps(dev, "tx_share", &tx_share, extack);
847 	if (err)
848 		return err;
849 
850 	mutex_lock(&esw->state_lock);
851 	err = esw_qos_set_group_min_rate(esw, group, tx_share, extack);
852 	mutex_unlock(&esw->state_lock);
853 	return err;
854 }
855 
856 int mlx5_esw_devlink_rate_node_tx_max_set(struct devlink_rate *rate_node, void *priv,
857 					  u64 tx_max, struct netlink_ext_ack *extack)
858 {
859 	struct mlx5_core_dev *dev = devlink_priv(rate_node->devlink);
860 	struct mlx5_eswitch *esw = dev->priv.eswitch;
861 	struct mlx5_esw_rate_group *group = priv;
862 	int err;
863 
864 	err = esw_qos_devlink_rate_to_mbps(dev, "tx_max", &tx_max, extack);
865 	if (err)
866 		return err;
867 
868 	mutex_lock(&esw->state_lock);
869 	err = esw_qos_set_group_max_rate(esw, group, tx_max, extack);
870 	mutex_unlock(&esw->state_lock);
871 	return err;
872 }
873 
874 int mlx5_esw_devlink_rate_node_new(struct devlink_rate *rate_node, void **priv,
875 				   struct netlink_ext_ack *extack)
876 {
877 	struct mlx5_esw_rate_group *group;
878 	struct mlx5_eswitch *esw;
879 	int err = 0;
880 
881 	esw = mlx5_devlink_eswitch_get(rate_node->devlink);
882 	if (IS_ERR(esw))
883 		return PTR_ERR(esw);
884 
885 	mutex_lock(&esw->state_lock);
886 	if (esw->mode != MLX5_ESWITCH_OFFLOADS) {
887 		NL_SET_ERR_MSG_MOD(extack,
888 				   "Rate node creation supported only in switchdev mode");
889 		err = -EOPNOTSUPP;
890 		goto unlock;
891 	}
892 
893 	group = esw_qos_create_rate_group(esw, extack);
894 	if (IS_ERR(group)) {
895 		err = PTR_ERR(group);
896 		goto unlock;
897 	}
898 
899 	*priv = group;
900 unlock:
901 	mutex_unlock(&esw->state_lock);
902 	return err;
903 }
904 
905 int mlx5_esw_devlink_rate_node_del(struct devlink_rate *rate_node, void *priv,
906 				   struct netlink_ext_ack *extack)
907 {
908 	struct mlx5_esw_rate_group *group = priv;
909 	struct mlx5_eswitch *esw;
910 	int err;
911 
912 	esw = mlx5_devlink_eswitch_get(rate_node->devlink);
913 	if (IS_ERR(esw))
914 		return PTR_ERR(esw);
915 
916 	mutex_lock(&esw->state_lock);
917 	err = esw_qos_destroy_rate_group(esw, group, extack);
918 	mutex_unlock(&esw->state_lock);
919 	return err;
920 }
921 
922 int mlx5_esw_qos_vport_update_group(struct mlx5_eswitch *esw,
923 				    struct mlx5_vport *vport,
924 				    struct mlx5_esw_rate_group *group,
925 				    struct netlink_ext_ack *extack)
926 {
927 	int err;
928 
929 	mutex_lock(&esw->state_lock);
930 	err = esw_qos_vport_enable(esw, vport, 0, 0, extack);
931 	if (!err)
932 		err = esw_qos_vport_update_group(esw, vport, group, extack);
933 	mutex_unlock(&esw->state_lock);
934 	return err;
935 }
936 
937 int mlx5_esw_devlink_rate_parent_set(struct devlink_rate *devlink_rate,
938 				     struct devlink_rate *parent,
939 				     void *priv, void *parent_priv,
940 				     struct netlink_ext_ack *extack)
941 {
942 	struct mlx5_esw_rate_group *group;
943 	struct mlx5_vport *vport = priv;
944 
945 	if (!parent)
946 		return mlx5_esw_qos_vport_update_group(vport->dev->priv.eswitch,
947 						       vport, NULL, extack);
948 
949 	group = parent_priv;
950 	return mlx5_esw_qos_vport_update_group(vport->dev->priv.eswitch, vport, group, extack);
951 }
952