xref: /freebsd/sys/dev/cxgbe/t4_sched.c (revision 6beb67c7)
1 /*-
2  * Copyright (c) 2017 Chelsio Communications, Inc.
3  * All rights reserved.
4  * Written by: Navdeep Parhar <np@FreeBSD.org>
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
17  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
18  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
21  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 #include <sys/cdefs.h>
29 __FBSDID("$FreeBSD$");
30 
31 #include "opt_inet.h"
32 #include "opt_inet6.h"
33 #include "opt_ratelimit.h"
34 
35 #include <sys/types.h>
36 #include <sys/malloc.h>
37 #include <sys/queue.h>
38 #include <sys/sbuf.h>
39 #include <sys/taskqueue.h>
40 #include <sys/sysctl.h>
41 
42 #include "common/common.h"
43 #include "common/t4_regs.h"
44 #include "common/t4_regs_values.h"
45 #include "common/t4_msg.h"
46 
47 
48 static int
49 in_range(int val, int lo, int hi)
50 {
51 
52 	return (val < 0 || (val <= hi && val >= lo));
53 }
54 
55 static int
56 set_sched_class_config(struct adapter *sc, int minmax)
57 {
58 	int rc;
59 
60 	if (minmax < 0)
61 		return (EINVAL);
62 
63 	rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4sscc");
64 	if (rc)
65 		return (rc);
66 	if (hw_off_limits(sc))
67 		rc = ENXIO;
68 	else
69 		rc = -t4_sched_config(sc, FW_SCHED_TYPE_PKTSCHED, minmax, 1);
70 	end_synchronized_op(sc, 0);
71 
72 	return (rc);
73 }
74 
75 static int
76 set_sched_class_params(struct adapter *sc, struct t4_sched_class_params *p,
77     int sleep_ok)
78 {
79 	int rc, top_speed, fw_level, fw_mode, fw_rateunit, fw_ratemode;
80 	struct port_info *pi;
81 	struct tx_cl_rl_params *tc, old;
82 	bool check_pktsize = false;
83 
84 	if (p->level == SCHED_CLASS_LEVEL_CL_RL)
85 		fw_level = FW_SCHED_PARAMS_LEVEL_CL_RL;
86 	else if (p->level == SCHED_CLASS_LEVEL_CL_WRR)
87 		fw_level = FW_SCHED_PARAMS_LEVEL_CL_WRR;
88 	else if (p->level == SCHED_CLASS_LEVEL_CH_RL)
89 		fw_level = FW_SCHED_PARAMS_LEVEL_CH_RL;
90 	else
91 		return (EINVAL);
92 
93 	if (p->level == SCHED_CLASS_LEVEL_CL_RL) {
94 		if (p->mode == SCHED_CLASS_MODE_CLASS)
95 			fw_mode = FW_SCHED_PARAMS_MODE_CLASS;
96 		else if (p->mode == SCHED_CLASS_MODE_FLOW) {
97 			check_pktsize = true;
98 			fw_mode = FW_SCHED_PARAMS_MODE_FLOW;
99 		} else
100 			return (EINVAL);
101 	} else
102 		fw_mode = 0;
103 
104 	/* Valid channel must always be provided. */
105 	if (p->channel < 0)
106 		return (EINVAL);
107 	if (!in_range(p->channel, 0, sc->chip_params->nchan - 1))
108 		return (ERANGE);
109 
110 	pi = sc->port[sc->chan_map[p->channel]];
111 	if (pi == NULL)
112 		return (ENXIO);
113 	MPASS(pi->tx_chan == p->channel);
114 	top_speed = port_top_speed(pi) * 1000000; /* Gbps -> Kbps */
115 
116 	if (p->level == SCHED_CLASS_LEVEL_CL_RL ||
117 	    p->level == SCHED_CLASS_LEVEL_CH_RL) {
118 		/*
119 		 * Valid rate (mode, unit and values) must be provided.
120 		 */
121 
122 		if (p->minrate < 0)
123 			p->minrate = 0;
124 		if (p->maxrate < 0)
125 			return (EINVAL);
126 
127 		if (p->rateunit == SCHED_CLASS_RATEUNIT_BITS) {
128 			fw_rateunit = FW_SCHED_PARAMS_UNIT_BITRATE;
129 			/* ratemode could be relative (%) or absolute. */
130 			if (p->ratemode == SCHED_CLASS_RATEMODE_REL) {
131 				fw_ratemode = FW_SCHED_PARAMS_RATE_REL;
132 				/* maxrate is % of port bandwidth. */
133 				if (!in_range(p->minrate, 0, 100) ||
134 				    !in_range(p->maxrate, 0, 100)) {
135 					return (ERANGE);
136 				}
137 			} else if (p->ratemode == SCHED_CLASS_RATEMODE_ABS) {
138 				fw_ratemode = FW_SCHED_PARAMS_RATE_ABS;
139 				/* maxrate is absolute value in kbps. */
140 				if (!in_range(p->minrate, 0, top_speed) ||
141 				    !in_range(p->maxrate, 0, top_speed)) {
142 					return (ERANGE);
143 				}
144 			} else
145 				return (EINVAL);
146 		} else if (p->rateunit == SCHED_CLASS_RATEUNIT_PKTS) {
147 			/* maxrate is the absolute value in pps. */
148 			check_pktsize = true;
149 			fw_rateunit = FW_SCHED_PARAMS_UNIT_PKTRATE;
150 		} else
151 			return (EINVAL);
152 	} else {
153 		MPASS(p->level == SCHED_CLASS_LEVEL_CL_WRR);
154 
155 		/*
156 		 * Valid weight must be provided.
157 		 */
158 		if (p->weight < 0)
159 		       return (EINVAL);
160 		if (!in_range(p->weight, 1, 99))
161 			return (ERANGE);
162 
163 		fw_rateunit = 0;
164 		fw_ratemode = 0;
165 	}
166 
167 	if (p->level == SCHED_CLASS_LEVEL_CL_RL ||
168 	    p->level == SCHED_CLASS_LEVEL_CL_WRR) {
169 		/*
170 		 * Valid scheduling class must be provided.
171 		 */
172 		if (p->cl < 0)
173 			return (EINVAL);
174 		if (!in_range(p->cl, 0, sc->params.nsched_cls - 1))
175 			return (ERANGE);
176 	}
177 
178 	if (check_pktsize) {
179 		if (p->pktsize < 0)
180 			return (EINVAL);
181 		if (!in_range(p->pktsize, 64, pi->vi[0].ifp->if_mtu))
182 			return (ERANGE);
183 	}
184 
185 	if (p->level == SCHED_CLASS_LEVEL_CL_RL) {
186 		tc = &pi->sched_params->cl_rl[p->cl];
187 		mtx_lock(&sc->tc_lock);
188 		if (tc->refcount > 0 || tc->flags & (CLRL_SYNC | CLRL_ASYNC))
189 			rc = EBUSY;
190 		else {
191 			tc->flags |= CLRL_SYNC | CLRL_USER;
192 			tc->ratemode = fw_ratemode;
193 			tc->rateunit = fw_rateunit;
194 			tc->mode = fw_mode;
195 			tc->maxrate = p->maxrate;
196 			tc->pktsize = p->pktsize;
197 			rc = 0;
198 			old= *tc;
199 		}
200 		mtx_unlock(&sc->tc_lock);
201 		if (rc != 0)
202 			return (rc);
203 	}
204 
205 	rc = begin_synchronized_op(sc, NULL,
206 	    sleep_ok ? (SLEEP_OK | INTR_OK) : HOLD_LOCK, "t4sscp");
207 	if (rc != 0) {
208 		if (p->level == SCHED_CLASS_LEVEL_CL_RL) {
209 			mtx_lock(&sc->tc_lock);
210 			*tc = old;
211 			mtx_unlock(&sc->tc_lock);
212 		}
213 		return (rc);
214 	}
215 	if (!hw_off_limits(sc)) {
216 		rc = -t4_sched_params(sc, FW_SCHED_TYPE_PKTSCHED, fw_level,
217 		    fw_mode, fw_rateunit, fw_ratemode, p->channel, p->cl,
218 		    p->minrate, p->maxrate, p->weight, p->pktsize, 0, sleep_ok);
219 	}
220 	end_synchronized_op(sc, sleep_ok ? 0 : LOCK_HELD);
221 
222 	if (p->level == SCHED_CLASS_LEVEL_CL_RL) {
223 		mtx_lock(&sc->tc_lock);
224 		MPASS(tc->flags & CLRL_SYNC);
225 		MPASS(tc->flags & CLRL_USER);
226 		MPASS(tc->refcount == 0);
227 
228 		tc->flags &= ~CLRL_SYNC;
229 		if (rc == 0)
230 			tc->flags &= ~CLRL_ERR;
231 		else
232 			tc->flags |= CLRL_ERR;
233 		mtx_unlock(&sc->tc_lock);
234 	}
235 
236 	return (rc);
237 }
238 
239 static void
240 update_tx_sched(void *context, int pending)
241 {
242 	int i, j, rc;
243 	struct port_info *pi;
244 	struct tx_cl_rl_params *tc;
245 	struct adapter *sc = context;
246 	const int n = sc->params.nsched_cls;
247 
248 	mtx_lock(&sc->tc_lock);
249 	for_each_port(sc, i) {
250 		pi = sc->port[i];
251 		tc = &pi->sched_params->cl_rl[0];
252 		for (j = 0; j < n; j++, tc++) {
253 			MPASS(mtx_owned(&sc->tc_lock));
254 			if ((tc->flags & CLRL_ASYNC) == 0)
255 				continue;
256 			mtx_unlock(&sc->tc_lock);
257 
258 			if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK,
259 			    "t4utxs") != 0) {
260 				mtx_lock(&sc->tc_lock);
261 				continue;
262 			}
263 			rc = -t4_sched_params(sc, FW_SCHED_TYPE_PKTSCHED,
264 			    FW_SCHED_PARAMS_LEVEL_CL_RL, tc->mode, tc->rateunit,
265 			    tc->ratemode, pi->tx_chan, j, 0, tc->maxrate, 0,
266 			    tc->pktsize, tc->burstsize, 1);
267 			end_synchronized_op(sc, 0);
268 
269 			mtx_lock(&sc->tc_lock);
270 			MPASS(tc->flags & CLRL_ASYNC);
271 			tc->flags &= ~CLRL_ASYNC;
272 			if (rc == 0)
273 				tc->flags &= ~CLRL_ERR;
274 			else
275 				tc->flags |= CLRL_ERR;
276 		}
277 	}
278 	mtx_unlock(&sc->tc_lock);
279 }
280 
281 int
282 t4_set_sched_class(struct adapter *sc, struct t4_sched_params *p)
283 {
284 
285 	if (p->type != SCHED_CLASS_TYPE_PACKET)
286 		return (EINVAL);
287 
288 	if (p->subcmd == SCHED_CLASS_SUBCMD_CONFIG)
289 		return (set_sched_class_config(sc, p->u.config.minmax));
290 
291 	if (p->subcmd == SCHED_CLASS_SUBCMD_PARAMS)
292 		return (set_sched_class_params(sc, &p->u.params, 1));
293 
294 	return (EINVAL);
295 }
296 
297 static int
298 bind_txq_to_traffic_class(struct adapter *sc, struct sge_txq *txq, int idx)
299 {
300 	struct tx_cl_rl_params *tc0, *tc;
301 	int rc, old_idx;
302 	uint32_t fw_mnem, fw_class;
303 
304 	if (!(txq->eq.flags & EQ_HW_ALLOCATED))
305 		return (ENXIO);
306 
307 	mtx_lock(&sc->tc_lock);
308 	if (txq->tc_idx == -2) {
309 		rc = EBUSY;	/* Another bind/unbind in progress already. */
310 		goto done;
311 	}
312 	if (idx == txq->tc_idx) {
313 		rc = 0;		/* No change, nothing to do. */
314 		goto done;
315 	}
316 
317 	tc0 = &sc->port[txq->eq.tx_chan]->sched_params->cl_rl[0];
318 	if (idx != -1) {
319 		/*
320 		 * Bind to a different class at index idx.
321 		 */
322 		tc = &tc0[idx];
323 		if (tc->flags & CLRL_ERR) {
324 			rc = ENXIO;
325 			goto done;
326 		} else {
327 			/*
328 			 * Ok to proceed.  Place a reference on the new class
329 			 * while still holding on to the reference on the
330 			 * previous class, if any.
331 			 */
332 			tc->refcount++;
333 		}
334 	}
335 	/* Mark as busy before letting go of the lock. */
336 	old_idx = txq->tc_idx;
337 	txq->tc_idx = -2;
338 	mtx_unlock(&sc->tc_lock);
339 
340 	rc = begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4btxq");
341 	if (rc != 0)
342 		return (rc);
343 	fw_mnem = (V_FW_PARAMS_MNEM(FW_PARAMS_MNEM_DMAQ) |
344 	    V_FW_PARAMS_PARAM_X(FW_PARAMS_PARAM_DMAQ_EQ_SCHEDCLASS_ETH) |
345 	    V_FW_PARAMS_PARAM_YZ(txq->eq.cntxt_id));
346 	fw_class = idx < 0 ? 0xffffffff : idx;
347 	rc = -t4_set_params(sc, sc->mbox, sc->pf, 0, 1, &fw_mnem, &fw_class);
348 	end_synchronized_op(sc, 0);
349 
350 	mtx_lock(&sc->tc_lock);
351 	MPASS(txq->tc_idx == -2);
352 	if (rc == 0) {
353 		/*
354 		 * Unbind, bind, or bind to a different class succeeded.  Remove
355 		 * the reference on the old traffic class, if any.
356 		 */
357 		if (old_idx != -1) {
358 			tc = &tc0[old_idx];
359 			MPASS(tc->refcount > 0);
360 			tc->refcount--;
361 		}
362 		txq->tc_idx = idx;
363 	} else {
364 		/*
365 		 * Unbind, bind, or bind to a different class failed.  Remove
366 		 * the anticipatory reference on the new traffic class, if any.
367 		 */
368 		if (idx != -1) {
369 			tc = &tc0[idx];
370 			MPASS(tc->refcount > 0);
371 			tc->refcount--;
372 		}
373 		txq->tc_idx = old_idx;
374 	}
375 done:
376 	MPASS(txq->tc_idx >= -1 && txq->tc_idx < sc->params.nsched_cls);
377 	mtx_unlock(&sc->tc_lock);
378 	return (rc);
379 }
380 
381 int
382 t4_set_sched_queue(struct adapter *sc, struct t4_sched_queue *p)
383 {
384 	struct port_info *pi = NULL;
385 	struct vi_info *vi;
386 	struct sge_txq *txq;
387 	int i, rc;
388 
389 	if (p->port >= sc->params.nports)
390 		return (EINVAL);
391 
392 	/*
393 	 * XXX: cxgbetool allows the user to specify the physical port only.  So
394 	 * we always operate on the main VI.
395 	 */
396 	pi = sc->port[p->port];
397 	vi = &pi->vi[0];
398 
399 	/* Checking VI_INIT_DONE outside a synch-op is a harmless race here. */
400 	if (!(vi->flags & VI_INIT_DONE))
401 		return (EAGAIN);
402 	MPASS(vi->ntxq > 0);
403 
404 	if (!in_range(p->queue, 0, vi->ntxq - 1) ||
405 	    !in_range(p->cl, 0, sc->params.nsched_cls - 1))
406 		return (EINVAL);
407 
408 	if (p->queue < 0) {
409 		/*
410 		 * Change the scheduling on all the TX queues for the
411 		 * interface.
412 		 */
413 		for_each_txq(vi, i, txq) {
414 			rc = bind_txq_to_traffic_class(sc, txq, p->cl);
415 			if (rc != 0)
416 				break;
417 		}
418 	} else {
419 		/*
420 		 * If op.queue is non-negative, then we're only changing the
421 		 * scheduling on a single specified TX queue.
422 		 */
423 		txq = &sc->sge.txq[vi->first_txq + p->queue];
424 		rc = bind_txq_to_traffic_class(sc, txq, p->cl);
425 	}
426 
427 	return (rc);
428 }
429 
430 int
431 t4_init_tx_sched(struct adapter *sc)
432 {
433 	int i, j;
434 	const int n = sc->params.nsched_cls;
435 	struct port_info *pi;
436 	struct tx_cl_rl_params *tc;
437 
438 	mtx_init(&sc->tc_lock, "tx_sched lock", NULL, MTX_DEF);
439 	TASK_INIT(&sc->tc_task, 0, update_tx_sched, sc);
440 	for_each_port(sc, i) {
441 		pi = sc->port[i];
442 		pi->sched_params = malloc(sizeof(*pi->sched_params) +
443 		    n * sizeof(*tc), M_CXGBE, M_ZERO | M_WAITOK);
444 		tc = &pi->sched_params->cl_rl[0];
445 		for (j = 0; j < n; j++, tc++) {
446 			tc->refcount = 0;
447 			tc->ratemode = FW_SCHED_PARAMS_RATE_ABS;
448 			tc->rateunit = FW_SCHED_PARAMS_UNIT_BITRATE;
449 			tc->mode = FW_SCHED_PARAMS_MODE_CLASS;
450 			tc->maxrate = 1000 * 1000;	/* 1 Gbps.  Arbitrary */
451 
452 			if (t4_sched_params_cl_rl_kbps(sc, pi->tx_chan, j,
453 			    tc->mode, tc->maxrate, tc->pktsize, 1) != 0)
454 				tc->flags = CLRL_ERR;
455 		}
456 	}
457 
458 	return (0);
459 }
460 
461 int
462 t4_free_tx_sched(struct adapter *sc)
463 {
464 	int i;
465 
466 	taskqueue_drain(taskqueue_thread, &sc->tc_task);
467 
468 	for_each_port(sc, i) {
469 		if (sc->port[i] != NULL)
470 			free(sc->port[i]->sched_params, M_CXGBE);
471 	}
472 
473 	if (mtx_initialized(&sc->tc_lock))
474 		mtx_destroy(&sc->tc_lock);
475 
476 	return (0);
477 }
478 
479 void
480 t4_update_tx_sched(struct adapter *sc)
481 {
482 
483 	taskqueue_enqueue(taskqueue_thread, &sc->tc_task);
484 }
485 
486 int
487 t4_reserve_cl_rl_kbps(struct adapter *sc, int port_id, u_int maxrate,
488     int *tc_idx)
489 {
490 	int rc = 0, fa = -1, i, pktsize, burstsize;
491 	bool update;
492 	struct tx_cl_rl_params *tc;
493 	struct port_info *pi;
494 
495 	MPASS(port_id >= 0 && port_id < sc->params.nports);
496 
497 	pi = sc->port[port_id];
498 	if (pi->sched_params->pktsize > 0)
499 		pktsize = pi->sched_params->pktsize;
500 	else
501 		pktsize = pi->vi[0].ifp->if_mtu;
502 	if (pi->sched_params->burstsize > 0)
503 		burstsize = pi->sched_params->burstsize;
504 	else
505 		burstsize = pktsize * 4;
506 	tc = &pi->sched_params->cl_rl[0];
507 
508 	update = false;
509 	mtx_lock(&sc->tc_lock);
510 	for (i = 0; i < sc->params.nsched_cls; i++, tc++) {
511 		if (fa < 0 && tc->refcount == 0 && !(tc->flags & CLRL_USER))
512 			fa = i;		/* first available */
513 
514 		if (tc->ratemode == FW_SCHED_PARAMS_RATE_ABS &&
515 		    tc->rateunit == FW_SCHED_PARAMS_UNIT_BITRATE &&
516 		    tc->mode == FW_SCHED_PARAMS_MODE_FLOW &&
517 		    tc->maxrate == maxrate && tc->pktsize == pktsize &&
518 		    tc->burstsize == burstsize) {
519 			tc->refcount++;
520 			*tc_idx = i;
521 			if ((tc->flags & (CLRL_ERR | CLRL_ASYNC | CLRL_SYNC)) ==
522 			    CLRL_ERR) {
523 				update = true;
524 			}
525 			goto done;
526 		}
527 	}
528 	/* Not found */
529 	MPASS(i == sc->params.nsched_cls);
530 	if (fa != -1) {
531 		tc = &pi->sched_params->cl_rl[fa];
532 		tc->refcount = 1;
533 		tc->ratemode = FW_SCHED_PARAMS_RATE_ABS;
534 		tc->rateunit = FW_SCHED_PARAMS_UNIT_BITRATE;
535 		tc->mode = FW_SCHED_PARAMS_MODE_FLOW;
536 		tc->maxrate = maxrate;
537 		tc->pktsize = pktsize;
538 		tc->burstsize = burstsize;
539 		*tc_idx = fa;
540 		update = true;
541 	} else {
542 		*tc_idx = -1;
543 		rc = ENOSPC;
544 	}
545 done:
546 	mtx_unlock(&sc->tc_lock);
547 	if (update) {
548 		tc->flags |= CLRL_ASYNC;
549 		t4_update_tx_sched(sc);
550 	}
551 	return (rc);
552 }
553 
554 void
555 t4_release_cl_rl(struct adapter *sc, int port_id, int tc_idx)
556 {
557 	struct tx_cl_rl_params *tc;
558 
559 	MPASS(port_id >= 0 && port_id < sc->params.nports);
560 	MPASS(tc_idx >= 0 && tc_idx < sc->params.nsched_cls);
561 
562 	mtx_lock(&sc->tc_lock);
563 	tc = &sc->port[port_id]->sched_params->cl_rl[tc_idx];
564 	MPASS(tc->refcount > 0);
565 	tc->refcount--;
566 	mtx_unlock(&sc->tc_lock);
567 }
568 
569 int
570 sysctl_tc(SYSCTL_HANDLER_ARGS)
571 {
572 	struct vi_info *vi = arg1;
573 	struct adapter *sc = vi->adapter;
574 	struct sge_txq *txq;
575 	int qidx = arg2, rc, tc_idx;
576 
577 	MPASS(qidx >= vi->first_txq && qidx < vi->first_txq + vi->ntxq);
578 
579 	txq = &sc->sge.txq[qidx];
580 	tc_idx = txq->tc_idx;
581 	rc = sysctl_handle_int(oidp, &tc_idx, 0, req);
582 	if (rc != 0 || req->newptr == NULL)
583 		return (rc);
584 
585 	if (sc->flags & IS_VF)
586 		return (EPERM);
587 	if (!in_range(tc_idx, 0, sc->params.nsched_cls - 1))
588 		return (EINVAL);
589 
590 	return (bind_txq_to_traffic_class(sc, txq, tc_idx));
591 }
592 
593 int
594 sysctl_tc_params(SYSCTL_HANDLER_ARGS)
595 {
596 	struct adapter *sc = arg1;
597 	struct tx_cl_rl_params tc;
598 	struct sbuf *sb;
599 	int i, rc, port_id, mbps, gbps;
600 
601 	rc = sysctl_wire_old_buffer(req, 0);
602 	if (rc != 0)
603 		return (rc);
604 
605 	sb = sbuf_new_for_sysctl(NULL, NULL, 4096, req);
606 	if (sb == NULL)
607 		return (ENOMEM);
608 
609 	port_id = arg2 >> 16;
610 	MPASS(port_id < sc->params.nports);
611 	MPASS(sc->port[port_id] != NULL);
612 	i = arg2 & 0xffff;
613 	MPASS(i < sc->params.nsched_cls);
614 
615 	mtx_lock(&sc->tc_lock);
616 	tc = sc->port[port_id]->sched_params->cl_rl[i];
617 	mtx_unlock(&sc->tc_lock);
618 
619 	switch (tc.rateunit) {
620 	case SCHED_CLASS_RATEUNIT_BITS:
621 		switch (tc.ratemode) {
622 		case SCHED_CLASS_RATEMODE_REL:
623 			/* XXX: top speed or actual link speed? */
624 			gbps = port_top_speed(sc->port[port_id]);
625 			sbuf_printf(sb, "%u%% of %uGbps", tc.maxrate, gbps);
626 			break;
627 		case SCHED_CLASS_RATEMODE_ABS:
628 			mbps = tc.maxrate / 1000;
629 			gbps = tc.maxrate / 1000000;
630 			if (tc.maxrate == gbps * 1000000)
631 				sbuf_printf(sb, "%uGbps", gbps);
632 			else if (tc.maxrate == mbps * 1000)
633 				sbuf_printf(sb, "%uMbps", mbps);
634 			else
635 				sbuf_printf(sb, "%uKbps", tc.maxrate);
636 			break;
637 		default:
638 			rc = ENXIO;
639 			goto done;
640 		}
641 		break;
642 	case SCHED_CLASS_RATEUNIT_PKTS:
643 		sbuf_printf(sb, "%upps", tc.maxrate);
644 		break;
645 	default:
646 		rc = ENXIO;
647 		goto done;
648 	}
649 
650 	switch (tc.mode) {
651 	case SCHED_CLASS_MODE_CLASS:
652 		sbuf_printf(sb, " aggregate");
653 		break;
654 	case SCHED_CLASS_MODE_FLOW:
655 		sbuf_printf(sb, " per-flow");
656 		if (tc.pktsize > 0)
657 			sbuf_printf(sb, " pkt-size %u", tc.pktsize);
658 		if (tc.burstsize > 0)
659 			sbuf_printf(sb, " burst-size %u", tc.burstsize);
660 		break;
661 	default:
662 		rc = ENXIO;
663 		goto done;
664 	}
665 
666 done:
667 	if (rc == 0)
668 		rc = sbuf_finish(sb);
669 	sbuf_delete(sb);
670 
671 	return (rc);
672 }
673 
674 #ifdef RATELIMIT
675 void
676 t4_init_etid_table(struct adapter *sc)
677 {
678 	int i;
679 	struct tid_info *t;
680 
681 	if (!is_ethoffload(sc))
682 		return;
683 
684 	t = &sc->tids;
685 	MPASS(t->netids > 0);
686 
687 	mtx_init(&t->etid_lock, "etid lock", NULL, MTX_DEF);
688 	t->etid_tab = malloc(sizeof(*t->etid_tab) * t->netids, M_CXGBE,
689 			M_ZERO | M_WAITOK);
690 	t->efree = t->etid_tab;
691 	t->etids_in_use = 0;
692 	for (i = 1; i < t->netids; i++)
693 		t->etid_tab[i - 1].next = &t->etid_tab[i];
694 	t->etid_tab[t->netids - 1].next = NULL;
695 }
696 
697 void
698 t4_free_etid_table(struct adapter *sc)
699 {
700 	struct tid_info *t;
701 
702 	if (!is_ethoffload(sc))
703 		return;
704 
705 	t = &sc->tids;
706 	MPASS(t->netids > 0);
707 
708 	free(t->etid_tab, M_CXGBE);
709 	t->etid_tab = NULL;
710 
711 	if (mtx_initialized(&t->etid_lock))
712 		mtx_destroy(&t->etid_lock);
713 }
714 
715 /* etid services */
716 static int alloc_etid(struct adapter *, struct cxgbe_rate_tag *);
717 static void free_etid(struct adapter *, int);
718 
719 static int
720 alloc_etid(struct adapter *sc, struct cxgbe_rate_tag *cst)
721 {
722 	struct tid_info *t = &sc->tids;
723 	int etid = -1;
724 
725 	mtx_lock(&t->etid_lock);
726 	if (t->efree) {
727 		union etid_entry *p = t->efree;
728 
729 		etid = p - t->etid_tab + t->etid_base;
730 		t->efree = p->next;
731 		p->cst = cst;
732 		t->etids_in_use++;
733 	}
734 	mtx_unlock(&t->etid_lock);
735 	return (etid);
736 }
737 
738 struct cxgbe_rate_tag *
739 lookup_etid(struct adapter *sc, int etid)
740 {
741 	struct tid_info *t = &sc->tids;
742 
743 	return (t->etid_tab[etid - t->etid_base].cst);
744 }
745 
746 static void
747 free_etid(struct adapter *sc, int etid)
748 {
749 	struct tid_info *t = &sc->tids;
750 	union etid_entry *p = &t->etid_tab[etid - t->etid_base];
751 
752 	mtx_lock(&t->etid_lock);
753 	p->next = t->efree;
754 	t->efree = p;
755 	t->etids_in_use--;
756 	mtx_unlock(&t->etid_lock);
757 }
758 
759 int
760 cxgbe_rate_tag_alloc(struct ifnet *ifp, union if_snd_tag_alloc_params *params,
761     struct m_snd_tag **pt)
762 {
763 	int rc, schedcl;
764 	struct vi_info *vi = ifp->if_softc;
765 	struct port_info *pi = vi->pi;
766 	struct adapter *sc = pi->adapter;
767 	struct cxgbe_rate_tag *cst;
768 
769 	MPASS(params->hdr.type == IF_SND_TAG_TYPE_RATE_LIMIT);
770 
771 	rc = t4_reserve_cl_rl_kbps(sc, pi->port_id,
772 	    (params->rate_limit.max_rate * 8ULL / 1000), &schedcl);
773 	if (rc != 0)
774 		return (rc);
775 	MPASS(schedcl >= 0 && schedcl < sc->params.nsched_cls);
776 
777 	cst = malloc(sizeof(*cst), M_CXGBE, M_ZERO | M_NOWAIT);
778 	if (cst == NULL) {
779 failed:
780 		t4_release_cl_rl(sc, pi->port_id, schedcl);
781 		return (ENOMEM);
782 	}
783 
784 	cst->etid = alloc_etid(sc, cst);
785 	if (cst->etid < 0) {
786 		free(cst, M_CXGBE);
787 		goto failed;
788 	}
789 
790 	mtx_init(&cst->lock, "cst_lock", NULL, MTX_DEF);
791 	mbufq_init(&cst->pending_tx, INT_MAX);
792 	mbufq_init(&cst->pending_fwack, INT_MAX);
793 	m_snd_tag_init(&cst->com, ifp, IF_SND_TAG_TYPE_RATE_LIMIT);
794 	cst->flags |= EO_FLOWC_PENDING | EO_SND_TAG_REF;
795 	cst->adapter = sc;
796 	cst->port_id = pi->port_id;
797 	cst->schedcl = schedcl;
798 	cst->max_rate = params->rate_limit.max_rate;
799 	cst->tx_credits = sc->params.eo_wr_cred;
800 	cst->tx_total = cst->tx_credits;
801 	cst->plen = 0;
802 	cst->ctrl0 = htobe32(V_TXPKT_OPCODE(CPL_TX_PKT_XT) |
803 	    V_TXPKT_INTF(pi->tx_chan) | V_TXPKT_PF(sc->pf) |
804 	    V_TXPKT_VF(vi->vin) | V_TXPKT_VF_VLD(vi->vfvld));
805 
806 	/*
807 	 * Queues will be selected later when the connection flowid is available.
808 	 */
809 
810 	*pt = &cst->com;
811 	return (0);
812 }
813 
814 /*
815  * Change in parameters, no change in ifp.
816  */
817 int
818 cxgbe_rate_tag_modify(struct m_snd_tag *mst,
819     union if_snd_tag_modify_params *params)
820 {
821 	int rc, schedcl;
822 	struct cxgbe_rate_tag *cst = mst_to_crt(mst);
823 	struct adapter *sc = cst->adapter;
824 
825 	/* XXX: is schedcl -1 ok here? */
826 	MPASS(cst->schedcl >= 0 && cst->schedcl < sc->params.nsched_cls);
827 
828 	mtx_lock(&cst->lock);
829 	MPASS(cst->flags & EO_SND_TAG_REF);
830 	rc = t4_reserve_cl_rl_kbps(sc, cst->port_id,
831 	    (params->rate_limit.max_rate * 8ULL / 1000), &schedcl);
832 	if (rc != 0)
833 		return (rc);
834 	MPASS(schedcl >= 0 && schedcl < sc->params.nsched_cls);
835 	t4_release_cl_rl(sc, cst->port_id, cst->schedcl);
836 	cst->schedcl = schedcl;
837 	cst->max_rate = params->rate_limit.max_rate;
838 	mtx_unlock(&cst->lock);
839 
840 	return (0);
841 }
842 
843 int
844 cxgbe_rate_tag_query(struct m_snd_tag *mst,
845     union if_snd_tag_query_params *params)
846 {
847 	struct cxgbe_rate_tag *cst = mst_to_crt(mst);
848 
849 	params->rate_limit.max_rate = cst->max_rate;
850 
851 #define CST_TO_MST_QLEVEL_SCALE (IF_SND_QUEUE_LEVEL_MAX / cst->tx_total)
852 	params->rate_limit.queue_level =
853 		(cst->tx_total - cst->tx_credits) * CST_TO_MST_QLEVEL_SCALE;
854 
855 	return (0);
856 }
857 
858 /*
859  * Unlocks cst and frees it.
860  */
861 void
862 cxgbe_rate_tag_free_locked(struct cxgbe_rate_tag *cst)
863 {
864 	struct adapter *sc = cst->adapter;
865 
866 	mtx_assert(&cst->lock, MA_OWNED);
867 	MPASS((cst->flags & EO_SND_TAG_REF) == 0);
868 	MPASS(cst->tx_credits == cst->tx_total);
869 	MPASS(cst->plen == 0);
870 	MPASS(mbufq_first(&cst->pending_tx) == NULL);
871 	MPASS(mbufq_first(&cst->pending_fwack) == NULL);
872 
873 	if (cst->etid >= 0)
874 		free_etid(sc, cst->etid);
875 	if (cst->schedcl != -1)
876 		t4_release_cl_rl(sc, cst->port_id, cst->schedcl);
877 	mtx_unlock(&cst->lock);
878 	mtx_destroy(&cst->lock);
879 	free(cst, M_CXGBE);
880 }
881 
882 void
883 cxgbe_rate_tag_free(struct m_snd_tag *mst)
884 {
885 	struct cxgbe_rate_tag *cst = mst_to_crt(mst);
886 
887 	mtx_lock(&cst->lock);
888 
889 	/* The kernel is done with the snd_tag.  Remove its reference. */
890 	MPASS(cst->flags & EO_SND_TAG_REF);
891 	cst->flags &= ~EO_SND_TAG_REF;
892 
893 	if (cst->ncompl == 0) {
894 		/*
895 		 * No fw4_ack in flight.  Free the tag right away if there are
896 		 * no outstanding credits.  Request the firmware to return all
897 		 * credits for the etid otherwise.
898 		 */
899 		if (cst->tx_credits == cst->tx_total) {
900 			cxgbe_rate_tag_free_locked(cst);
901 			return;	/* cst is gone. */
902 		}
903 		send_etid_flush_wr(cst);
904 	}
905 	mtx_unlock(&cst->lock);
906 }
907 
908 void
909 cxgbe_ratelimit_query(struct ifnet *ifp, struct if_ratelimit_query_results *q)
910 {
911 	struct vi_info *vi = ifp->if_softc;
912 	struct adapter *sc = vi->adapter;
913 
914 	q->rate_table = NULL;
915 	q->flags = RT_IS_SELECTABLE;
916 	/*
917 	 * Absolute max limits from the firmware configuration.  Practical
918 	 * limits depend on the burstsize, pktsize (ifp->if_mtu ultimately) and
919 	 * the card's cclk.
920 	 */
921 	q->max_flows = sc->tids.netids;
922 	q->number_of_rates = sc->params.nsched_cls;
923 	q->min_segment_burst = 4; /* matches PKTSCHED_BURST in the firmware. */
924 
925 #if 1
926 	if (chip_id(sc) < CHELSIO_T6) {
927 		/* Based on testing by rrs@ with a T580 at burstsize = 4. */
928 		MPASS(q->min_segment_burst == 4);
929 		q->max_flows = min(4000, q->max_flows);
930 	} else {
931 		/* XXX: TBD, carried forward from T5 for now. */
932 		q->max_flows = min(4000, q->max_flows);
933 	}
934 
935 	/*
936 	 * XXX: tcp_ratelimit.c grabs all available rates on link-up before it
937 	 * even knows whether hw pacing will be used or not.  This prevents
938 	 * other consumers like SO_MAX_PACING_RATE or those using cxgbetool or
939 	 * the private ioctls from using any of traffic classes.
940 	 *
941 	 * Underreport the number of rates to tcp_ratelimit so that it doesn't
942 	 * hog all of them.  This can be removed if/when tcp_ratelimit switches
943 	 * to making its allocations on first-use rather than link-up.  There is
944 	 * nothing wrong with one particular consumer reserving all the classes
945 	 * but it should do so only if it'll actually use hw rate limiting.
946 	 */
947 	q->number_of_rates /= 4;
948 #endif
949 }
950 #endif
951