xref: /freebsd/sys/dev/mlx5/mlx5_en/mlx5_en_rl.c (revision 7c3eff94)
1 /*-
2  * Copyright (c) 2016-2020 Mellanox Technologies. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  * 1. Redistributions of source code must retain the above copyright
8  *    notice, this list of conditions and the following disclaimer.
9  * 2. Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS `AS IS' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  *
25  * $FreeBSD$
26  */
27 
28 #include "en.h"
29 
30 #ifdef RATELIMIT
31 
32 static int mlx5e_rl_open_workers(struct mlx5e_priv *);
33 static void mlx5e_rl_close_workers(struct mlx5e_priv *);
34 static int mlx5e_rl_sysctl_show_rate_table(SYSCTL_HANDLER_ARGS);
35 static void mlx5e_rl_sysctl_add_u64_oid(struct mlx5e_rl_priv_data *, unsigned x,
36     struct sysctl_oid *, const char *name, const char *desc);
37 static void mlx5e_rl_sysctl_add_stats_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x,
38       struct sysctl_oid *node, const char *name, const char *desc);
39 static int mlx5e_rl_tx_limit_add(struct mlx5e_rl_priv_data *, uint64_t value);
40 static int mlx5e_rl_tx_limit_clr(struct mlx5e_rl_priv_data *, uint64_t value);
41 
42 static void
43 mlx5e_rl_build_sq_param(struct mlx5e_rl_priv_data *rl,
44     struct mlx5e_sq_param *param)
45 {
46 	void *sqc = param->sqc;
47 	void *wq = MLX5_ADDR_OF(sqc, sqc, wq);
48 	uint8_t log_sq_size = order_base_2(rl->param.tx_queue_size);
49 
50 	MLX5_SET(wq, wq, log_wq_sz, log_sq_size);
51 	MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB));
52 	MLX5_SET(wq, wq, pd, rl->priv->pdn);
53 
54 	param->wq.linear = 1;
55 }
56 
57 static void
58 mlx5e_rl_build_cq_param(struct mlx5e_rl_priv_data *rl,
59     struct mlx5e_cq_param *param)
60 {
61 	void *cqc = param->cqc;
62 	uint8_t log_sq_size = order_base_2(rl->param.tx_queue_size);
63 
64 	MLX5_SET(cqc, cqc, log_cq_size, log_sq_size);
65 	MLX5_SET(cqc, cqc, cq_period, rl->param.tx_coalesce_usecs);
66 	MLX5_SET(cqc, cqc, cq_max_count, rl->param.tx_coalesce_pkts);
67 	MLX5_SET(cqc, cqc, uar_page, rl->priv->mdev->priv.uar->index);
68 
69 	switch (rl->param.tx_coalesce_mode) {
70 	case 0:
71 		MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
72 		break;
73 	default:
74 		if (MLX5_CAP_GEN(rl->priv->mdev, cq_period_start_from_cqe))
75 			MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_CQE);
76 		else
77 			MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
78 		break;
79 	}
80 }
81 
82 static void
83 mlx5e_rl_build_channel_param(struct mlx5e_rl_priv_data *rl,
84     struct mlx5e_rl_channel_param *cparam)
85 {
86 	memset(cparam, 0, sizeof(*cparam));
87 
88 	mlx5e_rl_build_sq_param(rl, &cparam->sq);
89 	mlx5e_rl_build_cq_param(rl, &cparam->cq);
90 }
91 
92 static int
93 mlx5e_rl_create_sq(struct mlx5e_priv *priv, struct mlx5e_sq *sq,
94     struct mlx5e_sq_param *param, int ix)
95 {
96 	struct mlx5_core_dev *mdev = priv->mdev;
97 	void *sqc = param->sqc;
98 	void *sqc_wq = MLX5_ADDR_OF(sqc, sqc, wq);
99 	int err;
100 
101 	/* Create DMA descriptor TAG */
102 	if ((err = -bus_dma_tag_create(
103 	    bus_get_dma_tag(mdev->pdev->dev.bsddev),
104 	    1,				/* any alignment */
105 	    0,				/* no boundary */
106 	    BUS_SPACE_MAXADDR,		/* lowaddr */
107 	    BUS_SPACE_MAXADDR,		/* highaddr */
108 	    NULL, NULL,			/* filter, filterarg */
109 	    MLX5E_MAX_TX_PAYLOAD_SIZE,	/* maxsize */
110 	    MLX5E_MAX_TX_MBUF_FRAGS,	/* nsegments */
111 	    MLX5E_MAX_TX_MBUF_SIZE,	/* maxsegsize */
112 	    0,				/* flags */
113 	    NULL, NULL,			/* lockfunc, lockfuncarg */
114 	    &sq->dma_tag)))
115 		goto done;
116 
117 	sq->mkey_be = cpu_to_be32(priv->mr.key);
118 	sq->ifp = priv->ifp;
119 	sq->priv = priv;
120 
121 	err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, &sq->wq,
122 	    &sq->wq_ctrl);
123 	if (err)
124 		goto err_free_dma_tag;
125 
126 	sq->wq.db = &sq->wq.db[MLX5_SND_DBR];
127 
128 	err = mlx5e_alloc_sq_db(sq);
129 	if (err)
130 		goto err_sq_wq_destroy;
131 
132 	mlx5e_update_sq_inline(sq);
133 
134 	return (0);
135 
136 err_sq_wq_destroy:
137 	mlx5_wq_destroy(&sq->wq_ctrl);
138 err_free_dma_tag:
139 	bus_dma_tag_destroy(sq->dma_tag);
140 done:
141 	return (err);
142 }
143 
144 static void
145 mlx5e_rl_destroy_sq(struct mlx5e_sq *sq)
146 {
147 
148 	mlx5e_free_sq_db(sq);
149 	mlx5_wq_destroy(&sq->wq_ctrl);
150 	bus_dma_tag_destroy(sq->dma_tag);
151 }
152 
153 static int
154 mlx5e_rl_open_sq(struct mlx5e_priv *priv, struct mlx5e_sq *sq,
155     struct mlx5e_sq_param *param, int ix)
156 {
157 	int err;
158 
159 	err = mlx5e_rl_create_sq(priv, sq, param, ix);
160 	if (err)
161 		return (err);
162 
163 	err = mlx5e_enable_sq(sq, param, &priv->channel[ix].bfreg, priv->rl.tisn);
164 	if (err)
165 		goto err_destroy_sq;
166 
167 	err = mlx5e_modify_sq(sq, MLX5_SQC_STATE_RST, MLX5_SQC_STATE_RDY);
168 	if (err)
169 		goto err_disable_sq;
170 
171 	WRITE_ONCE(sq->running, 1);
172 
173 	return (0);
174 
175 err_disable_sq:
176 	mlx5e_disable_sq(sq);
177 err_destroy_sq:
178 	mlx5e_rl_destroy_sq(sq);
179 
180 	return (err);
181 }
182 
183 static void
184 mlx5e_rl_chan_mtx_init(struct mlx5e_priv *priv, struct mlx5e_sq *sq)
185 {
186 	mtx_init(&sq->lock, "mlx5tx-rl", NULL, MTX_DEF);
187 	mtx_init(&sq->comp_lock, "mlx5comp-rl", NULL, MTX_DEF);
188 
189 	callout_init_mtx(&sq->cev_callout, &sq->lock, 0);
190 
191 	sq->cev_factor = priv->rl.param.tx_completion_fact;
192 
193 	/* ensure the TX completion event factor is not zero */
194 	if (sq->cev_factor == 0)
195 		sq->cev_factor = 1;
196 }
197 
198 static int
199 mlx5e_rl_open_channel(struct mlx5e_rl_worker *rlw, int eq_ix,
200     struct mlx5e_rl_channel_param *cparam,
201     struct mlx5e_sq *volatile *ppsq)
202 {
203 	struct mlx5e_priv *priv = rlw->priv;
204 	struct mlx5e_sq *sq;
205 	int err;
206 
207 	sq = malloc(sizeof(*sq), M_MLX5EN, M_WAITOK | M_ZERO);
208 
209 	/* init mutexes */
210 	mlx5e_rl_chan_mtx_init(priv, sq);
211 
212 	/* open TX completion queue */
213 	err = mlx5e_open_cq(priv, &cparam->cq, &sq->cq,
214 	    &mlx5e_tx_cq_comp, eq_ix);
215 	if (err)
216 		goto err_free;
217 
218 	err = mlx5e_rl_open_sq(priv, sq, &cparam->sq, eq_ix);
219 	if (err)
220 		goto err_close_tx_cq;
221 
222 	/* store TX channel pointer */
223 	*ppsq = sq;
224 
225 	/* poll TX queue initially */
226 	sq->cq.mcq.comp(&sq->cq.mcq, NULL);
227 
228 	return (0);
229 
230 err_close_tx_cq:
231 	mlx5e_close_cq(&sq->cq);
232 
233 err_free:
234 	/* destroy mutexes */
235 	mtx_destroy(&sq->lock);
236 	mtx_destroy(&sq->comp_lock);
237 	free(sq, M_MLX5EN);
238 	atomic_add_64(&priv->rl.stats.tx_allocate_resource_failure, 1ULL);
239 	return (err);
240 }
241 
242 static void
243 mlx5e_rl_close_channel(struct mlx5e_sq *volatile *ppsq)
244 {
245 	struct mlx5e_sq *sq = *ppsq;
246 
247 	/* check if channel is already closed */
248 	if (sq == NULL)
249 		return;
250 	/* ensure channel pointer is no longer used */
251 	*ppsq = NULL;
252 
253 	/* teardown and destroy SQ */
254 	mlx5e_drain_sq(sq);
255 	mlx5e_disable_sq(sq);
256 	mlx5e_rl_destroy_sq(sq);
257 
258 	/* close CQ */
259 	mlx5e_close_cq(&sq->cq);
260 
261 	/* destroy mutexes */
262 	mtx_destroy(&sq->lock);
263 	mtx_destroy(&sq->comp_lock);
264 
265 	free(sq, M_MLX5EN);
266 }
267 
268 static void
269 mlx5e_rl_sync_tx_completion_fact(struct mlx5e_rl_priv_data *rl)
270 {
271 	/*
272 	 * Limit the maximum distance between completion events to
273 	 * half of the currently set TX queue size.
274 	 *
275 	 * The maximum number of queue entries a single IP packet can
276 	 * consume is given by MLX5_SEND_WQE_MAX_WQEBBS.
277 	 *
278 	 * The worst case max value is then given as below:
279 	 */
280 	uint64_t max = rl->param.tx_queue_size /
281 	    (2 * MLX5_SEND_WQE_MAX_WQEBBS);
282 
283 	/*
284 	 * Update the maximum completion factor value in case the
285 	 * tx_queue_size field changed. Ensure we don't overflow
286 	 * 16-bits.
287 	 */
288 	if (max < 1)
289 		max = 1;
290 	else if (max > 65535)
291 		max = 65535;
292 	rl->param.tx_completion_fact_max = max;
293 
294 	/*
295 	 * Verify that the current TX completion factor is within the
296 	 * given limits:
297 	 */
298 	if (rl->param.tx_completion_fact < 1)
299 		rl->param.tx_completion_fact = 1;
300 	else if (rl->param.tx_completion_fact > max)
301 		rl->param.tx_completion_fact = max;
302 }
303 
304 static int
305 mlx5e_rl_modify_sq(struct mlx5e_sq *sq, uint16_t rl_index)
306 {
307 	struct mlx5e_priv *priv = sq->priv;
308 	struct mlx5_core_dev *mdev = priv->mdev;
309 
310 	void *in;
311 	void *sqc;
312 	int inlen;
313 	int err;
314 
315 	inlen = MLX5_ST_SZ_BYTES(modify_sq_in);
316 	in = mlx5_vzalloc(inlen);
317 	if (in == NULL)
318 		return (-ENOMEM);
319 
320 	sqc = MLX5_ADDR_OF(modify_sq_in, in, ctx);
321 
322 	MLX5_SET(modify_sq_in, in, sqn, sq->sqn);
323 	MLX5_SET(modify_sq_in, in, sq_state, MLX5_SQC_STATE_RDY);
324 	MLX5_SET64(modify_sq_in, in, modify_bitmask, 1);
325 	MLX5_SET(sqc, sqc, state, MLX5_SQC_STATE_RDY);
326 	MLX5_SET(sqc, sqc, packet_pacing_rate_limit_index, rl_index);
327 
328 	err = mlx5_core_modify_sq(mdev, in, inlen);
329 
330 	kvfree(in);
331 
332 	return (err);
333 }
334 
335 /*
336  * This function will search the configured rate limit table for the
337  * best match to avoid that a single socket based application can
338  * allocate all the available hardware rates. If the user selected
339  * rate deviates too much from the closes rate available in the rate
340  * limit table, unlimited rate will be selected.
341  */
342 static uint64_t
343 mlx5e_rl_find_best_rate_locked(struct mlx5e_rl_priv_data *rl, uint64_t user_rate)
344 {
345 	uint64_t distance = -1ULL;
346 	uint64_t diff;
347 	uint64_t retval = 0;		/* unlimited */
348 	uint64_t x;
349 
350 	/* search for closest rate */
351 	for (x = 0; x != rl->param.tx_rates_def; x++) {
352 		uint64_t rate = rl->rate_limit_table[x];
353 		if (rate == 0)
354 			continue;
355 
356 		if (rate > user_rate)
357 			diff = rate - user_rate;
358 		else
359 			diff = user_rate - rate;
360 
361 		/* check if distance is smaller than previous rate */
362 		if (diff < distance) {
363 			distance = diff;
364 			retval = rate;
365 		}
366 	}
367 
368 	/* range check for multiplication below */
369 	if (user_rate > rl->param.tx_limit_max)
370 		user_rate = rl->param.tx_limit_max;
371 
372 	/* fallback to unlimited, if rate deviates too much */
373 	if (distance > howmany(user_rate *
374 	    rl->param.tx_allowed_deviation, 1000ULL))
375 		retval = 0;
376 
377 	return (retval);
378 }
379 
380 /*
381  * This function sets the requested rate for a rate limit channel, in
382  * bits per second. The requested rate will be filtered through the
383  * find best rate function above.
384  */
385 static int
386 mlx5e_rlw_channel_set_rate_locked(struct mlx5e_rl_worker *rlw,
387     struct mlx5e_rl_channel *channel, uint64_t rate)
388 {
389 	struct mlx5e_rl_priv_data *rl = &rlw->priv->rl;
390 	struct mlx5e_sq *sq;
391 	uint64_t temp;
392 	uint16_t index;
393 	uint16_t burst;
394 	int error;
395 
396 	if (rate != 0) {
397 		MLX5E_RL_WORKER_UNLOCK(rlw);
398 
399 		MLX5E_RL_RLOCK(rl);
400 
401 		/* get current burst size in bytes */
402 		temp = rl->param.tx_burst_size *
403 		    MLX5E_SW2HW_MTU(rlw->priv->ifp->if_mtu);
404 
405 		/* limit burst size to 64K currently */
406 		if (temp > 65535)
407 			temp = 65535;
408 		burst = temp;
409 
410 		/* find best rate */
411 		rate = mlx5e_rl_find_best_rate_locked(rl, rate);
412 
413 		MLX5E_RL_RUNLOCK(rl);
414 
415 		if (rate == 0) {
416 			/* rate doesn't exist, fallback to unlimited */
417 			index = 0;
418 			rate = 0;
419 			atomic_add_64(&rlw->priv->rl.stats.tx_modify_rate_failure, 1ULL);
420 		} else {
421 			/* get a reference on the new rate */
422 			error = -mlx5_rl_add_rate(rlw->priv->mdev,
423 			    howmany(rate, 1000), burst, &index);
424 
425 			if (error != 0) {
426 				/* adding rate failed, fallback to unlimited */
427 				index = 0;
428 				rate = 0;
429 				atomic_add_64(&rlw->priv->rl.stats.tx_add_new_rate_failure, 1ULL);
430 			}
431 		}
432 		MLX5E_RL_WORKER_LOCK(rlw);
433 	} else {
434 		index = 0;
435 		burst = 0;	/* default */
436 	}
437 
438 	/* atomically swap rates */
439 	temp = channel->last_rate;
440 	channel->last_rate = rate;
441 	rate = temp;
442 
443 	/* atomically swap burst size */
444 	temp = channel->last_burst;
445 	channel->last_burst = burst;
446 	burst = temp;
447 
448 	MLX5E_RL_WORKER_UNLOCK(rlw);
449 	/* put reference on the old rate, if any */
450 	if (rate != 0) {
451 		mlx5_rl_remove_rate(rlw->priv->mdev,
452 		    howmany(rate, 1000), burst);
453 	}
454 
455 	/* set new rate, if SQ is running */
456 	sq = channel->sq;
457 	if (sq != NULL && READ_ONCE(sq->running) != 0) {
458 		error = mlx5e_rl_modify_sq(sq, index);
459 		if (error != 0)
460 			atomic_add_64(&rlw->priv->rl.stats.tx_modify_rate_failure, 1ULL);
461 	} else
462 		error = 0;
463 	MLX5E_RL_WORKER_LOCK(rlw);
464 
465 	return (-error);
466 }
467 
468 static void
469 mlx5e_rl_worker(void *arg)
470 {
471 	struct thread *td;
472 	struct mlx5e_rl_worker *rlw = arg;
473 	struct mlx5e_rl_channel *channel;
474 	struct mlx5e_priv *priv;
475 	unsigned ix;
476 	uint64_t x;
477 	int error;
478 
479 	/* set thread priority */
480 	td = curthread;
481 
482 	thread_lock(td);
483 	sched_prio(td, PI_SWI(SWI_NET));
484 	thread_unlock(td);
485 
486 	priv = rlw->priv;
487 
488 	/* compute completion vector */
489 	ix = (rlw - priv->rl.workers) %
490 	    priv->mdev->priv.eq_table.num_comp_vectors;
491 
492 	/* TODO bind to CPU */
493 
494 	/* open all the SQs */
495 	MLX5E_RL_WORKER_LOCK(rlw);
496 	for (x = 0; x < priv->rl.param.tx_channels_per_worker_def; x++) {
497 		struct mlx5e_rl_channel *channel = rlw->channels + x;
498 
499 #if !defined(HAVE_RL_PRE_ALLOCATE_CHANNELS)
500 		if (channel->state == MLX5E_RL_ST_FREE)
501 			continue;
502 #endif
503 		MLX5E_RL_WORKER_UNLOCK(rlw);
504 
505 		MLX5E_RL_RLOCK(&priv->rl);
506 		error = mlx5e_rl_open_channel(rlw, ix,
507 		    &priv->rl.chan_param, &channel->sq);
508 		MLX5E_RL_RUNLOCK(&priv->rl);
509 
510 		MLX5E_RL_WORKER_LOCK(rlw);
511 		if (error != 0) {
512 			mlx5_en_err(priv->ifp,
513 			    "mlx5e_rl_open_channel failed: %d\n", error);
514 			break;
515 		}
516 		mlx5e_rlw_channel_set_rate_locked(rlw, channel, channel->init_rate);
517 	}
518 	while (1) {
519 		if (STAILQ_FIRST(&rlw->process_head) == NULL) {
520 			/* check if we are tearing down */
521 			if (rlw->worker_done != 0)
522 				break;
523 			cv_wait(&rlw->cv, &rlw->mtx);
524 		}
525 		/* check if we are tearing down */
526 		if (rlw->worker_done != 0)
527 			break;
528 		channel = STAILQ_FIRST(&rlw->process_head);
529 		if (channel != NULL) {
530 			STAILQ_REMOVE_HEAD(&rlw->process_head, entry);
531 
532 			switch (channel->state) {
533 			case MLX5E_RL_ST_MODIFY:
534 				channel->state = MLX5E_RL_ST_USED;
535 				MLX5E_RL_WORKER_UNLOCK(rlw);
536 
537 				/* create channel by demand */
538 				if (channel->sq == NULL) {
539 					MLX5E_RL_RLOCK(&priv->rl);
540 					error = mlx5e_rl_open_channel(rlw, ix,
541 					    &priv->rl.chan_param, &channel->sq);
542 					MLX5E_RL_RUNLOCK(&priv->rl);
543 
544 					if (error != 0) {
545 						mlx5_en_err(priv->ifp,
546 						    "mlx5e_rl_open_channel failed: %d\n", error);
547 					} else {
548 						atomic_add_64(&rlw->priv->rl.stats.tx_open_queues, 1ULL);
549 					}
550 				} else {
551 					mlx5e_resume_sq(channel->sq);
552 				}
553 
554 				MLX5E_RL_WORKER_LOCK(rlw);
555 				/* convert from bytes/s to bits/s and set new rate */
556 				error = mlx5e_rlw_channel_set_rate_locked(rlw, channel,
557 				    channel->new_rate * 8ULL);
558 				if (error != 0) {
559 					mlx5_en_err(priv->ifp,
560 					    "mlx5e_rlw_channel_set_rate_locked failed: %d\n",
561 					    error);
562 				}
563 				break;
564 
565 			case MLX5E_RL_ST_DESTROY:
566 				error = mlx5e_rlw_channel_set_rate_locked(rlw, channel, 0);
567 				if (error != 0) {
568 					mlx5_en_err(priv->ifp,
569 					    "mlx5e_rlw_channel_set_rate_locked failed: %d\n",
570 					    error);
571 				}
572 				if (channel->sq != NULL) {
573 					/*
574 					 * Make sure all packets are
575 					 * transmitted before SQ is
576 					 * returned to free list:
577 					 */
578 					MLX5E_RL_WORKER_UNLOCK(rlw);
579 					mlx5e_drain_sq(channel->sq);
580 					MLX5E_RL_WORKER_LOCK(rlw);
581 				}
582 				/* put the channel back into the free list */
583 				STAILQ_INSERT_HEAD(&rlw->index_list_head, channel, entry);
584 				channel->state = MLX5E_RL_ST_FREE;
585 				atomic_add_64(&priv->rl.stats.tx_active_connections, -1ULL);
586 				break;
587 			default:
588 				/* NOP */
589 				break;
590 			}
591 		}
592 	}
593 
594 	/* close all the SQs */
595 	for (x = 0; x < priv->rl.param.tx_channels_per_worker_def; x++) {
596 		struct mlx5e_rl_channel *channel = rlw->channels + x;
597 
598 		/* update the initial rate */
599 		channel->init_rate = channel->last_rate;
600 
601 		/* make sure we free up the rate resource */
602 		mlx5e_rlw_channel_set_rate_locked(rlw, channel, 0);
603 
604 		if (channel->sq != NULL) {
605 			MLX5E_RL_WORKER_UNLOCK(rlw);
606 			mlx5e_rl_close_channel(&channel->sq);
607 			atomic_add_64(&rlw->priv->rl.stats.tx_open_queues, -1ULL);
608 			MLX5E_RL_WORKER_LOCK(rlw);
609 		}
610 	}
611 
612 	rlw->worker_done = 0;
613 	cv_broadcast(&rlw->cv);
614 	MLX5E_RL_WORKER_UNLOCK(rlw);
615 
616 	kthread_exit();
617 }
618 
619 static int
620 mlx5e_rl_open_tis(struct mlx5e_priv *priv)
621 {
622 	struct mlx5_core_dev *mdev = priv->mdev;
623 	u32 in[MLX5_ST_SZ_DW(create_tis_in)];
624 	void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx);
625 
626 	memset(in, 0, sizeof(in));
627 
628 	MLX5_SET(tisc, tisc, prio, 0);
629 	MLX5_SET(tisc, tisc, transport_domain, priv->tdn);
630 
631 	return (mlx5_core_create_tis(mdev, in, sizeof(in), &priv->rl.tisn));
632 }
633 
634 static void
635 mlx5e_rl_close_tis(struct mlx5e_priv *priv)
636 {
637 	mlx5_core_destroy_tis(priv->mdev, priv->rl.tisn);
638 }
639 
640 static void
641 mlx5e_rl_set_default_params(struct mlx5e_rl_params *param,
642     struct mlx5_core_dev *mdev)
643 {
644 	/* ratelimit workers */
645 	param->tx_worker_threads_def = mdev->priv.eq_table.num_comp_vectors;
646 	param->tx_worker_threads_max = MLX5E_RL_MAX_WORKERS;
647 
648 	/* range check */
649 	if (param->tx_worker_threads_def == 0 ||
650 	    param->tx_worker_threads_def > param->tx_worker_threads_max)
651 		param->tx_worker_threads_def = param->tx_worker_threads_max;
652 
653 	/* ratelimit channels */
654 	param->tx_channels_per_worker_def = MLX5E_RL_MAX_SQS /
655 	    param->tx_worker_threads_def;
656 	param->tx_channels_per_worker_max = MLX5E_RL_MAX_SQS;
657 
658 	/* range check */
659 	if (param->tx_channels_per_worker_def > MLX5E_RL_DEF_SQ_PER_WORKER)
660 		param->tx_channels_per_worker_def = MLX5E_RL_DEF_SQ_PER_WORKER;
661 
662 	/* set default burst size */
663 	param->tx_burst_size = 4;	/* MTUs */
664 
665 	/*
666 	 * Set maximum burst size
667 	 *
668 	 * The burst size is multiplied by the MTU and clamped to the
669 	 * range 0 ... 65535 bytes inclusivly before fed into the
670 	 * firmware.
671 	 *
672 	 * NOTE: If the burst size or MTU is changed only ratelimit
673 	 * connections made after the change will use the new burst
674 	 * size.
675 	 */
676 	param->tx_burst_size_max = 255;
677 
678 	/* get firmware rate limits in 1000bit/s and convert them to bit/s */
679 	param->tx_limit_min = mdev->priv.rl_table.min_rate * 1000ULL;
680 	param->tx_limit_max = mdev->priv.rl_table.max_rate * 1000ULL;
681 
682 	/* ratelimit table size */
683 	param->tx_rates_max = mdev->priv.rl_table.max_size;
684 
685 	/* range check */
686 	if (param->tx_rates_max > MLX5E_RL_MAX_TX_RATES)
687 		param->tx_rates_max = MLX5E_RL_MAX_TX_RATES;
688 
689 	/* set default number of rates */
690 	param->tx_rates_def = param->tx_rates_max;
691 
692 	/* set maximum allowed rate deviation */
693 	if (param->tx_limit_max != 0) {
694 		/*
695 		 * Make sure the deviation multiplication doesn't
696 		 * overflow unsigned 64-bit:
697 		 */
698 		param->tx_allowed_deviation_max = -1ULL /
699 		    param->tx_limit_max;
700 	}
701 	/* set default rate deviation */
702 	param->tx_allowed_deviation = 50;	/* 5.0% */
703 
704 	/* channel parameters */
705 	param->tx_queue_size = (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE);
706 	param->tx_coalesce_usecs = MLX5E_RL_TX_COAL_USEC_DEFAULT;
707 	param->tx_coalesce_pkts = MLX5E_RL_TX_COAL_PKTS_DEFAULT;
708 	param->tx_coalesce_mode = MLX5E_RL_TX_COAL_MODE_DEFAULT;
709 	param->tx_completion_fact = MLX5E_RL_TX_COMP_FACT_DEFAULT;
710 }
711 
712 static const char *mlx5e_rl_params_desc[] = {
713 	MLX5E_RL_PARAMS(MLX5E_STATS_DESC)
714 };
715 
716 static const char *mlx5e_rl_table_params_desc[] = {
717 	MLX5E_RL_TABLE_PARAMS(MLX5E_STATS_DESC)
718 };
719 
720 static const char *mlx5e_rl_stats_desc[] = {
721 	MLX5E_RL_STATS(MLX5E_STATS_DESC)
722 };
723 
724 int
725 mlx5e_rl_init(struct mlx5e_priv *priv)
726 {
727 	struct mlx5e_rl_priv_data *rl = &priv->rl;
728 	struct sysctl_oid *node;
729 	struct sysctl_oid *stats;
730 	char buf[64];
731 	uint64_t i;
732 	uint64_t j;
733 	int error;
734 
735 	/* check if there is support for packet pacing */
736 	if (!MLX5_CAP_GEN(priv->mdev, qos) || !MLX5_CAP_QOS(priv->mdev, packet_pacing))
737 		return (0);
738 
739 	rl->priv = priv;
740 
741 	sysctl_ctx_init(&rl->ctx);
742 
743 	sx_init(&rl->rl_sxlock, "ratelimit-sxlock");
744 
745 	/* open own TIS domain for ratelimit SQs */
746 	error = mlx5e_rl_open_tis(priv);
747 	if (error)
748 		goto done;
749 
750 	/* setup default value for parameters */
751 	mlx5e_rl_set_default_params(&rl->param, priv->mdev);
752 
753 	/* update the completion factor */
754 	mlx5e_rl_sync_tx_completion_fact(rl);
755 
756 	/* create root node */
757 	node = SYSCTL_ADD_NODE(&rl->ctx,
758 	    SYSCTL_CHILDREN(priv->sysctl_ifnet), OID_AUTO,
759 	    "rate_limit", CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, "Rate limiting support");
760 
761 	if (node != NULL) {
762 		/* create SYSCTLs */
763 		for (i = 0; i != MLX5E_RL_PARAMS_NUM; i++) {
764 			mlx5e_rl_sysctl_add_u64_oid(rl,
765 			    MLX5E_RL_PARAMS_INDEX(arg[i]),
766 			    node, mlx5e_rl_params_desc[2 * i],
767 			    mlx5e_rl_params_desc[2 * i + 1]);
768 		}
769 
770 		stats = SYSCTL_ADD_NODE(&rl->ctx, SYSCTL_CHILDREN(node),
771 		    OID_AUTO, "stats", CTLFLAG_RD | CTLFLAG_MPSAFE, NULL,
772 		    "Rate limiting statistics");
773 		if (stats != NULL) {
774 			/* create SYSCTLs */
775 			for (i = 0; i != MLX5E_RL_STATS_NUM; i++) {
776 				mlx5e_rl_sysctl_add_stats_u64_oid(rl, i,
777 				    stats, mlx5e_rl_stats_desc[2 * i],
778 				    mlx5e_rl_stats_desc[2 * i + 1]);
779 			}
780 		}
781 	}
782 
783 	/* allocate workers array */
784 	rl->workers = malloc(sizeof(rl->workers[0]) *
785 	    rl->param.tx_worker_threads_def, M_MLX5EN, M_WAITOK | M_ZERO);
786 
787 	/* allocate rate limit array */
788 	rl->rate_limit_table = malloc(sizeof(rl->rate_limit_table[0]) *
789 	    rl->param.tx_rates_def, M_MLX5EN, M_WAITOK | M_ZERO);
790 
791 	if (node != NULL) {
792 		/* create more SYSCTls */
793 		SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
794 		    "tx_rate_show", CTLTYPE_STRING | CTLFLAG_RD |
795 		    CTLFLAG_MPSAFE, rl, 0, &mlx5e_rl_sysctl_show_rate_table,
796 		    "A", "Show table of all configured TX rates");
797 
798 		/* try to fetch rate table from kernel environment */
799 		for (i = 0; i != rl->param.tx_rates_def; i++) {
800 			/* compute path for tunable */
801 			snprintf(buf, sizeof(buf), "dev.mce.%d.rate_limit.tx_rate_add_%d",
802 			    device_get_unit(priv->mdev->pdev->dev.bsddev), (int)i);
803 			if (TUNABLE_QUAD_FETCH(buf, &j))
804 				mlx5e_rl_tx_limit_add(rl, j);
805 		}
806 
807 		/* setup rate table sysctls */
808 		for (i = 0; i != MLX5E_RL_TABLE_PARAMS_NUM; i++) {
809 			mlx5e_rl_sysctl_add_u64_oid(rl,
810 			    MLX5E_RL_PARAMS_INDEX(table_arg[i]),
811 			    node, mlx5e_rl_table_params_desc[2 * i],
812 			    mlx5e_rl_table_params_desc[2 * i + 1]);
813 		}
814 	}
815 
816 	for (j = 0; j < rl->param.tx_worker_threads_def; j++) {
817 		struct mlx5e_rl_worker *rlw = rl->workers + j;
818 
819 		rlw->priv = priv;
820 
821 		cv_init(&rlw->cv, "mlx5-worker-cv");
822 		mtx_init(&rlw->mtx, "mlx5-worker-mtx", NULL, MTX_DEF);
823 		STAILQ_INIT(&rlw->index_list_head);
824 		STAILQ_INIT(&rlw->process_head);
825 
826 		rlw->channels = malloc(sizeof(rlw->channels[0]) *
827 		    rl->param.tx_channels_per_worker_def, M_MLX5EN, M_WAITOK | M_ZERO);
828 
829 		MLX5E_RL_WORKER_LOCK(rlw);
830 		for (i = 0; i < rl->param.tx_channels_per_worker_def; i++) {
831 			struct mlx5e_rl_channel *channel = rlw->channels + i;
832 			channel->worker = rlw;
833 			channel->tag.type = IF_SND_TAG_TYPE_RATE_LIMIT;
834 			STAILQ_INSERT_TAIL(&rlw->index_list_head, channel, entry);
835 		}
836 		MLX5E_RL_WORKER_UNLOCK(rlw);
837 	}
838 
839 	PRIV_LOCK(priv);
840 	error = mlx5e_rl_open_workers(priv);
841 	PRIV_UNLOCK(priv);
842 
843 	if (error != 0) {
844 		mlx5_en_err(priv->ifp,
845 		    "mlx5e_rl_open_workers failed: %d\n", error);
846 	}
847 
848 	return (0);
849 
850 done:
851 	sysctl_ctx_free(&rl->ctx);
852 	sx_destroy(&rl->rl_sxlock);
853 	return (error);
854 }
855 
856 static int
857 mlx5e_rl_open_workers(struct mlx5e_priv *priv)
858 {
859 	struct mlx5e_rl_priv_data *rl = &priv->rl;
860 	struct thread *rl_thread = NULL;
861 	struct proc *rl_proc = NULL;
862 	uint64_t j;
863 	int error;
864 
865 	if (priv->gone || rl->opened)
866 		return (-EINVAL);
867 
868 	MLX5E_RL_WLOCK(rl);
869 	/* compute channel parameters once */
870 	mlx5e_rl_build_channel_param(rl, &rl->chan_param);
871 	MLX5E_RL_WUNLOCK(rl);
872 
873 	for (j = 0; j < rl->param.tx_worker_threads_def; j++) {
874 		struct mlx5e_rl_worker *rlw = rl->workers + j;
875 
876 		/* start worker thread */
877 		error = kproc_kthread_add(mlx5e_rl_worker, rlw, &rl_proc, &rl_thread,
878 		    RFHIGHPID, 0, "mlx5-ratelimit", "mlx5-rl-worker-thread-%d", (int)j);
879 		if (error != 0) {
880 			mlx5_en_err(rl->priv->ifp,
881 			    "kproc_kthread_add failed: %d\n", error);
882 			rlw->worker_done = 1;
883 		}
884 	}
885 
886 	rl->opened = 1;
887 
888 	return (0);
889 }
890 
891 static void
892 mlx5e_rl_close_workers(struct mlx5e_priv *priv)
893 {
894 	struct mlx5e_rl_priv_data *rl = &priv->rl;
895 	uint64_t y;
896 
897 	if (rl->opened == 0)
898 		return;
899 
900 	/* tear down worker threads simultaneously */
901 	for (y = 0; y < rl->param.tx_worker_threads_def; y++) {
902 		struct mlx5e_rl_worker *rlw = rl->workers + y;
903 
904 		/* tear down worker before freeing SQs */
905 		MLX5E_RL_WORKER_LOCK(rlw);
906 		if (rlw->worker_done == 0) {
907 			rlw->worker_done = 1;
908 			cv_broadcast(&rlw->cv);
909 		} else {
910 			/* XXX thread not started */
911 			rlw->worker_done = 0;
912 		}
913 		MLX5E_RL_WORKER_UNLOCK(rlw);
914 	}
915 
916 	/* wait for worker threads to exit */
917 	for (y = 0; y < rl->param.tx_worker_threads_def; y++) {
918 		struct mlx5e_rl_worker *rlw = rl->workers + y;
919 
920 		/* tear down worker before freeing SQs */
921 		MLX5E_RL_WORKER_LOCK(rlw);
922 		while (rlw->worker_done != 0)
923 			cv_wait(&rlw->cv, &rlw->mtx);
924 		MLX5E_RL_WORKER_UNLOCK(rlw);
925 	}
926 
927 	rl->opened = 0;
928 }
929 
930 static void
931 mlx5e_rl_reset_rates(struct mlx5e_rl_priv_data *rl)
932 {
933 	unsigned x;
934 
935 	MLX5E_RL_WLOCK(rl);
936 	for (x = 0; x != rl->param.tx_rates_def; x++)
937 		rl->rate_limit_table[x] = 0;
938 	MLX5E_RL_WUNLOCK(rl);
939 }
940 
941 void
942 mlx5e_rl_cleanup(struct mlx5e_priv *priv)
943 {
944 	struct mlx5e_rl_priv_data *rl = &priv->rl;
945 	uint64_t y;
946 
947 	/* check if there is support for packet pacing */
948 	if (!MLX5_CAP_GEN(priv->mdev, qos) || !MLX5_CAP_QOS(priv->mdev, packet_pacing))
949 		return;
950 
951 	/* TODO check if there is support for packet pacing */
952 
953 	sysctl_ctx_free(&rl->ctx);
954 
955 	PRIV_LOCK(priv);
956 	mlx5e_rl_close_workers(priv);
957 	PRIV_UNLOCK(priv);
958 
959 	mlx5e_rl_reset_rates(rl);
960 
961 	/* close TIS domain */
962 	mlx5e_rl_close_tis(priv);
963 
964 	for (y = 0; y < rl->param.tx_worker_threads_def; y++) {
965 		struct mlx5e_rl_worker *rlw = rl->workers + y;
966 
967 		cv_destroy(&rlw->cv);
968 		mtx_destroy(&rlw->mtx);
969 		free(rlw->channels, M_MLX5EN);
970 	}
971 	free(rl->rate_limit_table, M_MLX5EN);
972 	free(rl->workers, M_MLX5EN);
973 	sx_destroy(&rl->rl_sxlock);
974 }
975 
976 static void
977 mlx5e_rlw_queue_channel_locked(struct mlx5e_rl_worker *rlw,
978     struct mlx5e_rl_channel *channel)
979 {
980 	STAILQ_INSERT_TAIL(&rlw->process_head, channel, entry);
981 	cv_broadcast(&rlw->cv);
982 }
983 
984 static void
985 mlx5e_rl_free(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel)
986 {
987 	if (channel == NULL)
988 		return;
989 
990 	MLX5E_RL_WORKER_LOCK(rlw);
991 	switch (channel->state) {
992 	case MLX5E_RL_ST_MODIFY:
993 		channel->state = MLX5E_RL_ST_DESTROY;
994 		break;
995 	case MLX5E_RL_ST_USED:
996 		channel->state = MLX5E_RL_ST_DESTROY;
997 		mlx5e_rlw_queue_channel_locked(rlw, channel);
998 		break;
999 	default:
1000 		break;
1001 	}
1002 	MLX5E_RL_WORKER_UNLOCK(rlw);
1003 }
1004 
1005 static int
1006 mlx5e_rl_modify(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel, uint64_t rate)
1007 {
1008 
1009 	MLX5E_RL_WORKER_LOCK(rlw);
1010 	channel->new_rate = rate;
1011 	switch (channel->state) {
1012 	case MLX5E_RL_ST_USED:
1013 		channel->state = MLX5E_RL_ST_MODIFY;
1014 		mlx5e_rlw_queue_channel_locked(rlw, channel);
1015 		break;
1016 	default:
1017 		break;
1018 	}
1019 	MLX5E_RL_WORKER_UNLOCK(rlw);
1020 
1021 	return (0);
1022 }
1023 
1024 static int
1025 mlx5e_rl_query(struct mlx5e_rl_worker *rlw, struct mlx5e_rl_channel *channel,
1026     union if_snd_tag_query_params *params)
1027 {
1028 	int retval;
1029 
1030 	MLX5E_RL_WORKER_LOCK(rlw);
1031 	switch (channel->state) {
1032 	case MLX5E_RL_ST_USED:
1033 		params->rate_limit.max_rate = channel->last_rate;
1034 		params->rate_limit.queue_level = mlx5e_sq_queue_level(channel->sq);
1035 		retval = 0;
1036 		break;
1037 	case MLX5E_RL_ST_MODIFY:
1038 		params->rate_limit.max_rate = channel->last_rate;
1039 		params->rate_limit.queue_level = mlx5e_sq_queue_level(channel->sq);
1040 		retval = EBUSY;
1041 		break;
1042 	default:
1043 		retval = EINVAL;
1044 		break;
1045 	}
1046 	MLX5E_RL_WORKER_UNLOCK(rlw);
1047 
1048 	return (retval);
1049 }
1050 
1051 static int
1052 mlx5e_find_available_tx_ring_index(struct mlx5e_rl_worker *rlw,
1053     struct mlx5e_rl_channel **pchannel)
1054 {
1055 	struct mlx5e_rl_channel *channel;
1056 	int retval = ENOMEM;
1057 
1058 	MLX5E_RL_WORKER_LOCK(rlw);
1059 	/* Check for available channel in free list */
1060 	if ((channel = STAILQ_FIRST(&rlw->index_list_head)) != NULL) {
1061 		retval = 0;
1062 		/* Remove head index from available list */
1063 		STAILQ_REMOVE_HEAD(&rlw->index_list_head, entry);
1064 		channel->state = MLX5E_RL_ST_USED;
1065 		atomic_add_64(&rlw->priv->rl.stats.tx_active_connections, 1ULL);
1066 	} else {
1067 		atomic_add_64(&rlw->priv->rl.stats.tx_available_resource_failure, 1ULL);
1068 	}
1069 	MLX5E_RL_WORKER_UNLOCK(rlw);
1070 
1071 	*pchannel = channel;
1072 #ifdef RATELIMIT_DEBUG
1073 	mlx5_en_info(rlw->priv->ifp,
1074 	    "Channel pointer for rate limit connection is %p\n", channel);
1075 #endif
1076 	return (retval);
1077 }
1078 
1079 int
1080 mlx5e_rl_snd_tag_alloc(struct ifnet *ifp,
1081     union if_snd_tag_alloc_params *params,
1082     struct m_snd_tag **ppmt)
1083 {
1084 	struct mlx5e_rl_channel *channel;
1085 	struct mlx5e_rl_worker *rlw;
1086 	struct mlx5e_priv *priv;
1087 	int error;
1088 
1089 	priv = ifp->if_softc;
1090 
1091 	/* check if there is support for packet pacing or if device is going away */
1092 	if (!MLX5_CAP_GEN(priv->mdev, qos) ||
1093 	    !MLX5_CAP_QOS(priv->mdev, packet_pacing) || priv->gone ||
1094 	    params->rate_limit.hdr.type != IF_SND_TAG_TYPE_RATE_LIMIT)
1095 		return (EOPNOTSUPP);
1096 
1097 	/* compute worker thread this TCP connection belongs to */
1098 	rlw = priv->rl.workers + ((params->rate_limit.hdr.flowid % 128) %
1099 	    priv->rl.param.tx_worker_threads_def);
1100 
1101 	error = mlx5e_find_available_tx_ring_index(rlw, &channel);
1102 	if (error != 0)
1103 		goto done;
1104 
1105 	error = mlx5e_rl_modify(rlw, channel, params->rate_limit.max_rate);
1106 	if (error != 0) {
1107 		mlx5e_rl_free(rlw, channel);
1108 		goto done;
1109 	}
1110 
1111 	/* store pointer to mbuf tag */
1112 	MPASS(channel->tag.refcount == 0);
1113 	m_snd_tag_init(&channel->tag, ifp, IF_SND_TAG_TYPE_RATE_LIMIT);
1114 	*ppmt = &channel->tag;
1115 done:
1116 	return (error);
1117 }
1118 
1119 
1120 int
1121 mlx5e_rl_snd_tag_modify(struct m_snd_tag *pmt, union if_snd_tag_modify_params *params)
1122 {
1123 	struct mlx5e_rl_channel *channel =
1124 	    container_of(pmt, struct mlx5e_rl_channel, tag);
1125 
1126 	return (mlx5e_rl_modify(channel->worker, channel, params->rate_limit.max_rate));
1127 }
1128 
1129 int
1130 mlx5e_rl_snd_tag_query(struct m_snd_tag *pmt, union if_snd_tag_query_params *params)
1131 {
1132 	struct mlx5e_rl_channel *channel =
1133 	    container_of(pmt, struct mlx5e_rl_channel, tag);
1134 
1135 	return (mlx5e_rl_query(channel->worker, channel, params));
1136 }
1137 
1138 void
1139 mlx5e_rl_snd_tag_free(struct m_snd_tag *pmt)
1140 {
1141 	struct mlx5e_rl_channel *channel =
1142 	    container_of(pmt, struct mlx5e_rl_channel, tag);
1143 
1144 	mlx5e_rl_free(channel->worker, channel);
1145 }
1146 
1147 static int
1148 mlx5e_rl_sysctl_show_rate_table(SYSCTL_HANDLER_ARGS)
1149 {
1150 	struct mlx5e_rl_priv_data *rl = arg1;
1151 	struct mlx5e_priv *priv = rl->priv;
1152 	struct sbuf sbuf;
1153 	unsigned x;
1154 	int error;
1155 
1156 	error = sysctl_wire_old_buffer(req, 0);
1157 	if (error != 0)
1158 		return (error);
1159 
1160 	PRIV_LOCK(priv);
1161 
1162 	sbuf_new_for_sysctl(&sbuf, NULL, 128 * rl->param.tx_rates_def, req);
1163 
1164 	sbuf_printf(&sbuf,
1165 	    "\n\n" "\t" "ENTRY" "\t" "BURST" "\t" "RATE [bit/s]\n"
1166 	    "\t" "--------------------------------------------\n");
1167 
1168 	MLX5E_RL_RLOCK(rl);
1169 	for (x = 0; x != rl->param.tx_rates_def; x++) {
1170 		if (rl->rate_limit_table[x] == 0)
1171 			continue;
1172 
1173 		sbuf_printf(&sbuf, "\t" "%3u" "\t" "%3u" "\t" "%lld\n",
1174 		    x, (unsigned)rl->param.tx_burst_size,
1175 		    (long long)rl->rate_limit_table[x]);
1176 	}
1177 	MLX5E_RL_RUNLOCK(rl);
1178 
1179 	error = sbuf_finish(&sbuf);
1180 	sbuf_delete(&sbuf);
1181 
1182 	PRIV_UNLOCK(priv);
1183 
1184 	return (error);
1185 }
1186 
1187 static int
1188 mlx5e_rl_refresh_channel_params(struct mlx5e_rl_priv_data *rl)
1189 {
1190 	uint64_t x;
1191 	uint64_t y;
1192 
1193 	MLX5E_RL_WLOCK(rl);
1194 	/* compute channel parameters once */
1195 	mlx5e_rl_build_channel_param(rl, &rl->chan_param);
1196 	MLX5E_RL_WUNLOCK(rl);
1197 
1198 	for (y = 0; y != rl->param.tx_worker_threads_def; y++) {
1199 		struct mlx5e_rl_worker *rlw = rl->workers + y;
1200 
1201 		for (x = 0; x != rl->param.tx_channels_per_worker_def; x++) {
1202 			struct mlx5e_rl_channel *channel;
1203 			struct mlx5e_sq *sq;
1204 
1205 			channel = rlw->channels + x;
1206 			sq = channel->sq;
1207 
1208 			if (sq == NULL)
1209 				continue;
1210 
1211 			if (MLX5_CAP_GEN(rl->priv->mdev, cq_period_mode_modify)) {
1212 				mlx5_core_modify_cq_moderation_mode(rl->priv->mdev, &sq->cq.mcq,
1213 				    rl->param.tx_coalesce_usecs,
1214 				    rl->param.tx_coalesce_pkts,
1215 				    rl->param.tx_coalesce_mode);
1216 			} else {
1217 				mlx5_core_modify_cq_moderation(rl->priv->mdev, &sq->cq.mcq,
1218 				    rl->param.tx_coalesce_usecs,
1219 				    rl->param.tx_coalesce_pkts);
1220 			}
1221 		}
1222 	}
1223 	return (0);
1224 }
1225 
1226 void
1227 mlx5e_rl_refresh_sq_inline(struct mlx5e_rl_priv_data *rl)
1228 {
1229 	uint64_t x;
1230 	uint64_t y;
1231 
1232 	for (y = 0; y != rl->param.tx_worker_threads_def; y++) {
1233 		struct mlx5e_rl_worker *rlw = rl->workers + y;
1234 
1235 		for (x = 0; x != rl->param.tx_channels_per_worker_def; x++) {
1236 			struct mlx5e_rl_channel *channel;
1237 			struct mlx5e_sq *sq;
1238 
1239 			channel = rlw->channels + x;
1240 			sq = channel->sq;
1241 
1242 			if (sq == NULL)
1243 				continue;
1244 
1245 			mtx_lock(&sq->lock);
1246 			mlx5e_update_sq_inline(sq);
1247 			mtx_unlock(&sq->lock);
1248 		}
1249 	}
1250 }
1251 
1252 static int
1253 mlx5e_rl_tx_limit_add(struct mlx5e_rl_priv_data *rl, uint64_t value)
1254 {
1255 	unsigned x;
1256 	int error;
1257 
1258 	if (value < 1000 ||
1259 	    mlx5_rl_is_in_range(rl->priv->mdev, howmany(value, 1000), 0) == 0)
1260 		return (EINVAL);
1261 
1262 	MLX5E_RL_WLOCK(rl);
1263 	error = ENOMEM;
1264 
1265 	/* check if rate already exists */
1266 	for (x = 0; x != rl->param.tx_rates_def; x++) {
1267 		if (rl->rate_limit_table[x] != value)
1268 			continue;
1269 		error = EEXIST;
1270 		break;
1271 	}
1272 
1273 	/* check if there is a free rate entry */
1274 	if (x == rl->param.tx_rates_def) {
1275 		for (x = 0; x != rl->param.tx_rates_def; x++) {
1276 			if (rl->rate_limit_table[x] != 0)
1277 				continue;
1278 			rl->rate_limit_table[x] = value;
1279 			error = 0;
1280 			break;
1281 		}
1282 	}
1283 	MLX5E_RL_WUNLOCK(rl);
1284 
1285 	return (error);
1286 }
1287 
1288 static int
1289 mlx5e_rl_tx_limit_clr(struct mlx5e_rl_priv_data *rl, uint64_t value)
1290 {
1291 	unsigned x;
1292 	int error;
1293 
1294 	if (value == 0)
1295 		return (EINVAL);
1296 
1297 	MLX5E_RL_WLOCK(rl);
1298 
1299 	/* check if rate already exists */
1300 	for (x = 0; x != rl->param.tx_rates_def; x++) {
1301 		if (rl->rate_limit_table[x] != value)
1302 			continue;
1303 		/* free up rate */
1304 		rl->rate_limit_table[x] = 0;
1305 		break;
1306 	}
1307 
1308 	/* check if there is a free rate entry */
1309 	if (x == rl->param.tx_rates_def)
1310 		error = ENOENT;
1311 	else
1312 		error = 0;
1313 	MLX5E_RL_WUNLOCK(rl);
1314 
1315 	return (error);
1316 }
1317 
1318 static int
1319 mlx5e_rl_sysctl_handler(SYSCTL_HANDLER_ARGS)
1320 {
1321 	struct mlx5e_rl_priv_data *rl = arg1;
1322 	struct mlx5e_priv *priv = rl->priv;
1323 	unsigned mode_modify;
1324 	unsigned was_opened;
1325 	uint64_t value;
1326 	uint64_t old;
1327 	int error;
1328 
1329 	PRIV_LOCK(priv);
1330 
1331 	MLX5E_RL_RLOCK(rl);
1332 	value = rl->param.arg[arg2];
1333 	MLX5E_RL_RUNLOCK(rl);
1334 
1335 	if (req != NULL) {
1336 		old = value;
1337 		error = sysctl_handle_64(oidp, &value, 0, req);
1338 		if (error || req->newptr == NULL ||
1339 		    value == rl->param.arg[arg2])
1340 			goto done;
1341 	} else {
1342 		old = 0;
1343 		error = 0;
1344 	}
1345 
1346 	/* check if device is gone */
1347 	if (priv->gone) {
1348 		error = ENXIO;
1349 		goto done;
1350 	}
1351 	was_opened = rl->opened;
1352 	mode_modify = MLX5_CAP_GEN(priv->mdev, cq_period_mode_modify);
1353 
1354 	switch (MLX5E_RL_PARAMS_INDEX(arg[arg2])) {
1355 	case MLX5E_RL_PARAMS_INDEX(tx_worker_threads_def):
1356 		if (value > rl->param.tx_worker_threads_max)
1357 			value = rl->param.tx_worker_threads_max;
1358 		else if (value < 1)
1359 			value = 1;
1360 
1361 		/* store new value */
1362 		rl->param.arg[arg2] = value;
1363 		break;
1364 
1365 	case MLX5E_RL_PARAMS_INDEX(tx_channels_per_worker_def):
1366 		if (value > rl->param.tx_channels_per_worker_max)
1367 			value = rl->param.tx_channels_per_worker_max;
1368 		else if (value < 1)
1369 			value = 1;
1370 
1371 		/* store new value */
1372 		rl->param.arg[arg2] = value;
1373 		break;
1374 
1375 	case MLX5E_RL_PARAMS_INDEX(tx_rates_def):
1376 		if (value > rl->param.tx_rates_max)
1377 			value = rl->param.tx_rates_max;
1378 		else if (value < 1)
1379 			value = 1;
1380 
1381 		/* store new value */
1382 		rl->param.arg[arg2] = value;
1383 		break;
1384 
1385 	case MLX5E_RL_PARAMS_INDEX(tx_coalesce_usecs):
1386 		/* range check */
1387 		if (value < 1)
1388 			value = 0;
1389 		else if (value > MLX5E_FLD_MAX(cqc, cq_period))
1390 			value = MLX5E_FLD_MAX(cqc, cq_period);
1391 
1392 		/* store new value */
1393 		rl->param.arg[arg2] = value;
1394 
1395 		/* check to avoid down and up the network interface */
1396 		if (was_opened)
1397 			error = mlx5e_rl_refresh_channel_params(rl);
1398 		break;
1399 
1400 	case MLX5E_RL_PARAMS_INDEX(tx_coalesce_pkts):
1401 		/* import TX coal pkts */
1402 		if (value < 1)
1403 			value = 0;
1404 		else if (value > MLX5E_FLD_MAX(cqc, cq_max_count))
1405 			value = MLX5E_FLD_MAX(cqc, cq_max_count);
1406 
1407 		/* store new value */
1408 		rl->param.arg[arg2] = value;
1409 
1410 		/* check to avoid down and up the network interface */
1411 		if (was_opened)
1412 			error = mlx5e_rl_refresh_channel_params(rl);
1413 		break;
1414 
1415 	case MLX5E_RL_PARAMS_INDEX(tx_coalesce_mode):
1416 		/* network interface must be down */
1417 		if (was_opened != 0 && mode_modify == 0)
1418 			mlx5e_rl_close_workers(priv);
1419 
1420 		/* import TX coalesce mode */
1421 		if (value != 0)
1422 			value = 1;
1423 
1424 		/* store new value */
1425 		rl->param.arg[arg2] = value;
1426 
1427 		/* restart network interface, if any */
1428 		if (was_opened != 0) {
1429 			if (mode_modify == 0)
1430 				mlx5e_rl_open_workers(priv);
1431 			else
1432 				error = mlx5e_rl_refresh_channel_params(rl);
1433 		}
1434 		break;
1435 
1436 	case MLX5E_RL_PARAMS_INDEX(tx_queue_size):
1437 		/* network interface must be down */
1438 		if (was_opened)
1439 			mlx5e_rl_close_workers(priv);
1440 
1441 		/* import TX queue size */
1442 		if (value < (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE))
1443 			value = (1 << MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE);
1444 		else if (value > priv->params_ethtool.tx_queue_size_max)
1445 			value = priv->params_ethtool.tx_queue_size_max;
1446 
1447 		/* store actual TX queue size */
1448 		value = 1ULL << order_base_2(value);
1449 
1450 		/* store new value */
1451 		rl->param.arg[arg2] = value;
1452 
1453 		/* verify TX completion factor */
1454 		mlx5e_rl_sync_tx_completion_fact(rl);
1455 
1456 		/* restart network interface, if any */
1457 		if (was_opened)
1458 			mlx5e_rl_open_workers(priv);
1459 		break;
1460 
1461 	case MLX5E_RL_PARAMS_INDEX(tx_completion_fact):
1462 		/* network interface must be down */
1463 		if (was_opened)
1464 			mlx5e_rl_close_workers(priv);
1465 
1466 		/* store new value */
1467 		rl->param.arg[arg2] = value;
1468 
1469 		/* verify parameter */
1470 		mlx5e_rl_sync_tx_completion_fact(rl);
1471 
1472 		/* restart network interface, if any */
1473 		if (was_opened)
1474 			mlx5e_rl_open_workers(priv);
1475 		break;
1476 
1477 	case MLX5E_RL_PARAMS_INDEX(tx_limit_add):
1478 		error = mlx5e_rl_tx_limit_add(rl, value);
1479 		break;
1480 
1481 	case MLX5E_RL_PARAMS_INDEX(tx_limit_clr):
1482 		error = mlx5e_rl_tx_limit_clr(rl, value);
1483 		break;
1484 
1485 	case MLX5E_RL_PARAMS_INDEX(tx_allowed_deviation):
1486 		/* range check */
1487 		if (value > rl->param.tx_allowed_deviation_max)
1488 			value = rl->param.tx_allowed_deviation_max;
1489 		else if (value < rl->param.tx_allowed_deviation_min)
1490 			value = rl->param.tx_allowed_deviation_min;
1491 
1492 		MLX5E_RL_WLOCK(rl);
1493 		rl->param.arg[arg2] = value;
1494 		MLX5E_RL_WUNLOCK(rl);
1495 		break;
1496 
1497 	case MLX5E_RL_PARAMS_INDEX(tx_burst_size):
1498 		/* range check */
1499 		if (value > rl->param.tx_burst_size_max)
1500 			value = rl->param.tx_burst_size_max;
1501 		else if (value < rl->param.tx_burst_size_min)
1502 			value = rl->param.tx_burst_size_min;
1503 
1504 		MLX5E_RL_WLOCK(rl);
1505 		rl->param.arg[arg2] = value;
1506 		MLX5E_RL_WUNLOCK(rl);
1507 		break;
1508 
1509 	default:
1510 		break;
1511 	}
1512 done:
1513 	PRIV_UNLOCK(priv);
1514 	return (error);
1515 }
1516 
1517 static void
1518 mlx5e_rl_sysctl_add_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x,
1519     struct sysctl_oid *node, const char *name, const char *desc)
1520 {
1521 	/*
1522 	 * NOTE: In FreeBSD-11 and newer the CTLFLAG_RWTUN flag will
1523 	 * take care of loading default sysctl value from the kernel
1524 	 * environment, if any:
1525 	 */
1526 	if (strstr(name, "_max") != 0 || strstr(name, "_min") != 0) {
1527 		/* read-only SYSCTLs */
1528 		SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
1529 		    name, CTLTYPE_U64 | CTLFLAG_RD |
1530 		    CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc);
1531 	} else {
1532 		if (strstr(name, "_def") != 0) {
1533 #ifdef RATELIMIT_DEBUG
1534 			/* tunable read-only advanced SYSCTLs */
1535 			SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
1536 			    name, CTLTYPE_U64 | CTLFLAG_RDTUN |
1537 			    CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc);
1538 #endif
1539 		} else {
1540 			/* read-write SYSCTLs */
1541 			SYSCTL_ADD_PROC(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO,
1542 			    name, CTLTYPE_U64 | CTLFLAG_RWTUN |
1543 			    CTLFLAG_MPSAFE, rl, x, &mlx5e_rl_sysctl_handler, "QU", desc);
1544 		}
1545 	}
1546 }
1547 
1548 static void
1549 mlx5e_rl_sysctl_add_stats_u64_oid(struct mlx5e_rl_priv_data *rl, unsigned x,
1550     struct sysctl_oid *node, const char *name, const char *desc)
1551 {
1552 	/* read-only SYSCTLs */
1553 	SYSCTL_ADD_U64(&rl->ctx, SYSCTL_CHILDREN(node), OID_AUTO, name,
1554 	    CTLFLAG_RD, &rl->stats.arg[x], 0, desc);
1555 }
1556 
1557 #else
1558 
1559 int
1560 mlx5e_rl_init(struct mlx5e_priv *priv)
1561 {
1562 
1563 	return (0);
1564 }
1565 
1566 void
1567 mlx5e_rl_cleanup(struct mlx5e_priv *priv)
1568 {
1569 	/* NOP */
1570 }
1571 
1572 #endif		/* RATELIMIT */
1573