1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * net/sched/sch_cbq.c	Class-Based Queueing discipline.
4  *
5  * Authors:	Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
6  */
7 
8 #include <linux/module.h>
9 #include <linux/slab.h>
10 #include <linux/types.h>
11 #include <linux/kernel.h>
12 #include <linux/string.h>
13 #include <linux/errno.h>
14 #include <linux/skbuff.h>
15 #include <net/netlink.h>
16 #include <net/pkt_sched.h>
17 #include <net/pkt_cls.h>
18 
19 
20 /*	Class-Based Queueing (CBQ) algorithm.
21 	=======================================
22 
23 	Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
24 		 Management Models for Packet Networks",
25 		 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
26 
27 		 [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
28 
29 		 [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
30 		 Parameters", 1996
31 
32 		 [4] Sally Floyd and Michael Speer, "Experimental Results
33 		 for Class-Based Queueing", 1998, not published.
34 
35 	-----------------------------------------------------------------------
36 
37 	Algorithm skeleton was taken from NS simulator cbq.cc.
38 	If someone wants to check this code against the LBL version,
39 	he should take into account that ONLY the skeleton was borrowed,
40 	the implementation is different. Particularly:
41 
42 	--- The WRR algorithm is different. Our version looks more
43 	reasonable (I hope) and works when quanta are allowed to be
44 	less than MTU, which is always the case when real time classes
45 	have small rates. Note, that the statement of [3] is
46 	incomplete, delay may actually be estimated even if class
47 	per-round allotment is less than MTU. Namely, if per-round
48 	allotment is W*r_i, and r_1+...+r_k = r < 1
49 
50 	delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
51 
52 	In the worst case we have IntServ estimate with D = W*r+k*MTU
53 	and C = MTU*r. The proof (if correct at all) is trivial.
54 
55 
56 	--- It seems that cbq-2.0 is not very accurate. At least, I cannot
57 	interpret some places, which look like wrong translations
58 	from NS. Anyone is advised to find these differences
59 	and explain to me, why I am wrong 8).
60 
61 	--- Linux has no EOI event, so that we cannot estimate true class
62 	idle time. Workaround is to consider the next dequeue event
63 	as sign that previous packet is finished. This is wrong because of
64 	internal device queueing, but on a permanently loaded link it is true.
65 	Moreover, combined with clock integrator, this scheme looks
66 	very close to an ideal solution.  */
67 
68 struct cbq_sched_data;
69 
70 
71 struct cbq_class {
72 	struct Qdisc_class_common common;
73 	struct cbq_class	*next_alive;	/* next class with backlog in this priority band */
74 
75 /* Parameters */
76 	unsigned char		priority;	/* class priority */
77 	unsigned char		priority2;	/* priority to be used after overlimit */
78 	unsigned char		ewma_log;	/* time constant for idle time calculation */
79 
80 	u32			defmap;
81 
82 	/* Link-sharing scheduler parameters */
83 	long			maxidle;	/* Class parameters: see below. */
84 	long			offtime;
85 	long			minidle;
86 	u32			avpkt;
87 	struct qdisc_rate_table	*R_tab;
88 
89 	/* General scheduler (WRR) parameters */
90 	long			allot;
91 	long			quantum;	/* Allotment per WRR round */
92 	long			weight;		/* Relative allotment: see below */
93 
94 	struct Qdisc		*qdisc;		/* Ptr to CBQ discipline */
95 	struct cbq_class	*split;		/* Ptr to split node */
96 	struct cbq_class	*share;		/* Ptr to LS parent in the class tree */
97 	struct cbq_class	*tparent;	/* Ptr to tree parent in the class tree */
98 	struct cbq_class	*borrow;	/* NULL if class is bandwidth limited;
99 						   parent otherwise */
100 	struct cbq_class	*sibling;	/* Sibling chain */
101 	struct cbq_class	*children;	/* Pointer to children chain */
102 
103 	struct Qdisc		*q;		/* Elementary queueing discipline */
104 
105 
106 /* Variables */
107 	unsigned char		cpriority;	/* Effective priority */
108 	unsigned char		delayed;
109 	unsigned char		level;		/* level of the class in hierarchy:
110 						   0 for leaf classes, and maximal
111 						   level of children + 1 for nodes.
112 						 */
113 
114 	psched_time_t		last;		/* Last end of service */
115 	psched_time_t		undertime;
116 	long			avgidle;
117 	long			deficit;	/* Saved deficit for WRR */
118 	psched_time_t		penalized;
119 	struct gnet_stats_basic_packed bstats;
120 	struct gnet_stats_queue qstats;
121 	struct net_rate_estimator __rcu *rate_est;
122 	struct tc_cbq_xstats	xstats;
123 
124 	struct tcf_proto __rcu	*filter_list;
125 	struct tcf_block	*block;
126 
127 	int			filters;
128 
129 	struct cbq_class	*defaults[TC_PRIO_MAX + 1];
130 };
131 
132 struct cbq_sched_data {
133 	struct Qdisc_class_hash	clhash;			/* Hash table of all classes */
134 	int			nclasses[TC_CBQ_MAXPRIO + 1];
135 	unsigned int		quanta[TC_CBQ_MAXPRIO + 1];
136 
137 	struct cbq_class	link;
138 
139 	unsigned int		activemask;
140 	struct cbq_class	*active[TC_CBQ_MAXPRIO + 1];	/* List of all classes
141 								   with backlog */
142 
143 #ifdef CONFIG_NET_CLS_ACT
144 	struct cbq_class	*rx_class;
145 #endif
146 	struct cbq_class	*tx_class;
147 	struct cbq_class	*tx_borrowed;
148 	int			tx_len;
149 	psched_time_t		now;		/* Cached timestamp */
150 	unsigned int		pmask;
151 
152 	struct hrtimer		delay_timer;
153 	struct qdisc_watchdog	watchdog;	/* Watchdog timer,
154 						   started when CBQ has
155 						   backlog, but cannot
156 						   transmit just now */
157 	psched_tdiff_t		wd_expires;
158 	int			toplevel;
159 	u32			hgenerator;
160 };
161 
162 
163 #define L2T(cl, len)	qdisc_l2t((cl)->R_tab, len)
164 
165 static inline struct cbq_class *
cbq_class_lookup(struct cbq_sched_data * q,u32 classid)166 cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
167 {
168 	struct Qdisc_class_common *clc;
169 
170 	clc = qdisc_class_find(&q->clhash, classid);
171 	if (clc == NULL)
172 		return NULL;
173 	return container_of(clc, struct cbq_class, common);
174 }
175 
176 #ifdef CONFIG_NET_CLS_ACT
177 
178 static struct cbq_class *
cbq_reclassify(struct sk_buff * skb,struct cbq_class * this)179 cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
180 {
181 	struct cbq_class *cl;
182 
183 	for (cl = this->tparent; cl; cl = cl->tparent) {
184 		struct cbq_class *new = cl->defaults[TC_PRIO_BESTEFFORT];
185 
186 		if (new != NULL && new != this)
187 			return new;
188 	}
189 	return NULL;
190 }
191 
192 #endif
193 
194 /* Classify packet. The procedure is pretty complicated, but
195  * it allows us to combine link sharing and priority scheduling
196  * transparently.
197  *
198  * Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
199  * so that it resolves to split nodes. Then packets are classified
200  * by logical priority, or a more specific classifier may be attached
201  * to the split node.
202  */
203 
204 static struct cbq_class *
cbq_classify(struct sk_buff * skb,struct Qdisc * sch,int * qerr)205 cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
206 {
207 	struct cbq_sched_data *q = qdisc_priv(sch);
208 	struct cbq_class *head = &q->link;
209 	struct cbq_class **defmap;
210 	struct cbq_class *cl = NULL;
211 	u32 prio = skb->priority;
212 	struct tcf_proto *fl;
213 	struct tcf_result res;
214 
215 	/*
216 	 *  Step 1. If skb->priority points to one of our classes, use it.
217 	 */
218 	if (TC_H_MAJ(prio ^ sch->handle) == 0 &&
219 	    (cl = cbq_class_lookup(q, prio)) != NULL)
220 		return cl;
221 
222 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
223 	for (;;) {
224 		int result = 0;
225 		defmap = head->defaults;
226 
227 		fl = rcu_dereference_bh(head->filter_list);
228 		/*
229 		 * Step 2+n. Apply classifier.
230 		 */
231 		result = tcf_classify(skb, fl, &res, true);
232 		if (!fl || result < 0)
233 			goto fallback;
234 
235 		cl = (void *)res.class;
236 		if (!cl) {
237 			if (TC_H_MAJ(res.classid))
238 				cl = cbq_class_lookup(q, res.classid);
239 			else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL)
240 				cl = defmap[TC_PRIO_BESTEFFORT];
241 
242 			if (cl == NULL)
243 				goto fallback;
244 		}
245 		if (cl->level >= head->level)
246 			goto fallback;
247 #ifdef CONFIG_NET_CLS_ACT
248 		switch (result) {
249 		case TC_ACT_QUEUED:
250 		case TC_ACT_STOLEN:
251 		case TC_ACT_TRAP:
252 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
253 			fallthrough;
254 		case TC_ACT_SHOT:
255 			return NULL;
256 		case TC_ACT_RECLASSIFY:
257 			return cbq_reclassify(skb, cl);
258 		}
259 #endif
260 		if (cl->level == 0)
261 			return cl;
262 
263 		/*
264 		 * Step 3+n. If classifier selected a link sharing class,
265 		 *	   apply agency specific classifier.
266 		 *	   Repeat this procedure until we hit a leaf node.
267 		 */
268 		head = cl;
269 	}
270 
271 fallback:
272 	cl = head;
273 
274 	/*
275 	 * Step 4. No success...
276 	 */
277 	if (TC_H_MAJ(prio) == 0 &&
278 	    !(cl = head->defaults[prio & TC_PRIO_MAX]) &&
279 	    !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
280 		return head;
281 
282 	return cl;
283 }
284 
285 /*
286  * A packet has just been enqueued on the empty class.
287  * cbq_activate_class adds it to the tail of active class list
288  * of its priority band.
289  */
290 
cbq_activate_class(struct cbq_class * cl)291 static inline void cbq_activate_class(struct cbq_class *cl)
292 {
293 	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
294 	int prio = cl->cpriority;
295 	struct cbq_class *cl_tail;
296 
297 	cl_tail = q->active[prio];
298 	q->active[prio] = cl;
299 
300 	if (cl_tail != NULL) {
301 		cl->next_alive = cl_tail->next_alive;
302 		cl_tail->next_alive = cl;
303 	} else {
304 		cl->next_alive = cl;
305 		q->activemask |= (1<<prio);
306 	}
307 }
308 
309 /*
310  * Unlink class from active chain.
311  * Note that this same procedure is done directly in cbq_dequeue*
312  * during round-robin procedure.
313  */
314 
cbq_deactivate_class(struct cbq_class * this)315 static void cbq_deactivate_class(struct cbq_class *this)
316 {
317 	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
318 	int prio = this->cpriority;
319 	struct cbq_class *cl;
320 	struct cbq_class *cl_prev = q->active[prio];
321 
322 	do {
323 		cl = cl_prev->next_alive;
324 		if (cl == this) {
325 			cl_prev->next_alive = cl->next_alive;
326 			cl->next_alive = NULL;
327 
328 			if (cl == q->active[prio]) {
329 				q->active[prio] = cl_prev;
330 				if (cl == q->active[prio]) {
331 					q->active[prio] = NULL;
332 					q->activemask &= ~(1<<prio);
333 					return;
334 				}
335 			}
336 			return;
337 		}
338 	} while ((cl_prev = cl) != q->active[prio]);
339 }
340 
341 static void
cbq_mark_toplevel(struct cbq_sched_data * q,struct cbq_class * cl)342 cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
343 {
344 	int toplevel = q->toplevel;
345 
346 	if (toplevel > cl->level) {
347 		psched_time_t now = psched_get_time();
348 
349 		do {
350 			if (cl->undertime < now) {
351 				q->toplevel = cl->level;
352 				return;
353 			}
354 		} while ((cl = cl->borrow) != NULL && toplevel > cl->level);
355 	}
356 }
357 
358 static int
cbq_enqueue(struct sk_buff * skb,struct Qdisc * sch,struct sk_buff ** to_free)359 cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
360 	    struct sk_buff **to_free)
361 {
362 	struct cbq_sched_data *q = qdisc_priv(sch);
363 	int ret;
364 	struct cbq_class *cl = cbq_classify(skb, sch, &ret);
365 
366 #ifdef CONFIG_NET_CLS_ACT
367 	q->rx_class = cl;
368 #endif
369 	if (cl == NULL) {
370 		if (ret & __NET_XMIT_BYPASS)
371 			qdisc_qstats_drop(sch);
372 		__qdisc_drop(skb, to_free);
373 		return ret;
374 	}
375 
376 	ret = qdisc_enqueue(skb, cl->q, to_free);
377 	if (ret == NET_XMIT_SUCCESS) {
378 		sch->q.qlen++;
379 		cbq_mark_toplevel(q, cl);
380 		if (!cl->next_alive)
381 			cbq_activate_class(cl);
382 		return ret;
383 	}
384 
385 	if (net_xmit_drop_count(ret)) {
386 		qdisc_qstats_drop(sch);
387 		cbq_mark_toplevel(q, cl);
388 		cl->qstats.drops++;
389 	}
390 	return ret;
391 }
392 
393 /* Overlimit action: penalize leaf class by adding offtime */
cbq_overlimit(struct cbq_class * cl)394 static void cbq_overlimit(struct cbq_class *cl)
395 {
396 	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
397 	psched_tdiff_t delay = cl->undertime - q->now;
398 
399 	if (!cl->delayed) {
400 		delay += cl->offtime;
401 
402 		/*
403 		 * Class goes to sleep, so that it will have no
404 		 * chance to work avgidle. Let's forgive it 8)
405 		 *
406 		 * BTW cbq-2.0 has a crap in this
407 		 * place, apparently they forgot to shift it by cl->ewma_log.
408 		 */
409 		if (cl->avgidle < 0)
410 			delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
411 		if (cl->avgidle < cl->minidle)
412 			cl->avgidle = cl->minidle;
413 		if (delay <= 0)
414 			delay = 1;
415 		cl->undertime = q->now + delay;
416 
417 		cl->xstats.overactions++;
418 		cl->delayed = 1;
419 	}
420 	if (q->wd_expires == 0 || q->wd_expires > delay)
421 		q->wd_expires = delay;
422 
423 	/* Dirty work! We must schedule wakeups based on
424 	 * real available rate, rather than leaf rate,
425 	 * which may be tiny (even zero).
426 	 */
427 	if (q->toplevel == TC_CBQ_MAXLEVEL) {
428 		struct cbq_class *b;
429 		psched_tdiff_t base_delay = q->wd_expires;
430 
431 		for (b = cl->borrow; b; b = b->borrow) {
432 			delay = b->undertime - q->now;
433 			if (delay < base_delay) {
434 				if (delay <= 0)
435 					delay = 1;
436 				base_delay = delay;
437 			}
438 		}
439 
440 		q->wd_expires = base_delay;
441 	}
442 }
443 
cbq_undelay_prio(struct cbq_sched_data * q,int prio,psched_time_t now)444 static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio,
445 				       psched_time_t now)
446 {
447 	struct cbq_class *cl;
448 	struct cbq_class *cl_prev = q->active[prio];
449 	psched_time_t sched = now;
450 
451 	if (cl_prev == NULL)
452 		return 0;
453 
454 	do {
455 		cl = cl_prev->next_alive;
456 		if (now - cl->penalized > 0) {
457 			cl_prev->next_alive = cl->next_alive;
458 			cl->next_alive = NULL;
459 			cl->cpriority = cl->priority;
460 			cl->delayed = 0;
461 			cbq_activate_class(cl);
462 
463 			if (cl == q->active[prio]) {
464 				q->active[prio] = cl_prev;
465 				if (cl == q->active[prio]) {
466 					q->active[prio] = NULL;
467 					return 0;
468 				}
469 			}
470 
471 			cl = cl_prev->next_alive;
472 		} else if (sched - cl->penalized > 0)
473 			sched = cl->penalized;
474 	} while ((cl_prev = cl) != q->active[prio]);
475 
476 	return sched - now;
477 }
478 
cbq_undelay(struct hrtimer * timer)479 static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
480 {
481 	struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data,
482 						delay_timer);
483 	struct Qdisc *sch = q->watchdog.qdisc;
484 	psched_time_t now;
485 	psched_tdiff_t delay = 0;
486 	unsigned int pmask;
487 
488 	now = psched_get_time();
489 
490 	pmask = q->pmask;
491 	q->pmask = 0;
492 
493 	while (pmask) {
494 		int prio = ffz(~pmask);
495 		psched_tdiff_t tmp;
496 
497 		pmask &= ~(1<<prio);
498 
499 		tmp = cbq_undelay_prio(q, prio, now);
500 		if (tmp > 0) {
501 			q->pmask |= 1<<prio;
502 			if (tmp < delay || delay == 0)
503 				delay = tmp;
504 		}
505 	}
506 
507 	if (delay) {
508 		ktime_t time;
509 
510 		time = 0;
511 		time = ktime_add_ns(time, PSCHED_TICKS2NS(now + delay));
512 		hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS_PINNED);
513 	}
514 
515 	__netif_schedule(qdisc_root(sch));
516 	return HRTIMER_NORESTART;
517 }
518 
519 /*
520  * It is mission critical procedure.
521  *
522  * We "regenerate" toplevel cutoff, if transmitting class
523  * has backlog and it is not regulated. It is not part of
524  * original CBQ description, but looks more reasonable.
525  * Probably, it is wrong. This question needs further investigation.
526  */
527 
528 static inline void
cbq_update_toplevel(struct cbq_sched_data * q,struct cbq_class * cl,struct cbq_class * borrowed)529 cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
530 		    struct cbq_class *borrowed)
531 {
532 	if (cl && q->toplevel >= borrowed->level) {
533 		if (cl->q->q.qlen > 1) {
534 			do {
535 				if (borrowed->undertime == PSCHED_PASTPERFECT) {
536 					q->toplevel = borrowed->level;
537 					return;
538 				}
539 			} while ((borrowed = borrowed->borrow) != NULL);
540 		}
541 #if 0
542 	/* It is not necessary now. Uncommenting it
543 	   will save CPU cycles, but decrease fairness.
544 	 */
545 		q->toplevel = TC_CBQ_MAXLEVEL;
546 #endif
547 	}
548 }
549 
550 static void
cbq_update(struct cbq_sched_data * q)551 cbq_update(struct cbq_sched_data *q)
552 {
553 	struct cbq_class *this = q->tx_class;
554 	struct cbq_class *cl = this;
555 	int len = q->tx_len;
556 	psched_time_t now;
557 
558 	q->tx_class = NULL;
559 	/* Time integrator. We calculate EOS time
560 	 * by adding expected packet transmission time.
561 	 */
562 	now = q->now + L2T(&q->link, len);
563 
564 	for ( ; cl; cl = cl->share) {
565 		long avgidle = cl->avgidle;
566 		long idle;
567 
568 		cl->bstats.packets++;
569 		cl->bstats.bytes += len;
570 
571 		/*
572 		 * (now - last) is total time between packet right edges.
573 		 * (last_pktlen/rate) is "virtual" busy time, so that
574 		 *
575 		 *	idle = (now - last) - last_pktlen/rate
576 		 */
577 
578 		idle = now - cl->last;
579 		if ((unsigned long)idle > 128*1024*1024) {
580 			avgidle = cl->maxidle;
581 		} else {
582 			idle -= L2T(cl, len);
583 
584 		/* true_avgidle := (1-W)*true_avgidle + W*idle,
585 		 * where W=2^{-ewma_log}. But cl->avgidle is scaled:
586 		 * cl->avgidle == true_avgidle/W,
587 		 * hence:
588 		 */
589 			avgidle += idle - (avgidle>>cl->ewma_log);
590 		}
591 
592 		if (avgidle <= 0) {
593 			/* Overlimit or at-limit */
594 
595 			if (avgidle < cl->minidle)
596 				avgidle = cl->minidle;
597 
598 			cl->avgidle = avgidle;
599 
600 			/* Calculate expected time, when this class
601 			 * will be allowed to send.
602 			 * It will occur, when:
603 			 * (1-W)*true_avgidle + W*delay = 0, i.e.
604 			 * idle = (1/W - 1)*(-true_avgidle)
605 			 * or
606 			 * idle = (1 - W)*(-cl->avgidle);
607 			 */
608 			idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
609 
610 			/*
611 			 * That is not all.
612 			 * To maintain the rate allocated to the class,
613 			 * we add to undertime virtual clock,
614 			 * necessary to complete transmitted packet.
615 			 * (len/phys_bandwidth has been already passed
616 			 * to the moment of cbq_update)
617 			 */
618 
619 			idle -= L2T(&q->link, len);
620 			idle += L2T(cl, len);
621 
622 			cl->undertime = now + idle;
623 		} else {
624 			/* Underlimit */
625 
626 			cl->undertime = PSCHED_PASTPERFECT;
627 			if (avgidle > cl->maxidle)
628 				cl->avgidle = cl->maxidle;
629 			else
630 				cl->avgidle = avgidle;
631 		}
632 		if ((s64)(now - cl->last) > 0)
633 			cl->last = now;
634 	}
635 
636 	cbq_update_toplevel(q, this, q->tx_borrowed);
637 }
638 
639 static inline struct cbq_class *
cbq_under_limit(struct cbq_class * cl)640 cbq_under_limit(struct cbq_class *cl)
641 {
642 	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
643 	struct cbq_class *this_cl = cl;
644 
645 	if (cl->tparent == NULL)
646 		return cl;
647 
648 	if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) {
649 		cl->delayed = 0;
650 		return cl;
651 	}
652 
653 	do {
654 		/* It is very suspicious place. Now overlimit
655 		 * action is generated for not bounded classes
656 		 * only if link is completely congested.
657 		 * Though it is in agree with ancestor-only paradigm,
658 		 * it looks very stupid. Particularly,
659 		 * it means that this chunk of code will either
660 		 * never be called or result in strong amplification
661 		 * of burstiness. Dangerous, silly, and, however,
662 		 * no another solution exists.
663 		 */
664 		cl = cl->borrow;
665 		if (!cl) {
666 			this_cl->qstats.overlimits++;
667 			cbq_overlimit(this_cl);
668 			return NULL;
669 		}
670 		if (cl->level > q->toplevel)
671 			return NULL;
672 	} while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime);
673 
674 	cl->delayed = 0;
675 	return cl;
676 }
677 
678 static inline struct sk_buff *
cbq_dequeue_prio(struct Qdisc * sch,int prio)679 cbq_dequeue_prio(struct Qdisc *sch, int prio)
680 {
681 	struct cbq_sched_data *q = qdisc_priv(sch);
682 	struct cbq_class *cl_tail, *cl_prev, *cl;
683 	struct sk_buff *skb;
684 	int deficit;
685 
686 	cl_tail = cl_prev = q->active[prio];
687 	cl = cl_prev->next_alive;
688 
689 	do {
690 		deficit = 0;
691 
692 		/* Start round */
693 		do {
694 			struct cbq_class *borrow = cl;
695 
696 			if (cl->q->q.qlen &&
697 			    (borrow = cbq_under_limit(cl)) == NULL)
698 				goto skip_class;
699 
700 			if (cl->deficit <= 0) {
701 				/* Class exhausted its allotment per
702 				 * this round. Switch to the next one.
703 				 */
704 				deficit = 1;
705 				cl->deficit += cl->quantum;
706 				goto next_class;
707 			}
708 
709 			skb = cl->q->dequeue(cl->q);
710 
711 			/* Class did not give us any skb :-(
712 			 * It could occur even if cl->q->q.qlen != 0
713 			 * f.e. if cl->q == "tbf"
714 			 */
715 			if (skb == NULL)
716 				goto skip_class;
717 
718 			cl->deficit -= qdisc_pkt_len(skb);
719 			q->tx_class = cl;
720 			q->tx_borrowed = borrow;
721 			if (borrow != cl) {
722 #ifndef CBQ_XSTATS_BORROWS_BYTES
723 				borrow->xstats.borrows++;
724 				cl->xstats.borrows++;
725 #else
726 				borrow->xstats.borrows += qdisc_pkt_len(skb);
727 				cl->xstats.borrows += qdisc_pkt_len(skb);
728 #endif
729 			}
730 			q->tx_len = qdisc_pkt_len(skb);
731 
732 			if (cl->deficit <= 0) {
733 				q->active[prio] = cl;
734 				cl = cl->next_alive;
735 				cl->deficit += cl->quantum;
736 			}
737 			return skb;
738 
739 skip_class:
740 			if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
741 				/* Class is empty or penalized.
742 				 * Unlink it from active chain.
743 				 */
744 				cl_prev->next_alive = cl->next_alive;
745 				cl->next_alive = NULL;
746 
747 				/* Did cl_tail point to it? */
748 				if (cl == cl_tail) {
749 					/* Repair it! */
750 					cl_tail = cl_prev;
751 
752 					/* Was it the last class in this band? */
753 					if (cl == cl_tail) {
754 						/* Kill the band! */
755 						q->active[prio] = NULL;
756 						q->activemask &= ~(1<<prio);
757 						if (cl->q->q.qlen)
758 							cbq_activate_class(cl);
759 						return NULL;
760 					}
761 
762 					q->active[prio] = cl_tail;
763 				}
764 				if (cl->q->q.qlen)
765 					cbq_activate_class(cl);
766 
767 				cl = cl_prev;
768 			}
769 
770 next_class:
771 			cl_prev = cl;
772 			cl = cl->next_alive;
773 		} while (cl_prev != cl_tail);
774 	} while (deficit);
775 
776 	q->active[prio] = cl_prev;
777 
778 	return NULL;
779 }
780 
781 static inline struct sk_buff *
cbq_dequeue_1(struct Qdisc * sch)782 cbq_dequeue_1(struct Qdisc *sch)
783 {
784 	struct cbq_sched_data *q = qdisc_priv(sch);
785 	struct sk_buff *skb;
786 	unsigned int activemask;
787 
788 	activemask = q->activemask & 0xFF;
789 	while (activemask) {
790 		int prio = ffz(~activemask);
791 		activemask &= ~(1<<prio);
792 		skb = cbq_dequeue_prio(sch, prio);
793 		if (skb)
794 			return skb;
795 	}
796 	return NULL;
797 }
798 
799 static struct sk_buff *
cbq_dequeue(struct Qdisc * sch)800 cbq_dequeue(struct Qdisc *sch)
801 {
802 	struct sk_buff *skb;
803 	struct cbq_sched_data *q = qdisc_priv(sch);
804 	psched_time_t now;
805 
806 	now = psched_get_time();
807 
808 	if (q->tx_class)
809 		cbq_update(q);
810 
811 	q->now = now;
812 
813 	for (;;) {
814 		q->wd_expires = 0;
815 
816 		skb = cbq_dequeue_1(sch);
817 		if (skb) {
818 			qdisc_bstats_update(sch, skb);
819 			sch->q.qlen--;
820 			return skb;
821 		}
822 
823 		/* All the classes are overlimit.
824 		 *
825 		 * It is possible, if:
826 		 *
827 		 * 1. Scheduler is empty.
828 		 * 2. Toplevel cutoff inhibited borrowing.
829 		 * 3. Root class is overlimit.
830 		 *
831 		 * Reset 2d and 3d conditions and retry.
832 		 *
833 		 * Note, that NS and cbq-2.0 are buggy, peeking
834 		 * an arbitrary class is appropriate for ancestor-only
835 		 * sharing, but not for toplevel algorithm.
836 		 *
837 		 * Our version is better, but slower, because it requires
838 		 * two passes, but it is unavoidable with top-level sharing.
839 		 */
840 
841 		if (q->toplevel == TC_CBQ_MAXLEVEL &&
842 		    q->link.undertime == PSCHED_PASTPERFECT)
843 			break;
844 
845 		q->toplevel = TC_CBQ_MAXLEVEL;
846 		q->link.undertime = PSCHED_PASTPERFECT;
847 	}
848 
849 	/* No packets in scheduler or nobody wants to give them to us :-(
850 	 * Sigh... start watchdog timer in the last case.
851 	 */
852 
853 	if (sch->q.qlen) {
854 		qdisc_qstats_overlimit(sch);
855 		if (q->wd_expires)
856 			qdisc_watchdog_schedule(&q->watchdog,
857 						now + q->wd_expires);
858 	}
859 	return NULL;
860 }
861 
862 /* CBQ class maintenance routines */
863 
cbq_adjust_levels(struct cbq_class * this)864 static void cbq_adjust_levels(struct cbq_class *this)
865 {
866 	if (this == NULL)
867 		return;
868 
869 	do {
870 		int level = 0;
871 		struct cbq_class *cl;
872 
873 		cl = this->children;
874 		if (cl) {
875 			do {
876 				if (cl->level > level)
877 					level = cl->level;
878 			} while ((cl = cl->sibling) != this->children);
879 		}
880 		this->level = level + 1;
881 	} while ((this = this->tparent) != NULL);
882 }
883 
cbq_normalize_quanta(struct cbq_sched_data * q,int prio)884 static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
885 {
886 	struct cbq_class *cl;
887 	unsigned int h;
888 
889 	if (q->quanta[prio] == 0)
890 		return;
891 
892 	for (h = 0; h < q->clhash.hashsize; h++) {
893 		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
894 			/* BUGGGG... Beware! This expression suffer of
895 			 * arithmetic overflows!
896 			 */
897 			if (cl->priority == prio) {
898 				cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
899 					q->quanta[prio];
900 			}
901 			if (cl->quantum <= 0 ||
902 			    cl->quantum > 32*qdisc_dev(cl->qdisc)->mtu) {
903 				pr_warn("CBQ: class %08x has bad quantum==%ld, repaired.\n",
904 					cl->common.classid, cl->quantum);
905 				cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
906 			}
907 		}
908 	}
909 }
910 
cbq_sync_defmap(struct cbq_class * cl)911 static void cbq_sync_defmap(struct cbq_class *cl)
912 {
913 	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
914 	struct cbq_class *split = cl->split;
915 	unsigned int h;
916 	int i;
917 
918 	if (split == NULL)
919 		return;
920 
921 	for (i = 0; i <= TC_PRIO_MAX; i++) {
922 		if (split->defaults[i] == cl && !(cl->defmap & (1<<i)))
923 			split->defaults[i] = NULL;
924 	}
925 
926 	for (i = 0; i <= TC_PRIO_MAX; i++) {
927 		int level = split->level;
928 
929 		if (split->defaults[i])
930 			continue;
931 
932 		for (h = 0; h < q->clhash.hashsize; h++) {
933 			struct cbq_class *c;
934 
935 			hlist_for_each_entry(c, &q->clhash.hash[h],
936 					     common.hnode) {
937 				if (c->split == split && c->level < level &&
938 				    c->defmap & (1<<i)) {
939 					split->defaults[i] = c;
940 					level = c->level;
941 				}
942 			}
943 		}
944 	}
945 }
946 
cbq_change_defmap(struct cbq_class * cl,u32 splitid,u32 def,u32 mask)947 static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
948 {
949 	struct cbq_class *split = NULL;
950 
951 	if (splitid == 0) {
952 		split = cl->split;
953 		if (!split)
954 			return;
955 		splitid = split->common.classid;
956 	}
957 
958 	if (split == NULL || split->common.classid != splitid) {
959 		for (split = cl->tparent; split; split = split->tparent)
960 			if (split->common.classid == splitid)
961 				break;
962 	}
963 
964 	if (split == NULL)
965 		return;
966 
967 	if (cl->split != split) {
968 		cl->defmap = 0;
969 		cbq_sync_defmap(cl);
970 		cl->split = split;
971 		cl->defmap = def & mask;
972 	} else
973 		cl->defmap = (cl->defmap & ~mask) | (def & mask);
974 
975 	cbq_sync_defmap(cl);
976 }
977 
cbq_unlink_class(struct cbq_class * this)978 static void cbq_unlink_class(struct cbq_class *this)
979 {
980 	struct cbq_class *cl, **clp;
981 	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
982 
983 	qdisc_class_hash_remove(&q->clhash, &this->common);
984 
985 	if (this->tparent) {
986 		clp = &this->sibling;
987 		cl = *clp;
988 		do {
989 			if (cl == this) {
990 				*clp = cl->sibling;
991 				break;
992 			}
993 			clp = &cl->sibling;
994 		} while ((cl = *clp) != this->sibling);
995 
996 		if (this->tparent->children == this) {
997 			this->tparent->children = this->sibling;
998 			if (this->sibling == this)
999 				this->tparent->children = NULL;
1000 		}
1001 	} else {
1002 		WARN_ON(this->sibling != this);
1003 	}
1004 }
1005 
cbq_link_class(struct cbq_class * this)1006 static void cbq_link_class(struct cbq_class *this)
1007 {
1008 	struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1009 	struct cbq_class *parent = this->tparent;
1010 
1011 	this->sibling = this;
1012 	qdisc_class_hash_insert(&q->clhash, &this->common);
1013 
1014 	if (parent == NULL)
1015 		return;
1016 
1017 	if (parent->children == NULL) {
1018 		parent->children = this;
1019 	} else {
1020 		this->sibling = parent->children->sibling;
1021 		parent->children->sibling = this;
1022 	}
1023 }
1024 
1025 static void
cbq_reset(struct Qdisc * sch)1026 cbq_reset(struct Qdisc *sch)
1027 {
1028 	struct cbq_sched_data *q = qdisc_priv(sch);
1029 	struct cbq_class *cl;
1030 	int prio;
1031 	unsigned int h;
1032 
1033 	q->activemask = 0;
1034 	q->pmask = 0;
1035 	q->tx_class = NULL;
1036 	q->tx_borrowed = NULL;
1037 	qdisc_watchdog_cancel(&q->watchdog);
1038 	hrtimer_cancel(&q->delay_timer);
1039 	q->toplevel = TC_CBQ_MAXLEVEL;
1040 	q->now = psched_get_time();
1041 
1042 	for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
1043 		q->active[prio] = NULL;
1044 
1045 	for (h = 0; h < q->clhash.hashsize; h++) {
1046 		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1047 			qdisc_reset(cl->q);
1048 
1049 			cl->next_alive = NULL;
1050 			cl->undertime = PSCHED_PASTPERFECT;
1051 			cl->avgidle = cl->maxidle;
1052 			cl->deficit = cl->quantum;
1053 			cl->cpriority = cl->priority;
1054 		}
1055 	}
1056 	sch->q.qlen = 0;
1057 }
1058 
1059 
cbq_set_lss(struct cbq_class * cl,struct tc_cbq_lssopt * lss)1060 static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
1061 {
1062 	if (lss->change & TCF_CBQ_LSS_FLAGS) {
1063 		cl->share = (lss->flags & TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
1064 		cl->borrow = (lss->flags & TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
1065 	}
1066 	if (lss->change & TCF_CBQ_LSS_EWMA)
1067 		cl->ewma_log = lss->ewma_log;
1068 	if (lss->change & TCF_CBQ_LSS_AVPKT)
1069 		cl->avpkt = lss->avpkt;
1070 	if (lss->change & TCF_CBQ_LSS_MINIDLE)
1071 		cl->minidle = -(long)lss->minidle;
1072 	if (lss->change & TCF_CBQ_LSS_MAXIDLE) {
1073 		cl->maxidle = lss->maxidle;
1074 		cl->avgidle = lss->maxidle;
1075 	}
1076 	if (lss->change & TCF_CBQ_LSS_OFFTIME)
1077 		cl->offtime = lss->offtime;
1078 	return 0;
1079 }
1080 
cbq_rmprio(struct cbq_sched_data * q,struct cbq_class * cl)1081 static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1082 {
1083 	q->nclasses[cl->priority]--;
1084 	q->quanta[cl->priority] -= cl->weight;
1085 	cbq_normalize_quanta(q, cl->priority);
1086 }
1087 
cbq_addprio(struct cbq_sched_data * q,struct cbq_class * cl)1088 static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1089 {
1090 	q->nclasses[cl->priority]++;
1091 	q->quanta[cl->priority] += cl->weight;
1092 	cbq_normalize_quanta(q, cl->priority);
1093 }
1094 
cbq_set_wrr(struct cbq_class * cl,struct tc_cbq_wrropt * wrr)1095 static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1096 {
1097 	struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1098 
1099 	if (wrr->allot)
1100 		cl->allot = wrr->allot;
1101 	if (wrr->weight)
1102 		cl->weight = wrr->weight;
1103 	if (wrr->priority) {
1104 		cl->priority = wrr->priority - 1;
1105 		cl->cpriority = cl->priority;
1106 		if (cl->priority >= cl->priority2)
1107 			cl->priority2 = TC_CBQ_MAXPRIO - 1;
1108 	}
1109 
1110 	cbq_addprio(q, cl);
1111 	return 0;
1112 }
1113 
cbq_set_fopt(struct cbq_class * cl,struct tc_cbq_fopt * fopt)1114 static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1115 {
1116 	cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1117 	return 0;
1118 }
1119 
1120 static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = {
1121 	[TCA_CBQ_LSSOPT]	= { .len = sizeof(struct tc_cbq_lssopt) },
1122 	[TCA_CBQ_WRROPT]	= { .len = sizeof(struct tc_cbq_wrropt) },
1123 	[TCA_CBQ_FOPT]		= { .len = sizeof(struct tc_cbq_fopt) },
1124 	[TCA_CBQ_OVL_STRATEGY]	= { .len = sizeof(struct tc_cbq_ovl) },
1125 	[TCA_CBQ_RATE]		= { .len = sizeof(struct tc_ratespec) },
1126 	[TCA_CBQ_RTAB]		= { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1127 	[TCA_CBQ_POLICE]	= { .len = sizeof(struct tc_cbq_police) },
1128 };
1129 
cbq_opt_parse(struct nlattr * tb[TCA_CBQ_MAX+1],struct nlattr * opt,struct netlink_ext_ack * extack)1130 static int cbq_opt_parse(struct nlattr *tb[TCA_CBQ_MAX + 1],
1131 			 struct nlattr *opt,
1132 			 struct netlink_ext_ack *extack)
1133 {
1134 	int err;
1135 
1136 	if (!opt) {
1137 		NL_SET_ERR_MSG(extack, "CBQ options are required for this operation");
1138 		return -EINVAL;
1139 	}
1140 
1141 	err = nla_parse_nested_deprecated(tb, TCA_CBQ_MAX, opt,
1142 					  cbq_policy, extack);
1143 	if (err < 0)
1144 		return err;
1145 
1146 	if (tb[TCA_CBQ_WRROPT]) {
1147 		const struct tc_cbq_wrropt *wrr = nla_data(tb[TCA_CBQ_WRROPT]);
1148 
1149 		if (wrr->priority > TC_CBQ_MAXPRIO) {
1150 			NL_SET_ERR_MSG(extack, "priority is bigger than TC_CBQ_MAXPRIO");
1151 			err = -EINVAL;
1152 		}
1153 	}
1154 	return err;
1155 }
1156 
cbq_init(struct Qdisc * sch,struct nlattr * opt,struct netlink_ext_ack * extack)1157 static int cbq_init(struct Qdisc *sch, struct nlattr *opt,
1158 		    struct netlink_ext_ack *extack)
1159 {
1160 	struct cbq_sched_data *q = qdisc_priv(sch);
1161 	struct nlattr *tb[TCA_CBQ_MAX + 1];
1162 	struct tc_ratespec *r;
1163 	int err;
1164 
1165 	qdisc_watchdog_init(&q->watchdog, sch);
1166 	hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
1167 	q->delay_timer.function = cbq_undelay;
1168 
1169 	err = cbq_opt_parse(tb, opt, extack);
1170 	if (err < 0)
1171 		return err;
1172 
1173 	if (!tb[TCA_CBQ_RTAB] || !tb[TCA_CBQ_RATE]) {
1174 		NL_SET_ERR_MSG(extack, "Rate specification missing or incomplete");
1175 		return -EINVAL;
1176 	}
1177 
1178 	r = nla_data(tb[TCA_CBQ_RATE]);
1179 
1180 	q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB], extack);
1181 	if (!q->link.R_tab)
1182 		return -EINVAL;
1183 
1184 	err = tcf_block_get(&q->link.block, &q->link.filter_list, sch, extack);
1185 	if (err)
1186 		goto put_rtab;
1187 
1188 	err = qdisc_class_hash_init(&q->clhash);
1189 	if (err < 0)
1190 		goto put_block;
1191 
1192 	q->link.sibling = &q->link;
1193 	q->link.common.classid = sch->handle;
1194 	q->link.qdisc = sch;
1195 	q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1196 				      sch->handle, NULL);
1197 	if (!q->link.q)
1198 		q->link.q = &noop_qdisc;
1199 	else
1200 		qdisc_hash_add(q->link.q, true);
1201 
1202 	q->link.priority = TC_CBQ_MAXPRIO - 1;
1203 	q->link.priority2 = TC_CBQ_MAXPRIO - 1;
1204 	q->link.cpriority = TC_CBQ_MAXPRIO - 1;
1205 	q->link.allot = psched_mtu(qdisc_dev(sch));
1206 	q->link.quantum = q->link.allot;
1207 	q->link.weight = q->link.R_tab->rate.rate;
1208 
1209 	q->link.ewma_log = TC_CBQ_DEF_EWMA;
1210 	q->link.avpkt = q->link.allot/2;
1211 	q->link.minidle = -0x7FFFFFFF;
1212 
1213 	q->toplevel = TC_CBQ_MAXLEVEL;
1214 	q->now = psched_get_time();
1215 
1216 	cbq_link_class(&q->link);
1217 
1218 	if (tb[TCA_CBQ_LSSOPT])
1219 		cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT]));
1220 
1221 	cbq_addprio(q, &q->link);
1222 	return 0;
1223 
1224 put_block:
1225 	tcf_block_put(q->link.block);
1226 
1227 put_rtab:
1228 	qdisc_put_rtab(q->link.R_tab);
1229 	return err;
1230 }
1231 
cbq_dump_rate(struct sk_buff * skb,struct cbq_class * cl)1232 static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1233 {
1234 	unsigned char *b = skb_tail_pointer(skb);
1235 
1236 	if (nla_put(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate))
1237 		goto nla_put_failure;
1238 	return skb->len;
1239 
1240 nla_put_failure:
1241 	nlmsg_trim(skb, b);
1242 	return -1;
1243 }
1244 
cbq_dump_lss(struct sk_buff * skb,struct cbq_class * cl)1245 static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1246 {
1247 	unsigned char *b = skb_tail_pointer(skb);
1248 	struct tc_cbq_lssopt opt;
1249 
1250 	opt.flags = 0;
1251 	if (cl->borrow == NULL)
1252 		opt.flags |= TCF_CBQ_LSS_BOUNDED;
1253 	if (cl->share == NULL)
1254 		opt.flags |= TCF_CBQ_LSS_ISOLATED;
1255 	opt.ewma_log = cl->ewma_log;
1256 	opt.level = cl->level;
1257 	opt.avpkt = cl->avpkt;
1258 	opt.maxidle = cl->maxidle;
1259 	opt.minidle = (u32)(-cl->minidle);
1260 	opt.offtime = cl->offtime;
1261 	opt.change = ~0;
1262 	if (nla_put(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt))
1263 		goto nla_put_failure;
1264 	return skb->len;
1265 
1266 nla_put_failure:
1267 	nlmsg_trim(skb, b);
1268 	return -1;
1269 }
1270 
cbq_dump_wrr(struct sk_buff * skb,struct cbq_class * cl)1271 static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1272 {
1273 	unsigned char *b = skb_tail_pointer(skb);
1274 	struct tc_cbq_wrropt opt;
1275 
1276 	memset(&opt, 0, sizeof(opt));
1277 	opt.flags = 0;
1278 	opt.allot = cl->allot;
1279 	opt.priority = cl->priority + 1;
1280 	opt.cpriority = cl->cpriority + 1;
1281 	opt.weight = cl->weight;
1282 	if (nla_put(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt))
1283 		goto nla_put_failure;
1284 	return skb->len;
1285 
1286 nla_put_failure:
1287 	nlmsg_trim(skb, b);
1288 	return -1;
1289 }
1290 
cbq_dump_fopt(struct sk_buff * skb,struct cbq_class * cl)1291 static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1292 {
1293 	unsigned char *b = skb_tail_pointer(skb);
1294 	struct tc_cbq_fopt opt;
1295 
1296 	if (cl->split || cl->defmap) {
1297 		opt.split = cl->split ? cl->split->common.classid : 0;
1298 		opt.defmap = cl->defmap;
1299 		opt.defchange = ~0;
1300 		if (nla_put(skb, TCA_CBQ_FOPT, sizeof(opt), &opt))
1301 			goto nla_put_failure;
1302 	}
1303 	return skb->len;
1304 
1305 nla_put_failure:
1306 	nlmsg_trim(skb, b);
1307 	return -1;
1308 }
1309 
cbq_dump_attr(struct sk_buff * skb,struct cbq_class * cl)1310 static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1311 {
1312 	if (cbq_dump_lss(skb, cl) < 0 ||
1313 	    cbq_dump_rate(skb, cl) < 0 ||
1314 	    cbq_dump_wrr(skb, cl) < 0 ||
1315 	    cbq_dump_fopt(skb, cl) < 0)
1316 		return -1;
1317 	return 0;
1318 }
1319 
cbq_dump(struct Qdisc * sch,struct sk_buff * skb)1320 static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1321 {
1322 	struct cbq_sched_data *q = qdisc_priv(sch);
1323 	struct nlattr *nest;
1324 
1325 	nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
1326 	if (nest == NULL)
1327 		goto nla_put_failure;
1328 	if (cbq_dump_attr(skb, &q->link) < 0)
1329 		goto nla_put_failure;
1330 	return nla_nest_end(skb, nest);
1331 
1332 nla_put_failure:
1333 	nla_nest_cancel(skb, nest);
1334 	return -1;
1335 }
1336 
1337 static int
cbq_dump_stats(struct Qdisc * sch,struct gnet_dump * d)1338 cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1339 {
1340 	struct cbq_sched_data *q = qdisc_priv(sch);
1341 
1342 	q->link.xstats.avgidle = q->link.avgidle;
1343 	return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
1344 }
1345 
1346 static int
cbq_dump_class(struct Qdisc * sch,unsigned long arg,struct sk_buff * skb,struct tcmsg * tcm)1347 cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1348 	       struct sk_buff *skb, struct tcmsg *tcm)
1349 {
1350 	struct cbq_class *cl = (struct cbq_class *)arg;
1351 	struct nlattr *nest;
1352 
1353 	if (cl->tparent)
1354 		tcm->tcm_parent = cl->tparent->common.classid;
1355 	else
1356 		tcm->tcm_parent = TC_H_ROOT;
1357 	tcm->tcm_handle = cl->common.classid;
1358 	tcm->tcm_info = cl->q->handle;
1359 
1360 	nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
1361 	if (nest == NULL)
1362 		goto nla_put_failure;
1363 	if (cbq_dump_attr(skb, cl) < 0)
1364 		goto nla_put_failure;
1365 	return nla_nest_end(skb, nest);
1366 
1367 nla_put_failure:
1368 	nla_nest_cancel(skb, nest);
1369 	return -1;
1370 }
1371 
1372 static int
cbq_dump_class_stats(struct Qdisc * sch,unsigned long arg,struct gnet_dump * d)1373 cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1374 	struct gnet_dump *d)
1375 {
1376 	struct cbq_sched_data *q = qdisc_priv(sch);
1377 	struct cbq_class *cl = (struct cbq_class *)arg;
1378 	__u32 qlen;
1379 
1380 	cl->xstats.avgidle = cl->avgidle;
1381 	cl->xstats.undertime = 0;
1382 	qdisc_qstats_qlen_backlog(cl->q, &qlen, &cl->qstats.backlog);
1383 
1384 	if (cl->undertime != PSCHED_PASTPERFECT)
1385 		cl->xstats.undertime = cl->undertime - q->now;
1386 
1387 	if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
1388 				  d, NULL, &cl->bstats) < 0 ||
1389 	    gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1390 	    gnet_stats_copy_queue(d, NULL, &cl->qstats, qlen) < 0)
1391 		return -1;
1392 
1393 	return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1394 }
1395 
cbq_graft(struct Qdisc * sch,unsigned long arg,struct Qdisc * new,struct Qdisc ** old,struct netlink_ext_ack * extack)1396 static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1397 		     struct Qdisc **old, struct netlink_ext_ack *extack)
1398 {
1399 	struct cbq_class *cl = (struct cbq_class *)arg;
1400 
1401 	if (new == NULL) {
1402 		new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1403 					cl->common.classid, extack);
1404 		if (new == NULL)
1405 			return -ENOBUFS;
1406 	}
1407 
1408 	*old = qdisc_replace(sch, new, &cl->q);
1409 	return 0;
1410 }
1411 
cbq_leaf(struct Qdisc * sch,unsigned long arg)1412 static struct Qdisc *cbq_leaf(struct Qdisc *sch, unsigned long arg)
1413 {
1414 	struct cbq_class *cl = (struct cbq_class *)arg;
1415 
1416 	return cl->q;
1417 }
1418 
cbq_qlen_notify(struct Qdisc * sch,unsigned long arg)1419 static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
1420 {
1421 	struct cbq_class *cl = (struct cbq_class *)arg;
1422 
1423 	cbq_deactivate_class(cl);
1424 }
1425 
cbq_find(struct Qdisc * sch,u32 classid)1426 static unsigned long cbq_find(struct Qdisc *sch, u32 classid)
1427 {
1428 	struct cbq_sched_data *q = qdisc_priv(sch);
1429 
1430 	return (unsigned long)cbq_class_lookup(q, classid);
1431 }
1432 
cbq_destroy_class(struct Qdisc * sch,struct cbq_class * cl)1433 static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1434 {
1435 	struct cbq_sched_data *q = qdisc_priv(sch);
1436 
1437 	WARN_ON(cl->filters);
1438 
1439 	tcf_block_put(cl->block);
1440 	qdisc_put(cl->q);
1441 	qdisc_put_rtab(cl->R_tab);
1442 	gen_kill_estimator(&cl->rate_est);
1443 	if (cl != &q->link)
1444 		kfree(cl);
1445 }
1446 
cbq_destroy(struct Qdisc * sch)1447 static void cbq_destroy(struct Qdisc *sch)
1448 {
1449 	struct cbq_sched_data *q = qdisc_priv(sch);
1450 	struct hlist_node *next;
1451 	struct cbq_class *cl;
1452 	unsigned int h;
1453 
1454 #ifdef CONFIG_NET_CLS_ACT
1455 	q->rx_class = NULL;
1456 #endif
1457 	/*
1458 	 * Filters must be destroyed first because we don't destroy the
1459 	 * classes from root to leafs which means that filters can still
1460 	 * be bound to classes which have been destroyed already. --TGR '04
1461 	 */
1462 	for (h = 0; h < q->clhash.hashsize; h++) {
1463 		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1464 			tcf_block_put(cl->block);
1465 			cl->block = NULL;
1466 		}
1467 	}
1468 	for (h = 0; h < q->clhash.hashsize; h++) {
1469 		hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h],
1470 					  common.hnode)
1471 			cbq_destroy_class(sch, cl);
1472 	}
1473 	qdisc_class_hash_destroy(&q->clhash);
1474 }
1475 
1476 static int
cbq_change_class(struct Qdisc * sch,u32 classid,u32 parentid,struct nlattr ** tca,unsigned long * arg,struct netlink_ext_ack * extack)1477 cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca,
1478 		 unsigned long *arg, struct netlink_ext_ack *extack)
1479 {
1480 	int err;
1481 	struct cbq_sched_data *q = qdisc_priv(sch);
1482 	struct cbq_class *cl = (struct cbq_class *)*arg;
1483 	struct nlattr *opt = tca[TCA_OPTIONS];
1484 	struct nlattr *tb[TCA_CBQ_MAX + 1];
1485 	struct cbq_class *parent;
1486 	struct qdisc_rate_table *rtab = NULL;
1487 
1488 	err = cbq_opt_parse(tb, opt, extack);
1489 	if (err < 0)
1490 		return err;
1491 
1492 	if (tb[TCA_CBQ_OVL_STRATEGY] || tb[TCA_CBQ_POLICE]) {
1493 		NL_SET_ERR_MSG(extack, "Neither overlimit strategy nor policing attributes can be used for changing class params");
1494 		return -EOPNOTSUPP;
1495 	}
1496 
1497 	if (cl) {
1498 		/* Check parent */
1499 		if (parentid) {
1500 			if (cl->tparent &&
1501 			    cl->tparent->common.classid != parentid) {
1502 				NL_SET_ERR_MSG(extack, "Invalid parent id");
1503 				return -EINVAL;
1504 			}
1505 			if (!cl->tparent && parentid != TC_H_ROOT) {
1506 				NL_SET_ERR_MSG(extack, "Parent must be root");
1507 				return -EINVAL;
1508 			}
1509 		}
1510 
1511 		if (tb[TCA_CBQ_RATE]) {
1512 			rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]),
1513 					      tb[TCA_CBQ_RTAB], extack);
1514 			if (rtab == NULL)
1515 				return -EINVAL;
1516 		}
1517 
1518 		if (tca[TCA_RATE]) {
1519 			err = gen_replace_estimator(&cl->bstats, NULL,
1520 						    &cl->rate_est,
1521 						    NULL,
1522 						    qdisc_root_sleeping_running(sch),
1523 						    tca[TCA_RATE]);
1524 			if (err) {
1525 				NL_SET_ERR_MSG(extack, "Failed to replace specified rate estimator");
1526 				qdisc_put_rtab(rtab);
1527 				return err;
1528 			}
1529 		}
1530 
1531 		/* Change class parameters */
1532 		sch_tree_lock(sch);
1533 
1534 		if (cl->next_alive != NULL)
1535 			cbq_deactivate_class(cl);
1536 
1537 		if (rtab) {
1538 			qdisc_put_rtab(cl->R_tab);
1539 			cl->R_tab = rtab;
1540 		}
1541 
1542 		if (tb[TCA_CBQ_LSSOPT])
1543 			cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1544 
1545 		if (tb[TCA_CBQ_WRROPT]) {
1546 			cbq_rmprio(q, cl);
1547 			cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1548 		}
1549 
1550 		if (tb[TCA_CBQ_FOPT])
1551 			cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1552 
1553 		if (cl->q->q.qlen)
1554 			cbq_activate_class(cl);
1555 
1556 		sch_tree_unlock(sch);
1557 
1558 		return 0;
1559 	}
1560 
1561 	if (parentid == TC_H_ROOT)
1562 		return -EINVAL;
1563 
1564 	if (!tb[TCA_CBQ_WRROPT] || !tb[TCA_CBQ_RATE] || !tb[TCA_CBQ_LSSOPT]) {
1565 		NL_SET_ERR_MSG(extack, "One of the following attributes MUST be specified: WRR, rate or link sharing");
1566 		return -EINVAL;
1567 	}
1568 
1569 	rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB],
1570 			      extack);
1571 	if (rtab == NULL)
1572 		return -EINVAL;
1573 
1574 	if (classid) {
1575 		err = -EINVAL;
1576 		if (TC_H_MAJ(classid ^ sch->handle) ||
1577 		    cbq_class_lookup(q, classid)) {
1578 			NL_SET_ERR_MSG(extack, "Specified class not found");
1579 			goto failure;
1580 		}
1581 	} else {
1582 		int i;
1583 		classid = TC_H_MAKE(sch->handle, 0x8000);
1584 
1585 		for (i = 0; i < 0x8000; i++) {
1586 			if (++q->hgenerator >= 0x8000)
1587 				q->hgenerator = 1;
1588 			if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1589 				break;
1590 		}
1591 		err = -ENOSR;
1592 		if (i >= 0x8000) {
1593 			NL_SET_ERR_MSG(extack, "Unable to generate classid");
1594 			goto failure;
1595 		}
1596 		classid = classid|q->hgenerator;
1597 	}
1598 
1599 	parent = &q->link;
1600 	if (parentid) {
1601 		parent = cbq_class_lookup(q, parentid);
1602 		err = -EINVAL;
1603 		if (!parent) {
1604 			NL_SET_ERR_MSG(extack, "Failed to find parentid");
1605 			goto failure;
1606 		}
1607 	}
1608 
1609 	err = -ENOBUFS;
1610 	cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1611 	if (cl == NULL)
1612 		goto failure;
1613 
1614 	err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack);
1615 	if (err) {
1616 		kfree(cl);
1617 		return err;
1618 	}
1619 
1620 	if (tca[TCA_RATE]) {
1621 		err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est,
1622 					NULL,
1623 					qdisc_root_sleeping_running(sch),
1624 					tca[TCA_RATE]);
1625 		if (err) {
1626 			NL_SET_ERR_MSG(extack, "Couldn't create new estimator");
1627 			tcf_block_put(cl->block);
1628 			kfree(cl);
1629 			goto failure;
1630 		}
1631 	}
1632 
1633 	cl->R_tab = rtab;
1634 	rtab = NULL;
1635 	cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid,
1636 				  NULL);
1637 	if (!cl->q)
1638 		cl->q = &noop_qdisc;
1639 	else
1640 		qdisc_hash_add(cl->q, true);
1641 
1642 	cl->common.classid = classid;
1643 	cl->tparent = parent;
1644 	cl->qdisc = sch;
1645 	cl->allot = parent->allot;
1646 	cl->quantum = cl->allot;
1647 	cl->weight = cl->R_tab->rate.rate;
1648 
1649 	sch_tree_lock(sch);
1650 	cbq_link_class(cl);
1651 	cl->borrow = cl->tparent;
1652 	if (cl->tparent != &q->link)
1653 		cl->share = cl->tparent;
1654 	cbq_adjust_levels(parent);
1655 	cl->minidle = -0x7FFFFFFF;
1656 	cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1657 	cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1658 	if (cl->ewma_log == 0)
1659 		cl->ewma_log = q->link.ewma_log;
1660 	if (cl->maxidle == 0)
1661 		cl->maxidle = q->link.maxidle;
1662 	if (cl->avpkt == 0)
1663 		cl->avpkt = q->link.avpkt;
1664 	if (tb[TCA_CBQ_FOPT])
1665 		cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1666 	sch_tree_unlock(sch);
1667 
1668 	qdisc_class_hash_grow(sch, &q->clhash);
1669 
1670 	*arg = (unsigned long)cl;
1671 	return 0;
1672 
1673 failure:
1674 	qdisc_put_rtab(rtab);
1675 	return err;
1676 }
1677 
cbq_delete(struct Qdisc * sch,unsigned long arg,struct netlink_ext_ack * extack)1678 static int cbq_delete(struct Qdisc *sch, unsigned long arg,
1679 		      struct netlink_ext_ack *extack)
1680 {
1681 	struct cbq_sched_data *q = qdisc_priv(sch);
1682 	struct cbq_class *cl = (struct cbq_class *)arg;
1683 
1684 	if (cl->filters || cl->children || cl == &q->link)
1685 		return -EBUSY;
1686 
1687 	sch_tree_lock(sch);
1688 
1689 	qdisc_purge_queue(cl->q);
1690 
1691 	if (cl->next_alive)
1692 		cbq_deactivate_class(cl);
1693 
1694 	if (q->tx_borrowed == cl)
1695 		q->tx_borrowed = q->tx_class;
1696 	if (q->tx_class == cl) {
1697 		q->tx_class = NULL;
1698 		q->tx_borrowed = NULL;
1699 	}
1700 #ifdef CONFIG_NET_CLS_ACT
1701 	if (q->rx_class == cl)
1702 		q->rx_class = NULL;
1703 #endif
1704 
1705 	cbq_unlink_class(cl);
1706 	cbq_adjust_levels(cl->tparent);
1707 	cl->defmap = 0;
1708 	cbq_sync_defmap(cl);
1709 
1710 	cbq_rmprio(q, cl);
1711 	sch_tree_unlock(sch);
1712 
1713 	cbq_destroy_class(sch, cl);
1714 	return 0;
1715 }
1716 
cbq_tcf_block(struct Qdisc * sch,unsigned long arg,struct netlink_ext_ack * extack)1717 static struct tcf_block *cbq_tcf_block(struct Qdisc *sch, unsigned long arg,
1718 				       struct netlink_ext_ack *extack)
1719 {
1720 	struct cbq_sched_data *q = qdisc_priv(sch);
1721 	struct cbq_class *cl = (struct cbq_class *)arg;
1722 
1723 	if (cl == NULL)
1724 		cl = &q->link;
1725 
1726 	return cl->block;
1727 }
1728 
cbq_bind_filter(struct Qdisc * sch,unsigned long parent,u32 classid)1729 static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
1730 				     u32 classid)
1731 {
1732 	struct cbq_sched_data *q = qdisc_priv(sch);
1733 	struct cbq_class *p = (struct cbq_class *)parent;
1734 	struct cbq_class *cl = cbq_class_lookup(q, classid);
1735 
1736 	if (cl) {
1737 		if (p && p->level <= cl->level)
1738 			return 0;
1739 		cl->filters++;
1740 		return (unsigned long)cl;
1741 	}
1742 	return 0;
1743 }
1744 
cbq_unbind_filter(struct Qdisc * sch,unsigned long arg)1745 static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
1746 {
1747 	struct cbq_class *cl = (struct cbq_class *)arg;
1748 
1749 	cl->filters--;
1750 }
1751 
cbq_walk(struct Qdisc * sch,struct qdisc_walker * arg)1752 static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1753 {
1754 	struct cbq_sched_data *q = qdisc_priv(sch);
1755 	struct cbq_class *cl;
1756 	unsigned int h;
1757 
1758 	if (arg->stop)
1759 		return;
1760 
1761 	for (h = 0; h < q->clhash.hashsize; h++) {
1762 		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1763 			if (arg->count < arg->skip) {
1764 				arg->count++;
1765 				continue;
1766 			}
1767 			if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
1768 				arg->stop = 1;
1769 				return;
1770 			}
1771 			arg->count++;
1772 		}
1773 	}
1774 }
1775 
1776 static const struct Qdisc_class_ops cbq_class_ops = {
1777 	.graft		=	cbq_graft,
1778 	.leaf		=	cbq_leaf,
1779 	.qlen_notify	=	cbq_qlen_notify,
1780 	.find		=	cbq_find,
1781 	.change		=	cbq_change_class,
1782 	.delete		=	cbq_delete,
1783 	.walk		=	cbq_walk,
1784 	.tcf_block	=	cbq_tcf_block,
1785 	.bind_tcf	=	cbq_bind_filter,
1786 	.unbind_tcf	=	cbq_unbind_filter,
1787 	.dump		=	cbq_dump_class,
1788 	.dump_stats	=	cbq_dump_class_stats,
1789 };
1790 
1791 static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
1792 	.next		=	NULL,
1793 	.cl_ops		=	&cbq_class_ops,
1794 	.id		=	"cbq",
1795 	.priv_size	=	sizeof(struct cbq_sched_data),
1796 	.enqueue	=	cbq_enqueue,
1797 	.dequeue	=	cbq_dequeue,
1798 	.peek		=	qdisc_peek_dequeued,
1799 	.init		=	cbq_init,
1800 	.reset		=	cbq_reset,
1801 	.destroy	=	cbq_destroy,
1802 	.change		=	NULL,
1803 	.dump		=	cbq_dump,
1804 	.dump_stats	=	cbq_dump_stats,
1805 	.owner		=	THIS_MODULE,
1806 };
1807 
cbq_module_init(void)1808 static int __init cbq_module_init(void)
1809 {
1810 	return register_qdisc(&cbq_qdisc_ops);
1811 }
cbq_module_exit(void)1812 static void __exit cbq_module_exit(void)
1813 {
1814 	unregister_qdisc(&cbq_qdisc_ops);
1815 }
1816 module_init(cbq_module_init)
1817 module_exit(cbq_module_exit)
1818 MODULE_LICENSE("GPL");
1819