1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
2 
3 /* COMMON Applications Kept Enhanced (CAKE) discipline
4  *
5  * Copyright (C) 2014-2018 Jonathan Morton <chromatix99@gmail.com>
6  * Copyright (C) 2015-2018 Toke Høiland-Jørgensen <toke@toke.dk>
7  * Copyright (C) 2014-2018 Dave Täht <dave.taht@gmail.com>
8  * Copyright (C) 2015-2018 Sebastian Moeller <moeller0@gmx.de>
9  * (C) 2015-2018 Kevin Darbyshire-Bryant <kevin@darbyshire-bryant.me.uk>
10  * Copyright (C) 2017-2018 Ryan Mounce <ryan@mounce.com.au>
11  *
12  * The CAKE Principles:
13  *		   (or, how to have your cake and eat it too)
14  *
15  * This is a combination of several shaping, AQM and FQ techniques into one
16  * easy-to-use package:
17  *
18  * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE
19  *   equipment and bloated MACs.  This operates in deficit mode (as in sch_fq),
20  *   eliminating the need for any sort of burst parameter (eg. token bucket
21  *   depth).  Burst support is limited to that necessary to overcome scheduling
22  *   latency.
23  *
24  * - A Diffserv-aware priority queue, giving more priority to certain classes,
25  *   up to a specified fraction of bandwidth.  Above that bandwidth threshold,
26  *   the priority is reduced to avoid starving other tins.
27  *
28  * - Each priority tin has a separate Flow Queue system, to isolate traffic
29  *   flows from each other.  This prevents a burst on one flow from increasing
30  *   the delay to another.  Flows are distributed to queues using a
31  *   set-associative hash function.
32  *
33  * - Each queue is actively managed by Cobalt, which is a combination of the
34  *   Codel and Blue AQM algorithms.  This serves flows fairly, and signals
35  *   congestion early via ECN (if available) and/or packet drops, to keep
36  *   latency low.  The codel parameters are auto-tuned based on the bandwidth
37  *   setting, as is necessary at low bandwidths.
38  *
39  * The configuration parameters are kept deliberately simple for ease of use.
40  * Everything has sane defaults.  Complete generality of configuration is *not*
41  * a goal.
42  *
43  * The priority queue operates according to a weighted DRR scheme, combined with
44  * a bandwidth tracker which reuses the shaper logic to detect which side of the
45  * bandwidth sharing threshold the tin is operating.  This determines whether a
46  * priority-based weight (high) or a bandwidth-based weight (low) is used for
47  * that tin in the current pass.
48  *
49  * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly
50  * granted us permission to leverage.
51  */
52 
53 #include <linux/module.h>
54 #include <linux/types.h>
55 #include <linux/kernel.h>
56 #include <linux/jiffies.h>
57 #include <linux/string.h>
58 #include <linux/in.h>
59 #include <linux/errno.h>
60 #include <linux/init.h>
61 #include <linux/skbuff.h>
62 #include <linux/jhash.h>
63 #include <linux/slab.h>
64 #include <linux/vmalloc.h>
65 #include <linux/reciprocal_div.h>
66 #include <net/netlink.h>
67 #include <linux/if_vlan.h>
68 #include <net/pkt_sched.h>
69 #include <net/pkt_cls.h>
70 #include <net/tcp.h>
71 #include <net/flow_dissector.h>
72 
73 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
74 #include <net/netfilter/nf_conntrack_core.h>
75 #endif
76 
77 #define CAKE_SET_WAYS (8)
78 #define CAKE_MAX_TINS (8)
79 #define CAKE_QUEUES (1024)
80 #define CAKE_FLOW_MASK 63
81 #define CAKE_FLOW_NAT_FLAG 64
82 
83 /* struct cobalt_params - contains codel and blue parameters
84  * @interval:	codel initial drop rate
85  * @target:     maximum persistent sojourn time & blue update rate
86  * @mtu_time:   serialisation delay of maximum-size packet
87  * @p_inc:      increment of blue drop probability (0.32 fxp)
88  * @p_dec:      decrement of blue drop probability (0.32 fxp)
89  */
90 struct cobalt_params {
91 	u64	interval;
92 	u64	target;
93 	u64	mtu_time;
94 	u32	p_inc;
95 	u32	p_dec;
96 };
97 
98 /* struct cobalt_vars - contains codel and blue variables
99  * @count:		codel dropping frequency
100  * @rec_inv_sqrt:	reciprocal value of sqrt(count) >> 1
101  * @drop_next:		time to drop next packet, or when we dropped last
102  * @blue_timer:		Blue time to next drop
103  * @p_drop:		BLUE drop probability (0.32 fxp)
104  * @dropping:		set if in dropping state
105  * @ecn_marked:		set if marked
106  */
107 struct cobalt_vars {
108 	u32	count;
109 	u32	rec_inv_sqrt;
110 	ktime_t	drop_next;
111 	ktime_t	blue_timer;
112 	u32     p_drop;
113 	bool	dropping;
114 	bool    ecn_marked;
115 };
116 
117 enum {
118 	CAKE_SET_NONE = 0,
119 	CAKE_SET_SPARSE,
120 	CAKE_SET_SPARSE_WAIT, /* counted in SPARSE, actually in BULK */
121 	CAKE_SET_BULK,
122 	CAKE_SET_DECAYING
123 };
124 
125 struct cake_flow {
126 	/* this stuff is all needed per-flow at dequeue time */
127 	struct sk_buff	  *head;
128 	struct sk_buff	  *tail;
129 	struct list_head  flowchain;
130 	s32		  deficit;
131 	u32		  dropped;
132 	struct cobalt_vars cvars;
133 	u16		  srchost; /* index into cake_host table */
134 	u16		  dsthost;
135 	u8		  set;
136 }; /* please try to keep this structure <= 64 bytes */
137 
138 struct cake_host {
139 	u32 srchost_tag;
140 	u32 dsthost_tag;
141 	u16 srchost_bulk_flow_count;
142 	u16 dsthost_bulk_flow_count;
143 };
144 
145 struct cake_heap_entry {
146 	u16 t:3, b:10;
147 };
148 
149 struct cake_tin_data {
150 	struct cake_flow flows[CAKE_QUEUES];
151 	u32	backlogs[CAKE_QUEUES];
152 	u32	tags[CAKE_QUEUES]; /* for set association */
153 	u16	overflow_idx[CAKE_QUEUES];
154 	struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */
155 	u16	flow_quantum;
156 
157 	struct cobalt_params cparams;
158 	u32	drop_overlimit;
159 	u16	bulk_flow_count;
160 	u16	sparse_flow_count;
161 	u16	decaying_flow_count;
162 	u16	unresponsive_flow_count;
163 
164 	u32	max_skblen;
165 
166 	struct list_head new_flows;
167 	struct list_head old_flows;
168 	struct list_head decaying_flows;
169 
170 	/* time_next = time_this + ((len * rate_ns) >> rate_shft) */
171 	ktime_t	time_next_packet;
172 	u64	tin_rate_ns;
173 	u64	tin_rate_bps;
174 	u16	tin_rate_shft;
175 
176 	u16	tin_quantum;
177 	s32	tin_deficit;
178 	u32	tin_backlog;
179 	u32	tin_dropped;
180 	u32	tin_ecn_mark;
181 
182 	u32	packets;
183 	u64	bytes;
184 
185 	u32	ack_drops;
186 
187 	/* moving averages */
188 	u64 avge_delay;
189 	u64 peak_delay;
190 	u64 base_delay;
191 
192 	/* hash function stats */
193 	u32	way_directs;
194 	u32	way_hits;
195 	u32	way_misses;
196 	u32	way_collisions;
197 }; /* number of tins is small, so size of this struct doesn't matter much */
198 
199 struct cake_sched_data {
200 	struct tcf_proto __rcu *filter_list; /* optional external classifier */
201 	struct tcf_block *block;
202 	struct cake_tin_data *tins;
203 
204 	struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS];
205 	u16		overflow_timeout;
206 
207 	u16		tin_cnt;
208 	u8		tin_mode;
209 	u8		flow_mode;
210 	u8		ack_filter;
211 	u8		atm_mode;
212 
213 	u32		fwmark_mask;
214 	u16		fwmark_shft;
215 
216 	/* time_next = time_this + ((len * rate_ns) >> rate_shft) */
217 	u16		rate_shft;
218 	ktime_t		time_next_packet;
219 	ktime_t		failsafe_next_packet;
220 	u64		rate_ns;
221 	u64		rate_bps;
222 	u16		rate_flags;
223 	s16		rate_overhead;
224 	u16		rate_mpu;
225 	u64		interval;
226 	u64		target;
227 
228 	/* resource tracking */
229 	u32		buffer_used;
230 	u32		buffer_max_used;
231 	u32		buffer_limit;
232 	u32		buffer_config_limit;
233 
234 	/* indices for dequeue */
235 	u16		cur_tin;
236 	u16		cur_flow;
237 
238 	struct qdisc_watchdog watchdog;
239 	const u8	*tin_index;
240 	const u8	*tin_order;
241 
242 	/* bandwidth capacity estimate */
243 	ktime_t		last_packet_time;
244 	ktime_t		avg_window_begin;
245 	u64		avg_packet_interval;
246 	u64		avg_window_bytes;
247 	u64		avg_peak_bandwidth;
248 	ktime_t		last_reconfig_time;
249 
250 	/* packet length stats */
251 	u32		avg_netoff;
252 	u16		max_netlen;
253 	u16		max_adjlen;
254 	u16		min_netlen;
255 	u16		min_adjlen;
256 };
257 
258 enum {
259 	CAKE_FLAG_OVERHEAD	   = BIT(0),
260 	CAKE_FLAG_AUTORATE_INGRESS = BIT(1),
261 	CAKE_FLAG_INGRESS	   = BIT(2),
262 	CAKE_FLAG_WASH		   = BIT(3),
263 	CAKE_FLAG_SPLIT_GSO	   = BIT(4)
264 };
265 
266 /* COBALT operates the Codel and BLUE algorithms in parallel, in order to
267  * obtain the best features of each.  Codel is excellent on flows which
268  * respond to congestion signals in a TCP-like way.  BLUE is more effective on
269  * unresponsive flows.
270  */
271 
272 struct cobalt_skb_cb {
273 	ktime_t enqueue_time;
274 	u32     adjusted_len;
275 };
276 
us_to_ns(u64 us)277 static u64 us_to_ns(u64 us)
278 {
279 	return us * NSEC_PER_USEC;
280 }
281 
get_cobalt_cb(const struct sk_buff * skb)282 static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb)
283 {
284 	qdisc_cb_private_validate(skb, sizeof(struct cobalt_skb_cb));
285 	return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data;
286 }
287 
cobalt_get_enqueue_time(const struct sk_buff * skb)288 static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb)
289 {
290 	return get_cobalt_cb(skb)->enqueue_time;
291 }
292 
cobalt_set_enqueue_time(struct sk_buff * skb,ktime_t now)293 static void cobalt_set_enqueue_time(struct sk_buff *skb,
294 				    ktime_t now)
295 {
296 	get_cobalt_cb(skb)->enqueue_time = now;
297 }
298 
299 static u16 quantum_div[CAKE_QUEUES + 1] = {0};
300 
301 /* Diffserv lookup tables */
302 
303 static const u8 precedence[] = {
304 	0, 0, 0, 0, 0, 0, 0, 0,
305 	1, 1, 1, 1, 1, 1, 1, 1,
306 	2, 2, 2, 2, 2, 2, 2, 2,
307 	3, 3, 3, 3, 3, 3, 3, 3,
308 	4, 4, 4, 4, 4, 4, 4, 4,
309 	5, 5, 5, 5, 5, 5, 5, 5,
310 	6, 6, 6, 6, 6, 6, 6, 6,
311 	7, 7, 7, 7, 7, 7, 7, 7,
312 };
313 
314 static const u8 diffserv8[] = {
315 	2, 0, 1, 2, 4, 2, 2, 2,
316 	1, 2, 1, 2, 1, 2, 1, 2,
317 	5, 2, 4, 2, 4, 2, 4, 2,
318 	3, 2, 3, 2, 3, 2, 3, 2,
319 	6, 2, 3, 2, 3, 2, 3, 2,
320 	6, 2, 2, 2, 6, 2, 6, 2,
321 	7, 2, 2, 2, 2, 2, 2, 2,
322 	7, 2, 2, 2, 2, 2, 2, 2,
323 };
324 
325 static const u8 diffserv4[] = {
326 	0, 1, 0, 0, 2, 0, 0, 0,
327 	1, 0, 0, 0, 0, 0, 0, 0,
328 	2, 0, 2, 0, 2, 0, 2, 0,
329 	2, 0, 2, 0, 2, 0, 2, 0,
330 	3, 0, 2, 0, 2, 0, 2, 0,
331 	3, 0, 0, 0, 3, 0, 3, 0,
332 	3, 0, 0, 0, 0, 0, 0, 0,
333 	3, 0, 0, 0, 0, 0, 0, 0,
334 };
335 
336 static const u8 diffserv3[] = {
337 	0, 1, 0, 0, 2, 0, 0, 0,
338 	1, 0, 0, 0, 0, 0, 0, 0,
339 	0, 0, 0, 0, 0, 0, 0, 0,
340 	0, 0, 0, 0, 0, 0, 0, 0,
341 	0, 0, 0, 0, 0, 0, 0, 0,
342 	0, 0, 0, 0, 2, 0, 2, 0,
343 	2, 0, 0, 0, 0, 0, 0, 0,
344 	2, 0, 0, 0, 0, 0, 0, 0,
345 };
346 
347 static const u8 besteffort[] = {
348 	0, 0, 0, 0, 0, 0, 0, 0,
349 	0, 0, 0, 0, 0, 0, 0, 0,
350 	0, 0, 0, 0, 0, 0, 0, 0,
351 	0, 0, 0, 0, 0, 0, 0, 0,
352 	0, 0, 0, 0, 0, 0, 0, 0,
353 	0, 0, 0, 0, 0, 0, 0, 0,
354 	0, 0, 0, 0, 0, 0, 0, 0,
355 	0, 0, 0, 0, 0, 0, 0, 0,
356 };
357 
358 /* tin priority order for stats dumping */
359 
360 static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7};
361 static const u8 bulk_order[] = {1, 0, 2, 3};
362 
363 #define REC_INV_SQRT_CACHE (16)
364 static u32 cobalt_rec_inv_sqrt_cache[REC_INV_SQRT_CACHE] = {0};
365 
366 /* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots
367  * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2)
368  *
369  * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32
370  */
371 
cobalt_newton_step(struct cobalt_vars * vars)372 static void cobalt_newton_step(struct cobalt_vars *vars)
373 {
374 	u32 invsqrt, invsqrt2;
375 	u64 val;
376 
377 	invsqrt = vars->rec_inv_sqrt;
378 	invsqrt2 = ((u64)invsqrt * invsqrt) >> 32;
379 	val = (3LL << 32) - ((u64)vars->count * invsqrt2);
380 
381 	val >>= 2; /* avoid overflow in following multiply */
382 	val = (val * invsqrt) >> (32 - 2 + 1);
383 
384 	vars->rec_inv_sqrt = val;
385 }
386 
cobalt_invsqrt(struct cobalt_vars * vars)387 static void cobalt_invsqrt(struct cobalt_vars *vars)
388 {
389 	if (vars->count < REC_INV_SQRT_CACHE)
390 		vars->rec_inv_sqrt = cobalt_rec_inv_sqrt_cache[vars->count];
391 	else
392 		cobalt_newton_step(vars);
393 }
394 
395 /* There is a big difference in timing between the accurate values placed in
396  * the cache and the approximations given by a single Newton step for small
397  * count values, particularly when stepping from count 1 to 2 or vice versa.
398  * Above 16, a single Newton step gives sufficient accuracy in either
399  * direction, given the precision stored.
400  *
401  * The magnitude of the error when stepping up to count 2 is such as to give
402  * the value that *should* have been produced at count 4.
403  */
404 
cobalt_cache_init(void)405 static void cobalt_cache_init(void)
406 {
407 	struct cobalt_vars v;
408 
409 	memset(&v, 0, sizeof(v));
410 	v.rec_inv_sqrt = ~0U;
411 	cobalt_rec_inv_sqrt_cache[0] = v.rec_inv_sqrt;
412 
413 	for (v.count = 1; v.count < REC_INV_SQRT_CACHE; v.count++) {
414 		cobalt_newton_step(&v);
415 		cobalt_newton_step(&v);
416 		cobalt_newton_step(&v);
417 		cobalt_newton_step(&v);
418 
419 		cobalt_rec_inv_sqrt_cache[v.count] = v.rec_inv_sqrt;
420 	}
421 }
422 
cobalt_vars_init(struct cobalt_vars * vars)423 static void cobalt_vars_init(struct cobalt_vars *vars)
424 {
425 	memset(vars, 0, sizeof(*vars));
426 
427 	if (!cobalt_rec_inv_sqrt_cache[0]) {
428 		cobalt_cache_init();
429 		cobalt_rec_inv_sqrt_cache[0] = ~0;
430 	}
431 }
432 
433 /* CoDel control_law is t + interval/sqrt(count)
434  * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid
435  * both sqrt() and divide operation.
436  */
cobalt_control(ktime_t t,u64 interval,u32 rec_inv_sqrt)437 static ktime_t cobalt_control(ktime_t t,
438 			      u64 interval,
439 			      u32 rec_inv_sqrt)
440 {
441 	return ktime_add_ns(t, reciprocal_scale(interval,
442 						rec_inv_sqrt));
443 }
444 
445 /* Call this when a packet had to be dropped due to queue overflow.  Returns
446  * true if the BLUE state was quiescent before but active after this call.
447  */
cobalt_queue_full(struct cobalt_vars * vars,struct cobalt_params * p,ktime_t now)448 static bool cobalt_queue_full(struct cobalt_vars *vars,
449 			      struct cobalt_params *p,
450 			      ktime_t now)
451 {
452 	bool up = false;
453 
454 	if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
455 		up = !vars->p_drop;
456 		vars->p_drop += p->p_inc;
457 		if (vars->p_drop < p->p_inc)
458 			vars->p_drop = ~0;
459 		vars->blue_timer = now;
460 	}
461 	vars->dropping = true;
462 	vars->drop_next = now;
463 	if (!vars->count)
464 		vars->count = 1;
465 
466 	return up;
467 }
468 
469 /* Call this when the queue was serviced but turned out to be empty.  Returns
470  * true if the BLUE state was active before but quiescent after this call.
471  */
cobalt_queue_empty(struct cobalt_vars * vars,struct cobalt_params * p,ktime_t now)472 static bool cobalt_queue_empty(struct cobalt_vars *vars,
473 			       struct cobalt_params *p,
474 			       ktime_t now)
475 {
476 	bool down = false;
477 
478 	if (vars->p_drop &&
479 	    ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
480 		if (vars->p_drop < p->p_dec)
481 			vars->p_drop = 0;
482 		else
483 			vars->p_drop -= p->p_dec;
484 		vars->blue_timer = now;
485 		down = !vars->p_drop;
486 	}
487 	vars->dropping = false;
488 
489 	if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) {
490 		vars->count--;
491 		cobalt_invsqrt(vars);
492 		vars->drop_next = cobalt_control(vars->drop_next,
493 						 p->interval,
494 						 vars->rec_inv_sqrt);
495 	}
496 
497 	return down;
498 }
499 
500 /* Call this with a freshly dequeued packet for possible congestion marking.
501  * Returns true as an instruction to drop the packet, false for delivery.
502  */
cobalt_should_drop(struct cobalt_vars * vars,struct cobalt_params * p,ktime_t now,struct sk_buff * skb,u32 bulk_flows)503 static bool cobalt_should_drop(struct cobalt_vars *vars,
504 			       struct cobalt_params *p,
505 			       ktime_t now,
506 			       struct sk_buff *skb,
507 			       u32 bulk_flows)
508 {
509 	bool next_due, over_target, drop = false;
510 	ktime_t schedule;
511 	u64 sojourn;
512 
513 /* The 'schedule' variable records, in its sign, whether 'now' is before or
514  * after 'drop_next'.  This allows 'drop_next' to be updated before the next
515  * scheduling decision is actually branched, without destroying that
516  * information.  Similarly, the first 'schedule' value calculated is preserved
517  * in the boolean 'next_due'.
518  *
519  * As for 'drop_next', we take advantage of the fact that 'interval' is both
520  * the delay between first exceeding 'target' and the first signalling event,
521  * *and* the scaling factor for the signalling frequency.  It's therefore very
522  * natural to use a single mechanism for both purposes, and eliminates a
523  * significant amount of reference Codel's spaghetti code.  To help with this,
524  * both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close
525  * as possible to 1.0 in fixed-point.
526  */
527 
528 	sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
529 	schedule = ktime_sub(now, vars->drop_next);
530 	over_target = sojourn > p->target &&
531 		      sojourn > p->mtu_time * bulk_flows * 2 &&
532 		      sojourn > p->mtu_time * 4;
533 	next_due = vars->count && ktime_to_ns(schedule) >= 0;
534 
535 	vars->ecn_marked = false;
536 
537 	if (over_target) {
538 		if (!vars->dropping) {
539 			vars->dropping = true;
540 			vars->drop_next = cobalt_control(now,
541 							 p->interval,
542 							 vars->rec_inv_sqrt);
543 		}
544 		if (!vars->count)
545 			vars->count = 1;
546 	} else if (vars->dropping) {
547 		vars->dropping = false;
548 	}
549 
550 	if (next_due && vars->dropping) {
551 		/* Use ECN mark if possible, otherwise drop */
552 		drop = !(vars->ecn_marked = INET_ECN_set_ce(skb));
553 
554 		vars->count++;
555 		if (!vars->count)
556 			vars->count--;
557 		cobalt_invsqrt(vars);
558 		vars->drop_next = cobalt_control(vars->drop_next,
559 						 p->interval,
560 						 vars->rec_inv_sqrt);
561 		schedule = ktime_sub(now, vars->drop_next);
562 	} else {
563 		while (next_due) {
564 			vars->count--;
565 			cobalt_invsqrt(vars);
566 			vars->drop_next = cobalt_control(vars->drop_next,
567 							 p->interval,
568 							 vars->rec_inv_sqrt);
569 			schedule = ktime_sub(now, vars->drop_next);
570 			next_due = vars->count && ktime_to_ns(schedule) >= 0;
571 		}
572 	}
573 
574 	/* Simple BLUE implementation.  Lack of ECN is deliberate. */
575 	if (vars->p_drop)
576 		drop |= (prandom_u32() < vars->p_drop);
577 
578 	/* Overload the drop_next field as an activity timeout */
579 	if (!vars->count)
580 		vars->drop_next = ktime_add_ns(now, p->interval);
581 	else if (ktime_to_ns(schedule) > 0 && !drop)
582 		vars->drop_next = now;
583 
584 	return drop;
585 }
586 
cake_update_flowkeys(struct flow_keys * keys,const struct sk_buff * skb)587 static bool cake_update_flowkeys(struct flow_keys *keys,
588 				 const struct sk_buff *skb)
589 {
590 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
591 	struct nf_conntrack_tuple tuple = {};
592 	bool rev = !skb->_nfct, upd = false;
593 	__be32 ip;
594 
595 	if (skb_protocol(skb, true) != htons(ETH_P_IP))
596 		return false;
597 
598 	if (!nf_ct_get_tuple_skb(&tuple, skb))
599 		return false;
600 
601 	ip = rev ? tuple.dst.u3.ip : tuple.src.u3.ip;
602 	if (ip != keys->addrs.v4addrs.src) {
603 		keys->addrs.v4addrs.src = ip;
604 		upd = true;
605 	}
606 	ip = rev ? tuple.src.u3.ip : tuple.dst.u3.ip;
607 	if (ip != keys->addrs.v4addrs.dst) {
608 		keys->addrs.v4addrs.dst = ip;
609 		upd = true;
610 	}
611 
612 	if (keys->ports.ports) {
613 		__be16 port;
614 
615 		port = rev ? tuple.dst.u.all : tuple.src.u.all;
616 		if (port != keys->ports.src) {
617 			keys->ports.src = port;
618 			upd = true;
619 		}
620 		port = rev ? tuple.src.u.all : tuple.dst.u.all;
621 		if (port != keys->ports.dst) {
622 			port = keys->ports.dst;
623 			upd = true;
624 		}
625 	}
626 	return upd;
627 #else
628 	return false;
629 #endif
630 }
631 
632 /* Cake has several subtle multiple bit settings. In these cases you
633  *  would be matching triple isolate mode as well.
634  */
635 
cake_dsrc(int flow_mode)636 static bool cake_dsrc(int flow_mode)
637 {
638 	return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC;
639 }
640 
cake_ddst(int flow_mode)641 static bool cake_ddst(int flow_mode)
642 {
643 	return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST;
644 }
645 
cake_hash(struct cake_tin_data * q,const struct sk_buff * skb,int flow_mode,u16 flow_override,u16 host_override)646 static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb,
647 		     int flow_mode, u16 flow_override, u16 host_override)
648 {
649 	bool hash_flows = (!flow_override && !!(flow_mode & CAKE_FLOW_FLOWS));
650 	bool hash_hosts = (!host_override && !!(flow_mode & CAKE_FLOW_HOSTS));
651 	bool nat_enabled = !!(flow_mode & CAKE_FLOW_NAT_FLAG);
652 	u32 flow_hash = 0, srchost_hash = 0, dsthost_hash = 0;
653 	u16 reduced_hash, srchost_idx, dsthost_idx;
654 	struct flow_keys keys, host_keys;
655 	bool use_skbhash = skb->l4_hash;
656 
657 	if (unlikely(flow_mode == CAKE_FLOW_NONE))
658 		return 0;
659 
660 	/* If both overrides are set, or we can use the SKB hash and nat mode is
661 	 * disabled, we can skip packet dissection entirely. If nat mode is
662 	 * enabled there's another check below after doing the conntrack lookup.
663 	 */
664 	if ((!hash_flows || (use_skbhash && !nat_enabled)) && !hash_hosts)
665 		goto skip_hash;
666 
667 	skb_flow_dissect_flow_keys(skb, &keys,
668 				   FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
669 
670 	/* Don't use the SKB hash if we change the lookup keys from conntrack */
671 	if (nat_enabled && cake_update_flowkeys(&keys, skb))
672 		use_skbhash = false;
673 
674 	/* If we can still use the SKB hash and don't need the host hash, we can
675 	 * skip the rest of the hashing procedure
676 	 */
677 	if (use_skbhash && !hash_hosts)
678 		goto skip_hash;
679 
680 	/* flow_hash_from_keys() sorts the addresses by value, so we have
681 	 * to preserve their order in a separate data structure to treat
682 	 * src and dst host addresses as independently selectable.
683 	 */
684 	host_keys = keys;
685 	host_keys.ports.ports     = 0;
686 	host_keys.basic.ip_proto  = 0;
687 	host_keys.keyid.keyid     = 0;
688 	host_keys.tags.flow_label = 0;
689 
690 	switch (host_keys.control.addr_type) {
691 	case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
692 		host_keys.addrs.v4addrs.src = 0;
693 		dsthost_hash = flow_hash_from_keys(&host_keys);
694 		host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
695 		host_keys.addrs.v4addrs.dst = 0;
696 		srchost_hash = flow_hash_from_keys(&host_keys);
697 		break;
698 
699 	case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
700 		memset(&host_keys.addrs.v6addrs.src, 0,
701 		       sizeof(host_keys.addrs.v6addrs.src));
702 		dsthost_hash = flow_hash_from_keys(&host_keys);
703 		host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
704 		memset(&host_keys.addrs.v6addrs.dst, 0,
705 		       sizeof(host_keys.addrs.v6addrs.dst));
706 		srchost_hash = flow_hash_from_keys(&host_keys);
707 		break;
708 
709 	default:
710 		dsthost_hash = 0;
711 		srchost_hash = 0;
712 	}
713 
714 	/* This *must* be after the above switch, since as a
715 	 * side-effect it sorts the src and dst addresses.
716 	 */
717 	if (hash_flows && !use_skbhash)
718 		flow_hash = flow_hash_from_keys(&keys);
719 
720 skip_hash:
721 	if (flow_override)
722 		flow_hash = flow_override - 1;
723 	else if (use_skbhash)
724 		flow_hash = skb->hash;
725 	if (host_override) {
726 		dsthost_hash = host_override - 1;
727 		srchost_hash = host_override - 1;
728 	}
729 
730 	if (!(flow_mode & CAKE_FLOW_FLOWS)) {
731 		if (flow_mode & CAKE_FLOW_SRC_IP)
732 			flow_hash ^= srchost_hash;
733 
734 		if (flow_mode & CAKE_FLOW_DST_IP)
735 			flow_hash ^= dsthost_hash;
736 	}
737 
738 	reduced_hash = flow_hash % CAKE_QUEUES;
739 
740 	/* set-associative hashing */
741 	/* fast path if no hash collision (direct lookup succeeds) */
742 	if (likely(q->tags[reduced_hash] == flow_hash &&
743 		   q->flows[reduced_hash].set)) {
744 		q->way_directs++;
745 	} else {
746 		u32 inner_hash = reduced_hash % CAKE_SET_WAYS;
747 		u32 outer_hash = reduced_hash - inner_hash;
748 		bool allocate_src = false;
749 		bool allocate_dst = false;
750 		u32 i, k;
751 
752 		/* check if any active queue in the set is reserved for
753 		 * this flow.
754 		 */
755 		for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
756 		     i++, k = (k + 1) % CAKE_SET_WAYS) {
757 			if (q->tags[outer_hash + k] == flow_hash) {
758 				if (i)
759 					q->way_hits++;
760 
761 				if (!q->flows[outer_hash + k].set) {
762 					/* need to increment host refcnts */
763 					allocate_src = cake_dsrc(flow_mode);
764 					allocate_dst = cake_ddst(flow_mode);
765 				}
766 
767 				goto found;
768 			}
769 		}
770 
771 		/* no queue is reserved for this flow, look for an
772 		 * empty one.
773 		 */
774 		for (i = 0; i < CAKE_SET_WAYS;
775 			 i++, k = (k + 1) % CAKE_SET_WAYS) {
776 			if (!q->flows[outer_hash + k].set) {
777 				q->way_misses++;
778 				allocate_src = cake_dsrc(flow_mode);
779 				allocate_dst = cake_ddst(flow_mode);
780 				goto found;
781 			}
782 		}
783 
784 		/* With no empty queues, default to the original
785 		 * queue, accept the collision, update the host tags.
786 		 */
787 		q->way_collisions++;
788 		if (q->flows[outer_hash + k].set == CAKE_SET_BULK) {
789 			q->hosts[q->flows[reduced_hash].srchost].srchost_bulk_flow_count--;
790 			q->hosts[q->flows[reduced_hash].dsthost].dsthost_bulk_flow_count--;
791 		}
792 		allocate_src = cake_dsrc(flow_mode);
793 		allocate_dst = cake_ddst(flow_mode);
794 found:
795 		/* reserve queue for future packets in same flow */
796 		reduced_hash = outer_hash + k;
797 		q->tags[reduced_hash] = flow_hash;
798 
799 		if (allocate_src) {
800 			srchost_idx = srchost_hash % CAKE_QUEUES;
801 			inner_hash = srchost_idx % CAKE_SET_WAYS;
802 			outer_hash = srchost_idx - inner_hash;
803 			for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
804 				i++, k = (k + 1) % CAKE_SET_WAYS) {
805 				if (q->hosts[outer_hash + k].srchost_tag ==
806 				    srchost_hash)
807 					goto found_src;
808 			}
809 			for (i = 0; i < CAKE_SET_WAYS;
810 				i++, k = (k + 1) % CAKE_SET_WAYS) {
811 				if (!q->hosts[outer_hash + k].srchost_bulk_flow_count)
812 					break;
813 			}
814 			q->hosts[outer_hash + k].srchost_tag = srchost_hash;
815 found_src:
816 			srchost_idx = outer_hash + k;
817 			if (q->flows[reduced_hash].set == CAKE_SET_BULK)
818 				q->hosts[srchost_idx].srchost_bulk_flow_count++;
819 			q->flows[reduced_hash].srchost = srchost_idx;
820 		}
821 
822 		if (allocate_dst) {
823 			dsthost_idx = dsthost_hash % CAKE_QUEUES;
824 			inner_hash = dsthost_idx % CAKE_SET_WAYS;
825 			outer_hash = dsthost_idx - inner_hash;
826 			for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
827 			     i++, k = (k + 1) % CAKE_SET_WAYS) {
828 				if (q->hosts[outer_hash + k].dsthost_tag ==
829 				    dsthost_hash)
830 					goto found_dst;
831 			}
832 			for (i = 0; i < CAKE_SET_WAYS;
833 			     i++, k = (k + 1) % CAKE_SET_WAYS) {
834 				if (!q->hosts[outer_hash + k].dsthost_bulk_flow_count)
835 					break;
836 			}
837 			q->hosts[outer_hash + k].dsthost_tag = dsthost_hash;
838 found_dst:
839 			dsthost_idx = outer_hash + k;
840 			if (q->flows[reduced_hash].set == CAKE_SET_BULK)
841 				q->hosts[dsthost_idx].dsthost_bulk_flow_count++;
842 			q->flows[reduced_hash].dsthost = dsthost_idx;
843 		}
844 	}
845 
846 	return reduced_hash;
847 }
848 
849 /* helper functions : might be changed when/if skb use a standard list_head */
850 /* remove one skb from head of slot queue */
851 
dequeue_head(struct cake_flow * flow)852 static struct sk_buff *dequeue_head(struct cake_flow *flow)
853 {
854 	struct sk_buff *skb = flow->head;
855 
856 	if (skb) {
857 		flow->head = skb->next;
858 		skb_mark_not_on_list(skb);
859 	}
860 
861 	return skb;
862 }
863 
864 /* add skb to flow queue (tail add) */
865 
flow_queue_add(struct cake_flow * flow,struct sk_buff * skb)866 static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb)
867 {
868 	if (!flow->head)
869 		flow->head = skb;
870 	else
871 		flow->tail->next = skb;
872 	flow->tail = skb;
873 	skb->next = NULL;
874 }
875 
cake_get_iphdr(const struct sk_buff * skb,struct ipv6hdr * buf)876 static struct iphdr *cake_get_iphdr(const struct sk_buff *skb,
877 				    struct ipv6hdr *buf)
878 {
879 	unsigned int offset = skb_network_offset(skb);
880 	struct iphdr *iph;
881 
882 	iph = skb_header_pointer(skb, offset, sizeof(struct iphdr), buf);
883 
884 	if (!iph)
885 		return NULL;
886 
887 	if (iph->version == 4 && iph->protocol == IPPROTO_IPV6)
888 		return skb_header_pointer(skb, offset + iph->ihl * 4,
889 					  sizeof(struct ipv6hdr), buf);
890 
891 	else if (iph->version == 4)
892 		return iph;
893 
894 	else if (iph->version == 6)
895 		return skb_header_pointer(skb, offset, sizeof(struct ipv6hdr),
896 					  buf);
897 
898 	return NULL;
899 }
900 
cake_get_tcphdr(const struct sk_buff * skb,void * buf,unsigned int bufsize)901 static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb,
902 				      void *buf, unsigned int bufsize)
903 {
904 	unsigned int offset = skb_network_offset(skb);
905 	const struct ipv6hdr *ipv6h;
906 	const struct tcphdr *tcph;
907 	const struct iphdr *iph;
908 	struct ipv6hdr _ipv6h;
909 	struct tcphdr _tcph;
910 
911 	ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h);
912 
913 	if (!ipv6h)
914 		return NULL;
915 
916 	if (ipv6h->version == 4) {
917 		iph = (struct iphdr *)ipv6h;
918 		offset += iph->ihl * 4;
919 
920 		/* special-case 6in4 tunnelling, as that is a common way to get
921 		 * v6 connectivity in the home
922 		 */
923 		if (iph->protocol == IPPROTO_IPV6) {
924 			ipv6h = skb_header_pointer(skb, offset,
925 						   sizeof(_ipv6h), &_ipv6h);
926 
927 			if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP)
928 				return NULL;
929 
930 			offset += sizeof(struct ipv6hdr);
931 
932 		} else if (iph->protocol != IPPROTO_TCP) {
933 			return NULL;
934 		}
935 
936 	} else if (ipv6h->version == 6) {
937 		if (ipv6h->nexthdr != IPPROTO_TCP)
938 			return NULL;
939 
940 		offset += sizeof(struct ipv6hdr);
941 	} else {
942 		return NULL;
943 	}
944 
945 	tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
946 	if (!tcph)
947 		return NULL;
948 
949 	return skb_header_pointer(skb, offset,
950 				  min(__tcp_hdrlen(tcph), bufsize), buf);
951 }
952 
cake_get_tcpopt(const struct tcphdr * tcph,int code,int * oplen)953 static const void *cake_get_tcpopt(const struct tcphdr *tcph,
954 				   int code, int *oplen)
955 {
956 	/* inspired by tcp_parse_options in tcp_input.c */
957 	int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
958 	const u8 *ptr = (const u8 *)(tcph + 1);
959 
960 	while (length > 0) {
961 		int opcode = *ptr++;
962 		int opsize;
963 
964 		if (opcode == TCPOPT_EOL)
965 			break;
966 		if (opcode == TCPOPT_NOP) {
967 			length--;
968 			continue;
969 		}
970 		opsize = *ptr++;
971 		if (opsize < 2 || opsize > length)
972 			break;
973 
974 		if (opcode == code) {
975 			*oplen = opsize;
976 			return ptr;
977 		}
978 
979 		ptr += opsize - 2;
980 		length -= opsize;
981 	}
982 
983 	return NULL;
984 }
985 
986 /* Compare two SACK sequences. A sequence is considered greater if it SACKs more
987  * bytes than the other. In the case where both sequences ACKs bytes that the
988  * other doesn't, A is considered greater. DSACKs in A also makes A be
989  * considered greater.
990  *
991  * @return -1, 0 or 1 as normal compare functions
992  */
cake_tcph_sack_compare(const struct tcphdr * tcph_a,const struct tcphdr * tcph_b)993 static int cake_tcph_sack_compare(const struct tcphdr *tcph_a,
994 				  const struct tcphdr *tcph_b)
995 {
996 	const struct tcp_sack_block_wire *sack_a, *sack_b;
997 	u32 ack_seq_a = ntohl(tcph_a->ack_seq);
998 	u32 bytes_a = 0, bytes_b = 0;
999 	int oplen_a, oplen_b;
1000 	bool first = true;
1001 
1002 	sack_a = cake_get_tcpopt(tcph_a, TCPOPT_SACK, &oplen_a);
1003 	sack_b = cake_get_tcpopt(tcph_b, TCPOPT_SACK, &oplen_b);
1004 
1005 	/* pointers point to option contents */
1006 	oplen_a -= TCPOLEN_SACK_BASE;
1007 	oplen_b -= TCPOLEN_SACK_BASE;
1008 
1009 	if (sack_a && oplen_a >= sizeof(*sack_a) &&
1010 	    (!sack_b || oplen_b < sizeof(*sack_b)))
1011 		return -1;
1012 	else if (sack_b && oplen_b >= sizeof(*sack_b) &&
1013 		 (!sack_a || oplen_a < sizeof(*sack_a)))
1014 		return 1;
1015 	else if ((!sack_a || oplen_a < sizeof(*sack_a)) &&
1016 		 (!sack_b || oplen_b < sizeof(*sack_b)))
1017 		return 0;
1018 
1019 	while (oplen_a >= sizeof(*sack_a)) {
1020 		const struct tcp_sack_block_wire *sack_tmp = sack_b;
1021 		u32 start_a = get_unaligned_be32(&sack_a->start_seq);
1022 		u32 end_a = get_unaligned_be32(&sack_a->end_seq);
1023 		int oplen_tmp = oplen_b;
1024 		bool found = false;
1025 
1026 		/* DSACK; always considered greater to prevent dropping */
1027 		if (before(start_a, ack_seq_a))
1028 			return -1;
1029 
1030 		bytes_a += end_a - start_a;
1031 
1032 		while (oplen_tmp >= sizeof(*sack_tmp)) {
1033 			u32 start_b = get_unaligned_be32(&sack_tmp->start_seq);
1034 			u32 end_b = get_unaligned_be32(&sack_tmp->end_seq);
1035 
1036 			/* first time through we count the total size */
1037 			if (first)
1038 				bytes_b += end_b - start_b;
1039 
1040 			if (!after(start_b, start_a) && !before(end_b, end_a)) {
1041 				found = true;
1042 				if (!first)
1043 					break;
1044 			}
1045 			oplen_tmp -= sizeof(*sack_tmp);
1046 			sack_tmp++;
1047 		}
1048 
1049 		if (!found)
1050 			return -1;
1051 
1052 		oplen_a -= sizeof(*sack_a);
1053 		sack_a++;
1054 		first = false;
1055 	}
1056 
1057 	/* If we made it this far, all ranges SACKed by A are covered by B, so
1058 	 * either the SACKs are equal, or B SACKs more bytes.
1059 	 */
1060 	return bytes_b > bytes_a ? 1 : 0;
1061 }
1062 
cake_tcph_get_tstamp(const struct tcphdr * tcph,u32 * tsval,u32 * tsecr)1063 static void cake_tcph_get_tstamp(const struct tcphdr *tcph,
1064 				 u32 *tsval, u32 *tsecr)
1065 {
1066 	const u8 *ptr;
1067 	int opsize;
1068 
1069 	ptr = cake_get_tcpopt(tcph, TCPOPT_TIMESTAMP, &opsize);
1070 
1071 	if (ptr && opsize == TCPOLEN_TIMESTAMP) {
1072 		*tsval = get_unaligned_be32(ptr);
1073 		*tsecr = get_unaligned_be32(ptr + 4);
1074 	}
1075 }
1076 
cake_tcph_may_drop(const struct tcphdr * tcph,u32 tstamp_new,u32 tsecr_new)1077 static bool cake_tcph_may_drop(const struct tcphdr *tcph,
1078 			       u32 tstamp_new, u32 tsecr_new)
1079 {
1080 	/* inspired by tcp_parse_options in tcp_input.c */
1081 	int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
1082 	const u8 *ptr = (const u8 *)(tcph + 1);
1083 	u32 tstamp, tsecr;
1084 
1085 	/* 3 reserved flags must be unset to avoid future breakage
1086 	 * ACK must be set
1087 	 * ECE/CWR are handled separately
1088 	 * All other flags URG/PSH/RST/SYN/FIN must be unset
1089 	 * 0x0FFF0000 = all TCP flags (confirm ACK=1, others zero)
1090 	 * 0x00C00000 = CWR/ECE (handled separately)
1091 	 * 0x0F3F0000 = 0x0FFF0000 & ~0x00C00000
1092 	 */
1093 	if (((tcp_flag_word(tcph) &
1094 	      cpu_to_be32(0x0F3F0000)) != TCP_FLAG_ACK))
1095 		return false;
1096 
1097 	while (length > 0) {
1098 		int opcode = *ptr++;
1099 		int opsize;
1100 
1101 		if (opcode == TCPOPT_EOL)
1102 			break;
1103 		if (opcode == TCPOPT_NOP) {
1104 			length--;
1105 			continue;
1106 		}
1107 		opsize = *ptr++;
1108 		if (opsize < 2 || opsize > length)
1109 			break;
1110 
1111 		switch (opcode) {
1112 		case TCPOPT_MD5SIG: /* doesn't influence state */
1113 			break;
1114 
1115 		case TCPOPT_SACK: /* stricter checking performed later */
1116 			if (opsize % 8 != 2)
1117 				return false;
1118 			break;
1119 
1120 		case TCPOPT_TIMESTAMP:
1121 			/* only drop timestamps lower than new */
1122 			if (opsize != TCPOLEN_TIMESTAMP)
1123 				return false;
1124 			tstamp = get_unaligned_be32(ptr);
1125 			tsecr = get_unaligned_be32(ptr + 4);
1126 			if (after(tstamp, tstamp_new) ||
1127 			    after(tsecr, tsecr_new))
1128 				return false;
1129 			break;
1130 
1131 		case TCPOPT_MSS:  /* these should only be set on SYN */
1132 		case TCPOPT_WINDOW:
1133 		case TCPOPT_SACK_PERM:
1134 		case TCPOPT_FASTOPEN:
1135 		case TCPOPT_EXP:
1136 		default: /* don't drop if any unknown options are present */
1137 			return false;
1138 		}
1139 
1140 		ptr += opsize - 2;
1141 		length -= opsize;
1142 	}
1143 
1144 	return true;
1145 }
1146 
cake_ack_filter(struct cake_sched_data * q,struct cake_flow * flow)1147 static struct sk_buff *cake_ack_filter(struct cake_sched_data *q,
1148 				       struct cake_flow *flow)
1149 {
1150 	bool aggressive = q->ack_filter == CAKE_ACK_AGGRESSIVE;
1151 	struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL;
1152 	struct sk_buff *skb_check, *skb_prev = NULL;
1153 	const struct ipv6hdr *ipv6h, *ipv6h_check;
1154 	unsigned char _tcph[64], _tcph_check[64];
1155 	const struct tcphdr *tcph, *tcph_check;
1156 	const struct iphdr *iph, *iph_check;
1157 	struct ipv6hdr _iph, _iph_check;
1158 	const struct sk_buff *skb;
1159 	int seglen, num_found = 0;
1160 	u32 tstamp = 0, tsecr = 0;
1161 	__be32 elig_flags = 0;
1162 	int sack_comp;
1163 
1164 	/* no other possible ACKs to filter */
1165 	if (flow->head == flow->tail)
1166 		return NULL;
1167 
1168 	skb = flow->tail;
1169 	tcph = cake_get_tcphdr(skb, _tcph, sizeof(_tcph));
1170 	iph = cake_get_iphdr(skb, &_iph);
1171 	if (!tcph)
1172 		return NULL;
1173 
1174 	cake_tcph_get_tstamp(tcph, &tstamp, &tsecr);
1175 
1176 	/* the 'triggering' packet need only have the ACK flag set.
1177 	 * also check that SYN is not set, as there won't be any previous ACKs.
1178 	 */
1179 	if ((tcp_flag_word(tcph) &
1180 	     (TCP_FLAG_ACK | TCP_FLAG_SYN)) != TCP_FLAG_ACK)
1181 		return NULL;
1182 
1183 	/* the 'triggering' ACK is at the tail of the queue, we have already
1184 	 * returned if it is the only packet in the flow. loop through the rest
1185 	 * of the queue looking for pure ACKs with the same 5-tuple as the
1186 	 * triggering one.
1187 	 */
1188 	for (skb_check = flow->head;
1189 	     skb_check && skb_check != skb;
1190 	     skb_prev = skb_check, skb_check = skb_check->next) {
1191 		iph_check = cake_get_iphdr(skb_check, &_iph_check);
1192 		tcph_check = cake_get_tcphdr(skb_check, &_tcph_check,
1193 					     sizeof(_tcph_check));
1194 
1195 		/* only TCP packets with matching 5-tuple are eligible, and only
1196 		 * drop safe headers
1197 		 */
1198 		if (!tcph_check || iph->version != iph_check->version ||
1199 		    tcph_check->source != tcph->source ||
1200 		    tcph_check->dest != tcph->dest)
1201 			continue;
1202 
1203 		if (iph_check->version == 4) {
1204 			if (iph_check->saddr != iph->saddr ||
1205 			    iph_check->daddr != iph->daddr)
1206 				continue;
1207 
1208 			seglen = ntohs(iph_check->tot_len) -
1209 				       (4 * iph_check->ihl);
1210 		} else if (iph_check->version == 6) {
1211 			ipv6h = (struct ipv6hdr *)iph;
1212 			ipv6h_check = (struct ipv6hdr *)iph_check;
1213 
1214 			if (ipv6_addr_cmp(&ipv6h_check->saddr, &ipv6h->saddr) ||
1215 			    ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr))
1216 				continue;
1217 
1218 			seglen = ntohs(ipv6h_check->payload_len);
1219 		} else {
1220 			WARN_ON(1);  /* shouldn't happen */
1221 			continue;
1222 		}
1223 
1224 		/* If the ECE/CWR flags changed from the previous eligible
1225 		 * packet in the same flow, we should no longer be dropping that
1226 		 * previous packet as this would lose information.
1227 		 */
1228 		if (elig_ack && (tcp_flag_word(tcph_check) &
1229 				 (TCP_FLAG_ECE | TCP_FLAG_CWR)) != elig_flags) {
1230 			elig_ack = NULL;
1231 			elig_ack_prev = NULL;
1232 			num_found--;
1233 		}
1234 
1235 		/* Check TCP options and flags, don't drop ACKs with segment
1236 		 * data, and don't drop ACKs with a higher cumulative ACK
1237 		 * counter than the triggering packet. Check ACK seqno here to
1238 		 * avoid parsing SACK options of packets we are going to exclude
1239 		 * anyway.
1240 		 */
1241 		if (!cake_tcph_may_drop(tcph_check, tstamp, tsecr) ||
1242 		    (seglen - __tcp_hdrlen(tcph_check)) != 0 ||
1243 		    after(ntohl(tcph_check->ack_seq), ntohl(tcph->ack_seq)))
1244 			continue;
1245 
1246 		/* Check SACK options. The triggering packet must SACK more data
1247 		 * than the ACK under consideration, or SACK the same range but
1248 		 * have a larger cumulative ACK counter. The latter is a
1249 		 * pathological case, but is contained in the following check
1250 		 * anyway, just to be safe.
1251 		 */
1252 		sack_comp = cake_tcph_sack_compare(tcph_check, tcph);
1253 
1254 		if (sack_comp < 0 ||
1255 		    (ntohl(tcph_check->ack_seq) == ntohl(tcph->ack_seq) &&
1256 		     sack_comp == 0))
1257 			continue;
1258 
1259 		/* At this point we have found an eligible pure ACK to drop; if
1260 		 * we are in aggressive mode, we are done. Otherwise, keep
1261 		 * searching unless this is the second eligible ACK we
1262 		 * found.
1263 		 *
1264 		 * Since we want to drop ACK closest to the head of the queue,
1265 		 * save the first eligible ACK we find, even if we need to loop
1266 		 * again.
1267 		 */
1268 		if (!elig_ack) {
1269 			elig_ack = skb_check;
1270 			elig_ack_prev = skb_prev;
1271 			elig_flags = (tcp_flag_word(tcph_check)
1272 				      & (TCP_FLAG_ECE | TCP_FLAG_CWR));
1273 		}
1274 
1275 		if (num_found++ > 0)
1276 			goto found;
1277 	}
1278 
1279 	/* We made it through the queue without finding two eligible ACKs . If
1280 	 * we found a single eligible ACK we can drop it in aggressive mode if
1281 	 * we can guarantee that this does not interfere with ECN flag
1282 	 * information. We ensure this by dropping it only if the enqueued
1283 	 * packet is consecutive with the eligible ACK, and their flags match.
1284 	 */
1285 	if (elig_ack && aggressive && elig_ack->next == skb &&
1286 	    (elig_flags == (tcp_flag_word(tcph) &
1287 			    (TCP_FLAG_ECE | TCP_FLAG_CWR))))
1288 		goto found;
1289 
1290 	return NULL;
1291 
1292 found:
1293 	if (elig_ack_prev)
1294 		elig_ack_prev->next = elig_ack->next;
1295 	else
1296 		flow->head = elig_ack->next;
1297 
1298 	skb_mark_not_on_list(elig_ack);
1299 
1300 	return elig_ack;
1301 }
1302 
cake_ewma(u64 avg,u64 sample,u32 shift)1303 static u64 cake_ewma(u64 avg, u64 sample, u32 shift)
1304 {
1305 	avg -= avg >> shift;
1306 	avg += sample >> shift;
1307 	return avg;
1308 }
1309 
cake_calc_overhead(struct cake_sched_data * q,u32 len,u32 off)1310 static u32 cake_calc_overhead(struct cake_sched_data *q, u32 len, u32 off)
1311 {
1312 	if (q->rate_flags & CAKE_FLAG_OVERHEAD)
1313 		len -= off;
1314 
1315 	if (q->max_netlen < len)
1316 		q->max_netlen = len;
1317 	if (q->min_netlen > len)
1318 		q->min_netlen = len;
1319 
1320 	len += q->rate_overhead;
1321 
1322 	if (len < q->rate_mpu)
1323 		len = q->rate_mpu;
1324 
1325 	if (q->atm_mode == CAKE_ATM_ATM) {
1326 		len += 47;
1327 		len /= 48;
1328 		len *= 53;
1329 	} else if (q->atm_mode == CAKE_ATM_PTM) {
1330 		/* Add one byte per 64 bytes or part thereof.
1331 		 * This is conservative and easier to calculate than the
1332 		 * precise value.
1333 		 */
1334 		len += (len + 63) / 64;
1335 	}
1336 
1337 	if (q->max_adjlen < len)
1338 		q->max_adjlen = len;
1339 	if (q->min_adjlen > len)
1340 		q->min_adjlen = len;
1341 
1342 	return len;
1343 }
1344 
cake_overhead(struct cake_sched_data * q,const struct sk_buff * skb)1345 static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb)
1346 {
1347 	const struct skb_shared_info *shinfo = skb_shinfo(skb);
1348 	unsigned int hdr_len, last_len = 0;
1349 	u32 off = skb_network_offset(skb);
1350 	u32 len = qdisc_pkt_len(skb);
1351 	u16 segs = 1;
1352 
1353 	q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8);
1354 
1355 	if (!shinfo->gso_size)
1356 		return cake_calc_overhead(q, len, off);
1357 
1358 	/* borrowed from qdisc_pkt_len_init() */
1359 	hdr_len = skb_transport_header(skb) - skb_mac_header(skb);
1360 
1361 	/* + transport layer */
1362 	if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 |
1363 						SKB_GSO_TCPV6))) {
1364 		const struct tcphdr *th;
1365 		struct tcphdr _tcphdr;
1366 
1367 		th = skb_header_pointer(skb, skb_transport_offset(skb),
1368 					sizeof(_tcphdr), &_tcphdr);
1369 		if (likely(th))
1370 			hdr_len += __tcp_hdrlen(th);
1371 	} else {
1372 		struct udphdr _udphdr;
1373 
1374 		if (skb_header_pointer(skb, skb_transport_offset(skb),
1375 				       sizeof(_udphdr), &_udphdr))
1376 			hdr_len += sizeof(struct udphdr);
1377 	}
1378 
1379 	if (unlikely(shinfo->gso_type & SKB_GSO_DODGY))
1380 		segs = DIV_ROUND_UP(skb->len - hdr_len,
1381 				    shinfo->gso_size);
1382 	else
1383 		segs = shinfo->gso_segs;
1384 
1385 	len = shinfo->gso_size + hdr_len;
1386 	last_len = skb->len - shinfo->gso_size * (segs - 1);
1387 
1388 	return (cake_calc_overhead(q, len, off) * (segs - 1) +
1389 		cake_calc_overhead(q, last_len, off));
1390 }
1391 
cake_heap_swap(struct cake_sched_data * q,u16 i,u16 j)1392 static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j)
1393 {
1394 	struct cake_heap_entry ii = q->overflow_heap[i];
1395 	struct cake_heap_entry jj = q->overflow_heap[j];
1396 
1397 	q->overflow_heap[i] = jj;
1398 	q->overflow_heap[j] = ii;
1399 
1400 	q->tins[ii.t].overflow_idx[ii.b] = j;
1401 	q->tins[jj.t].overflow_idx[jj.b] = i;
1402 }
1403 
cake_heap_get_backlog(const struct cake_sched_data * q,u16 i)1404 static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i)
1405 {
1406 	struct cake_heap_entry ii = q->overflow_heap[i];
1407 
1408 	return q->tins[ii.t].backlogs[ii.b];
1409 }
1410 
cake_heapify(struct cake_sched_data * q,u16 i)1411 static void cake_heapify(struct cake_sched_data *q, u16 i)
1412 {
1413 	static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES;
1414 	u32 mb = cake_heap_get_backlog(q, i);
1415 	u32 m = i;
1416 
1417 	while (m < a) {
1418 		u32 l = m + m + 1;
1419 		u32 r = l + 1;
1420 
1421 		if (l < a) {
1422 			u32 lb = cake_heap_get_backlog(q, l);
1423 
1424 			if (lb > mb) {
1425 				m  = l;
1426 				mb = lb;
1427 			}
1428 		}
1429 
1430 		if (r < a) {
1431 			u32 rb = cake_heap_get_backlog(q, r);
1432 
1433 			if (rb > mb) {
1434 				m  = r;
1435 				mb = rb;
1436 			}
1437 		}
1438 
1439 		if (m != i) {
1440 			cake_heap_swap(q, i, m);
1441 			i = m;
1442 		} else {
1443 			break;
1444 		}
1445 	}
1446 }
1447 
cake_heapify_up(struct cake_sched_data * q,u16 i)1448 static void cake_heapify_up(struct cake_sched_data *q, u16 i)
1449 {
1450 	while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) {
1451 		u16 p = (i - 1) >> 1;
1452 		u32 ib = cake_heap_get_backlog(q, i);
1453 		u32 pb = cake_heap_get_backlog(q, p);
1454 
1455 		if (ib > pb) {
1456 			cake_heap_swap(q, i, p);
1457 			i = p;
1458 		} else {
1459 			break;
1460 		}
1461 	}
1462 }
1463 
cake_advance_shaper(struct cake_sched_data * q,struct cake_tin_data * b,struct sk_buff * skb,ktime_t now,bool drop)1464 static int cake_advance_shaper(struct cake_sched_data *q,
1465 			       struct cake_tin_data *b,
1466 			       struct sk_buff *skb,
1467 			       ktime_t now, bool drop)
1468 {
1469 	u32 len = get_cobalt_cb(skb)->adjusted_len;
1470 
1471 	/* charge packet bandwidth to this tin
1472 	 * and to the global shaper.
1473 	 */
1474 	if (q->rate_ns) {
1475 		u64 tin_dur = (len * b->tin_rate_ns) >> b->tin_rate_shft;
1476 		u64 global_dur = (len * q->rate_ns) >> q->rate_shft;
1477 		u64 failsafe_dur = global_dur + (global_dur >> 1);
1478 
1479 		if (ktime_before(b->time_next_packet, now))
1480 			b->time_next_packet = ktime_add_ns(b->time_next_packet,
1481 							   tin_dur);
1482 
1483 		else if (ktime_before(b->time_next_packet,
1484 				      ktime_add_ns(now, tin_dur)))
1485 			b->time_next_packet = ktime_add_ns(now, tin_dur);
1486 
1487 		q->time_next_packet = ktime_add_ns(q->time_next_packet,
1488 						   global_dur);
1489 		if (!drop)
1490 			q->failsafe_next_packet = \
1491 				ktime_add_ns(q->failsafe_next_packet,
1492 					     failsafe_dur);
1493 	}
1494 	return len;
1495 }
1496 
cake_drop(struct Qdisc * sch,struct sk_buff ** to_free)1497 static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free)
1498 {
1499 	struct cake_sched_data *q = qdisc_priv(sch);
1500 	ktime_t now = ktime_get();
1501 	u32 idx = 0, tin = 0, len;
1502 	struct cake_heap_entry qq;
1503 	struct cake_tin_data *b;
1504 	struct cake_flow *flow;
1505 	struct sk_buff *skb;
1506 
1507 	if (!q->overflow_timeout) {
1508 		int i;
1509 		/* Build fresh max-heap */
1510 		for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2; i >= 0; i--)
1511 			cake_heapify(q, i);
1512 	}
1513 	q->overflow_timeout = 65535;
1514 
1515 	/* select longest queue for pruning */
1516 	qq  = q->overflow_heap[0];
1517 	tin = qq.t;
1518 	idx = qq.b;
1519 
1520 	b = &q->tins[tin];
1521 	flow = &b->flows[idx];
1522 	skb = dequeue_head(flow);
1523 	if (unlikely(!skb)) {
1524 		/* heap has gone wrong, rebuild it next time */
1525 		q->overflow_timeout = 0;
1526 		return idx + (tin << 16);
1527 	}
1528 
1529 	if (cobalt_queue_full(&flow->cvars, &b->cparams, now))
1530 		b->unresponsive_flow_count++;
1531 
1532 	len = qdisc_pkt_len(skb);
1533 	q->buffer_used      -= skb->truesize;
1534 	b->backlogs[idx]    -= len;
1535 	b->tin_backlog      -= len;
1536 	sch->qstats.backlog -= len;
1537 	qdisc_tree_reduce_backlog(sch, 1, len);
1538 
1539 	flow->dropped++;
1540 	b->tin_dropped++;
1541 	sch->qstats.drops++;
1542 
1543 	if (q->rate_flags & CAKE_FLAG_INGRESS)
1544 		cake_advance_shaper(q, b, skb, now, true);
1545 
1546 	__qdisc_drop(skb, to_free);
1547 	sch->q.qlen--;
1548 
1549 	cake_heapify(q, 0);
1550 
1551 	return idx + (tin << 16);
1552 }
1553 
cake_handle_diffserv(struct sk_buff * skb,bool wash)1554 static u8 cake_handle_diffserv(struct sk_buff *skb, bool wash)
1555 {
1556 	const int offset = skb_network_offset(skb);
1557 	u16 *buf, buf_;
1558 	u8 dscp;
1559 
1560 	switch (skb_protocol(skb, true)) {
1561 	case htons(ETH_P_IP):
1562 		buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_);
1563 		if (unlikely(!buf))
1564 			return 0;
1565 
1566 		/* ToS is in the second byte of iphdr */
1567 		dscp = ipv4_get_dsfield((struct iphdr *)buf) >> 2;
1568 
1569 		if (wash && dscp) {
1570 			const int wlen = offset + sizeof(struct iphdr);
1571 
1572 			if (!pskb_may_pull(skb, wlen) ||
1573 			    skb_try_make_writable(skb, wlen))
1574 				return 0;
1575 
1576 			ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0);
1577 		}
1578 
1579 		return dscp;
1580 
1581 	case htons(ETH_P_IPV6):
1582 		buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_);
1583 		if (unlikely(!buf))
1584 			return 0;
1585 
1586 		/* Traffic class is in the first and second bytes of ipv6hdr */
1587 		dscp = ipv6_get_dsfield((struct ipv6hdr *)buf) >> 2;
1588 
1589 		if (wash && dscp) {
1590 			const int wlen = offset + sizeof(struct ipv6hdr);
1591 
1592 			if (!pskb_may_pull(skb, wlen) ||
1593 			    skb_try_make_writable(skb, wlen))
1594 				return 0;
1595 
1596 			ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0);
1597 		}
1598 
1599 		return dscp;
1600 
1601 	case htons(ETH_P_ARP):
1602 		return 0x38;  /* CS7 - Net Control */
1603 
1604 	default:
1605 		/* If there is no Diffserv field, treat as best-effort */
1606 		return 0;
1607 	}
1608 }
1609 
cake_select_tin(struct Qdisc * sch,struct sk_buff * skb)1610 static struct cake_tin_data *cake_select_tin(struct Qdisc *sch,
1611 					     struct sk_buff *skb)
1612 {
1613 	struct cake_sched_data *q = qdisc_priv(sch);
1614 	u32 tin, mark;
1615 	bool wash;
1616 	u8 dscp;
1617 
1618 	/* Tin selection: Default to diffserv-based selection, allow overriding
1619 	 * using firewall marks or skb->priority. Call DSCP parsing early if
1620 	 * wash is enabled, otherwise defer to below to skip unneeded parsing.
1621 	 */
1622 	mark = (skb->mark & q->fwmark_mask) >> q->fwmark_shft;
1623 	wash = !!(q->rate_flags & CAKE_FLAG_WASH);
1624 	if (wash)
1625 		dscp = cake_handle_diffserv(skb, wash);
1626 
1627 	if (q->tin_mode == CAKE_DIFFSERV_BESTEFFORT)
1628 		tin = 0;
1629 
1630 	else if (mark && mark <= q->tin_cnt)
1631 		tin = q->tin_order[mark - 1];
1632 
1633 	else if (TC_H_MAJ(skb->priority) == sch->handle &&
1634 		 TC_H_MIN(skb->priority) > 0 &&
1635 		 TC_H_MIN(skb->priority) <= q->tin_cnt)
1636 		tin = q->tin_order[TC_H_MIN(skb->priority) - 1];
1637 
1638 	else {
1639 		if (!wash)
1640 			dscp = cake_handle_diffserv(skb, wash);
1641 		tin = q->tin_index[dscp];
1642 
1643 		if (unlikely(tin >= q->tin_cnt))
1644 			tin = 0;
1645 	}
1646 
1647 	return &q->tins[tin];
1648 }
1649 
cake_classify(struct Qdisc * sch,struct cake_tin_data ** t,struct sk_buff * skb,int flow_mode,int * qerr)1650 static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t,
1651 			 struct sk_buff *skb, int flow_mode, int *qerr)
1652 {
1653 	struct cake_sched_data *q = qdisc_priv(sch);
1654 	struct tcf_proto *filter;
1655 	struct tcf_result res;
1656 	u16 flow = 0, host = 0;
1657 	int result;
1658 
1659 	filter = rcu_dereference_bh(q->filter_list);
1660 	if (!filter)
1661 		goto hash;
1662 
1663 	*qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
1664 	result = tcf_classify(skb, filter, &res, false);
1665 
1666 	if (result >= 0) {
1667 #ifdef CONFIG_NET_CLS_ACT
1668 		switch (result) {
1669 		case TC_ACT_STOLEN:
1670 		case TC_ACT_QUEUED:
1671 		case TC_ACT_TRAP:
1672 			*qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
1673 			fallthrough;
1674 		case TC_ACT_SHOT:
1675 			return 0;
1676 		}
1677 #endif
1678 		if (TC_H_MIN(res.classid) <= CAKE_QUEUES)
1679 			flow = TC_H_MIN(res.classid);
1680 		if (TC_H_MAJ(res.classid) <= (CAKE_QUEUES << 16))
1681 			host = TC_H_MAJ(res.classid) >> 16;
1682 	}
1683 hash:
1684 	*t = cake_select_tin(sch, skb);
1685 	return cake_hash(*t, skb, flow_mode, flow, host) + 1;
1686 }
1687 
1688 static void cake_reconfigure(struct Qdisc *sch);
1689 
cake_enqueue(struct sk_buff * skb,struct Qdisc * sch,struct sk_buff ** to_free)1690 static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
1691 			struct sk_buff **to_free)
1692 {
1693 	struct cake_sched_data *q = qdisc_priv(sch);
1694 	int len = qdisc_pkt_len(skb);
1695 	int ret;
1696 	struct sk_buff *ack = NULL;
1697 	ktime_t now = ktime_get();
1698 	struct cake_tin_data *b;
1699 	struct cake_flow *flow;
1700 	u32 idx;
1701 
1702 	/* choose flow to insert into */
1703 	idx = cake_classify(sch, &b, skb, q->flow_mode, &ret);
1704 	if (idx == 0) {
1705 		if (ret & __NET_XMIT_BYPASS)
1706 			qdisc_qstats_drop(sch);
1707 		__qdisc_drop(skb, to_free);
1708 		return ret;
1709 	}
1710 	idx--;
1711 	flow = &b->flows[idx];
1712 
1713 	/* ensure shaper state isn't stale */
1714 	if (!b->tin_backlog) {
1715 		if (ktime_before(b->time_next_packet, now))
1716 			b->time_next_packet = now;
1717 
1718 		if (!sch->q.qlen) {
1719 			if (ktime_before(q->time_next_packet, now)) {
1720 				q->failsafe_next_packet = now;
1721 				q->time_next_packet = now;
1722 			} else if (ktime_after(q->time_next_packet, now) &&
1723 				   ktime_after(q->failsafe_next_packet, now)) {
1724 				u64 next = \
1725 					min(ktime_to_ns(q->time_next_packet),
1726 					    ktime_to_ns(
1727 						   q->failsafe_next_packet));
1728 				sch->qstats.overlimits++;
1729 				qdisc_watchdog_schedule_ns(&q->watchdog, next);
1730 			}
1731 		}
1732 	}
1733 
1734 	if (unlikely(len > b->max_skblen))
1735 		b->max_skblen = len;
1736 
1737 	if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) {
1738 		struct sk_buff *segs, *nskb;
1739 		netdev_features_t features = netif_skb_features(skb);
1740 		unsigned int slen = 0, numsegs = 0;
1741 
1742 		segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
1743 		if (IS_ERR_OR_NULL(segs))
1744 			return qdisc_drop(skb, sch, to_free);
1745 
1746 		skb_list_walk_safe(segs, segs, nskb) {
1747 			skb_mark_not_on_list(segs);
1748 			qdisc_skb_cb(segs)->pkt_len = segs->len;
1749 			cobalt_set_enqueue_time(segs, now);
1750 			get_cobalt_cb(segs)->adjusted_len = cake_overhead(q,
1751 									  segs);
1752 			flow_queue_add(flow, segs);
1753 
1754 			sch->q.qlen++;
1755 			numsegs++;
1756 			slen += segs->len;
1757 			q->buffer_used += segs->truesize;
1758 			b->packets++;
1759 		}
1760 
1761 		/* stats */
1762 		b->bytes	    += slen;
1763 		b->backlogs[idx]    += slen;
1764 		b->tin_backlog      += slen;
1765 		sch->qstats.backlog += slen;
1766 		q->avg_window_bytes += slen;
1767 
1768 		qdisc_tree_reduce_backlog(sch, 1-numsegs, len-slen);
1769 		consume_skb(skb);
1770 	} else {
1771 		/* not splitting */
1772 		cobalt_set_enqueue_time(skb, now);
1773 		get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb);
1774 		flow_queue_add(flow, skb);
1775 
1776 		if (q->ack_filter)
1777 			ack = cake_ack_filter(q, flow);
1778 
1779 		if (ack) {
1780 			b->ack_drops++;
1781 			sch->qstats.drops++;
1782 			b->bytes += qdisc_pkt_len(ack);
1783 			len -= qdisc_pkt_len(ack);
1784 			q->buffer_used += skb->truesize - ack->truesize;
1785 			if (q->rate_flags & CAKE_FLAG_INGRESS)
1786 				cake_advance_shaper(q, b, ack, now, true);
1787 
1788 			qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(ack));
1789 			consume_skb(ack);
1790 		} else {
1791 			sch->q.qlen++;
1792 			q->buffer_used      += skb->truesize;
1793 		}
1794 
1795 		/* stats */
1796 		b->packets++;
1797 		b->bytes	    += len;
1798 		b->backlogs[idx]    += len;
1799 		b->tin_backlog      += len;
1800 		sch->qstats.backlog += len;
1801 		q->avg_window_bytes += len;
1802 	}
1803 
1804 	if (q->overflow_timeout)
1805 		cake_heapify_up(q, b->overflow_idx[idx]);
1806 
1807 	/* incoming bandwidth capacity estimate */
1808 	if (q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) {
1809 		u64 packet_interval = \
1810 			ktime_to_ns(ktime_sub(now, q->last_packet_time));
1811 
1812 		if (packet_interval > NSEC_PER_SEC)
1813 			packet_interval = NSEC_PER_SEC;
1814 
1815 		/* filter out short-term bursts, eg. wifi aggregation */
1816 		q->avg_packet_interval = \
1817 			cake_ewma(q->avg_packet_interval,
1818 				  packet_interval,
1819 				  (packet_interval > q->avg_packet_interval ?
1820 					  2 : 8));
1821 
1822 		q->last_packet_time = now;
1823 
1824 		if (packet_interval > q->avg_packet_interval) {
1825 			u64 window_interval = \
1826 				ktime_to_ns(ktime_sub(now,
1827 						      q->avg_window_begin));
1828 			u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC;
1829 
1830 			b = div64_u64(b, window_interval);
1831 			q->avg_peak_bandwidth =
1832 				cake_ewma(q->avg_peak_bandwidth, b,
1833 					  b > q->avg_peak_bandwidth ? 2 : 8);
1834 			q->avg_window_bytes = 0;
1835 			q->avg_window_begin = now;
1836 
1837 			if (ktime_after(now,
1838 					ktime_add_ms(q->last_reconfig_time,
1839 						     250))) {
1840 				q->rate_bps = (q->avg_peak_bandwidth * 15) >> 4;
1841 				cake_reconfigure(sch);
1842 			}
1843 		}
1844 	} else {
1845 		q->avg_window_bytes = 0;
1846 		q->last_packet_time = now;
1847 	}
1848 
1849 	/* flowchain */
1850 	if (!flow->set || flow->set == CAKE_SET_DECAYING) {
1851 		struct cake_host *srchost = &b->hosts[flow->srchost];
1852 		struct cake_host *dsthost = &b->hosts[flow->dsthost];
1853 		u16 host_load = 1;
1854 
1855 		if (!flow->set) {
1856 			list_add_tail(&flow->flowchain, &b->new_flows);
1857 		} else {
1858 			b->decaying_flow_count--;
1859 			list_move_tail(&flow->flowchain, &b->new_flows);
1860 		}
1861 		flow->set = CAKE_SET_SPARSE;
1862 		b->sparse_flow_count++;
1863 
1864 		if (cake_dsrc(q->flow_mode))
1865 			host_load = max(host_load, srchost->srchost_bulk_flow_count);
1866 
1867 		if (cake_ddst(q->flow_mode))
1868 			host_load = max(host_load, dsthost->dsthost_bulk_flow_count);
1869 
1870 		flow->deficit = (b->flow_quantum *
1871 				 quantum_div[host_load]) >> 16;
1872 	} else if (flow->set == CAKE_SET_SPARSE_WAIT) {
1873 		struct cake_host *srchost = &b->hosts[flow->srchost];
1874 		struct cake_host *dsthost = &b->hosts[flow->dsthost];
1875 
1876 		/* this flow was empty, accounted as a sparse flow, but actually
1877 		 * in the bulk rotation.
1878 		 */
1879 		flow->set = CAKE_SET_BULK;
1880 		b->sparse_flow_count--;
1881 		b->bulk_flow_count++;
1882 
1883 		if (cake_dsrc(q->flow_mode))
1884 			srchost->srchost_bulk_flow_count++;
1885 
1886 		if (cake_ddst(q->flow_mode))
1887 			dsthost->dsthost_bulk_flow_count++;
1888 
1889 	}
1890 
1891 	if (q->buffer_used > q->buffer_max_used)
1892 		q->buffer_max_used = q->buffer_used;
1893 
1894 	if (q->buffer_used > q->buffer_limit) {
1895 		u32 dropped = 0;
1896 
1897 		while (q->buffer_used > q->buffer_limit) {
1898 			dropped++;
1899 			cake_drop(sch, to_free);
1900 		}
1901 		b->drop_overlimit += dropped;
1902 	}
1903 	return NET_XMIT_SUCCESS;
1904 }
1905 
cake_dequeue_one(struct Qdisc * sch)1906 static struct sk_buff *cake_dequeue_one(struct Qdisc *sch)
1907 {
1908 	struct cake_sched_data *q = qdisc_priv(sch);
1909 	struct cake_tin_data *b = &q->tins[q->cur_tin];
1910 	struct cake_flow *flow = &b->flows[q->cur_flow];
1911 	struct sk_buff *skb = NULL;
1912 	u32 len;
1913 
1914 	if (flow->head) {
1915 		skb = dequeue_head(flow);
1916 		len = qdisc_pkt_len(skb);
1917 		b->backlogs[q->cur_flow] -= len;
1918 		b->tin_backlog		 -= len;
1919 		sch->qstats.backlog      -= len;
1920 		q->buffer_used		 -= skb->truesize;
1921 		sch->q.qlen--;
1922 
1923 		if (q->overflow_timeout)
1924 			cake_heapify(q, b->overflow_idx[q->cur_flow]);
1925 	}
1926 	return skb;
1927 }
1928 
1929 /* Discard leftover packets from a tin no longer in use. */
cake_clear_tin(struct Qdisc * sch,u16 tin)1930 static void cake_clear_tin(struct Qdisc *sch, u16 tin)
1931 {
1932 	struct cake_sched_data *q = qdisc_priv(sch);
1933 	struct sk_buff *skb;
1934 
1935 	q->cur_tin = tin;
1936 	for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++)
1937 		while (!!(skb = cake_dequeue_one(sch)))
1938 			kfree_skb(skb);
1939 }
1940 
cake_dequeue(struct Qdisc * sch)1941 static struct sk_buff *cake_dequeue(struct Qdisc *sch)
1942 {
1943 	struct cake_sched_data *q = qdisc_priv(sch);
1944 	struct cake_tin_data *b = &q->tins[q->cur_tin];
1945 	struct cake_host *srchost, *dsthost;
1946 	ktime_t now = ktime_get();
1947 	struct cake_flow *flow;
1948 	struct list_head *head;
1949 	bool first_flow = true;
1950 	struct sk_buff *skb;
1951 	u16 host_load;
1952 	u64 delay;
1953 	u32 len;
1954 
1955 begin:
1956 	if (!sch->q.qlen)
1957 		return NULL;
1958 
1959 	/* global hard shaper */
1960 	if (ktime_after(q->time_next_packet, now) &&
1961 	    ktime_after(q->failsafe_next_packet, now)) {
1962 		u64 next = min(ktime_to_ns(q->time_next_packet),
1963 			       ktime_to_ns(q->failsafe_next_packet));
1964 
1965 		sch->qstats.overlimits++;
1966 		qdisc_watchdog_schedule_ns(&q->watchdog, next);
1967 		return NULL;
1968 	}
1969 
1970 	/* Choose a class to work on. */
1971 	if (!q->rate_ns) {
1972 		/* In unlimited mode, can't rely on shaper timings, just balance
1973 		 * with DRR
1974 		 */
1975 		bool wrapped = false, empty = true;
1976 
1977 		while (b->tin_deficit < 0 ||
1978 		       !(b->sparse_flow_count + b->bulk_flow_count)) {
1979 			if (b->tin_deficit <= 0)
1980 				b->tin_deficit += b->tin_quantum;
1981 			if (b->sparse_flow_count + b->bulk_flow_count)
1982 				empty = false;
1983 
1984 			q->cur_tin++;
1985 			b++;
1986 			if (q->cur_tin >= q->tin_cnt) {
1987 				q->cur_tin = 0;
1988 				b = q->tins;
1989 
1990 				if (wrapped) {
1991 					/* It's possible for q->qlen to be
1992 					 * nonzero when we actually have no
1993 					 * packets anywhere.
1994 					 */
1995 					if (empty)
1996 						return NULL;
1997 				} else {
1998 					wrapped = true;
1999 				}
2000 			}
2001 		}
2002 	} else {
2003 		/* In shaped mode, choose:
2004 		 * - Highest-priority tin with queue and meeting schedule, or
2005 		 * - The earliest-scheduled tin with queue.
2006 		 */
2007 		ktime_t best_time = KTIME_MAX;
2008 		int tin, best_tin = 0;
2009 
2010 		for (tin = 0; tin < q->tin_cnt; tin++) {
2011 			b = q->tins + tin;
2012 			if ((b->sparse_flow_count + b->bulk_flow_count) > 0) {
2013 				ktime_t time_to_pkt = \
2014 					ktime_sub(b->time_next_packet, now);
2015 
2016 				if (ktime_to_ns(time_to_pkt) <= 0 ||
2017 				    ktime_compare(time_to_pkt,
2018 						  best_time) <= 0) {
2019 					best_time = time_to_pkt;
2020 					best_tin = tin;
2021 				}
2022 			}
2023 		}
2024 
2025 		q->cur_tin = best_tin;
2026 		b = q->tins + best_tin;
2027 
2028 		/* No point in going further if no packets to deliver. */
2029 		if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count)))
2030 			return NULL;
2031 	}
2032 
2033 retry:
2034 	/* service this class */
2035 	head = &b->decaying_flows;
2036 	if (!first_flow || list_empty(head)) {
2037 		head = &b->new_flows;
2038 		if (list_empty(head)) {
2039 			head = &b->old_flows;
2040 			if (unlikely(list_empty(head))) {
2041 				head = &b->decaying_flows;
2042 				if (unlikely(list_empty(head)))
2043 					goto begin;
2044 			}
2045 		}
2046 	}
2047 	flow = list_first_entry(head, struct cake_flow, flowchain);
2048 	q->cur_flow = flow - b->flows;
2049 	first_flow = false;
2050 
2051 	/* triple isolation (modified DRR++) */
2052 	srchost = &b->hosts[flow->srchost];
2053 	dsthost = &b->hosts[flow->dsthost];
2054 	host_load = 1;
2055 
2056 	/* flow isolation (DRR++) */
2057 	if (flow->deficit <= 0) {
2058 		/* Keep all flows with deficits out of the sparse and decaying
2059 		 * rotations.  No non-empty flow can go into the decaying
2060 		 * rotation, so they can't get deficits
2061 		 */
2062 		if (flow->set == CAKE_SET_SPARSE) {
2063 			if (flow->head) {
2064 				b->sparse_flow_count--;
2065 				b->bulk_flow_count++;
2066 
2067 				if (cake_dsrc(q->flow_mode))
2068 					srchost->srchost_bulk_flow_count++;
2069 
2070 				if (cake_ddst(q->flow_mode))
2071 					dsthost->dsthost_bulk_flow_count++;
2072 
2073 				flow->set = CAKE_SET_BULK;
2074 			} else {
2075 				/* we've moved it to the bulk rotation for
2076 				 * correct deficit accounting but we still want
2077 				 * to count it as a sparse flow, not a bulk one.
2078 				 */
2079 				flow->set = CAKE_SET_SPARSE_WAIT;
2080 			}
2081 		}
2082 
2083 		if (cake_dsrc(q->flow_mode))
2084 			host_load = max(host_load, srchost->srchost_bulk_flow_count);
2085 
2086 		if (cake_ddst(q->flow_mode))
2087 			host_load = max(host_load, dsthost->dsthost_bulk_flow_count);
2088 
2089 		WARN_ON(host_load > CAKE_QUEUES);
2090 
2091 		/* The shifted prandom_u32() is a way to apply dithering to
2092 		 * avoid accumulating roundoff errors
2093 		 */
2094 		flow->deficit += (b->flow_quantum * quantum_div[host_load] +
2095 				  (prandom_u32() >> 16)) >> 16;
2096 		list_move_tail(&flow->flowchain, &b->old_flows);
2097 
2098 		goto retry;
2099 	}
2100 
2101 	/* Retrieve a packet via the AQM */
2102 	while (1) {
2103 		skb = cake_dequeue_one(sch);
2104 		if (!skb) {
2105 			/* this queue was actually empty */
2106 			if (cobalt_queue_empty(&flow->cvars, &b->cparams, now))
2107 				b->unresponsive_flow_count--;
2108 
2109 			if (flow->cvars.p_drop || flow->cvars.count ||
2110 			    ktime_before(now, flow->cvars.drop_next)) {
2111 				/* keep in the flowchain until the state has
2112 				 * decayed to rest
2113 				 */
2114 				list_move_tail(&flow->flowchain,
2115 					       &b->decaying_flows);
2116 				if (flow->set == CAKE_SET_BULK) {
2117 					b->bulk_flow_count--;
2118 
2119 					if (cake_dsrc(q->flow_mode))
2120 						srchost->srchost_bulk_flow_count--;
2121 
2122 					if (cake_ddst(q->flow_mode))
2123 						dsthost->dsthost_bulk_flow_count--;
2124 
2125 					b->decaying_flow_count++;
2126 				} else if (flow->set == CAKE_SET_SPARSE ||
2127 					   flow->set == CAKE_SET_SPARSE_WAIT) {
2128 					b->sparse_flow_count--;
2129 					b->decaying_flow_count++;
2130 				}
2131 				flow->set = CAKE_SET_DECAYING;
2132 			} else {
2133 				/* remove empty queue from the flowchain */
2134 				list_del_init(&flow->flowchain);
2135 				if (flow->set == CAKE_SET_SPARSE ||
2136 				    flow->set == CAKE_SET_SPARSE_WAIT)
2137 					b->sparse_flow_count--;
2138 				else if (flow->set == CAKE_SET_BULK) {
2139 					b->bulk_flow_count--;
2140 
2141 					if (cake_dsrc(q->flow_mode))
2142 						srchost->srchost_bulk_flow_count--;
2143 
2144 					if (cake_ddst(q->flow_mode))
2145 						dsthost->dsthost_bulk_flow_count--;
2146 
2147 				} else
2148 					b->decaying_flow_count--;
2149 
2150 				flow->set = CAKE_SET_NONE;
2151 			}
2152 			goto begin;
2153 		}
2154 
2155 		/* Last packet in queue may be marked, shouldn't be dropped */
2156 		if (!cobalt_should_drop(&flow->cvars, &b->cparams, now, skb,
2157 					(b->bulk_flow_count *
2158 					 !!(q->rate_flags &
2159 					    CAKE_FLAG_INGRESS))) ||
2160 		    !flow->head)
2161 			break;
2162 
2163 		/* drop this packet, get another one */
2164 		if (q->rate_flags & CAKE_FLAG_INGRESS) {
2165 			len = cake_advance_shaper(q, b, skb,
2166 						  now, true);
2167 			flow->deficit -= len;
2168 			b->tin_deficit -= len;
2169 		}
2170 		flow->dropped++;
2171 		b->tin_dropped++;
2172 		qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
2173 		qdisc_qstats_drop(sch);
2174 		kfree_skb(skb);
2175 		if (q->rate_flags & CAKE_FLAG_INGRESS)
2176 			goto retry;
2177 	}
2178 
2179 	b->tin_ecn_mark += !!flow->cvars.ecn_marked;
2180 	qdisc_bstats_update(sch, skb);
2181 
2182 	/* collect delay stats */
2183 	delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
2184 	b->avge_delay = cake_ewma(b->avge_delay, delay, 8);
2185 	b->peak_delay = cake_ewma(b->peak_delay, delay,
2186 				  delay > b->peak_delay ? 2 : 8);
2187 	b->base_delay = cake_ewma(b->base_delay, delay,
2188 				  delay < b->base_delay ? 2 : 8);
2189 
2190 	len = cake_advance_shaper(q, b, skb, now, false);
2191 	flow->deficit -= len;
2192 	b->tin_deficit -= len;
2193 
2194 	if (ktime_after(q->time_next_packet, now) && sch->q.qlen) {
2195 		u64 next = min(ktime_to_ns(q->time_next_packet),
2196 			       ktime_to_ns(q->failsafe_next_packet));
2197 
2198 		qdisc_watchdog_schedule_ns(&q->watchdog, next);
2199 	} else if (!sch->q.qlen) {
2200 		int i;
2201 
2202 		for (i = 0; i < q->tin_cnt; i++) {
2203 			if (q->tins[i].decaying_flow_count) {
2204 				ktime_t next = \
2205 					ktime_add_ns(now,
2206 						     q->tins[i].cparams.target);
2207 
2208 				qdisc_watchdog_schedule_ns(&q->watchdog,
2209 							   ktime_to_ns(next));
2210 				break;
2211 			}
2212 		}
2213 	}
2214 
2215 	if (q->overflow_timeout)
2216 		q->overflow_timeout--;
2217 
2218 	return skb;
2219 }
2220 
cake_reset(struct Qdisc * sch)2221 static void cake_reset(struct Qdisc *sch)
2222 {
2223 	u32 c;
2224 
2225 	for (c = 0; c < CAKE_MAX_TINS; c++)
2226 		cake_clear_tin(sch, c);
2227 }
2228 
2229 static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = {
2230 	[TCA_CAKE_BASE_RATE64]   = { .type = NLA_U64 },
2231 	[TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 },
2232 	[TCA_CAKE_ATM]		 = { .type = NLA_U32 },
2233 	[TCA_CAKE_FLOW_MODE]     = { .type = NLA_U32 },
2234 	[TCA_CAKE_OVERHEAD]      = { .type = NLA_S32 },
2235 	[TCA_CAKE_RTT]		 = { .type = NLA_U32 },
2236 	[TCA_CAKE_TARGET]	 = { .type = NLA_U32 },
2237 	[TCA_CAKE_AUTORATE]      = { .type = NLA_U32 },
2238 	[TCA_CAKE_MEMORY]	 = { .type = NLA_U32 },
2239 	[TCA_CAKE_NAT]		 = { .type = NLA_U32 },
2240 	[TCA_CAKE_RAW]		 = { .type = NLA_U32 },
2241 	[TCA_CAKE_WASH]		 = { .type = NLA_U32 },
2242 	[TCA_CAKE_MPU]		 = { .type = NLA_U32 },
2243 	[TCA_CAKE_INGRESS]	 = { .type = NLA_U32 },
2244 	[TCA_CAKE_ACK_FILTER]	 = { .type = NLA_U32 },
2245 	[TCA_CAKE_SPLIT_GSO]	 = { .type = NLA_U32 },
2246 	[TCA_CAKE_FWMARK]	 = { .type = NLA_U32 },
2247 };
2248 
cake_set_rate(struct cake_tin_data * b,u64 rate,u32 mtu,u64 target_ns,u64 rtt_est_ns)2249 static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu,
2250 			  u64 target_ns, u64 rtt_est_ns)
2251 {
2252 	/* convert byte-rate into time-per-byte
2253 	 * so it will always unwedge in reasonable time.
2254 	 */
2255 	static const u64 MIN_RATE = 64;
2256 	u32 byte_target = mtu;
2257 	u64 byte_target_ns;
2258 	u8  rate_shft = 0;
2259 	u64 rate_ns = 0;
2260 
2261 	b->flow_quantum = 1514;
2262 	if (rate) {
2263 		b->flow_quantum = max(min(rate >> 12, 1514ULL), 300ULL);
2264 		rate_shft = 34;
2265 		rate_ns = ((u64)NSEC_PER_SEC) << rate_shft;
2266 		rate_ns = div64_u64(rate_ns, max(MIN_RATE, rate));
2267 		while (!!(rate_ns >> 34)) {
2268 			rate_ns >>= 1;
2269 			rate_shft--;
2270 		}
2271 	} /* else unlimited, ie. zero delay */
2272 
2273 	b->tin_rate_bps  = rate;
2274 	b->tin_rate_ns   = rate_ns;
2275 	b->tin_rate_shft = rate_shft;
2276 
2277 	byte_target_ns = (byte_target * rate_ns) >> rate_shft;
2278 
2279 	b->cparams.target = max((byte_target_ns * 3) / 2, target_ns);
2280 	b->cparams.interval = max(rtt_est_ns +
2281 				     b->cparams.target - target_ns,
2282 				     b->cparams.target * 2);
2283 	b->cparams.mtu_time = byte_target_ns;
2284 	b->cparams.p_inc = 1 << 24; /* 1/256 */
2285 	b->cparams.p_dec = 1 << 20; /* 1/4096 */
2286 }
2287 
cake_config_besteffort(struct Qdisc * sch)2288 static int cake_config_besteffort(struct Qdisc *sch)
2289 {
2290 	struct cake_sched_data *q = qdisc_priv(sch);
2291 	struct cake_tin_data *b = &q->tins[0];
2292 	u32 mtu = psched_mtu(qdisc_dev(sch));
2293 	u64 rate = q->rate_bps;
2294 
2295 	q->tin_cnt = 1;
2296 
2297 	q->tin_index = besteffort;
2298 	q->tin_order = normal_order;
2299 
2300 	cake_set_rate(b, rate, mtu,
2301 		      us_to_ns(q->target), us_to_ns(q->interval));
2302 	b->tin_quantum = 65535;
2303 
2304 	return 0;
2305 }
2306 
cake_config_precedence(struct Qdisc * sch)2307 static int cake_config_precedence(struct Qdisc *sch)
2308 {
2309 	/* convert high-level (user visible) parameters into internal format */
2310 	struct cake_sched_data *q = qdisc_priv(sch);
2311 	u32 mtu = psched_mtu(qdisc_dev(sch));
2312 	u64 rate = q->rate_bps;
2313 	u32 quantum = 256;
2314 	u32 i;
2315 
2316 	q->tin_cnt = 8;
2317 	q->tin_index = precedence;
2318 	q->tin_order = normal_order;
2319 
2320 	for (i = 0; i < q->tin_cnt; i++) {
2321 		struct cake_tin_data *b = &q->tins[i];
2322 
2323 		cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2324 			      us_to_ns(q->interval));
2325 
2326 		b->tin_quantum = max_t(u16, 1U, quantum);
2327 
2328 		/* calculate next class's parameters */
2329 		rate  *= 7;
2330 		rate >>= 3;
2331 
2332 		quantum  *= 7;
2333 		quantum >>= 3;
2334 	}
2335 
2336 	return 0;
2337 }
2338 
2339 /*	List of known Diffserv codepoints:
2340  *
2341  *	Least Effort (CS1)
2342  *	Best Effort (CS0)
2343  *	Max Reliability & LLT "Lo" (TOS1)
2344  *	Max Throughput (TOS2)
2345  *	Min Delay (TOS4)
2346  *	LLT "La" (TOS5)
2347  *	Assured Forwarding 1 (AF1x) - x3
2348  *	Assured Forwarding 2 (AF2x) - x3
2349  *	Assured Forwarding 3 (AF3x) - x3
2350  *	Assured Forwarding 4 (AF4x) - x3
2351  *	Precedence Class 2 (CS2)
2352  *	Precedence Class 3 (CS3)
2353  *	Precedence Class 4 (CS4)
2354  *	Precedence Class 5 (CS5)
2355  *	Precedence Class 6 (CS6)
2356  *	Precedence Class 7 (CS7)
2357  *	Voice Admit (VA)
2358  *	Expedited Forwarding (EF)
2359 
2360  *	Total 25 codepoints.
2361  */
2362 
2363 /*	List of traffic classes in RFC 4594:
2364  *		(roughly descending order of contended priority)
2365  *		(roughly ascending order of uncontended throughput)
2366  *
2367  *	Network Control (CS6,CS7)      - routing traffic
2368  *	Telephony (EF,VA)         - aka. VoIP streams
2369  *	Signalling (CS5)               - VoIP setup
2370  *	Multimedia Conferencing (AF4x) - aka. video calls
2371  *	Realtime Interactive (CS4)     - eg. games
2372  *	Multimedia Streaming (AF3x)    - eg. YouTube, NetFlix, Twitch
2373  *	Broadcast Video (CS3)
2374  *	Low Latency Data (AF2x,TOS4)      - eg. database
2375  *	Ops, Admin, Management (CS2,TOS1) - eg. ssh
2376  *	Standard Service (CS0 & unrecognised codepoints)
2377  *	High Throughput Data (AF1x,TOS2)  - eg. web traffic
2378  *	Low Priority Data (CS1)           - eg. BitTorrent
2379 
2380  *	Total 12 traffic classes.
2381  */
2382 
cake_config_diffserv8(struct Qdisc * sch)2383 static int cake_config_diffserv8(struct Qdisc *sch)
2384 {
2385 /*	Pruned list of traffic classes for typical applications:
2386  *
2387  *		Network Control          (CS6, CS7)
2388  *		Minimum Latency          (EF, VA, CS5, CS4)
2389  *		Interactive Shell        (CS2, TOS1)
2390  *		Low Latency Transactions (AF2x, TOS4)
2391  *		Video Streaming          (AF4x, AF3x, CS3)
2392  *		Bog Standard             (CS0 etc.)
2393  *		High Throughput          (AF1x, TOS2)
2394  *		Background Traffic       (CS1)
2395  *
2396  *		Total 8 traffic classes.
2397  */
2398 
2399 	struct cake_sched_data *q = qdisc_priv(sch);
2400 	u32 mtu = psched_mtu(qdisc_dev(sch));
2401 	u64 rate = q->rate_bps;
2402 	u32 quantum = 256;
2403 	u32 i;
2404 
2405 	q->tin_cnt = 8;
2406 
2407 	/* codepoint to class mapping */
2408 	q->tin_index = diffserv8;
2409 	q->tin_order = normal_order;
2410 
2411 	/* class characteristics */
2412 	for (i = 0; i < q->tin_cnt; i++) {
2413 		struct cake_tin_data *b = &q->tins[i];
2414 
2415 		cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2416 			      us_to_ns(q->interval));
2417 
2418 		b->tin_quantum = max_t(u16, 1U, quantum);
2419 
2420 		/* calculate next class's parameters */
2421 		rate  *= 7;
2422 		rate >>= 3;
2423 
2424 		quantum  *= 7;
2425 		quantum >>= 3;
2426 	}
2427 
2428 	return 0;
2429 }
2430 
cake_config_diffserv4(struct Qdisc * sch)2431 static int cake_config_diffserv4(struct Qdisc *sch)
2432 {
2433 /*  Further pruned list of traffic classes for four-class system:
2434  *
2435  *	    Latency Sensitive  (CS7, CS6, EF, VA, CS5, CS4)
2436  *	    Streaming Media    (AF4x, AF3x, CS3, AF2x, TOS4, CS2, TOS1)
2437  *	    Best Effort        (CS0, AF1x, TOS2, and those not specified)
2438  *	    Background Traffic (CS1)
2439  *
2440  *		Total 4 traffic classes.
2441  */
2442 
2443 	struct cake_sched_data *q = qdisc_priv(sch);
2444 	u32 mtu = psched_mtu(qdisc_dev(sch));
2445 	u64 rate = q->rate_bps;
2446 	u32 quantum = 1024;
2447 
2448 	q->tin_cnt = 4;
2449 
2450 	/* codepoint to class mapping */
2451 	q->tin_index = diffserv4;
2452 	q->tin_order = bulk_order;
2453 
2454 	/* class characteristics */
2455 	cake_set_rate(&q->tins[0], rate, mtu,
2456 		      us_to_ns(q->target), us_to_ns(q->interval));
2457 	cake_set_rate(&q->tins[1], rate >> 4, mtu,
2458 		      us_to_ns(q->target), us_to_ns(q->interval));
2459 	cake_set_rate(&q->tins[2], rate >> 1, mtu,
2460 		      us_to_ns(q->target), us_to_ns(q->interval));
2461 	cake_set_rate(&q->tins[3], rate >> 2, mtu,
2462 		      us_to_ns(q->target), us_to_ns(q->interval));
2463 
2464 	/* bandwidth-sharing weights */
2465 	q->tins[0].tin_quantum = quantum;
2466 	q->tins[1].tin_quantum = quantum >> 4;
2467 	q->tins[2].tin_quantum = quantum >> 1;
2468 	q->tins[3].tin_quantum = quantum >> 2;
2469 
2470 	return 0;
2471 }
2472 
cake_config_diffserv3(struct Qdisc * sch)2473 static int cake_config_diffserv3(struct Qdisc *sch)
2474 {
2475 /*  Simplified Diffserv structure with 3 tins.
2476  *		Low Priority		(CS1)
2477  *		Best Effort
2478  *		Latency Sensitive	(TOS4, VA, EF, CS6, CS7)
2479  */
2480 	struct cake_sched_data *q = qdisc_priv(sch);
2481 	u32 mtu = psched_mtu(qdisc_dev(sch));
2482 	u64 rate = q->rate_bps;
2483 	u32 quantum = 1024;
2484 
2485 	q->tin_cnt = 3;
2486 
2487 	/* codepoint to class mapping */
2488 	q->tin_index = diffserv3;
2489 	q->tin_order = bulk_order;
2490 
2491 	/* class characteristics */
2492 	cake_set_rate(&q->tins[0], rate, mtu,
2493 		      us_to_ns(q->target), us_to_ns(q->interval));
2494 	cake_set_rate(&q->tins[1], rate >> 4, mtu,
2495 		      us_to_ns(q->target), us_to_ns(q->interval));
2496 	cake_set_rate(&q->tins[2], rate >> 2, mtu,
2497 		      us_to_ns(q->target), us_to_ns(q->interval));
2498 
2499 	/* bandwidth-sharing weights */
2500 	q->tins[0].tin_quantum = quantum;
2501 	q->tins[1].tin_quantum = quantum >> 4;
2502 	q->tins[2].tin_quantum = quantum >> 2;
2503 
2504 	return 0;
2505 }
2506 
cake_reconfigure(struct Qdisc * sch)2507 static void cake_reconfigure(struct Qdisc *sch)
2508 {
2509 	struct cake_sched_data *q = qdisc_priv(sch);
2510 	int c, ft;
2511 
2512 	switch (q->tin_mode) {
2513 	case CAKE_DIFFSERV_BESTEFFORT:
2514 		ft = cake_config_besteffort(sch);
2515 		break;
2516 
2517 	case CAKE_DIFFSERV_PRECEDENCE:
2518 		ft = cake_config_precedence(sch);
2519 		break;
2520 
2521 	case CAKE_DIFFSERV_DIFFSERV8:
2522 		ft = cake_config_diffserv8(sch);
2523 		break;
2524 
2525 	case CAKE_DIFFSERV_DIFFSERV4:
2526 		ft = cake_config_diffserv4(sch);
2527 		break;
2528 
2529 	case CAKE_DIFFSERV_DIFFSERV3:
2530 	default:
2531 		ft = cake_config_diffserv3(sch);
2532 		break;
2533 	}
2534 
2535 	for (c = q->tin_cnt; c < CAKE_MAX_TINS; c++) {
2536 		cake_clear_tin(sch, c);
2537 		q->tins[c].cparams.mtu_time = q->tins[ft].cparams.mtu_time;
2538 	}
2539 
2540 	q->rate_ns   = q->tins[ft].tin_rate_ns;
2541 	q->rate_shft = q->tins[ft].tin_rate_shft;
2542 
2543 	if (q->buffer_config_limit) {
2544 		q->buffer_limit = q->buffer_config_limit;
2545 	} else if (q->rate_bps) {
2546 		u64 t = q->rate_bps * q->interval;
2547 
2548 		do_div(t, USEC_PER_SEC / 4);
2549 		q->buffer_limit = max_t(u32, t, 4U << 20);
2550 	} else {
2551 		q->buffer_limit = ~0;
2552 	}
2553 
2554 	sch->flags &= ~TCQ_F_CAN_BYPASS;
2555 
2556 	q->buffer_limit = min(q->buffer_limit,
2557 			      max(sch->limit * psched_mtu(qdisc_dev(sch)),
2558 				  q->buffer_config_limit));
2559 }
2560 
cake_change(struct Qdisc * sch,struct nlattr * opt,struct netlink_ext_ack * extack)2561 static int cake_change(struct Qdisc *sch, struct nlattr *opt,
2562 		       struct netlink_ext_ack *extack)
2563 {
2564 	struct cake_sched_data *q = qdisc_priv(sch);
2565 	struct nlattr *tb[TCA_CAKE_MAX + 1];
2566 	int err;
2567 
2568 	if (!opt)
2569 		return -EINVAL;
2570 
2571 	err = nla_parse_nested_deprecated(tb, TCA_CAKE_MAX, opt, cake_policy,
2572 					  extack);
2573 	if (err < 0)
2574 		return err;
2575 
2576 	if (tb[TCA_CAKE_NAT]) {
2577 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
2578 		q->flow_mode &= ~CAKE_FLOW_NAT_FLAG;
2579 		q->flow_mode |= CAKE_FLOW_NAT_FLAG *
2580 			!!nla_get_u32(tb[TCA_CAKE_NAT]);
2581 #else
2582 		NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT],
2583 				    "No conntrack support in kernel");
2584 		return -EOPNOTSUPP;
2585 #endif
2586 	}
2587 
2588 	if (tb[TCA_CAKE_BASE_RATE64])
2589 		q->rate_bps = nla_get_u64(tb[TCA_CAKE_BASE_RATE64]);
2590 
2591 	if (tb[TCA_CAKE_DIFFSERV_MODE])
2592 		q->tin_mode = nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]);
2593 
2594 	if (tb[TCA_CAKE_WASH]) {
2595 		if (!!nla_get_u32(tb[TCA_CAKE_WASH]))
2596 			q->rate_flags |= CAKE_FLAG_WASH;
2597 		else
2598 			q->rate_flags &= ~CAKE_FLAG_WASH;
2599 	}
2600 
2601 	if (tb[TCA_CAKE_FLOW_MODE])
2602 		q->flow_mode = ((q->flow_mode & CAKE_FLOW_NAT_FLAG) |
2603 				(nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) &
2604 					CAKE_FLOW_MASK));
2605 
2606 	if (tb[TCA_CAKE_ATM])
2607 		q->atm_mode = nla_get_u32(tb[TCA_CAKE_ATM]);
2608 
2609 	if (tb[TCA_CAKE_OVERHEAD]) {
2610 		q->rate_overhead = nla_get_s32(tb[TCA_CAKE_OVERHEAD]);
2611 		q->rate_flags |= CAKE_FLAG_OVERHEAD;
2612 
2613 		q->max_netlen = 0;
2614 		q->max_adjlen = 0;
2615 		q->min_netlen = ~0;
2616 		q->min_adjlen = ~0;
2617 	}
2618 
2619 	if (tb[TCA_CAKE_RAW]) {
2620 		q->rate_flags &= ~CAKE_FLAG_OVERHEAD;
2621 
2622 		q->max_netlen = 0;
2623 		q->max_adjlen = 0;
2624 		q->min_netlen = ~0;
2625 		q->min_adjlen = ~0;
2626 	}
2627 
2628 	if (tb[TCA_CAKE_MPU])
2629 		q->rate_mpu = nla_get_u32(tb[TCA_CAKE_MPU]);
2630 
2631 	if (tb[TCA_CAKE_RTT]) {
2632 		q->interval = nla_get_u32(tb[TCA_CAKE_RTT]);
2633 
2634 		if (!q->interval)
2635 			q->interval = 1;
2636 	}
2637 
2638 	if (tb[TCA_CAKE_TARGET]) {
2639 		q->target = nla_get_u32(tb[TCA_CAKE_TARGET]);
2640 
2641 		if (!q->target)
2642 			q->target = 1;
2643 	}
2644 
2645 	if (tb[TCA_CAKE_AUTORATE]) {
2646 		if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE]))
2647 			q->rate_flags |= CAKE_FLAG_AUTORATE_INGRESS;
2648 		else
2649 			q->rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS;
2650 	}
2651 
2652 	if (tb[TCA_CAKE_INGRESS]) {
2653 		if (!!nla_get_u32(tb[TCA_CAKE_INGRESS]))
2654 			q->rate_flags |= CAKE_FLAG_INGRESS;
2655 		else
2656 			q->rate_flags &= ~CAKE_FLAG_INGRESS;
2657 	}
2658 
2659 	if (tb[TCA_CAKE_ACK_FILTER])
2660 		q->ack_filter = nla_get_u32(tb[TCA_CAKE_ACK_FILTER]);
2661 
2662 	if (tb[TCA_CAKE_MEMORY])
2663 		q->buffer_config_limit = nla_get_u32(tb[TCA_CAKE_MEMORY]);
2664 
2665 	if (tb[TCA_CAKE_SPLIT_GSO]) {
2666 		if (!!nla_get_u32(tb[TCA_CAKE_SPLIT_GSO]))
2667 			q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2668 		else
2669 			q->rate_flags &= ~CAKE_FLAG_SPLIT_GSO;
2670 	}
2671 
2672 	if (tb[TCA_CAKE_FWMARK]) {
2673 		q->fwmark_mask = nla_get_u32(tb[TCA_CAKE_FWMARK]);
2674 		q->fwmark_shft = q->fwmark_mask ? __ffs(q->fwmark_mask) : 0;
2675 	}
2676 
2677 	if (q->tins) {
2678 		sch_tree_lock(sch);
2679 		cake_reconfigure(sch);
2680 		sch_tree_unlock(sch);
2681 	}
2682 
2683 	return 0;
2684 }
2685 
cake_destroy(struct Qdisc * sch)2686 static void cake_destroy(struct Qdisc *sch)
2687 {
2688 	struct cake_sched_data *q = qdisc_priv(sch);
2689 
2690 	qdisc_watchdog_cancel(&q->watchdog);
2691 	tcf_block_put(q->block);
2692 	kvfree(q->tins);
2693 }
2694 
cake_init(struct Qdisc * sch,struct nlattr * opt,struct netlink_ext_ack * extack)2695 static int cake_init(struct Qdisc *sch, struct nlattr *opt,
2696 		     struct netlink_ext_ack *extack)
2697 {
2698 	struct cake_sched_data *q = qdisc_priv(sch);
2699 	int i, j, err;
2700 
2701 	sch->limit = 10240;
2702 	q->tin_mode = CAKE_DIFFSERV_DIFFSERV3;
2703 	q->flow_mode  = CAKE_FLOW_TRIPLE;
2704 
2705 	q->rate_bps = 0; /* unlimited by default */
2706 
2707 	q->interval = 100000; /* 100ms default */
2708 	q->target   =   5000; /* 5ms: codel RFC argues
2709 			       * for 5 to 10% of interval
2710 			       */
2711 	q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2712 	q->cur_tin = 0;
2713 	q->cur_flow  = 0;
2714 
2715 	qdisc_watchdog_init(&q->watchdog, sch);
2716 
2717 	if (opt) {
2718 		err = cake_change(sch, opt, extack);
2719 
2720 		if (err)
2721 			return err;
2722 	}
2723 
2724 	err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
2725 	if (err)
2726 		return err;
2727 
2728 	quantum_div[0] = ~0;
2729 	for (i = 1; i <= CAKE_QUEUES; i++)
2730 		quantum_div[i] = 65535 / i;
2731 
2732 	q->tins = kvcalloc(CAKE_MAX_TINS, sizeof(struct cake_tin_data),
2733 			   GFP_KERNEL);
2734 	if (!q->tins)
2735 		goto nomem;
2736 
2737 	for (i = 0; i < CAKE_MAX_TINS; i++) {
2738 		struct cake_tin_data *b = q->tins + i;
2739 
2740 		INIT_LIST_HEAD(&b->new_flows);
2741 		INIT_LIST_HEAD(&b->old_flows);
2742 		INIT_LIST_HEAD(&b->decaying_flows);
2743 		b->sparse_flow_count = 0;
2744 		b->bulk_flow_count = 0;
2745 		b->decaying_flow_count = 0;
2746 
2747 		for (j = 0; j < CAKE_QUEUES; j++) {
2748 			struct cake_flow *flow = b->flows + j;
2749 			u32 k = j * CAKE_MAX_TINS + i;
2750 
2751 			INIT_LIST_HEAD(&flow->flowchain);
2752 			cobalt_vars_init(&flow->cvars);
2753 
2754 			q->overflow_heap[k].t = i;
2755 			q->overflow_heap[k].b = j;
2756 			b->overflow_idx[j] = k;
2757 		}
2758 	}
2759 
2760 	cake_reconfigure(sch);
2761 	q->avg_peak_bandwidth = q->rate_bps;
2762 	q->min_netlen = ~0;
2763 	q->min_adjlen = ~0;
2764 	return 0;
2765 
2766 nomem:
2767 	cake_destroy(sch);
2768 	return -ENOMEM;
2769 }
2770 
cake_dump(struct Qdisc * sch,struct sk_buff * skb)2771 static int cake_dump(struct Qdisc *sch, struct sk_buff *skb)
2772 {
2773 	struct cake_sched_data *q = qdisc_priv(sch);
2774 	struct nlattr *opts;
2775 
2776 	opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
2777 	if (!opts)
2778 		goto nla_put_failure;
2779 
2780 	if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64, q->rate_bps,
2781 			      TCA_CAKE_PAD))
2782 		goto nla_put_failure;
2783 
2784 	if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE,
2785 			q->flow_mode & CAKE_FLOW_MASK))
2786 		goto nla_put_failure;
2787 
2788 	if (nla_put_u32(skb, TCA_CAKE_RTT, q->interval))
2789 		goto nla_put_failure;
2790 
2791 	if (nla_put_u32(skb, TCA_CAKE_TARGET, q->target))
2792 		goto nla_put_failure;
2793 
2794 	if (nla_put_u32(skb, TCA_CAKE_MEMORY, q->buffer_config_limit))
2795 		goto nla_put_failure;
2796 
2797 	if (nla_put_u32(skb, TCA_CAKE_AUTORATE,
2798 			!!(q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS)))
2799 		goto nla_put_failure;
2800 
2801 	if (nla_put_u32(skb, TCA_CAKE_INGRESS,
2802 			!!(q->rate_flags & CAKE_FLAG_INGRESS)))
2803 		goto nla_put_failure;
2804 
2805 	if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, q->ack_filter))
2806 		goto nla_put_failure;
2807 
2808 	if (nla_put_u32(skb, TCA_CAKE_NAT,
2809 			!!(q->flow_mode & CAKE_FLOW_NAT_FLAG)))
2810 		goto nla_put_failure;
2811 
2812 	if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, q->tin_mode))
2813 		goto nla_put_failure;
2814 
2815 	if (nla_put_u32(skb, TCA_CAKE_WASH,
2816 			!!(q->rate_flags & CAKE_FLAG_WASH)))
2817 		goto nla_put_failure;
2818 
2819 	if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, q->rate_overhead))
2820 		goto nla_put_failure;
2821 
2822 	if (!(q->rate_flags & CAKE_FLAG_OVERHEAD))
2823 		if (nla_put_u32(skb, TCA_CAKE_RAW, 0))
2824 			goto nla_put_failure;
2825 
2826 	if (nla_put_u32(skb, TCA_CAKE_ATM, q->atm_mode))
2827 		goto nla_put_failure;
2828 
2829 	if (nla_put_u32(skb, TCA_CAKE_MPU, q->rate_mpu))
2830 		goto nla_put_failure;
2831 
2832 	if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO,
2833 			!!(q->rate_flags & CAKE_FLAG_SPLIT_GSO)))
2834 		goto nla_put_failure;
2835 
2836 	if (nla_put_u32(skb, TCA_CAKE_FWMARK, q->fwmark_mask))
2837 		goto nla_put_failure;
2838 
2839 	return nla_nest_end(skb, opts);
2840 
2841 nla_put_failure:
2842 	return -1;
2843 }
2844 
cake_dump_stats(struct Qdisc * sch,struct gnet_dump * d)2845 static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
2846 {
2847 	struct nlattr *stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
2848 	struct cake_sched_data *q = qdisc_priv(sch);
2849 	struct nlattr *tstats, *ts;
2850 	int i;
2851 
2852 	if (!stats)
2853 		return -1;
2854 
2855 #define PUT_STAT_U32(attr, data) do {				       \
2856 		if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2857 			goto nla_put_failure;			       \
2858 	} while (0)
2859 #define PUT_STAT_U64(attr, data) do {				       \
2860 		if (nla_put_u64_64bit(d->skb, TCA_CAKE_STATS_ ## attr, \
2861 					data, TCA_CAKE_STATS_PAD)) \
2862 			goto nla_put_failure;			       \
2863 	} while (0)
2864 
2865 	PUT_STAT_U64(CAPACITY_ESTIMATE64, q->avg_peak_bandwidth);
2866 	PUT_STAT_U32(MEMORY_LIMIT, q->buffer_limit);
2867 	PUT_STAT_U32(MEMORY_USED, q->buffer_max_used);
2868 	PUT_STAT_U32(AVG_NETOFF, ((q->avg_netoff + 0x8000) >> 16));
2869 	PUT_STAT_U32(MAX_NETLEN, q->max_netlen);
2870 	PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen);
2871 	PUT_STAT_U32(MIN_NETLEN, q->min_netlen);
2872 	PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen);
2873 
2874 #undef PUT_STAT_U32
2875 #undef PUT_STAT_U64
2876 
2877 	tstats = nla_nest_start_noflag(d->skb, TCA_CAKE_STATS_TIN_STATS);
2878 	if (!tstats)
2879 		goto nla_put_failure;
2880 
2881 #define PUT_TSTAT_U32(attr, data) do {					\
2882 		if (nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \
2883 			goto nla_put_failure;				\
2884 	} while (0)
2885 #define PUT_TSTAT_U64(attr, data) do {					\
2886 		if (nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \
2887 					data, TCA_CAKE_TIN_STATS_PAD))	\
2888 			goto nla_put_failure;				\
2889 	} while (0)
2890 
2891 	for (i = 0; i < q->tin_cnt; i++) {
2892 		struct cake_tin_data *b = &q->tins[q->tin_order[i]];
2893 
2894 		ts = nla_nest_start_noflag(d->skb, i + 1);
2895 		if (!ts)
2896 			goto nla_put_failure;
2897 
2898 		PUT_TSTAT_U64(THRESHOLD_RATE64, b->tin_rate_bps);
2899 		PUT_TSTAT_U64(SENT_BYTES64, b->bytes);
2900 		PUT_TSTAT_U32(BACKLOG_BYTES, b->tin_backlog);
2901 
2902 		PUT_TSTAT_U32(TARGET_US,
2903 			      ktime_to_us(ns_to_ktime(b->cparams.target)));
2904 		PUT_TSTAT_U32(INTERVAL_US,
2905 			      ktime_to_us(ns_to_ktime(b->cparams.interval)));
2906 
2907 		PUT_TSTAT_U32(SENT_PACKETS, b->packets);
2908 		PUT_TSTAT_U32(DROPPED_PACKETS, b->tin_dropped);
2909 		PUT_TSTAT_U32(ECN_MARKED_PACKETS, b->tin_ecn_mark);
2910 		PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, b->ack_drops);
2911 
2912 		PUT_TSTAT_U32(PEAK_DELAY_US,
2913 			      ktime_to_us(ns_to_ktime(b->peak_delay)));
2914 		PUT_TSTAT_U32(AVG_DELAY_US,
2915 			      ktime_to_us(ns_to_ktime(b->avge_delay)));
2916 		PUT_TSTAT_U32(BASE_DELAY_US,
2917 			      ktime_to_us(ns_to_ktime(b->base_delay)));
2918 
2919 		PUT_TSTAT_U32(WAY_INDIRECT_HITS, b->way_hits);
2920 		PUT_TSTAT_U32(WAY_MISSES, b->way_misses);
2921 		PUT_TSTAT_U32(WAY_COLLISIONS, b->way_collisions);
2922 
2923 		PUT_TSTAT_U32(SPARSE_FLOWS, b->sparse_flow_count +
2924 					    b->decaying_flow_count);
2925 		PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count);
2926 		PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count);
2927 		PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen);
2928 
2929 		PUT_TSTAT_U32(FLOW_QUANTUM, b->flow_quantum);
2930 		nla_nest_end(d->skb, ts);
2931 	}
2932 
2933 #undef PUT_TSTAT_U32
2934 #undef PUT_TSTAT_U64
2935 
2936 	nla_nest_end(d->skb, tstats);
2937 	return nla_nest_end(d->skb, stats);
2938 
2939 nla_put_failure:
2940 	nla_nest_cancel(d->skb, stats);
2941 	return -1;
2942 }
2943 
cake_leaf(struct Qdisc * sch,unsigned long arg)2944 static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg)
2945 {
2946 	return NULL;
2947 }
2948 
cake_find(struct Qdisc * sch,u32 classid)2949 static unsigned long cake_find(struct Qdisc *sch, u32 classid)
2950 {
2951 	return 0;
2952 }
2953 
cake_bind(struct Qdisc * sch,unsigned long parent,u32 classid)2954 static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent,
2955 			       u32 classid)
2956 {
2957 	return 0;
2958 }
2959 
cake_unbind(struct Qdisc * q,unsigned long cl)2960 static void cake_unbind(struct Qdisc *q, unsigned long cl)
2961 {
2962 }
2963 
cake_tcf_block(struct Qdisc * sch,unsigned long cl,struct netlink_ext_ack * extack)2964 static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl,
2965 					struct netlink_ext_ack *extack)
2966 {
2967 	struct cake_sched_data *q = qdisc_priv(sch);
2968 
2969 	if (cl)
2970 		return NULL;
2971 	return q->block;
2972 }
2973 
cake_dump_class(struct Qdisc * sch,unsigned long cl,struct sk_buff * skb,struct tcmsg * tcm)2974 static int cake_dump_class(struct Qdisc *sch, unsigned long cl,
2975 			   struct sk_buff *skb, struct tcmsg *tcm)
2976 {
2977 	tcm->tcm_handle |= TC_H_MIN(cl);
2978 	return 0;
2979 }
2980 
cake_dump_class_stats(struct Qdisc * sch,unsigned long cl,struct gnet_dump * d)2981 static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl,
2982 				 struct gnet_dump *d)
2983 {
2984 	struct cake_sched_data *q = qdisc_priv(sch);
2985 	const struct cake_flow *flow = NULL;
2986 	struct gnet_stats_queue qs = { 0 };
2987 	struct nlattr *stats;
2988 	u32 idx = cl - 1;
2989 
2990 	if (idx < CAKE_QUEUES * q->tin_cnt) {
2991 		const struct cake_tin_data *b = \
2992 			&q->tins[q->tin_order[idx / CAKE_QUEUES]];
2993 		const struct sk_buff *skb;
2994 
2995 		flow = &b->flows[idx % CAKE_QUEUES];
2996 
2997 		if (flow->head) {
2998 			sch_tree_lock(sch);
2999 			skb = flow->head;
3000 			while (skb) {
3001 				qs.qlen++;
3002 				skb = skb->next;
3003 			}
3004 			sch_tree_unlock(sch);
3005 		}
3006 		qs.backlog = b->backlogs[idx % CAKE_QUEUES];
3007 		qs.drops = flow->dropped;
3008 	}
3009 	if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
3010 		return -1;
3011 	if (flow) {
3012 		ktime_t now = ktime_get();
3013 
3014 		stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
3015 		if (!stats)
3016 			return -1;
3017 
3018 #define PUT_STAT_U32(attr, data) do {				       \
3019 		if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
3020 			goto nla_put_failure;			       \
3021 	} while (0)
3022 #define PUT_STAT_S32(attr, data) do {				       \
3023 		if (nla_put_s32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
3024 			goto nla_put_failure;			       \
3025 	} while (0)
3026 
3027 		PUT_STAT_S32(DEFICIT, flow->deficit);
3028 		PUT_STAT_U32(DROPPING, flow->cvars.dropping);
3029 		PUT_STAT_U32(COBALT_COUNT, flow->cvars.count);
3030 		PUT_STAT_U32(P_DROP, flow->cvars.p_drop);
3031 		if (flow->cvars.p_drop) {
3032 			PUT_STAT_S32(BLUE_TIMER_US,
3033 				     ktime_to_us(
3034 					     ktime_sub(now,
3035 						       flow->cvars.blue_timer)));
3036 		}
3037 		if (flow->cvars.dropping) {
3038 			PUT_STAT_S32(DROP_NEXT_US,
3039 				     ktime_to_us(
3040 					     ktime_sub(now,
3041 						       flow->cvars.drop_next)));
3042 		}
3043 
3044 		if (nla_nest_end(d->skb, stats) < 0)
3045 			return -1;
3046 	}
3047 
3048 	return 0;
3049 
3050 nla_put_failure:
3051 	nla_nest_cancel(d->skb, stats);
3052 	return -1;
3053 }
3054 
cake_walk(struct Qdisc * sch,struct qdisc_walker * arg)3055 static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg)
3056 {
3057 	struct cake_sched_data *q = qdisc_priv(sch);
3058 	unsigned int i, j;
3059 
3060 	if (arg->stop)
3061 		return;
3062 
3063 	for (i = 0; i < q->tin_cnt; i++) {
3064 		struct cake_tin_data *b = &q->tins[q->tin_order[i]];
3065 
3066 		for (j = 0; j < CAKE_QUEUES; j++) {
3067 			if (list_empty(&b->flows[j].flowchain) ||
3068 			    arg->count < arg->skip) {
3069 				arg->count++;
3070 				continue;
3071 			}
3072 			if (arg->fn(sch, i * CAKE_QUEUES + j + 1, arg) < 0) {
3073 				arg->stop = 1;
3074 				break;
3075 			}
3076 			arg->count++;
3077 		}
3078 	}
3079 }
3080 
3081 static const struct Qdisc_class_ops cake_class_ops = {
3082 	.leaf		=	cake_leaf,
3083 	.find		=	cake_find,
3084 	.tcf_block	=	cake_tcf_block,
3085 	.bind_tcf	=	cake_bind,
3086 	.unbind_tcf	=	cake_unbind,
3087 	.dump		=	cake_dump_class,
3088 	.dump_stats	=	cake_dump_class_stats,
3089 	.walk		=	cake_walk,
3090 };
3091 
3092 static struct Qdisc_ops cake_qdisc_ops __read_mostly = {
3093 	.cl_ops		=	&cake_class_ops,
3094 	.id		=	"cake",
3095 	.priv_size	=	sizeof(struct cake_sched_data),
3096 	.enqueue	=	cake_enqueue,
3097 	.dequeue	=	cake_dequeue,
3098 	.peek		=	qdisc_peek_dequeued,
3099 	.init		=	cake_init,
3100 	.reset		=	cake_reset,
3101 	.destroy	=	cake_destroy,
3102 	.change		=	cake_change,
3103 	.dump		=	cake_dump,
3104 	.dump_stats	=	cake_dump_stats,
3105 	.owner		=	THIS_MODULE,
3106 };
3107 
cake_module_init(void)3108 static int __init cake_module_init(void)
3109 {
3110 	return register_qdisc(&cake_qdisc_ops);
3111 }
3112 
cake_module_exit(void)3113 static void __exit cake_module_exit(void)
3114 {
3115 	unregister_qdisc(&cake_qdisc_ops);
3116 }
3117 
3118 module_init(cake_module_init)
3119 module_exit(cake_module_exit)
3120 MODULE_AUTHOR("Jonathan Morton");
3121 MODULE_LICENSE("Dual BSD/GPL");
3122 MODULE_DESCRIPTION("The CAKE shaper.");
3123