xref: /freebsd/sys/netpfil/ipfw/dn_sched_wf2q.c (revision bdd1243d)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause
3  *
4  * Copyright (c) 2010 Riccardo Panicucci, Universita` di Pisa
5  * Copyright (c) 2000-2002 Luigi Rizzo, Universita` di Pisa
6  * All rights reserved
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 /*
31  * $FreeBSD$
32  */
33 
34 #ifdef _KERNEL
35 #include <sys/malloc.h>
36 #include <sys/socket.h>
37 #include <sys/socketvar.h>
38 #include <sys/kernel.h>
39 #include <sys/lock.h>
40 #include <sys/mbuf.h>
41 #include <sys/module.h>
42 #include <sys/rwlock.h>
43 #include <net/if.h>	/* IFNAMSIZ */
44 #include <netinet/in.h>
45 #include <netinet/ip_var.h>		/* ipfw_rule_ref */
46 #include <netinet/ip_fw.h>	/* flow_id */
47 #include <netinet/ip_dummynet.h>
48 #include <netpfil/ipfw/ip_fw_private.h>
49 #include <netpfil/ipfw/dn_heap.h>
50 #include <netpfil/ipfw/ip_dn_private.h>
51 #ifdef NEW_AQM
52 #include <netpfil/ipfw/dn_aqm.h>
53 #endif
54 #include <netpfil/ipfw/dn_sched.h>
55 #else
56 #include <dn_test.h>
57 #endif
58 
59 #ifndef MAX64
60 #define MAX64(x,y)  (( (int64_t) ( (y)-(x) )) > 0 ) ? (y) : (x)
61 #endif
62 
63 /*
64  * timestamps are computed on 64 bit using fixed point arithmetic.
65  * LMAX_BITS, WMAX_BITS are the max number of bits for the packet len
66  * and sum of weights, respectively. FRAC_BITS is the number of
67  * fractional bits. We want FRAC_BITS >> WMAX_BITS to avoid too large
68  * errors when computing the inverse, FRAC_BITS < 32 so we can do 1/w
69  * using an unsigned 32-bit division, and to avoid wraparounds we need
70  * LMAX_BITS + WMAX_BITS + FRAC_BITS << 64
71  * As an example
72  * FRAC_BITS = 26, LMAX_BITS=14, WMAX_BITS = 19
73  */
74 #ifndef FRAC_BITS
75 #define FRAC_BITS    28 /* shift for fixed point arithmetic */
76 #define	ONE_FP	(1UL << FRAC_BITS)
77 #endif
78 
79 /*
80  * Private information for the scheduler instance:
81  * sch_heap (key is Finish time) returns the next queue to serve
82  * ne_heap (key is Start time) stores not-eligible queues
83  * idle_heap (key=start/finish time) stores idle flows. It must
84  *	support extract-from-middle.
85  * A flow is only in 1 of the three heaps.
86  * XXX todo: use a more efficient data structure, e.g. a tree sorted
87  * by F with min_subtree(S) in each node
88  */
89 struct wf2qp_si {
90     struct dn_heap sch_heap;	/* top extract - key Finish  time */
91     struct dn_heap ne_heap;	/* top extract - key Start   time */
92     struct dn_heap idle_heap;	/* random extract - key Start=Finish time */
93     uint64_t V;			/* virtual time */
94     uint32_t inv_wsum;		/* inverse of sum of weights */
95     uint32_t wsum;		/* sum of weights */
96 };
97 
98 struct wf2qp_queue {
99     struct dn_queue _q;
100     uint64_t S, F;		/* start time, finish time */
101     uint32_t inv_w;		/* ONE_FP / weight */
102     int32_t heap_pos;		/* position (index) of struct in heap */
103 };
104 
105 /*
106  * This file implements a WF2Q+ scheduler as it has been in dummynet
107  * since 2000.
108  * The scheduler supports per-flow queues and has O(log N) complexity.
109  *
110  * WF2Q+ needs to drain entries from the idle heap so that we
111  * can keep the sum of weights up to date. We can do it whenever
112  * we get a chance, or periodically, or following some other
113  * strategy. The function idle_check() drains at most N elements
114  * from the idle heap.
115  */
116 static void
117 idle_check(struct wf2qp_si *si, int n, int force)
118 {
119     struct dn_heap *h = &si->idle_heap;
120     while (n-- > 0 && h->elements > 0 &&
121 		(force || DN_KEY_LT(HEAP_TOP(h)->key, si->V))) {
122 	struct dn_queue *q = HEAP_TOP(h)->object;
123         struct wf2qp_queue *alg_fq = (struct wf2qp_queue *)q;
124 
125         heap_extract(h, NULL);
126         /* XXX to let the flowset delete the queue we should
127 	 * mark it as 'unused' by the scheduler.
128 	 */
129         alg_fq->S = alg_fq->F + 1; /* Mark timestamp as invalid. */
130         si->wsum -= q->fs->fs.par[0];	/* adjust sum of weights */
131 	if (si->wsum > 0)
132 		si->inv_wsum = ONE_FP/si->wsum;
133     }
134 }
135 
136 static int
137 wf2qp_enqueue(struct dn_sch_inst *_si, struct dn_queue *q, struct mbuf *m)
138 {
139     struct dn_fsk *fs = q->fs;
140     struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1);
141     struct wf2qp_queue *alg_fq;
142     uint64_t len = m->m_pkthdr.len;
143 
144     if (m != q->mq.head) {
145 	if (dn_enqueue(q, m, 0)) /* packet was dropped */
146 	    return 1;
147 	if (m != q->mq.head)	/* queue was already busy */
148 	    return 0;
149     }
150 
151     /* If reach this point, queue q was idle */
152     alg_fq = (struct wf2qp_queue *)q;
153 
154     if (DN_KEY_LT(alg_fq->F, alg_fq->S)) {
155         /* F<S means timestamps are invalid ->brand new queue. */
156         alg_fq->S = si->V;		/* init start time */
157         si->wsum += fs->fs.par[0];	/* add weight of new queue. */
158 	si->inv_wsum = ONE_FP/si->wsum;
159     } else { /* if it was idle then it was in the idle heap */
160         if (! heap_extract(&si->idle_heap, q))
161 		return 1;
162         alg_fq->S = MAX64(alg_fq->F, si->V);	/* compute new S */
163     }
164     alg_fq->F = alg_fq->S + len * alg_fq->inv_w;
165 
166     /* if nothing is backlogged, make sure this flow is eligible */
167     if (si->ne_heap.elements == 0 && si->sch_heap.elements == 0)
168         si->V = MAX64(alg_fq->S, si->V);
169 
170     /*
171      * Look at eligibility. A flow is not eligibile if S>V (when
172      * this happens, it means that there is some other flow already
173      * scheduled for the same pipe, so the sch_heap cannot be
174      * empty). If the flow is not eligible we just store it in the
175      * ne_heap. Otherwise, we store in the sch_heap.
176      * Note that for all flows in sch_heap (SCH), S_i <= V,
177      * and for all flows in ne_heap (NEH), S_i > V.
178      * So when we need to compute max(V, min(S_i)) forall i in
179      * SCH+NEH, we only need to look into NEH.
180      */
181     if (DN_KEY_LT(si->V, alg_fq->S)) {
182         /* S>V means flow Not eligible. */
183         if (si->sch_heap.elements == 0)
184             D("++ ouch! not eligible but empty scheduler!");
185         heap_insert(&si->ne_heap, alg_fq->S, q);
186     } else {
187         heap_insert(&si->sch_heap, alg_fq->F, q);
188     }
189     return 0;
190 }
191 
192 /* XXX invariant: sch > 0 || V >= min(S in neh) */
193 static struct mbuf *
194 wf2qp_dequeue(struct dn_sch_inst *_si)
195 {
196 	/* Access scheduler instance private data */
197 	struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1);
198 	struct mbuf *m;
199 	struct dn_queue *q;
200 	struct dn_heap *sch = &si->sch_heap;
201 	struct dn_heap *neh = &si->ne_heap;
202 	struct wf2qp_queue *alg_fq;
203 
204 	if (sch->elements == 0 && neh->elements == 0) {
205 		/* we have nothing to do. We could kill the idle heap
206 		 * altogether and reset V
207 		 */
208 		idle_check(si, 0x7fffffff, 1);
209 		si->V = 0;
210 		si->wsum = 0;	/* should be set already */
211 		return NULL;	/* quick return if nothing to do */
212 	}
213 	idle_check(si, 1, 0);	/* drain something from the idle heap */
214 
215 	/* make sure at least one element is eligible, bumping V
216 	 * and moving entries that have become eligible.
217 	 * We need to repeat the first part twice, before and
218 	 * after extracting the candidate, or enqueue() will
219 	 * find the data structure in a wrong state.
220 	 */
221   m = NULL;
222   for(;;) {
223 	/*
224 	 * Compute V = max(V, min(S_i)). Remember that all elements
225 	 * in sch have by definition S_i <= V so if sch is not empty,
226 	 * V is surely the max and we must not update it. Conversely,
227 	 * if sch is empty we only need to look at neh.
228 	 * We don't need to move the queues, as it will be done at the
229 	 * next enqueue
230 	 */
231 	if (sch->elements == 0 && neh->elements > 0) {
232 		si->V = MAX64(si->V, HEAP_TOP(neh)->key);
233 	}
234 	while (neh->elements > 0 &&
235 		    DN_KEY_LEQ(HEAP_TOP(neh)->key, si->V)) {
236 		q = HEAP_TOP(neh)->object;
237 		alg_fq = (struct wf2qp_queue *)q;
238 		heap_extract(neh, NULL);
239 		heap_insert(sch, alg_fq->F, q);
240 	}
241 	if (m) /* pkt found in previous iteration */
242 		break;
243 	/* ok we have at least one eligible pkt */
244 	q = HEAP_TOP(sch)->object;
245 	alg_fq = (struct wf2qp_queue *)q;
246 	m = dn_dequeue(q);
247 	if (m == NULL)
248 		return NULL;
249 	heap_extract(sch, NULL); /* Remove queue from heap. */
250 	si->V += (uint64_t)(m->m_pkthdr.len) * si->inv_wsum;
251 	alg_fq->S = alg_fq->F;  /* Update start time. */
252 	if (q->mq.head == 0) {	/* not backlogged any more. */
253 		heap_insert(&si->idle_heap, alg_fq->F, q);
254 	} else {			/* Still backlogged. */
255 		/* Update F, store in neh or sch */
256 		uint64_t len = q->mq.head->m_pkthdr.len;
257 		alg_fq->F += len * alg_fq->inv_w;
258 		if (DN_KEY_LEQ(alg_fq->S, si->V)) {
259 			heap_insert(sch, alg_fq->F, q);
260 		} else {
261 			heap_insert(neh, alg_fq->S, q);
262 		}
263 	}
264     }
265 	return m;
266 }
267 
268 static int
269 wf2qp_new_sched(struct dn_sch_inst *_si)
270 {
271 	struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1);
272 	int ofs = offsetof(struct wf2qp_queue, heap_pos);
273 
274 	/* all heaps support extract from middle */
275 	if (heap_init(&si->idle_heap, 16, ofs) ||
276 	    heap_init(&si->sch_heap, 16, ofs) ||
277 	    heap_init(&si->ne_heap, 16, ofs)) {
278 		heap_free(&si->ne_heap);
279 		heap_free(&si->sch_heap);
280 		heap_free(&si->idle_heap);
281 		return ENOMEM;
282 	}
283 	return 0;
284 }
285 
286 static int
287 wf2qp_free_sched(struct dn_sch_inst *_si)
288 {
289 	struct wf2qp_si *si = (struct wf2qp_si *)(_si + 1);
290 
291 	heap_free(&si->sch_heap);
292 	heap_free(&si->ne_heap);
293 	heap_free(&si->idle_heap);
294 
295 	return 0;
296 }
297 
298 static int
299 wf2qp_new_fsk(struct dn_fsk *fs)
300 {
301 	ipdn_bound_var(&fs->fs.par[0], 1,
302 		1, 100, "WF2Q+ weight");
303 	return 0;
304 }
305 
306 static int
307 wf2qp_new_queue(struct dn_queue *_q)
308 {
309 	struct wf2qp_queue *q = (struct wf2qp_queue *)_q;
310 
311 	_q->ni.oid.subtype = DN_SCHED_WF2QP;
312 	q->F = 0;	/* not strictly necessary */
313 	q->S = q->F + 1;    /* mark timestamp as invalid. */
314         q->inv_w = ONE_FP / _q->fs->fs.par[0];
315 	if (_q->mq.head != NULL) {
316 		wf2qp_enqueue(_q->_si, _q, _q->mq.head);
317 	}
318 	return 0;
319 }
320 
321 /*
322  * Called when the infrastructure removes a queue (e.g. flowset
323  * is reconfigured). Nothing to do if we did not 'own' the queue,
324  * otherwise remove it from the right heap and adjust the sum
325  * of weights.
326  */
327 static int
328 wf2qp_free_queue(struct dn_queue *q)
329 {
330 	struct wf2qp_queue *alg_fq = (struct wf2qp_queue *)q;
331 	struct wf2qp_si *si = (struct wf2qp_si *)(q->_si + 1);
332 
333 	if (alg_fq->S >= alg_fq->F + 1)
334 		return 0;	/* nothing to do, not in any heap */
335 	si->wsum -= q->fs->fs.par[0];
336 	if (si->wsum > 0)
337 		si->inv_wsum = ONE_FP/si->wsum;
338 
339 	/* extract from the heap. XXX TODO we may need to adjust V
340 	 * to make sure the invariants hold.
341 	 */
342 	heap_extract(&si->idle_heap, q);
343 	heap_extract(&si->ne_heap, q);
344 	heap_extract(&si->sch_heap, q);
345 
346 	return 0;
347 }
348 
349 /*
350  * WF2Q+ scheduler descriptor
351  * contains the type of the scheduler, the name, the size of the
352  * structures and function pointers.
353  */
354 static struct dn_alg wf2qp_desc = {
355 	_SI( .type = ) DN_SCHED_WF2QP,
356 	_SI( .name = ) "WF2Q+",
357 	_SI( .flags = ) DN_MULTIQUEUE,
358 
359 	/* we need extra space in the si and the queue */
360 	_SI( .schk_datalen = ) 0,
361 	_SI( .si_datalen = ) sizeof(struct wf2qp_si),
362 	_SI( .q_datalen = ) sizeof(struct wf2qp_queue) -
363 				sizeof(struct dn_queue),
364 
365 	_SI( .enqueue = ) wf2qp_enqueue,
366 	_SI( .dequeue = ) wf2qp_dequeue,
367 
368 	_SI( .config = )  NULL,
369 	_SI( .destroy = )  NULL,
370 	_SI( .new_sched = ) wf2qp_new_sched,
371 	_SI( .free_sched = ) wf2qp_free_sched,
372 
373 	_SI( .new_fsk = ) wf2qp_new_fsk,
374 	_SI( .free_fsk = )  NULL,
375 
376 	_SI( .new_queue = ) wf2qp_new_queue,
377 	_SI( .free_queue = ) wf2qp_free_queue,
378 #ifdef NEW_AQM
379 	_SI( .getconfig = )  NULL,
380 #endif
381 
382 };
383 
384 DECLARE_DNSCHED_MODULE(dn_wf2qp, &wf2qp_desc);
385