xref: /dragonfly/sys/net/altq/altq_fairq.c (revision e5a92d33)
1 /*
2  * Copyright (c) 2008 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * $DragonFly: src/sys/net/altq/altq_fairq.c,v 1.2 2008/05/14 11:59:23 sephe Exp $
35  */
36 /*
37  * Matt: I gutted altq_priq.c and used it as a skeleton on which to build
38  * fairq.  The fairq algorithm is completely different then priq, of course,
39  * but because I used priq's skeleton I believe I should include priq's
40  * copyright.
41  *
42  * Copyright (C) 2000-2003
43  *	Sony Computer Science Laboratories Inc.  All rights reserved.
44  *
45  * Redistribution and use in source and binary forms, with or without
46  * modification, are permitted provided that the following conditions
47  * are met:
48  * 1. Redistributions of source code must retain the above copyright
49  *    notice, this list of conditions and the following disclaimer.
50  * 2. Redistributions in binary form must reproduce the above copyright
51  *    notice, this list of conditions and the following disclaimer in the
52  *    documentation and/or other materials provided with the distribution.
53  *
54  * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND
55  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
56  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
57  * ARE DISCLAIMED.  IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE
58  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
59  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
60  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
61  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
62  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
63  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
64  * SUCH DAMAGE.
65  */
66 
67 /*
68  * FAIRQ - take traffic classified by keep state (hashed into
69  *	   pf->state_hash) and bucketize it.  Fairly extract
70  *	   the first packet from each bucket in a round-robin fashion.
71  *
72  * TODO - better overall qlimit support (right now it is per-bucket).
73  *	- NOTE: red etc is per bucket, not overall.
74  *	- better service curve support.
75  *
76  * EXAMPLE:
77  *
78  *  altq on em0 fairq bandwidth 650Kb queue { std, bulk }
79  *  queue std  priority 3 bandwidth 200Kb \
80  *	fairq (buckets 64, default, hogs 1Kb) qlimit 50
81  *  queue bulk priority 2 bandwidth 100Kb \
82  *	fairq (buckets 64, hogs 1Kb) qlimit 50
83  *
84  *	NOTE: When the aggregate bandwidth is less than the link bandwidth
85  *	      any remaining bandwidth is dynamically assigned using the
86  *	      existing bandwidth specs as weightings.
87  *
88  *  pass out on em0 from any to any keep state queue std
89  *  pass out on em0 inet proto tcp ..... port ... keep state queue bulk
90  */
91 #include "opt_altq.h"
92 #include "opt_inet.h"
93 #include "opt_inet6.h"
94 
95 #ifdef ALTQ_FAIRQ  /* fairq is enabled in the kernel conf */
96 
97 #include <sys/param.h>
98 #include <sys/malloc.h>
99 #include <sys/mbuf.h>
100 #include <sys/socket.h>
101 #include <sys/sockio.h>
102 #include <sys/systm.h>
103 #include <sys/proc.h>
104 #include <sys/errno.h>
105 #include <sys/kernel.h>
106 #include <sys/queue.h>
107 #include <sys/thread.h>
108 
109 #include <net/if.h>
110 #include <net/ifq_var.h>
111 #include <netinet/in.h>
112 
113 #include <net/pf/pfvar.h>
114 #include <net/altq/altq.h>
115 #include <net/altq/altq_fairq.h>
116 
117 #include <sys/thread2.h>
118 
119 #define FAIRQ_SUBQ_INDEX	ALTQ_SUBQ_INDEX_DEFAULT
120 #define FAIRQ_LOCK(ifq) \
121     ALTQ_SQ_LOCK(&(ifq)->altq_subq[FAIRQ_SUBQ_INDEX])
122 #define FAIRQ_UNLOCK(ifq) \
123     ALTQ_SQ_UNLOCK(&(ifq)->altq_subq[FAIRQ_SUBQ_INDEX])
124 
125 /*
126  * function prototypes
127  */
128 static int	fairq_clear_interface(struct fairq_if *);
129 static int	fairq_request(struct ifaltq_subque *, int, void *);
130 static void	fairq_purge(struct fairq_if *);
131 static struct fairq_class *fairq_class_create(struct fairq_if *, int,
132 					int, u_int, struct fairq_opts *, int);
133 static int	fairq_class_destroy(struct fairq_class *);
134 static int	fairq_enqueue(struct ifaltq_subque *, struct mbuf *,
135 					struct altq_pktattr *);
136 static struct mbuf *fairq_dequeue(struct ifaltq_subque *, int);
137 
138 static int	fairq_addq(struct fairq_class *, struct mbuf *, int hash);
139 static struct mbuf *fairq_getq(struct fairq_class *, uint64_t);
140 static struct mbuf *fairq_pollq(struct fairq_class *, uint64_t, int *);
141 static fairq_bucket_t *fairq_selectq(struct fairq_class *);
142 static void	fairq_purgeq(struct fairq_class *);
143 
144 static void	get_class_stats(struct fairq_classstats *,
145 					struct fairq_class *);
146 static struct fairq_class *clh_to_clp(struct fairq_if *, uint32_t);
147 
148 int
149 fairq_pfattach(struct pf_altq *a, struct ifaltq *ifq)
150 {
151 	return altq_attach(ifq, ALTQT_FAIRQ, a->altq_disc, ifq_mapsubq_default,
152 	    fairq_enqueue, fairq_dequeue, fairq_request, NULL, NULL);
153 }
154 
155 int
156 fairq_add_altq(struct pf_altq *a)
157 {
158 	struct fairq_if *pif;
159 	struct ifnet *ifp;
160 
161 	ifnet_lock();
162 
163 	if ((ifp = ifunit(a->ifname)) == NULL) {
164 		ifnet_unlock();
165 		return (EINVAL);
166 	}
167 	if (!ifq_is_ready(&ifp->if_snd)) {
168 		ifnet_unlock();
169 		return (ENODEV);
170 	}
171 
172 	pif = kmalloc(sizeof(*pif), M_ALTQ, M_WAITOK | M_ZERO);
173 	pif->pif_bandwidth = a->ifbandwidth;
174 	pif->pif_maxpri = -1;
175 	pif->pif_ifq = &ifp->if_snd;
176 	ifq_purge_all(&ifp->if_snd);
177 
178 	ifnet_unlock();
179 
180 	/* keep the state in pf_altq */
181 	a->altq_disc = pif;
182 
183 	return (0);
184 }
185 
186 int
187 fairq_remove_altq(struct pf_altq *a)
188 {
189 	struct fairq_if *pif;
190 
191 	if ((pif = a->altq_disc) == NULL)
192 		return (EINVAL);
193 	a->altq_disc = NULL;
194 
195 	fairq_clear_interface(pif);
196 
197 	kfree(pif, M_ALTQ);
198 	return (0);
199 }
200 
201 static int
202 fairq_add_queue_locked(struct pf_altq *a, struct fairq_if *pif)
203 {
204 	struct fairq_class *cl;
205 
206 	KKASSERT(a->priority < FAIRQ_MAXPRI);
207 	KKASSERT(a->qid != 0);
208 
209 	if (pif->pif_classes[a->priority] != NULL)
210 		return (EBUSY);
211 	if (clh_to_clp(pif, a->qid) != NULL)
212 		return (EBUSY);
213 
214 	cl = fairq_class_create(pif, a->priority, a->qlimit, a->bandwidth,
215 			       &a->pq_u.fairq_opts, a->qid);
216 	if (cl == NULL)
217 		return (ENOMEM);
218 
219 	return (0);
220 }
221 
222 int
223 fairq_add_queue(struct pf_altq *a)
224 {
225 	struct fairq_if *pif;
226 	struct ifaltq *ifq;
227 	int error;
228 
229 	/* check parameters */
230 	if (a->priority >= FAIRQ_MAXPRI)
231 		return (EINVAL);
232 	if (a->qid == 0)
233 		return (EINVAL);
234 
235 	/* XXX not MP safe */
236 	if ((pif = a->altq_disc) == NULL)
237 		return (EINVAL);
238 	ifq = pif->pif_ifq;
239 
240 	FAIRQ_LOCK(ifq);
241 	error = fairq_add_queue_locked(a, pif);
242 	FAIRQ_UNLOCK(ifq);
243 
244 	return error;
245 }
246 
247 static int
248 fairq_remove_queue_locked(struct pf_altq *a, struct fairq_if *pif)
249 {
250 	struct fairq_class *cl;
251 
252 	if ((cl = clh_to_clp(pif, a->qid)) == NULL)
253 		return (EINVAL);
254 
255 	return (fairq_class_destroy(cl));
256 }
257 
258 int
259 fairq_remove_queue(struct pf_altq *a)
260 {
261 	struct fairq_if *pif;
262 	struct ifaltq *ifq;
263 	int error;
264 
265 	/* XXX not MP safe */
266 	if ((pif = a->altq_disc) == NULL)
267 		return (EINVAL);
268 	ifq = pif->pif_ifq;
269 
270 	FAIRQ_LOCK(ifq);
271 	error = fairq_remove_queue_locked(a, pif);
272 	FAIRQ_UNLOCK(ifq);
273 
274 	return error;
275 }
276 
277 int
278 fairq_getqstats(struct pf_altq *a, void *ubuf, int *nbytes)
279 {
280 	struct fairq_if *pif;
281 	struct fairq_class *cl;
282 	struct fairq_classstats stats;
283 	struct ifaltq *ifq;
284 	int error = 0;
285 
286 	if (*nbytes < sizeof(stats))
287 		return (EINVAL);
288 
289 	ifnet_lock();
290 
291 	/* XXX not MP safe */
292 	if ((pif = altq_lookup(a->ifname, ALTQT_FAIRQ)) == NULL) {
293 		ifnet_unlock();
294 		return (EBADF);
295 	}
296 	ifq = pif->pif_ifq;
297 
298 	FAIRQ_LOCK(ifq);
299 
300 	if ((cl = clh_to_clp(pif, a->qid)) == NULL) {
301 		FAIRQ_UNLOCK(ifq);
302 		ifnet_unlock();
303 		return (EINVAL);
304 	}
305 
306 	get_class_stats(&stats, cl);
307 
308 	FAIRQ_UNLOCK(ifq);
309 
310 	ifnet_unlock();
311 
312 	if ((error = copyout((caddr_t)&stats, ubuf, sizeof(stats))) != 0)
313 		return (error);
314 	*nbytes = sizeof(stats);
315 	return (0);
316 }
317 
318 /*
319  * bring the interface back to the initial state by discarding
320  * all the filters and classes.
321  */
322 static int
323 fairq_clear_interface(struct fairq_if *pif)
324 {
325 	struct fairq_class *cl;
326 	int pri;
327 
328 	/* clear out the classes */
329 	for (pri = 0; pri <= pif->pif_maxpri; pri++) {
330 		if ((cl = pif->pif_classes[pri]) != NULL)
331 			fairq_class_destroy(cl);
332 	}
333 
334 	return (0);
335 }
336 
337 static int
338 fairq_request(struct ifaltq_subque *ifsq, int req, void *arg)
339 {
340 	struct ifaltq *ifq = ifsq->ifsq_altq;
341 	struct fairq_if *pif = (struct fairq_if *)ifq->altq_disc;
342 
343 	crit_enter();
344 	switch (req) {
345 	case ALTRQ_PURGE:
346 		if (ifsq_get_index(ifsq) == FAIRQ_SUBQ_INDEX) {
347 			fairq_purge(pif);
348 		} else {
349 			/*
350 			 * Race happened, the unrelated subqueue was
351 			 * picked during the packet scheduler transition.
352 			 */
353 			ifsq_classic_request(ifsq, ALTRQ_PURGE, NULL);
354 		}
355 		break;
356 	}
357 	crit_exit();
358 	return (0);
359 }
360 
361 /* discard all the queued packets on the interface */
362 static void
363 fairq_purge(struct fairq_if *pif)
364 {
365 	struct fairq_class *cl;
366 	int pri;
367 
368 	for (pri = 0; pri <= pif->pif_maxpri; pri++) {
369 		if ((cl = pif->pif_classes[pri]) != NULL && cl->cl_head)
370 			fairq_purgeq(cl);
371 	}
372 	if (ifq_is_enabled(pif->pif_ifq))
373 		ALTQ_SQ_CNTR_RESET(&pif->pif_ifq->altq_subq[FAIRQ_SUBQ_INDEX]);
374 }
375 
376 static struct fairq_class *
377 fairq_class_create(struct fairq_if *pif, int pri, int qlimit,
378 		   u_int bandwidth, struct fairq_opts *opts, int qid)
379 {
380 	struct fairq_class *cl;
381 	int flags = opts->flags;
382 	u_int nbuckets = opts->nbuckets;
383 	int i;
384 
385 #ifndef ALTQ_RED
386 	if (flags & FARF_RED) {
387 #ifdef ALTQ_DEBUG
388 		kprintf("fairq_class_create: RED not configured for FAIRQ!\n");
389 #endif
390 		return (NULL);
391 	}
392 #endif
393 	if (nbuckets == 0)
394 		nbuckets = 256;
395 	if (nbuckets > FAIRQ_MAX_BUCKETS)
396 		nbuckets = FAIRQ_MAX_BUCKETS;
397 	/* enforce power-of-2 size */
398 	while ((nbuckets ^ (nbuckets - 1)) != ((nbuckets << 1) - 1))
399 		++nbuckets;
400 
401 	if ((cl = pif->pif_classes[pri]) != NULL) {
402 		/* modify the class instead of creating a new one */
403 		crit_enter();
404 		if (cl->cl_head)
405 			fairq_purgeq(cl);
406 		crit_exit();
407 #ifdef ALTQ_RIO
408 		if (cl->cl_qtype == Q_RIO)
409 			rio_destroy((rio_t *)cl->cl_red);
410 #endif
411 #ifdef ALTQ_RED
412 		if (cl->cl_qtype == Q_RED)
413 			red_destroy(cl->cl_red);
414 #endif
415 	} else {
416 		cl = kmalloc(sizeof(*cl), M_ALTQ, M_WAITOK | M_ZERO);
417 		cl->cl_nbuckets = nbuckets;
418 		cl->cl_nbucket_mask = nbuckets - 1;
419 
420 		cl->cl_buckets = kmalloc(sizeof(*cl->cl_buckets) *
421 					 cl->cl_nbuckets,
422 					 M_ALTQ, M_WAITOK | M_ZERO);
423 		cl->cl_head = NULL;
424 	}
425 
426 	pif->pif_classes[pri] = cl;
427 	if (flags & FARF_DEFAULTCLASS)
428 		pif->pif_default = cl;
429 	if (qlimit == 0)
430 		qlimit = 50;  /* use default */
431 	cl->cl_qlimit = qlimit;
432 	for (i = 0; i < cl->cl_nbuckets; ++i) {
433 		qlimit(&cl->cl_buckets[i].queue) = qlimit;
434 	}
435 	cl->cl_bandwidth = bandwidth / 8;	/* cvt to bytes per second */
436 	cl->cl_qtype = Q_DROPTAIL;
437 	cl->cl_flags = flags & FARF_USERFLAGS;
438 	cl->cl_pri = pri;
439 	if (pri > pif->pif_maxpri)
440 		pif->pif_maxpri = pri;
441 	cl->cl_pif = pif;
442 	cl->cl_handle = qid;
443 	cl->cl_hogs_m1 = opts->hogs_m1 / 8;
444 	cl->cl_lssc_m1 = opts->lssc_m1 / 8;	/* NOT YET USED */
445 	cl->cl_bw_current = 0;
446 
447 #ifdef ALTQ_RED
448 	if (flags & (FARF_RED|FARF_RIO)) {
449 		int red_flags, red_pkttime;
450 
451 		red_flags = 0;
452 		if (flags & FARF_ECN)
453 			red_flags |= REDF_ECN;
454 #ifdef ALTQ_RIO
455 		if (flags & FARF_CLEARDSCP)
456 			red_flags |= RIOF_CLEARDSCP;
457 #endif
458 		if (pif->pif_bandwidth < 8)
459 			red_pkttime = 1000 * 1000 * 1000; /* 1 sec */
460 		else
461 			red_pkttime =
462 				(int64_t)pif->pif_ifq->altq_ifp->if_mtu *
463 				(1000 * 1000 * 1000) /
464 				(pif->pif_bandwidth / 8 + 1);
465 #ifdef ALTQ_RIO
466 		if (flags & FARF_RIO) {
467 			cl->cl_red = (red_t *)rio_alloc(0, NULL,
468 						red_flags, red_pkttime);
469 			if (cl->cl_red != NULL)
470 				cl->cl_qtype = Q_RIO;
471 		} else
472 #endif
473 		if (flags & FARF_RED) {
474 			cl->cl_red = red_alloc(0, 0,
475 			    cl->cl_qlimit * 10 / 100,
476 			    cl->cl_qlimit * 30 / 100,
477 			    red_flags, red_pkttime);
478 			if (cl->cl_red != NULL)
479 				cl->cl_qtype = Q_RED;
480 		}
481 	}
482 #endif /* ALTQ_RED */
483 
484 	return (cl);
485 }
486 
487 static int
488 fairq_class_destroy(struct fairq_class *cl)
489 {
490 	struct fairq_if *pif;
491 	int pri;
492 
493 	crit_enter();
494 
495 	if (cl->cl_head)
496 		fairq_purgeq(cl);
497 
498 	pif = cl->cl_pif;
499 	pif->pif_classes[cl->cl_pri] = NULL;
500 	if (pif->pif_poll_cache == cl)
501 		pif->pif_poll_cache = NULL;
502 	if (pif->pif_maxpri == cl->cl_pri) {
503 		for (pri = cl->cl_pri; pri >= 0; pri--)
504 			if (pif->pif_classes[pri] != NULL) {
505 				pif->pif_maxpri = pri;
506 				break;
507 			}
508 		if (pri < 0)
509 			pif->pif_maxpri = -1;
510 	}
511 	crit_exit();
512 
513 	if (cl->cl_red != NULL) {
514 #ifdef ALTQ_RIO
515 		if (cl->cl_qtype == Q_RIO)
516 			rio_destroy((rio_t *)cl->cl_red);
517 #endif
518 #ifdef ALTQ_RED
519 		if (cl->cl_qtype == Q_RED)
520 			red_destroy(cl->cl_red);
521 #endif
522 	}
523 	kfree(cl->cl_buckets, M_ALTQ);
524 	cl->cl_head = NULL;	/* sanity */
525 	cl->cl_buckets = NULL;	/* sanity */
526 	kfree(cl, M_ALTQ);
527 
528 	return (0);
529 }
530 
531 /*
532  * fairq_enqueue is an enqueue function to be registered to
533  * (*ifsq_enqueue) in struct ifaltq_subque.
534  */
535 static int
536 fairq_enqueue(struct ifaltq_subque *ifsq, struct mbuf *m,
537     struct altq_pktattr *pktattr)
538 {
539 	struct ifaltq *ifq = ifsq->ifsq_altq;
540 	struct fairq_if *pif = (struct fairq_if *)ifq->altq_disc;
541 	struct fairq_class *cl;
542 	int error;
543 	int len;
544 	int hash;
545 
546 	if (ifsq_get_index(ifsq) != FAIRQ_SUBQ_INDEX) {
547 		/*
548 		 * Race happened, the unrelated subqueue was
549 		 * picked during the packet scheduler transition.
550 		 */
551 		ifsq_classic_request(ifsq, ALTRQ_PURGE, NULL);
552 		m_freem(m);
553 		return ENOBUFS;
554 	}
555 
556 	crit_enter();
557 
558 	/* grab class set by classifier */
559 	M_ASSERTPKTHDR(m);
560 	if (m->m_pkthdr.fw_flags & PF_MBUF_STRUCTURE) {
561 		cl = clh_to_clp(pif, m->m_pkthdr.pf.qid);
562 		if (m->m_pkthdr.pf.flags & PF_TAG_STATE_HASHED)
563 			hash = (int)m->m_pkthdr.pf.state_hash;
564 		else
565 			hash = 0;
566 	} else {
567 		cl = NULL;
568 		hash = 0;
569 	}
570 	if (cl == NULL) {
571 		cl = pif->pif_default;
572 		if (cl == NULL) {
573 			m_freem(m);
574 			error = ENOBUFS;
575 			goto done;
576 		}
577 	}
578 	cl->cl_flags |= FARF_HAS_PACKETS;
579 	cl->cl_pktattr = NULL;
580 	len = m_pktlen(m);
581 	if (fairq_addq(cl, m, hash) != 0) {
582 		/* drop occurred.  mbuf was freed in fairq_addq. */
583 		PKTCNTR_ADD(&cl->cl_dropcnt, len);
584 		error = ENOBUFS;
585 		goto done;
586 	}
587 	ALTQ_SQ_PKTCNT_INC(ifsq);
588 	error = 0;
589 done:
590 	crit_exit();
591 	return (error);
592 }
593 
594 /*
595  * fairq_dequeue is a dequeue function to be registered to
596  * (*ifsq_dequeue) in struct ifaltq_subque.
597  *
598  * note: ALTDQ_POLL returns the next packet without removing the packet
599  *	from the queue.  ALTDQ_REMOVE is a normal dequeue operation.
600  */
601 static struct mbuf *
602 fairq_dequeue(struct ifaltq_subque *ifsq, int op)
603 {
604 	struct ifaltq *ifq = ifsq->ifsq_altq;
605 	struct fairq_if *pif = (struct fairq_if *)ifq->altq_disc;
606 	struct fairq_class *cl;
607 	struct fairq_class *best_cl;
608 	struct mbuf *best_m;
609 	struct mbuf *m;
610 	uint64_t cur_time = read_machclk();
611 	uint64_t best_scale;
612 	uint64_t scale;
613 	int pri;
614 	int hit_limit;
615 
616 	if (ifsq_get_index(ifsq) != FAIRQ_SUBQ_INDEX) {
617 		/*
618 		 * Race happened, the unrelated subqueue was
619 		 * picked during the packet scheduler transition.
620 		 */
621 		ifsq_classic_request(ifsq, ALTRQ_PURGE, NULL);
622 		return NULL;
623 	}
624 
625 	if (ifsq_is_empty(ifsq)) {
626 		/* no packet in the queue */
627 		return (NULL);
628 	}
629 
630 	crit_enter();
631 	if (pif->pif_poll_cache && op == ALTDQ_REMOVE) {
632 		best_cl = pif->pif_poll_cache;
633 		m = fairq_getq(best_cl, cur_time);
634 		pif->pif_poll_cache = NULL;
635 		if (m) {
636 			ALTQ_SQ_PKTCNT_DEC(ifsq);
637 			PKTCNTR_ADD(&best_cl->cl_xmitcnt, m_pktlen(m));
638 		}
639 	} else {
640 		best_cl = NULL;
641 		best_m = NULL;
642 		best_scale = 0xFFFFFFFFFFFFFFFFLLU;
643 
644 		for (pri = pif->pif_maxpri;  pri >= 0; pri--) {
645 			if ((cl = pif->pif_classes[pri]) == NULL)
646 				continue;
647 			if ((cl->cl_flags & FARF_HAS_PACKETS) == 0)
648 				continue;
649 			m = fairq_pollq(cl, cur_time, &hit_limit);
650 			if (m == NULL) {
651 				cl->cl_flags &= ~FARF_HAS_PACKETS;
652 				continue;
653 			}
654 
655 			/*
656 			 * We can halt the search immediately if the queue
657 			 * did not hit its bandwidth limit.
658 			 */
659 			if (hit_limit == 0) {
660 				best_cl = cl;
661 				best_m = m;
662 				break;
663 			}
664 
665 			/*
666 			 * Otherwise calculate the scale factor and select
667 			 * the queue with the lowest scale factor.  This
668 			 * apportions any unused bandwidth weighted by
669 			 * the relative bandwidth specification.
670 			 *
671 			 * scale = (bw / max) with a multiple of 256.
672 			 *
673 			 * The calculation is refactored to reduce the
674 			 * chance of overflow.
675 			 */
676 			scale = cl->cl_bw_current * 16 /
677 				(cl->cl_bandwidth / 16 + 1);
678 			if (best_scale > scale) {
679 				best_cl = cl;
680 				best_m = m;
681 				best_scale = scale;
682 			}
683 		}
684 
685 		if (op == ALTDQ_POLL) {
686 #ifdef foo
687 			/*
688 			 * Don't use poll cache; the poll/dequeue
689 			 * model is no longer applicable to SMP
690 			 * system.  e.g.
691 			 *    CPU-A            CPU-B
692 			 *      :                :
693 			 *    poll               :
694 			 *      :              poll
695 			 *    dequeue (+)        :
696 			 *
697 			 * The dequeue at (+) will hit the poll
698 			 * cache set by CPU-B.
699 			 */
700 			pif->pif_poll_cache = best_cl;
701 #endif
702 			m = best_m;
703 		} else if (best_cl) {
704 			m = fairq_getq(best_cl, cur_time);
705 			KKASSERT(best_m == m);
706 			ALTQ_SQ_PKTCNT_DEC(ifsq);
707 			PKTCNTR_ADD(&best_cl->cl_xmitcnt, m_pktlen(m));
708 		} else {
709 			m = NULL;
710 		}
711 	}
712 	crit_exit();
713 	return (m);
714 }
715 
716 static int
717 fairq_addq(struct fairq_class *cl, struct mbuf *m, int hash)
718 {
719 	fairq_bucket_t *b;
720 	u_int hindex;
721 	uint64_t bw;
722 
723 	/*
724 	 * If the packet doesn't have any keep state put it on the end of
725 	 * our queue.  XXX this can result in out of order delivery.
726 	 */
727 	if (hash == 0) {
728 		if (cl->cl_head)
729 			b = cl->cl_head->prev;
730 		else
731 			b = &cl->cl_buckets[0];
732 	} else {
733 		hindex = hash & cl->cl_nbucket_mask;
734 		b = &cl->cl_buckets[hindex];
735 	}
736 
737 	/*
738 	 * Add the bucket to the end of the circular list of active buckets.
739 	 *
740 	 * As a special case we add the bucket to the beginning of the list
741 	 * instead of the end if it was not previously on the list and if
742 	 * its traffic is less then the hog level.
743 	 */
744 	if (b->in_use == 0) {
745 		b->in_use = 1;
746 		if (cl->cl_head == NULL) {
747 			cl->cl_head = b;
748 			cl->cl_advanced = 1;
749 			b->next = b;
750 			b->prev = b;
751 		} else {
752 			b->next = cl->cl_head;
753 			b->prev = cl->cl_head->prev;
754 			b->prev->next = b;
755 			b->next->prev = b;
756 
757 			if (b->bw_delta && cl->cl_hogs_m1) {
758 				bw = b->bw_bytes * machclk_freq / b->bw_delta;
759 				if (bw < cl->cl_hogs_m1) {
760 					cl->cl_head = b;
761 					cl->cl_advanced = 1;
762 				}
763 			}
764 		}
765 	}
766 
767 #ifdef ALTQ_RIO
768 	if (cl->cl_qtype == Q_RIO)
769 		return rio_addq((rio_t *)cl->cl_red, &b->queue, m, cl->cl_pktattr);
770 #endif
771 #ifdef ALTQ_RED
772 	if (cl->cl_qtype == Q_RED)
773 		return red_addq(cl->cl_red, &b->queue, m, cl->cl_pktattr);
774 #endif
775 	if (qlen(&b->queue) >= qlimit(&b->queue)) {
776 		m_freem(m);
777 		return (-1);
778 	}
779 
780 	if (cl->cl_flags & FARF_CLEARDSCP)
781 		write_dsfield(m, cl->cl_pktattr, 0);
782 
783 	_addq(&b->queue, m);
784 
785 	return (0);
786 }
787 
788 static struct mbuf *
789 fairq_getq(struct fairq_class *cl, uint64_t cur_time)
790 {
791 	fairq_bucket_t *b;
792 	struct mbuf *m;
793 
794 	b = fairq_selectq(cl);
795 	if (b == NULL)
796 		m = NULL;
797 #ifdef ALTQ_RIO
798 	else if (cl->cl_qtype == Q_RIO)
799 		m = rio_getq((rio_t *)cl->cl_red, &b->queue);
800 #endif
801 #ifdef ALTQ_RED
802 	else if (cl->cl_qtype == Q_RED)
803 		m = red_getq(cl->cl_red, &b->queue);
804 #endif
805 	else
806 		m = _getq(&b->queue);
807 
808 	/*
809 	 * Calculate the BW change
810 	 */
811 	if (m != NULL) {
812 		uint64_t delta;
813 
814 		/*
815 		 * Per-class bandwidth calculation
816 		 */
817 		delta = (cur_time - cl->cl_last_time);
818 		if (delta > machclk_freq * 8)
819 			delta = machclk_freq * 8;
820 		cl->cl_bw_delta += delta;
821 		cl->cl_bw_bytes += m->m_pkthdr.len;
822 		cl->cl_last_time = cur_time;
823 
824 		/*
825 		 * Cap delta at ~machclk_freq to avoid overflows.
826 		 */
827 		if (cl->cl_bw_delta > machclk_freq) {
828 			uint64_t f = cl->cl_bw_delta * 32 / machclk_freq;
829 			cl->cl_bw_delta = cl->cl_bw_delta * 16 / f;
830 			cl->cl_bw_bytes = cl->cl_bw_bytes * 16 / f;
831 		}
832 
833 		/*
834 		 * Per-bucket bandwidth calculation.
835 		 */
836 		delta = (cur_time - b->last_time);
837 		if (delta > machclk_freq * 8)
838 			delta = machclk_freq * 8;
839 		b->bw_delta += delta;
840 		b->bw_bytes += m->m_pkthdr.len;
841 		b->last_time = cur_time;
842 
843 		/*
844 		 * Cap bw_delta at ~machclk_freq to avoid overflows.
845 		 */
846 		if (b->bw_delta > machclk_freq) {
847 			uint64_t f = b->bw_delta * 32 / machclk_freq;
848 			b->bw_delta = b->bw_delta * 16 / f;
849 			b->bw_bytes = b->bw_bytes * 16 / f;
850 		}
851 	}
852 	return(m);
853 }
854 
855 /*
856  * Figure out what the next packet would be if there were no limits.  If
857  * this class hits its bandwidth limit *hit_limit is set to no-zero, otherwise
858  * it is set to 0.  A non-NULL mbuf is returned either way.
859  */
860 static struct mbuf *
861 fairq_pollq(struct fairq_class *cl, uint64_t cur_time, int *hit_limit)
862 {
863 	fairq_bucket_t *b;
864 	struct mbuf *m;
865 	uint64_t delta;
866 	uint64_t bw;
867 
868 	*hit_limit = 0;
869 	b = fairq_selectq(cl);
870 	if (b == NULL)
871 		return(NULL);
872 	m = qhead(&b->queue);
873 	if (m == NULL)
874 		return(NULL);
875 	cl->cl_advanced = 1;	/* so next select/get doesn't re-advance */
876 
877 	/*
878 	 * Did this packet exceed the class bandwidth?
879 	 *
880 	 * Calculate the bandwidth component of the packet in bytes/sec.
881 	 * Avoid overflows when machclk_freq is very high.
882 	 */
883 	delta = cur_time - cl->cl_last_time;
884 	if (delta > machclk_freq * 8)
885 		delta = machclk_freq * 8;
886 	cl->cl_bw_delta += delta;
887 	cl->cl_last_time = cur_time;
888 
889 	if (cl->cl_bw_delta) {
890 		bw = (cl->cl_bw_bytes + m->m_pkthdr.len) *
891 		     machclk_freq / cl->cl_bw_delta;
892 		if (bw > cl->cl_bandwidth)
893 			*hit_limit = 1;
894 		cl->cl_bw_current = bw;
895 #if 0
896 		kprintf("BW %6lld relative to %6llu %d queue %p\n",
897 			bw, cl->cl_bandwidth, *hit_limit, b);
898 #endif
899 	}
900 	return(m);
901 }
902 
903 /*
904  * Locate the next queue we want to pull a packet out of.  This code
905  * is also responsible for removing empty buckets from the circular list.
906  */
907 static
908 fairq_bucket_t *
909 fairq_selectq(struct fairq_class *cl)
910 {
911 	fairq_bucket_t *b;
912 	uint64_t bw;
913 
914 	while ((b = cl->cl_head) != NULL) {
915 		/*
916 		 * Remove empty queues from consideration
917 		 */
918 		if (qempty(&b->queue)) {
919 			b->in_use = 0;
920 			cl->cl_head = b->next;
921 			cl->cl_advanced = 1;
922 			if (cl->cl_head == b) {
923 				cl->cl_head = NULL;
924 			} else {
925 				b->next->prev = b->prev;
926 				b->prev->next = b->next;
927 			}
928 			continue;
929 		}
930 
931 		/*
932 		 * Advance the round robin.  Queues with bandwidths less
933 		 * then the hog bandwidth are allowed to burst.
934 		 *
935 		 * Don't advance twice if the previous head emptied.
936 		 */
937 		if (cl->cl_advanced) {
938 			cl->cl_advanced = 0;
939 			break;
940 		}
941 		if (cl->cl_hogs_m1 == 0) {
942 			cl->cl_head = b->next;
943 		} else if (b->bw_delta) {
944 			bw = b->bw_bytes * machclk_freq / b->bw_delta;
945 			if (bw >= cl->cl_hogs_m1)
946 				cl->cl_head = b->next;
947 		}
948 
949 		/*
950 		 * Return the (possibly new) head.
951 		 */
952 		b = cl->cl_head;
953 		break;
954 	}
955 	return(b);
956 }
957 
958 static void
959 fairq_purgeq(struct fairq_class *cl)
960 {
961 	fairq_bucket_t *b;
962 	struct mbuf *m;
963 
964 	while ((b = fairq_selectq(cl)) != NULL) {
965 		while ((m = _getq(&b->queue)) != NULL) {
966 			PKTCNTR_ADD(&cl->cl_dropcnt, m_pktlen(m));
967 			m_freem(m);
968 		}
969 		KKASSERT(qlen(&b->queue) == 0);
970 	}
971 }
972 
973 static void
974 get_class_stats(struct fairq_classstats *sp, struct fairq_class *cl)
975 {
976 	fairq_bucket_t *b;
977 
978 	sp->class_handle = cl->cl_handle;
979 	sp->qlimit = cl->cl_qlimit;
980 	sp->xmit_cnt = cl->cl_xmitcnt;
981 	sp->drop_cnt = cl->cl_dropcnt;
982 	sp->qtype = cl->cl_qtype;
983 	sp->qlength = 0;
984 
985 	if (cl->cl_head) {
986 		b = cl->cl_head;
987 		do {
988 			sp->qlength += qlen(&b->queue);
989 			b = b->next;
990 		} while (b != cl->cl_head);
991 	}
992 
993 #ifdef ALTQ_RED
994 	if (cl->cl_qtype == Q_RED)
995 		red_getstats(cl->cl_red, &sp->red[0]);
996 #endif
997 #ifdef ALTQ_RIO
998 	if (cl->cl_qtype == Q_RIO)
999 		rio_getstats((rio_t *)cl->cl_red, &sp->red[0]);
1000 #endif
1001 }
1002 
1003 /* convert a class handle to the corresponding class pointer */
1004 static struct fairq_class *
1005 clh_to_clp(struct fairq_if *pif, uint32_t chandle)
1006 {
1007 	struct fairq_class *cl;
1008 	int idx;
1009 
1010 	if (chandle == 0)
1011 		return (NULL);
1012 
1013 	for (idx = pif->pif_maxpri; idx >= 0; idx--)
1014 		if ((cl = pif->pif_classes[idx]) != NULL &&
1015 		    cl->cl_handle == chandle)
1016 			return (cl);
1017 
1018 	return (NULL);
1019 }
1020 
1021 #endif /* ALTQ_FAIRQ */
1022