1 /* @(#)rm_class.c 1.48 97/12/05 SMI */ 2 /* $KAME: altq_rmclass.c,v 1.18 2003/11/06 06:32:53 kjc Exp $ */ 3 4 /* 5 * Copyright (c) 1991-1997 Regents of the University of California. 6 * All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 3. All advertising materials mentioning features or use of this software 17 * must display the following acknowledgement: 18 * This product includes software developed by the Network Research 19 * Group at Lawrence Berkeley Laboratory. 20 * 4. Neither the name of the University nor of the Laboratory may be used 21 * to endorse or promote products derived from this software without 22 * specific prior written permission. 23 * 24 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 25 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 26 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 27 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 28 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 29 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 30 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 32 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 33 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 * 36 * LBL code modified by speer@eng.sun.com, May 1977. 37 * For questions and/or comments, please send mail to cbq@ee.lbl.gov 38 */ 39 40 #include "opt_altq.h" 41 #include "opt_inet.h" 42 #include "opt_inet6.h" 43 44 #ifdef ALTQ_CBQ /* cbq is enabled by ALTQ_CBQ option in opt_altq.h */ 45 46 #include <sys/param.h> 47 #include <sys/malloc.h> 48 #include <sys/mbuf.h> 49 #include <sys/socket.h> 50 #include <sys/systm.h> 51 #include <sys/callout.h> 52 #include <sys/errno.h> 53 #include <sys/time.h> 54 #include <sys/thread.h> 55 #include <sys/thread2.h> 56 57 #include <net/if.h> 58 #include <net/ifq_var.h> 59 #include <net/netmsg2.h> 60 #include <net/netisr2.h> 61 62 #include <net/altq/altq.h> 63 #include <net/altq/altq_rmclass.h> 64 #include <net/altq/altq_rmclass_debug.h> 65 #include <net/altq/altq_red.h> 66 #include <net/altq/altq_rio.h> 67 68 #ifdef CBQ_TRACE 69 static struct cbqtrace cbqtrace_buffer[NCBQTRACE+1]; 70 static struct cbqtrace *cbqtrace_ptr = NULL; 71 static int cbqtrace_count; 72 #endif 73 74 /* 75 * Local Macros 76 */ 77 78 #define reset_cutoff(ifd) { ifd->cutoff_ = RM_MAXDEPTH; } 79 80 /* 81 * Local routines. 82 */ 83 84 static int rmc_satisfied(struct rm_class *, struct timeval *); 85 static void rmc_wrr_set_weights(struct rm_ifdat *); 86 static void rmc_depth_compute(struct rm_class *); 87 static void rmc_depth_recompute(rm_class_t *); 88 89 static struct mbuf *_rmc_wrr_dequeue_next(struct rm_ifdat *, int); 90 static struct mbuf *_rmc_prr_dequeue_next(struct rm_ifdat *, int); 91 92 static int _rmc_addq(rm_class_t *, struct mbuf *); 93 static void _rmc_dropq(rm_class_t *); 94 static struct mbuf *_rmc_getq(rm_class_t *); 95 static struct mbuf *_rmc_pollq(rm_class_t *); 96 97 static int rmc_under_limit(struct rm_class *, struct timeval *); 98 static void rmc_tl_satisfied(struct rm_ifdat *, struct timeval *); 99 static void rmc_drop_action(struct rm_class *); 100 static void rmc_restart(void *); 101 static void rmc_restart_dispatch(netmsg_t); 102 static void rmc_root_overlimit(struct rm_class *, struct rm_class *); 103 104 #define BORROW_OFFTIME 105 /* 106 * BORROW_OFFTIME (experimental): 107 * borrow the offtime of the class borrowing from. 108 * the reason is that when its own offtime is set, the class is unable 109 * to borrow much, especially when cutoff is taking effect. 110 * but when the borrowed class is overloaded (advidle is close to minidle), 111 * use the borrowing class's offtime to avoid overload. 112 */ 113 #define ADJUST_CUTOFF 114 /* 115 * ADJUST_CUTOFF (experimental): 116 * if no underlimit class is found due to cutoff, increase cutoff and 117 * retry the scheduling loop. 118 * also, don't invoke delay_actions while cutoff is taking effect, 119 * since a sleeping class won't have a chance to be scheduled in the 120 * next loop. 121 * 122 * now heuristics for setting the top-level variable (cutoff_) becomes: 123 * 1. if a packet arrives for a not-overlimit class, set cutoff 124 * to the depth of the class. 125 * 2. if cutoff is i, and a packet arrives for an overlimit class 126 * with an underlimit ancestor at a lower level than i (say j), 127 * then set cutoff to j. 128 * 3. at scheduling a packet, if there is no underlimit class 129 * due to the current cutoff level, increase cutoff by 1 and 130 * then try to schedule again. 131 */ 132 133 /* 134 * rm_class_t * 135 * rmc_newclass(...) - Create a new resource management class at priority 136 * 'pri' on the interface given by 'ifd'. 137 * 138 * nsecPerByte is the data rate of the interface in nanoseconds/byte. 139 * E.g., 800 for a 10Mb/s ethernet. If the class gets less 140 * than 100% of the bandwidth, this number should be the 141 * 'effective' rate for the class. Let f be the 142 * bandwidth fraction allocated to this class, and let 143 * nsPerByte be the data rate of the output link in 144 * nanoseconds/byte. Then nsecPerByte is set to 145 * nsPerByte / f. E.g., 1600 (= 800 / .5) 146 * for a class that gets 50% of an ethernet's bandwidth. 147 * 148 * action the routine to call when the class is over limit. 149 * 150 * maxq max allowable queue size for class (in packets). 151 * 152 * parent parent class pointer. 153 * 154 * borrow class to borrow from (should be either 'parent' or null). 155 * 156 * maxidle max value allowed for class 'idle' time estimate (this 157 * parameter determines how large an initial burst of packets 158 * can be before overlimit action is invoked. 159 * 160 * offtime how long 'delay' action will delay when class goes over 161 * limit (this parameter determines the steady-state burst 162 * size when a class is running over its limit). 163 * 164 * Maxidle and offtime have to be computed from the following: If the 165 * average packet size is s, the bandwidth fraction allocated to this 166 * class is f, we want to allow b packet bursts, and the gain of the 167 * averaging filter is g (= 1 - 2^(-RM_FILTER_GAIN)), then: 168 * 169 * ptime = s * nsPerByte * (1 - f) / f 170 * maxidle = ptime * (1 - g^b) / g^b 171 * minidle = -ptime * (1 / (f - 1)) 172 * offtime = ptime * (1 + 1/(1 - g) * (1 - g^(b - 1)) / g^(b - 1) 173 * 174 * Operationally, it's convenient to specify maxidle & offtime in units 175 * independent of the link bandwidth so the maxidle & offtime passed to 176 * this routine are the above values multiplied by 8*f/(1000*nsPerByte). 177 * (The constant factor is a scale factor needed to make the parameters 178 * integers. This scaling also means that the 'unscaled' values of 179 * maxidle*nsecPerByte/8 and offtime*nsecPerByte/8 will be in microseconds, 180 * not nanoseconds.) Also note that the 'idle' filter computation keeps 181 * an estimate scaled upward by 2^RM_FILTER_GAIN so the passed value of 182 * maxidle also must be scaled upward by this value. Thus, the passed 183 * values for maxidle and offtime can be computed as follows: 184 * 185 * maxidle = maxidle * 2^RM_FILTER_GAIN * 8 / (1000 * nsecPerByte) 186 * offtime = offtime * 8 / (1000 * nsecPerByte) 187 * 188 * When USE_HRTIME is employed, then maxidle and offtime become: 189 * maxidle = maxilde * (8.0 / nsecPerByte); 190 * offtime = offtime * (8.0 / nsecPerByte); 191 */ 192 struct rm_class * 193 rmc_newclass(int pri, struct rm_ifdat *ifd, u_int nsecPerByte, 194 void (*action)(rm_class_t *, rm_class_t *), int maxq, 195 struct rm_class *parent, struct rm_class *borrow, u_int maxidle, 196 int minidle, u_int offtime, int pktsize, int flags) 197 { 198 struct rm_class *cl; 199 struct rm_class *peer; 200 201 if (pri >= RM_MAXPRIO) 202 return (NULL); 203 #ifndef ALTQ_RED 204 if (flags & RMCF_RED) { 205 #ifdef ALTQ_DEBUG 206 kprintf("rmc_newclass: RED not configured for CBQ!\n"); 207 #endif 208 return (NULL); 209 } 210 #endif 211 #ifndef ALTQ_RIO 212 if (flags & RMCF_RIO) { 213 #ifdef ALTQ_DEBUG 214 kprintf("rmc_newclass: RIO not configured for CBQ!\n"); 215 #endif 216 return (NULL); 217 } 218 #endif 219 220 cl = kmalloc(sizeof(*cl), M_ALTQ, M_WAITOK | M_ZERO); 221 callout_init(&cl->callout_); 222 netmsg_init(&cl->callout_nmsg_, NULL, &netisr_adone_rport, 223 MSGF_PRIORITY, rmc_restart_dispatch); 224 cl->callout_nmsg_.lmsg.u.ms_resultp = cl; 225 226 cl->q_ = kmalloc(sizeof(*cl->q_), M_ALTQ, M_WAITOK | M_ZERO); 227 228 /* 229 * Class initialization. 230 */ 231 cl->children_ = NULL; 232 cl->parent_ = parent; 233 cl->borrow_ = borrow; 234 cl->leaf_ = 1; 235 cl->ifdat_ = ifd; 236 cl->pri_ = pri; 237 cl->allotment_ = RM_NS_PER_SEC / nsecPerByte; /* Bytes per sec */ 238 cl->depth_ = 0; 239 cl->qthresh_ = 0; 240 cl->ns_per_byte_ = nsecPerByte; 241 242 qlimit(cl->q_) = maxq; 243 qtype(cl->q_) = Q_DROPHEAD; 244 qlen(cl->q_) = 0; 245 cl->flags_ = flags; 246 247 #if 1 /* minidle is also scaled in ALTQ */ 248 cl->minidle_ = (minidle * (int)nsecPerByte) / 8; 249 if (cl->minidle_ > 0) 250 cl->minidle_ = 0; 251 #else 252 cl->minidle_ = minidle; 253 #endif 254 cl->maxidle_ = (maxidle * nsecPerByte) / 8; 255 if (cl->maxidle_ == 0) 256 cl->maxidle_ = 1; 257 #if 1 /* offtime is also scaled in ALTQ */ 258 cl->avgidle_ = cl->maxidle_; 259 cl->offtime_ = ((offtime * nsecPerByte) / 8) >> RM_FILTER_GAIN; 260 if (cl->offtime_ == 0) 261 cl->offtime_ = 1; 262 #else 263 cl->avgidle_ = 0; 264 cl->offtime_ = (offtime * nsecPerByte) / 8; 265 #endif 266 cl->overlimit = action; 267 268 #ifdef ALTQ_RED 269 if (flags & (RMCF_RED|RMCF_RIO)) { 270 int red_flags, red_pkttime; 271 272 red_flags = 0; 273 if (flags & RMCF_ECN) 274 red_flags |= REDF_ECN; 275 #ifdef ALTQ_RIO 276 if (flags & RMCF_CLEARDSCP) 277 red_flags |= RIOF_CLEARDSCP; 278 #endif 279 red_pkttime = nsecPerByte * pktsize / 1000; 280 281 if (flags & RMCF_RED) { 282 cl->red_ = red_alloc(0, 0, 283 qlimit(cl->q_) * 10/100, 284 qlimit(cl->q_) * 30/100, 285 red_flags, red_pkttime); 286 if (cl->red_ != NULL) 287 qtype(cl->q_) = Q_RED; 288 } 289 #ifdef ALTQ_RIO 290 else { 291 cl->red_ = (red_t *)rio_alloc(0, NULL, 292 red_flags, red_pkttime); 293 if (cl->red_ != NULL) 294 qtype(cl->q_) = Q_RIO; 295 } 296 #endif 297 } 298 #endif /* ALTQ_RED */ 299 300 /* 301 * put the class into the class tree 302 */ 303 crit_enter(); 304 if ((peer = ifd->active_[pri]) != NULL) { 305 /* find the last class at this pri */ 306 cl->peer_ = peer; 307 while (peer->peer_ != ifd->active_[pri]) 308 peer = peer->peer_; 309 peer->peer_ = cl; 310 } else { 311 ifd->active_[pri] = cl; 312 cl->peer_ = cl; 313 } 314 315 if (cl->parent_) { 316 cl->next_ = parent->children_; 317 parent->children_ = cl; 318 parent->leaf_ = 0; 319 } 320 321 /* 322 * Compute the depth of this class and its ancestors in the class 323 * hierarchy. 324 */ 325 rmc_depth_compute(cl); 326 327 /* 328 * If CBQ's WRR is enabled, then initialize the class WRR state. 329 */ 330 if (ifd->wrr_) { 331 ifd->num_[pri]++; 332 ifd->alloc_[pri] += cl->allotment_; 333 rmc_wrr_set_weights(ifd); 334 } 335 crit_exit(); 336 return (cl); 337 } 338 339 int 340 rmc_modclass(struct rm_class *cl, u_int nsecPerByte, int maxq, u_int maxidle, 341 int minidle, u_int offtime, int pktsize) 342 { 343 struct rm_ifdat *ifd; 344 u_int old_allotment; 345 346 ifd = cl->ifdat_; 347 old_allotment = cl->allotment_; 348 349 crit_enter(); 350 cl->allotment_ = RM_NS_PER_SEC / nsecPerByte; /* Bytes per sec */ 351 cl->qthresh_ = 0; 352 cl->ns_per_byte_ = nsecPerByte; 353 354 qlimit(cl->q_) = maxq; 355 356 #if 1 /* minidle is also scaled in ALTQ */ 357 cl->minidle_ = (minidle * nsecPerByte) / 8; 358 if (cl->minidle_ > 0) 359 cl->minidle_ = 0; 360 #else 361 cl->minidle_ = minidle; 362 #endif 363 cl->maxidle_ = (maxidle * nsecPerByte) / 8; 364 if (cl->maxidle_ == 0) 365 cl->maxidle_ = 1; 366 #if 1 /* offtime is also scaled in ALTQ */ 367 cl->avgidle_ = cl->maxidle_; 368 cl->offtime_ = ((offtime * nsecPerByte) / 8) >> RM_FILTER_GAIN; 369 if (cl->offtime_ == 0) 370 cl->offtime_ = 1; 371 #else 372 cl->avgidle_ = 0; 373 cl->offtime_ = (offtime * nsecPerByte) / 8; 374 #endif 375 376 /* 377 * If CBQ's WRR is enabled, then initialize the class WRR state. 378 */ 379 if (ifd->wrr_) { 380 ifd->alloc_[cl->pri_] += cl->allotment_ - old_allotment; 381 rmc_wrr_set_weights(ifd); 382 } 383 crit_exit(); 384 return (0); 385 } 386 387 /* 388 * static void 389 * rmc_wrr_set_weights(struct rm_ifdat *ifdat) - This function computes 390 * the appropriate run robin weights for the CBQ weighted round robin 391 * algorithm. 392 * 393 * Returns: NONE 394 */ 395 396 static void 397 rmc_wrr_set_weights(struct rm_ifdat *ifd) 398 { 399 int i; 400 struct rm_class *cl, *clh; 401 402 for (i = 0; i < RM_MAXPRIO; i++) { 403 /* 404 * This is inverted from that of the simulator to 405 * maintain precision. 406 */ 407 if (ifd->num_[i] == 0) 408 ifd->M_[i] = 0; 409 else 410 ifd->M_[i] = ifd->alloc_[i] / 411 (ifd->num_[i] * ifd->maxpkt_); 412 /* 413 * Compute the weighted allotment for each class. 414 * This takes the expensive div instruction out 415 * of the main loop for the wrr scheduling path. 416 * These only get recomputed when a class comes or 417 * goes. 418 */ 419 if (ifd->active_[i] != NULL) { 420 clh = cl = ifd->active_[i]; 421 do { 422 /* safe-guard for slow link or alloc_ == 0 */ 423 if (ifd->M_[i] == 0) 424 cl->w_allotment_ = 0; 425 else 426 cl->w_allotment_ = cl->allotment_ / 427 ifd->M_[i]; 428 cl = cl->peer_; 429 } while ((cl != NULL) && (cl != clh)); 430 } 431 } 432 } 433 434 int 435 rmc_get_weight(struct rm_ifdat *ifd, int pri) 436 { 437 if ((pri >= 0) && (pri < RM_MAXPRIO)) 438 return (ifd->M_[pri]); 439 else 440 return (0); 441 } 442 443 /* 444 * static void 445 * rmc_depth_compute(struct rm_class *cl) - This function computes the 446 * appropriate depth of class 'cl' and its ancestors. 447 * 448 * Returns: NONE 449 */ 450 451 static void 452 rmc_depth_compute(struct rm_class *cl) 453 { 454 rm_class_t *t = cl, *p; 455 456 /* 457 * Recompute the depth for the branch of the tree. 458 */ 459 while (t != NULL) { 460 p = t->parent_; 461 if (p && (t->depth_ >= p->depth_)) { 462 p->depth_ = t->depth_ + 1; 463 t = p; 464 } else 465 t = NULL; 466 } 467 } 468 469 /* 470 * static void 471 * rmc_depth_recompute(struct rm_class *cl) - This function re-computes 472 * the depth of the tree after a class has been deleted. 473 * 474 * Returns: NONE 475 */ 476 477 static void 478 rmc_depth_recompute(rm_class_t *cl) 479 { 480 #if 1 /* ALTQ */ 481 rm_class_t *p, *t; 482 483 p = cl; 484 while (p != NULL) { 485 if ((t = p->children_) == NULL) { 486 p->depth_ = 0; 487 } else { 488 int cdepth = 0; 489 490 while (t != NULL) { 491 if (t->depth_ > cdepth) 492 cdepth = t->depth_; 493 t = t->next_; 494 } 495 496 if (p->depth_ == cdepth + 1) 497 /* no change to this parent */ 498 return; 499 500 p->depth_ = cdepth + 1; 501 } 502 503 p = p->parent_; 504 } 505 #else 506 rm_class_t *t; 507 508 if (cl->depth_ >= 1) { 509 if (cl->children_ == NULL) { 510 cl->depth_ = 0; 511 } else if ((t = cl->children_) != NULL) { 512 while (t != NULL) { 513 if (t->children_ != NULL) 514 rmc_depth_recompute(t); 515 t = t->next_; 516 } 517 } else 518 rmc_depth_compute(cl); 519 } 520 #endif 521 } 522 523 /* 524 * void 525 * rmc_delete_class(struct rm_ifdat *ifdat, struct rm_class *cl) - This 526 * function deletes a class from the link-sharing structure and frees 527 * all resources associated with the class. 528 * 529 * Returns: NONE 530 */ 531 532 void 533 rmc_delete_class(struct rm_ifdat *ifd, struct rm_class *cl) 534 { 535 struct rm_class *p, *head, *previous; 536 struct netmsg_base smsg; 537 struct ifaltq_subque *ifsq = 538 &ifd->ifq_->altq_subq[ALTQ_SUBQ_INDEX_DEFAULT]; 539 540 KKASSERT(cl->children_ == NULL); 541 542 ALTQ_SQ_ASSERT_LOCKED(ifsq); 543 ALTQ_SQ_UNLOCK(ifsq); 544 callout_stop_sync(&cl->callout_); 545 /* Make sure that cl->callout_nmsg_ stops. */ 546 netmsg_init(&smsg, NULL, &curthread->td_msgport, 0, 547 netmsg_sync_handler); 548 lwkt_domsg(netisr_cpuport(0), &smsg.lmsg, 0); 549 ALTQ_SQ_LOCK(ifsq); 550 551 crit_enter(); 552 553 if (ifd->pollcache_ == cl) 554 ifd->pollcache_ = NULL; 555 556 /* 557 * Free packets in the packet queue. 558 * XXX - this may not be a desired behavior. Packets should be 559 * re-queued. 560 */ 561 rmc_dropall(cl); 562 563 /* 564 * If the class has a parent, then remove the class from the 565 * class from the parent's children chain. 566 */ 567 if (cl->parent_ != NULL) { 568 head = cl->parent_->children_; 569 p = previous = head; 570 if (head->next_ == NULL) { 571 KKASSERT(head == cl); 572 cl->parent_->children_ = NULL; 573 cl->parent_->leaf_ = 1; 574 } else while (p != NULL) { 575 if (p == cl) { 576 if (cl == head) 577 cl->parent_->children_ = cl->next_; 578 else 579 previous->next_ = cl->next_; 580 cl->next_ = NULL; 581 p = NULL; 582 } else { 583 previous = p; 584 p = p->next_; 585 } 586 } 587 } 588 589 /* 590 * Delete class from class priority peer list. 591 */ 592 if ((p = ifd->active_[cl->pri_]) != NULL) { 593 /* 594 * If there is more than one member of this priority 595 * level, then look for class(cl) in the priority level. 596 */ 597 if (p != p->peer_) { 598 while (p->peer_ != cl) 599 p = p->peer_; 600 p->peer_ = cl->peer_; 601 602 if (ifd->active_[cl->pri_] == cl) 603 ifd->active_[cl->pri_] = cl->peer_; 604 } else { 605 KKASSERT(p == cl); 606 ifd->active_[cl->pri_] = NULL; 607 } 608 } 609 610 /* 611 * Recompute the WRR weights. 612 */ 613 if (ifd->wrr_) { 614 ifd->alloc_[cl->pri_] -= cl->allotment_; 615 ifd->num_[cl->pri_]--; 616 rmc_wrr_set_weights(ifd); 617 } 618 619 /* 620 * Re-compute the depth of the tree. 621 */ 622 #if 1 /* ALTQ */ 623 rmc_depth_recompute(cl->parent_); 624 #else 625 rmc_depth_recompute(ifd->root_); 626 #endif 627 628 crit_exit(); 629 630 /* 631 * Free the class structure. 632 */ 633 if (cl->red_ != NULL) { 634 #ifdef ALTQ_RIO 635 if (q_is_rio(cl->q_)) 636 rio_destroy((rio_t *)cl->red_); 637 #endif 638 #ifdef ALTQ_RED 639 if (q_is_red(cl->q_)) 640 red_destroy(cl->red_); 641 #endif 642 } 643 kfree(cl->q_, M_ALTQ); 644 kfree(cl, M_ALTQ); 645 } 646 647 /* 648 * void 649 * rmc_init(...) - Initialize the resource management data structures 650 * associated with the output portion of interface 'ifp'. 'ifd' is 651 * where the structures will be built (for backwards compatibility, the 652 * structures aren't kept in the ifnet struct). 'nsecPerByte' 653 * gives the link speed (inverse of bandwidth) in nanoseconds/byte. 654 * 'restart' is the driver-specific routine that the generic 'delay 655 * until under limit' action will call to restart output. `maxq' 656 * is the queue size of the 'link' & 'default' classes. 'maxqueued' 657 * is the maximum number of packets that the resource management 658 * code will allow to be queued 'downstream' (this is typically 1). 659 * 660 * Returns: NONE 661 */ 662 663 void 664 rmc_init(struct ifaltq *ifq, struct rm_ifdat *ifd, u_int nsecPerByte, 665 void (*restart)(struct ifaltq *), int maxq, int maxqueued, u_int maxidle, 666 int minidle, u_int offtime, int flags) 667 { 668 int i, mtu; 669 670 /* 671 * Initialize the CBQ tracing/debug facility. 672 */ 673 CBQTRACEINIT(); 674 675 bzero(ifd, sizeof (*ifd)); 676 mtu = ifq->altq_ifp->if_mtu; 677 ifd->ifq_ = ifq; 678 ifd->restart = restart; 679 ifd->maxqueued_ = maxqueued; 680 ifd->ns_per_byte_ = nsecPerByte; 681 ifd->maxpkt_ = mtu; 682 ifd->wrr_ = (flags & RMCF_WRR) ? 1 : 0; 683 ifd->efficient_ = (flags & RMCF_EFFICIENT) ? 1 : 0; 684 #if 1 685 ifd->maxiftime_ = mtu * nsecPerByte / 1000 * 16; 686 if (mtu * nsecPerByte > 10 * 1000000) 687 ifd->maxiftime_ /= 4; 688 #endif 689 690 reset_cutoff(ifd); 691 CBQTRACE(rmc_init, 'INIT', ifd->cutoff_); 692 693 /* 694 * Initialize the CBQ's WRR state. 695 */ 696 for (i = 0; i < RM_MAXPRIO; i++) { 697 ifd->alloc_[i] = 0; 698 ifd->M_[i] = 0; 699 ifd->num_[i] = 0; 700 ifd->na_[i] = 0; 701 ifd->active_[i] = NULL; 702 } 703 704 /* 705 * Initialize current packet state. 706 */ 707 ifd->qi_ = 0; 708 ifd->qo_ = 0; 709 for (i = 0; i < RM_MAXQUEUED; i++) { 710 ifd->class_[i] = NULL; 711 ifd->curlen_[i] = 0; 712 ifd->borrowed_[i] = NULL; 713 } 714 715 /* 716 * Create the root class of the link-sharing structure. 717 */ 718 ifd->root_ = rmc_newclass(0, ifd, nsecPerByte, rmc_root_overlimit, 719 maxq, 0, 0, maxidle, minidle, offtime, 0, 0); 720 if (ifd->root_ == NULL) { 721 kprintf("rmc_init: root class not allocated\n"); 722 return ; 723 } 724 ifd->root_->depth_ = 0; 725 } 726 727 /* 728 * void 729 * rmc_queue_packet(struct rm_class *cl, struct mbuf *m) - Add packet given by 730 * mbuf 'm' to queue for resource class 'cl'. This routine is called 731 * by a driver's if_output routine. This routine must be called with 732 * output packet completion interrupts locked out (to avoid racing with 733 * rmc_dequeue_next). 734 * 735 * Returns: 0 on successful queueing 736 * -1 when packet drop occurs 737 */ 738 int 739 rmc_queue_packet(struct rm_class *cl, struct mbuf *m) 740 { 741 struct timeval now; 742 struct rm_ifdat *ifd = cl->ifdat_; 743 int cpri = cl->pri_; 744 int is_empty = qempty(cl->q_); 745 746 RM_GETTIME(now); 747 if (ifd->cutoff_ > 0) { 748 if (TV_LT(&cl->undertime_, &now)) { 749 if (ifd->cutoff_ > cl->depth_) 750 ifd->cutoff_ = cl->depth_; 751 CBQTRACE(rmc_queue_packet, 'ffoc', cl->depth_); 752 } 753 #if 1 /* ALTQ */ 754 else { 755 /* 756 * the class is overlimit. if the class has 757 * underlimit ancestors, set cutoff to the lowest 758 * depth among them. 759 */ 760 struct rm_class *borrow = cl->borrow_; 761 762 while (borrow != NULL && 763 borrow->depth_ < ifd->cutoff_) { 764 if (TV_LT(&borrow->undertime_, &now)) { 765 ifd->cutoff_ = borrow->depth_; 766 CBQTRACE(rmc_queue_packet, 'ffob', ifd->cutoff_); 767 break; 768 } 769 borrow = borrow->borrow_; 770 } 771 } 772 #else /* !ALTQ */ 773 else if ((ifd->cutoff_ > 1) && cl->borrow_) { 774 if (TV_LT(&cl->borrow_->undertime_, &now)) { 775 ifd->cutoff_ = cl->borrow_->depth_; 776 CBQTRACE(rmc_queue_packet, 'ffob', 777 cl->borrow_->depth_); 778 } 779 } 780 #endif /* !ALTQ */ 781 } 782 783 if (_rmc_addq(cl, m) < 0) 784 /* failed */ 785 return (-1); 786 787 if (is_empty) { 788 CBQTRACE(rmc_queue_packet, 'ytpe', cl->stats_.handle); 789 ifd->na_[cpri]++; 790 } 791 792 if (qlen(cl->q_) > qlimit(cl->q_)) { 793 /* note: qlimit can be set to 0 or 1 */ 794 rmc_drop_action(cl); 795 return (-1); 796 } 797 return (0); 798 } 799 800 /* 801 * void 802 * rmc_tl_satisfied(struct rm_ifdat *ifd, struct timeval *now) - Check all 803 * classes to see if there are satified. 804 */ 805 806 static void 807 rmc_tl_satisfied(struct rm_ifdat *ifd, struct timeval *now) 808 { 809 int i; 810 rm_class_t *p, *bp; 811 812 for (i = RM_MAXPRIO - 1; i >= 0; i--) { 813 if ((bp = ifd->active_[i]) != NULL) { 814 p = bp; 815 do { 816 if (!rmc_satisfied(p, now)) { 817 ifd->cutoff_ = p->depth_; 818 return; 819 } 820 p = p->peer_; 821 } while (p != bp); 822 } 823 } 824 825 reset_cutoff(ifd); 826 } 827 828 /* 829 * rmc_satisfied - Return 1 of the class is satisfied. O, otherwise. 830 */ 831 832 static int 833 rmc_satisfied(struct rm_class *cl, struct timeval *now) 834 { 835 rm_class_t *p; 836 837 if (cl == NULL) 838 return (1); 839 if (TV_LT(now, &cl->undertime_)) 840 return (1); 841 if (cl->depth_ == 0) { 842 if (!cl->sleeping_ && (qlen(cl->q_) > cl->qthresh_)) 843 return (0); 844 else 845 return (1); 846 } 847 if (cl->children_ != NULL) { 848 p = cl->children_; 849 while (p != NULL) { 850 if (!rmc_satisfied(p, now)) 851 return (0); 852 p = p->next_; 853 } 854 } 855 856 return (1); 857 } 858 859 /* 860 * Return 1 if class 'cl' is under limit or can borrow from a parent, 861 * 0 if overlimit. As a side-effect, this routine will invoke the 862 * class overlimit action if the class if overlimit. 863 */ 864 865 static int 866 rmc_under_limit(struct rm_class *cl, struct timeval *now) 867 { 868 rm_class_t *p = cl; 869 rm_class_t *top; 870 struct rm_ifdat *ifd = cl->ifdat_; 871 872 ifd->borrowed_[ifd->qi_] = NULL; 873 /* 874 * If cl is the root class, then always return that it is 875 * underlimit. Otherwise, check to see if the class is underlimit. 876 */ 877 if (cl->parent_ == NULL) 878 return (1); 879 880 if (cl->sleeping_) { 881 if (TV_LT(now, &cl->undertime_)) 882 return (0); 883 884 callout_stop(&cl->callout_); 885 cl->sleeping_ = 0; 886 cl->undertime_.tv_sec = 0; 887 return (1); 888 } 889 890 top = NULL; 891 while (cl->undertime_.tv_sec && TV_LT(now, &cl->undertime_)) { 892 if (((cl = cl->borrow_) == NULL) || 893 (cl->depth_ > ifd->cutoff_)) { 894 #ifdef ADJUST_CUTOFF 895 if (cl != NULL) 896 /* cutoff is taking effect, just 897 return false without calling 898 the delay action. */ 899 return (0); 900 #endif 901 #ifdef BORROW_OFFTIME 902 /* 903 * check if the class can borrow offtime too. 904 * borrow offtime from the top of the borrow 905 * chain if the top class is not overloaded. 906 */ 907 if (cl != NULL) { 908 /* cutoff is taking effect, use this class as top. */ 909 top = cl; 910 CBQTRACE(rmc_under_limit, 'ffou', ifd->cutoff_); 911 } 912 if (top != NULL && top->avgidle_ == top->minidle_) 913 top = NULL; 914 p->overtime_ = *now; 915 (p->overlimit)(p, top); 916 #else 917 p->overtime_ = *now; 918 (p->overlimit)(p, NULL); 919 #endif 920 return (0); 921 } 922 top = cl; 923 } 924 925 if (cl != p) 926 ifd->borrowed_[ifd->qi_] = cl; 927 return (1); 928 } 929 930 /* 931 * _rmc_wrr_dequeue_next() - This is scheduler for WRR as opposed to 932 * Packet-by-packet round robin. 933 * 934 * The heart of the weighted round-robin scheduler, which decides which 935 * class next gets to send a packet. Highest priority first, then 936 * weighted round-robin within priorites. 937 * 938 * Each able-to-send class gets to send until its byte allocation is 939 * exhausted. Thus, the active pointer is only changed after a class has 940 * exhausted its allocation. 941 * 942 * If the scheduler finds no class that is underlimit or able to borrow, 943 * then the first class found that had a nonzero queue and is allowed to 944 * borrow gets to send. 945 */ 946 947 static struct mbuf * 948 _rmc_wrr_dequeue_next(struct rm_ifdat *ifd, int op) 949 { 950 struct rm_class *cl = NULL, *first = NULL; 951 u_int deficit; 952 int cpri; 953 struct mbuf *m; 954 struct timeval now; 955 956 RM_GETTIME(now); 957 958 /* 959 * if the driver polls the top of the queue and then removes 960 * the polled packet, we must return the same packet. 961 */ 962 if (op == ALTDQ_REMOVE && ifd->pollcache_) { 963 cl = ifd->pollcache_; 964 cpri = cl->pri_; 965 if (ifd->efficient_) { 966 /* check if this class is overlimit */ 967 if (cl->undertime_.tv_sec != 0 && 968 rmc_under_limit(cl, &now) == 0) 969 first = cl; 970 } 971 ifd->pollcache_ = NULL; 972 goto _wrr_out; 973 } 974 /* mode == ALTDQ_POLL || pollcache == NULL */ 975 ifd->pollcache_ = NULL; 976 ifd->borrowed_[ifd->qi_] = NULL; 977 #ifdef ADJUST_CUTOFF 978 _again: 979 #endif 980 for (cpri = RM_MAXPRIO - 1; cpri >= 0; cpri--) { 981 if (ifd->na_[cpri] == 0) 982 continue; 983 deficit = 0; 984 /* 985 * Loop through twice for a priority level, if some class 986 * was unable to send a packet the first round because 987 * of the weighted round-robin mechanism. 988 * During the second loop at this level, deficit==2. 989 * (This second loop is not needed if for every class, 990 * "M[cl->pri_])" times "cl->allotment" is greater than 991 * the byte size for the largest packet in the class.) 992 */ 993 _wrr_loop: 994 cl = ifd->active_[cpri]; 995 KKASSERT(cl != NULL); 996 do { 997 if ((deficit < 2) && (cl->bytes_alloc_ <= 0)) 998 cl->bytes_alloc_ += cl->w_allotment_; 999 if (!qempty(cl->q_)) { 1000 if ((cl->undertime_.tv_sec == 0) || 1001 rmc_under_limit(cl, &now)) { 1002 if (cl->bytes_alloc_ > 0 || deficit > 1) 1003 goto _wrr_out; 1004 1005 /* underlimit but no alloc */ 1006 deficit = 1; 1007 #if 1 1008 ifd->borrowed_[ifd->qi_] = NULL; 1009 #endif 1010 } 1011 else if (first == NULL && cl->borrow_ != NULL) 1012 first = cl; /* borrowing candidate */ 1013 } 1014 1015 cl->bytes_alloc_ = 0; 1016 cl = cl->peer_; 1017 } while (cl != ifd->active_[cpri]); 1018 1019 if (deficit == 1) { 1020 /* first loop found an underlimit class with deficit */ 1021 /* Loop on same priority level, with new deficit. */ 1022 deficit = 2; 1023 goto _wrr_loop; 1024 } 1025 } 1026 1027 #ifdef ADJUST_CUTOFF 1028 /* 1029 * no underlimit class found. if cutoff is taking effect, 1030 * increase cutoff and try again. 1031 */ 1032 if (first != NULL && ifd->cutoff_ < ifd->root_->depth_) { 1033 ifd->cutoff_++; 1034 CBQTRACE(_rmc_wrr_dequeue_next, 'ojda', ifd->cutoff_); 1035 goto _again; 1036 } 1037 #endif /* ADJUST_CUTOFF */ 1038 /* 1039 * If LINK_EFFICIENCY is turned on, then the first overlimit 1040 * class we encounter will send a packet if all the classes 1041 * of the link-sharing structure are overlimit. 1042 */ 1043 reset_cutoff(ifd); 1044 CBQTRACE(_rmc_wrr_dequeue_next, 'otsr', ifd->cutoff_); 1045 1046 if (!ifd->efficient_ || first == NULL) 1047 return (NULL); 1048 1049 cl = first; 1050 cpri = cl->pri_; 1051 #if 0 /* too time-consuming for nothing */ 1052 if (cl->sleeping_) 1053 callout_stop(&cl->callout_); 1054 cl->sleeping_ = 0; 1055 cl->undertime_.tv_sec = 0; 1056 #endif 1057 ifd->borrowed_[ifd->qi_] = cl->borrow_; 1058 ifd->cutoff_ = cl->borrow_->depth_; 1059 1060 /* 1061 * Deque the packet and do the book keeping... 1062 */ 1063 _wrr_out: 1064 if (op == ALTDQ_REMOVE) { 1065 m = _rmc_getq(cl); 1066 if (m == NULL) 1067 panic("_rmc_wrr_dequeue_next"); 1068 if (qempty(cl->q_)) 1069 ifd->na_[cpri]--; 1070 1071 /* 1072 * Update class statistics and link data. 1073 */ 1074 if (cl->bytes_alloc_ > 0) 1075 cl->bytes_alloc_ -= m_pktlen(m); 1076 1077 if ((cl->bytes_alloc_ <= 0) || first == cl) 1078 ifd->active_[cl->pri_] = cl->peer_; 1079 else 1080 ifd->active_[cl->pri_] = cl; 1081 1082 ifd->class_[ifd->qi_] = cl; 1083 ifd->curlen_[ifd->qi_] = m_pktlen(m); 1084 ifd->now_[ifd->qi_] = now; 1085 ifd->qi_ = (ifd->qi_ + 1) % ifd->maxqueued_; 1086 ifd->queued_++; 1087 } else { 1088 /* mode == ALTDQ_PPOLL */ 1089 m = _rmc_pollq(cl); 1090 #ifdef foo 1091 /* 1092 * Don't use poll cache; the poll/dequeue 1093 * model is no longer applicable to SMP 1094 * system. e.g. 1095 * CPU-A CPU-B 1096 * : : 1097 * poll : 1098 * : poll 1099 * dequeue (+) : 1100 * 1101 * The dequeue at (+) will hit the poll 1102 * cache set by CPU-B. 1103 */ 1104 ifd->pollcache_ = cl; 1105 #endif 1106 } 1107 return (m); 1108 } 1109 1110 /* 1111 * Dequeue & return next packet from the highest priority class that 1112 * has a packet to send & has enough allocation to send it. This 1113 * routine is called by a driver whenever it needs a new packet to 1114 * output. 1115 */ 1116 static struct mbuf * 1117 _rmc_prr_dequeue_next(struct rm_ifdat *ifd, int op) 1118 { 1119 struct mbuf *m; 1120 int cpri; 1121 struct rm_class *cl, *first = NULL; 1122 struct timeval now; 1123 1124 RM_GETTIME(now); 1125 1126 /* 1127 * if the driver polls the top of the queue and then removes 1128 * the polled packet, we must return the same packet. 1129 */ 1130 if (op == ALTDQ_REMOVE && ifd->pollcache_) { 1131 cl = ifd->pollcache_; 1132 cpri = cl->pri_; 1133 ifd->pollcache_ = NULL; 1134 goto _prr_out; 1135 } 1136 /* mode == ALTDQ_POLL || pollcache == NULL */ 1137 ifd->pollcache_ = NULL; 1138 ifd->borrowed_[ifd->qi_] = NULL; 1139 #ifdef ADJUST_CUTOFF 1140 _again: 1141 #endif 1142 for (cpri = RM_MAXPRIO - 1; cpri >= 0; cpri--) { 1143 if (ifd->na_[cpri] == 0) 1144 continue; 1145 cl = ifd->active_[cpri]; 1146 KKASSERT(cl != NULL); 1147 do { 1148 if (!qempty(cl->q_)) { 1149 if ((cl->undertime_.tv_sec == 0) || 1150 rmc_under_limit(cl, &now)) 1151 goto _prr_out; 1152 if (first == NULL && cl->borrow_ != NULL) 1153 first = cl; 1154 } 1155 cl = cl->peer_; 1156 } while (cl != ifd->active_[cpri]); 1157 } 1158 1159 #ifdef ADJUST_CUTOFF 1160 /* 1161 * no underlimit class found. if cutoff is taking effect, increase 1162 * cutoff and try again. 1163 */ 1164 if (first != NULL && ifd->cutoff_ < ifd->root_->depth_) { 1165 ifd->cutoff_++; 1166 goto _again; 1167 } 1168 #endif /* ADJUST_CUTOFF */ 1169 /* 1170 * If LINK_EFFICIENCY is turned on, then the first overlimit 1171 * class we encounter will send a packet if all the classes 1172 * of the link-sharing structure are overlimit. 1173 */ 1174 reset_cutoff(ifd); 1175 if (!ifd->efficient_ || first == NULL) 1176 return (NULL); 1177 1178 cl = first; 1179 cpri = cl->pri_; 1180 #if 0 /* too time-consuming for nothing */ 1181 if (cl->sleeping_) 1182 callout_stop(&cl->callout_); 1183 cl->sleeping_ = 0; 1184 cl->undertime_.tv_sec = 0; 1185 #endif 1186 ifd->borrowed_[ifd->qi_] = cl->borrow_; 1187 ifd->cutoff_ = cl->borrow_->depth_; 1188 1189 /* 1190 * Deque the packet and do the book keeping... 1191 */ 1192 _prr_out: 1193 if (op == ALTDQ_REMOVE) { 1194 m = _rmc_getq(cl); 1195 if (m == NULL) 1196 panic("_rmc_prr_dequeue_next"); 1197 if (qempty(cl->q_)) 1198 ifd->na_[cpri]--; 1199 1200 ifd->active_[cpri] = cl->peer_; 1201 1202 ifd->class_[ifd->qi_] = cl; 1203 ifd->curlen_[ifd->qi_] = m_pktlen(m); 1204 ifd->now_[ifd->qi_] = now; 1205 ifd->qi_ = (ifd->qi_ + 1) % ifd->maxqueued_; 1206 ifd->queued_++; 1207 } else { 1208 /* mode == ALTDQ_POLL */ 1209 m = _rmc_pollq(cl); 1210 #ifdef foo 1211 /* 1212 * Don't use poll cache; the poll/dequeue 1213 * model is no longer applicable to SMP 1214 * system. e.g. 1215 * CPU-A CPU-B 1216 * : : 1217 * poll : 1218 * : poll 1219 * dequeue (+) : 1220 * 1221 * The dequeue at (+) will hit the poll 1222 * cache set by CPU-B. 1223 */ 1224 ifd->pollcache_ = cl; 1225 #endif 1226 } 1227 return (m); 1228 } 1229 1230 /* 1231 * struct mbuf * 1232 * rmc_dequeue_next(struct rm_ifdat *ifd, struct timeval *now) - this function 1233 * is invoked by the packet driver to get the next packet to be 1234 * dequeued and output on the link. If WRR is enabled, then the 1235 * WRR dequeue next routine will determine the next packet to sent. 1236 * Otherwise, packet-by-packet round robin is invoked. 1237 * 1238 * Returns: NULL, if a packet is not available or if all 1239 * classes are overlimit. 1240 * 1241 * Otherwise, Pointer to the next packet. 1242 */ 1243 1244 struct mbuf * 1245 rmc_dequeue_next(struct rm_ifdat *ifd, int mode) 1246 { 1247 if (ifd->queued_ >= ifd->maxqueued_) 1248 return (NULL); 1249 else if (ifd->wrr_) 1250 return (_rmc_wrr_dequeue_next(ifd, mode)); 1251 else 1252 return (_rmc_prr_dequeue_next(ifd, mode)); 1253 } 1254 1255 /* 1256 * Update the utilization estimate for the packet that just completed. 1257 * The packet's class & the parent(s) of that class all get their 1258 * estimators updated. This routine is called by the driver's output- 1259 * packet-completion interrupt service routine. 1260 */ 1261 1262 /* 1263 * a macro to approximate "divide by 1000" that gives 0.000999, 1264 * if a value has enough effective digits. 1265 * (on pentium, mul takes 9 cycles but div takes 46!) 1266 */ 1267 #define NSEC_TO_USEC(t) (((t) >> 10) + ((t) >> 16) + ((t) >> 17)) 1268 void 1269 rmc_update_class_util(struct rm_ifdat *ifd) 1270 { 1271 int idle, avgidle, pktlen; 1272 int pkt_time, tidle; 1273 rm_class_t *cl, *borrowed; 1274 rm_class_t *borrows; 1275 struct timeval *nowp; 1276 1277 /* 1278 * Get the most recent completed class. 1279 */ 1280 if ((cl = ifd->class_[ifd->qo_]) == NULL) 1281 return; 1282 1283 pktlen = ifd->curlen_[ifd->qo_]; 1284 borrowed = ifd->borrowed_[ifd->qo_]; 1285 borrows = borrowed; 1286 1287 PKTCNTR_ADD(&cl->stats_.xmit_cnt, pktlen); 1288 1289 /* 1290 * Run estimator on class and its ancestors. 1291 */ 1292 /* 1293 * rm_update_class_util is designed to be called when the 1294 * transfer is completed from a xmit complete interrupt, 1295 * but most drivers don't implement an upcall for that. 1296 * so, just use estimated completion time. 1297 * as a result, ifd->qi_ and ifd->qo_ are always synced. 1298 */ 1299 nowp = &ifd->now_[ifd->qo_]; 1300 /* get pkt_time (for link) in usec */ 1301 #if 1 /* use approximation */ 1302 pkt_time = ifd->curlen_[ifd->qo_] * ifd->ns_per_byte_; 1303 pkt_time = NSEC_TO_USEC(pkt_time); 1304 #else 1305 pkt_time = ifd->curlen_[ifd->qo_] * ifd->ns_per_byte_ / 1000; 1306 #endif 1307 #if 1 /* ALTQ4PPP */ 1308 if (TV_LT(nowp, &ifd->ifnow_)) { 1309 int iftime; 1310 1311 /* 1312 * make sure the estimated completion time does not go 1313 * too far. it can happen when the link layer supports 1314 * data compression or the interface speed is set to 1315 * a much lower value. 1316 */ 1317 TV_DELTA(&ifd->ifnow_, nowp, iftime); 1318 if (iftime+pkt_time < ifd->maxiftime_) { 1319 TV_ADD_DELTA(&ifd->ifnow_, pkt_time, &ifd->ifnow_); 1320 } else { 1321 TV_ADD_DELTA(nowp, ifd->maxiftime_, &ifd->ifnow_); 1322 } 1323 } else { 1324 TV_ADD_DELTA(nowp, pkt_time, &ifd->ifnow_); 1325 } 1326 #else 1327 if (TV_LT(nowp, &ifd->ifnow_)) { 1328 TV_ADD_DELTA(&ifd->ifnow_, pkt_time, &ifd->ifnow_); 1329 } else { 1330 TV_ADD_DELTA(nowp, pkt_time, &ifd->ifnow_); 1331 } 1332 #endif 1333 1334 while (cl != NULL) { 1335 TV_DELTA(&ifd->ifnow_, &cl->last_, idle); 1336 if (idle >= 2000000) 1337 /* 1338 * this class is idle enough, reset avgidle. 1339 * (TV_DELTA returns 2000000 us when delta is large.) 1340 */ 1341 cl->avgidle_ = cl->maxidle_; 1342 1343 /* get pkt_time (for class) in usec */ 1344 #if 1 /* use approximation */ 1345 pkt_time = pktlen * cl->ns_per_byte_; 1346 pkt_time = NSEC_TO_USEC(pkt_time); 1347 #else 1348 pkt_time = pktlen * cl->ns_per_byte_ / 1000; 1349 #endif 1350 idle -= pkt_time; 1351 1352 avgidle = cl->avgidle_; 1353 avgidle += idle - (avgidle >> RM_FILTER_GAIN); 1354 cl->avgidle_ = avgidle; 1355 1356 /* Are we overlimit ? */ 1357 if (avgidle <= 0) { 1358 CBQTRACE(rmc_update_class_util, 'milo', cl->stats_.handle); 1359 #if 1 /* ALTQ */ 1360 /* 1361 * need some lower bound for avgidle, otherwise 1362 * a borrowing class gets unbounded penalty. 1363 */ 1364 if (avgidle < cl->minidle_) 1365 avgidle = cl->avgidle_ = cl->minidle_; 1366 #endif 1367 /* set next idle to make avgidle 0 */ 1368 tidle = pkt_time + 1369 (((1 - RM_POWER) * avgidle) >> RM_FILTER_GAIN); 1370 TV_ADD_DELTA(nowp, tidle, &cl->undertime_); 1371 ++cl->stats_.over; 1372 } else { 1373 cl->avgidle_ = 1374 (avgidle > cl->maxidle_) ? cl->maxidle_ : avgidle; 1375 cl->undertime_.tv_sec = 0; 1376 if (cl->sleeping_) { 1377 callout_stop(&cl->callout_); 1378 cl->sleeping_ = 0; 1379 } 1380 } 1381 1382 if (borrows != NULL) { 1383 if (borrows != cl) 1384 ++cl->stats_.borrows; 1385 else 1386 borrows = NULL; 1387 } 1388 cl->last_ = ifd->ifnow_; 1389 cl->last_pkttime_ = pkt_time; 1390 1391 #if 1 1392 if (cl->parent_ == NULL) { 1393 /* take stats of root class */ 1394 PKTCNTR_ADD(&cl->stats_.xmit_cnt, pktlen); 1395 } 1396 #endif 1397 1398 cl = cl->parent_; 1399 } 1400 1401 /* 1402 * Check to see if cutoff needs to set to a new level. 1403 */ 1404 cl = ifd->class_[ifd->qo_]; 1405 if (borrowed && (ifd->cutoff_ >= borrowed->depth_)) { 1406 #if 1 /* ALTQ */ 1407 if ((qlen(cl->q_) <= 0) || TV_LT(nowp, &borrowed->undertime_)) { 1408 rmc_tl_satisfied(ifd, nowp); 1409 CBQTRACE(rmc_update_class_util, 'broe', ifd->cutoff_); 1410 } else { 1411 ifd->cutoff_ = borrowed->depth_; 1412 CBQTRACE(rmc_update_class_util, 'ffob', borrowed->depth_); 1413 } 1414 #else /* !ALTQ */ 1415 if ((qlen(cl->q_) <= 1) || TV_LT(&now, &borrowed->undertime_)) { 1416 reset_cutoff(ifd); 1417 #ifdef notdef 1418 rmc_tl_satisfied(ifd, &now); 1419 #endif 1420 CBQTRACE(rmc_update_class_util, 'broe', ifd->cutoff_); 1421 } else { 1422 ifd->cutoff_ = borrowed->depth_; 1423 CBQTRACE(rmc_update_class_util, 'ffob', borrowed->depth_); 1424 } 1425 #endif /* !ALTQ */ 1426 } 1427 1428 /* 1429 * Release class slot 1430 */ 1431 ifd->borrowed_[ifd->qo_] = NULL; 1432 ifd->class_[ifd->qo_] = NULL; 1433 ifd->qo_ = (ifd->qo_ + 1) % ifd->maxqueued_; 1434 ifd->queued_--; 1435 } 1436 1437 /* 1438 * void 1439 * rmc_drop_action(struct rm_class *cl) - Generic (not protocol-specific) 1440 * over-limit action routines. These get invoked by rmc_under_limit() 1441 * if a class with packets to send if over its bandwidth limit & can't 1442 * borrow from a parent class. 1443 * 1444 * Returns: NONE 1445 */ 1446 1447 static void 1448 rmc_drop_action(struct rm_class *cl) 1449 { 1450 struct rm_ifdat *ifd = cl->ifdat_; 1451 1452 KKASSERT(qlen(cl->q_) > 0); 1453 _rmc_dropq(cl); 1454 if (qempty(cl->q_)) 1455 ifd->na_[cl->pri_]--; 1456 } 1457 1458 void 1459 rmc_dropall(struct rm_class *cl) 1460 { 1461 struct rm_ifdat *ifd = cl->ifdat_; 1462 1463 if (!qempty(cl->q_)) { 1464 _flushq(cl->q_); 1465 1466 ifd->na_[cl->pri_]--; 1467 } 1468 } 1469 1470 /* 1471 * void 1472 * rmc_delay_action(struct rm_class *cl) - This function is the generic CBQ 1473 * delay action routine. It is invoked via rmc_under_limit when the 1474 * packet is discoverd to be overlimit. 1475 * 1476 * If the delay action is result of borrow class being overlimit, then 1477 * delay for the offtime of the borrowing class that is overlimit. 1478 * 1479 * Returns: NONE 1480 */ 1481 1482 void 1483 rmc_delay_action(struct rm_class *cl, struct rm_class *borrow) 1484 { 1485 int delay, t, extradelay; 1486 1487 cl->stats_.overactions++; 1488 TV_DELTA(&cl->undertime_, &cl->overtime_, delay); 1489 #ifndef BORROW_OFFTIME 1490 delay += cl->offtime_; 1491 #endif 1492 1493 if (!cl->sleeping_) { 1494 CBQTRACE(rmc_delay_action, 'yled', cl->stats_.handle); 1495 #ifdef BORROW_OFFTIME 1496 if (borrow != NULL) 1497 extradelay = borrow->offtime_; 1498 else 1499 #endif 1500 extradelay = cl->offtime_; 1501 1502 #ifdef ALTQ 1503 /* 1504 * XXX recalculate suspend time: 1505 * current undertime is (tidle + pkt_time) calculated 1506 * from the last transmission. 1507 * tidle: time required to bring avgidle back to 0 1508 * pkt_time: target waiting time for this class 1509 * we need to replace pkt_time by offtime 1510 */ 1511 extradelay -= cl->last_pkttime_; 1512 #endif 1513 if (extradelay > 0) { 1514 TV_ADD_DELTA(&cl->undertime_, extradelay, &cl->undertime_); 1515 delay += extradelay; 1516 } 1517 1518 cl->sleeping_ = 1; 1519 cl->stats_.delays++; 1520 1521 /* 1522 * Since packets are phased randomly with respect to the 1523 * clock, 1 tick (the next clock tick) can be an arbitrarily 1524 * short time so we have to wait for at least two ticks. 1525 * NOTE: If there's no other traffic, we need the timer as 1526 * a 'backstop' to restart this class. 1527 */ 1528 if (delay > ustick * 2) 1529 t = (delay + ustick - 1) / ustick; 1530 else 1531 t = 2; 1532 callout_reset_bycpu(&cl->callout_, t, rmc_restart, cl, 0); 1533 } 1534 } 1535 1536 /* 1537 * void 1538 * rmc_restart() - is just a helper routine for rmc_delay_action -- it is 1539 * called by the system timer code & is responsible checking if the 1540 * class is still sleeping (it might have been restarted as a side 1541 * effect of the queue scan on a packet arrival) and, if so, restarting 1542 * output for the class. Inspecting the class state & restarting output 1543 * require locking the class structure. In general the driver is 1544 * responsible for locking but this is the only routine that is not 1545 * called directly or indirectly from the interface driver so it has 1546 * know about system locking conventions. Under bsd, locking is done 1547 * by raising IPL to splimp so that's what's implemented here. On a 1548 * different system this would probably need to be changed. 1549 * 1550 * Since this function is called from an independant timeout, we 1551 * have to set up the lock conditions expected for the ALTQ operation. 1552 * Note that the restart will probably fall through to an if_start. 1553 * 1554 * Returns: NONE 1555 */ 1556 1557 static void 1558 rmc_restart_dispatch(netmsg_t nmsg) 1559 { 1560 struct rm_class *cl = nmsg->lmsg.u.ms_resultp; 1561 struct rm_ifdat *ifd = cl->ifdat_; 1562 struct ifaltq_subque *ifsq = 1563 &ifd->ifq_->altq_subq[ALTQ_SUBQ_INDEX_DEFAULT]; 1564 1565 ASSERT_NETISR0; 1566 1567 crit_enter(); 1568 lwkt_replymsg(&nmsg->lmsg, 0); /* reply ASAP */ 1569 crit_exit(); 1570 1571 ALTQ_SQ_LOCK(ifsq); 1572 if (cl->sleeping_) { 1573 cl->sleeping_ = 0; 1574 cl->undertime_.tv_sec = 0; 1575 1576 if (ifd->queued_ < ifd->maxqueued_ && ifd->restart != NULL) { 1577 CBQTRACE(rmc_restart, 'trts', cl->stats_.handle); 1578 (ifd->restart)(ifd->ifq_); 1579 } 1580 } 1581 ALTQ_SQ_UNLOCK(ifsq); 1582 } 1583 1584 static void 1585 rmc_restart(void *xcl) 1586 { 1587 struct rm_class *cl = xcl; 1588 struct lwkt_msg *lmsg = &cl->callout_nmsg_.lmsg; 1589 1590 KASSERT(mycpuid == 0, ("not on cpu0")); 1591 crit_enter(); 1592 if (lmsg->ms_flags & MSGF_DONE) 1593 lwkt_sendmsg_oncpu(netisr_cpuport(0), lmsg); 1594 crit_exit(); 1595 } 1596 1597 /* 1598 * void 1599 * rmc_root_overlimit(struct rm_class *cl) - This the generic overlimit 1600 * handling routine for the root class of the link sharing structure. 1601 * 1602 * Returns: NONE 1603 */ 1604 1605 static void 1606 rmc_root_overlimit(struct rm_class *cl, struct rm_class *borrow) 1607 { 1608 panic("rmc_root_overlimit"); 1609 } 1610 1611 /* 1612 * Packet Queue handling routines. Eventually, this is to localize the 1613 * effects on the code whether queues are red queues or droptail 1614 * queues. 1615 */ 1616 1617 static int 1618 _rmc_addq(rm_class_t *cl, struct mbuf *m) 1619 { 1620 #ifdef ALTQ_RIO 1621 if (q_is_rio(cl->q_)) 1622 return rio_addq((rio_t *)cl->red_, cl->q_, m, cl->pktattr_); 1623 #endif 1624 #ifdef ALTQ_RED 1625 if (q_is_red(cl->q_)) 1626 return red_addq(cl->red_, cl->q_, m, cl->pktattr_); 1627 #endif /* ALTQ_RED */ 1628 1629 if (cl->flags_ & RMCF_CLEARDSCP) 1630 write_dsfield(m, cl->pktattr_, 0); 1631 1632 _addq(cl->q_, m); 1633 return (0); 1634 } 1635 1636 /* note: _rmc_dropq is not called for red */ 1637 static void 1638 _rmc_dropq(rm_class_t *cl) 1639 { 1640 struct mbuf *m; 1641 1642 if ((m = _getq(cl->q_)) != NULL) 1643 m_freem(m); 1644 } 1645 1646 static struct mbuf * 1647 _rmc_getq(rm_class_t *cl) 1648 { 1649 #ifdef ALTQ_RIO 1650 if (q_is_rio(cl->q_)) 1651 return rio_getq((rio_t *)cl->red_, cl->q_); 1652 #endif 1653 #ifdef ALTQ_RED 1654 if (q_is_red(cl->q_)) 1655 return red_getq(cl->red_, cl->q_); 1656 #endif 1657 return _getq(cl->q_); 1658 } 1659 1660 static struct mbuf * 1661 _rmc_pollq(rm_class_t *cl) 1662 { 1663 return qhead(cl->q_); 1664 } 1665 1666 #ifdef CBQ_TRACE 1667 /* 1668 * DDB hook to trace cbq events: 1669 * the last 1024 events are held in a circular buffer. 1670 * use "call cbqtrace_dump(N)" to display 20 events from Nth event. 1671 */ 1672 void cbqtrace_dump(int); 1673 static char *rmc_funcname(void *); 1674 1675 static struct rmc_funcs { 1676 void *func; 1677 char *name; 1678 } rmc_funcs[] = { 1679 rmc_init, "rmc_init", 1680 rmc_queue_packet, "rmc_queue_packet", 1681 rmc_under_limit, "rmc_under_limit", 1682 rmc_update_class_util, "rmc_update_class_util", 1683 rmc_delay_action, "rmc_delay_action", 1684 rmc_restart, "rmc_restart", 1685 _rmc_wrr_dequeue_next, "_rmc_wrr_dequeue_next", 1686 NULL, NULL 1687 }; 1688 1689 static char * 1690 rmc_funcname(void *func) 1691 { 1692 struct rmc_funcs *fp; 1693 1694 for (fp = rmc_funcs; fp->func != NULL; fp++) { 1695 if (fp->func == func) 1696 return (fp->name); 1697 } 1698 1699 return ("unknown"); 1700 } 1701 1702 void 1703 cbqtrace_dump(int counter) 1704 { 1705 int i, *p; 1706 char *cp; 1707 1708 counter = counter % NCBQTRACE; 1709 p = (int *)&cbqtrace_buffer[counter]; 1710 1711 for (i=0; i<20; i++) { 1712 kprintf("[0x%x] ", *p++); 1713 kprintf("%s: ", rmc_funcname((void *)*p++)); 1714 cp = (char *)p++; 1715 kprintf("%c%c%c%c: ", cp[0], cp[1], cp[2], cp[3]); 1716 kprintf("%d\n",*p++); 1717 1718 if (p >= (int *)&cbqtrace_buffer[NCBQTRACE]) 1719 p = (int *)cbqtrace_buffer; 1720 } 1721 } 1722 #endif /* CBQ_TRACE */ 1723 #endif /* ALTQ_CBQ */ 1724