1 /* $KAME: altq_red.c,v 1.19 2004/04/17 10:54:49 kjc Exp $ */ 2 /* $DragonFly: src/sys/net/altq/altq_red.c,v 1.4 2006/12/22 23:44:55 swildner Exp $ */ 3 4 /* 5 * Copyright (C) 1997-2003 6 * Sony Computer Science Laboratories Inc. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 */ 30 /* 31 * Copyright (c) 1990-1994 Regents of the University of California. 32 * All rights reserved. 33 * 34 * Redistribution and use in source and binary forms, with or without 35 * modification, are permitted provided that the following conditions 36 * are met: 37 * 1. Redistributions of source code must retain the above copyright 38 * notice, this list of conditions and the following disclaimer. 39 * 2. Redistributions in binary form must reproduce the above copyright 40 * notice, this list of conditions and the following disclaimer in the 41 * documentation and/or other materials provided with the distribution. 42 * 3. All advertising materials mentioning features or use of this software 43 * must display the following acknowledgement: 44 * This product includes software developed by the Computer Systems 45 * Engineering Group at Lawrence Berkeley Laboratory. 46 * 4. Neither the name of the University nor of the Laboratory may be used 47 * to endorse or promote products derived from this software without 48 * specific prior written permission. 49 * 50 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 51 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 52 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 53 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 54 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 55 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 56 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 57 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 58 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 59 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 60 * SUCH DAMAGE. 61 */ 62 63 #include "opt_altq.h" 64 #include "opt_inet.h" 65 #include "opt_inet6.h" 66 67 #ifdef ALTQ_RED /* red is enabled by ALTQ_RED option in opt_altq.h */ 68 69 #include <sys/param.h> 70 #include <sys/malloc.h> 71 #include <sys/mbuf.h> 72 #include <sys/socket.h> 73 #include <sys/systm.h> 74 #include <sys/errno.h> 75 76 #include <net/if.h> 77 #include <net/ifq_var.h> 78 79 #include <netinet/in.h> 80 #include <netinet/in_systm.h> 81 #include <netinet/ip.h> 82 #ifdef INET6 83 #include <netinet/ip6.h> 84 #endif 85 86 #include <net/pf/pfvar.h> 87 #include <net/altq/altq.h> 88 #include <net/altq/altq_red.h> 89 90 /* 91 * ALTQ/RED (Random Early Detection) implementation using 32-bit 92 * fixed-point calculation. 93 * 94 * written by kjc using the ns code as a reference. 95 * you can learn more about red and ns from Sally's home page at 96 * http://www-nrg.ee.lbl.gov/floyd/ 97 * 98 * most of the red parameter values are fixed in this implementation 99 * to prevent fixed-point overflow/underflow. 100 * if you change the parameters, watch out for overflow/underflow! 101 * 102 * the parameters used are recommended values by Sally. 103 * the corresponding ns config looks: 104 * q_weight=0.00195 105 * minthresh=5 maxthresh=15 queue-size=60 106 * linterm=30 107 * dropmech=drop-tail 108 * bytes=false (can't be handled by 32-bit fixed-point) 109 * doubleq=false dqthresh=false 110 * wait=true 111 */ 112 /* 113 * alternative red parameters for a slow link. 114 * 115 * assume the queue length becomes from zero to L and keeps L, it takes 116 * N packets for q_avg to reach 63% of L. 117 * when q_weight is 0.002, N is about 500 packets. 118 * for a slow link like dial-up, 500 packets takes more than 1 minute! 119 * when q_weight is 0.008, N is about 127 packets. 120 * when q_weight is 0.016, N is about 63 packets. 121 * bursts of 50 packets are allowed for 0.002, bursts of 25 packets 122 * are allowed for 0.016. 123 * see Sally's paper for more details. 124 */ 125 /* normal red parameters */ 126 #define W_WEIGHT 512 /* inverse of weight of EWMA (511/512) */ 127 /* q_weight = 0.00195 */ 128 129 /* red parameters for a slow link */ 130 #define W_WEIGHT_1 128 /* inverse of weight of EWMA (127/128) */ 131 /* q_weight = 0.0078125 */ 132 133 /* red parameters for a very slow link (e.g., dialup) */ 134 #define W_WEIGHT_2 64 /* inverse of weight of EWMA (63/64) */ 135 /* q_weight = 0.015625 */ 136 137 /* fixed-point uses 12-bit decimal places */ 138 #define FP_SHIFT 12 /* fixed-point shift */ 139 140 /* red parameters for drop probability */ 141 #define INV_P_MAX 10 /* inverse of max drop probability */ 142 #define TH_MIN 5 /* min threshold */ 143 #define TH_MAX 15 /* max threshold */ 144 145 #define RED_LIMIT 60 /* default max queue lenght */ 146 #define RED_STATS /* collect statistics */ 147 148 /* 149 * our default policy for forced-drop is drop-tail. 150 * (in altq-1.1.2 or earlier, the default was random-drop. 151 * but it makes more sense to punish the cause of the surge.) 152 * to switch to the random-drop policy, define "RED_RANDOM_DROP". 153 */ 154 155 /* default red parameter values */ 156 static int default_th_min = TH_MIN; 157 static int default_th_max = TH_MAX; 158 static int default_inv_pmax = INV_P_MAX; 159 160 /* 161 * red support routines 162 */ 163 red_t * 164 red_alloc(int weight, int inv_pmax, int th_min, int th_max, int flags, int pkttime) 165 { 166 red_t *rp; 167 int w, i; 168 int npkts_per_sec; 169 170 rp = kmalloc(sizeof(*rp), M_ALTQ, M_WAITOK | M_ZERO); 171 rp->red_avg = 0; 172 rp->red_idle = 1; 173 174 if (weight == 0) 175 rp->red_weight = W_WEIGHT; 176 else 177 rp->red_weight = weight; 178 if (inv_pmax == 0) 179 rp->red_inv_pmax = default_inv_pmax; 180 else 181 rp->red_inv_pmax = inv_pmax; 182 if (th_min == 0) 183 rp->red_thmin = default_th_min; 184 else 185 rp->red_thmin = th_min; 186 if (th_max == 0) 187 rp->red_thmax = default_th_max; 188 else 189 rp->red_thmax = th_max; 190 191 rp->red_flags = flags; 192 193 if (pkttime == 0) 194 /* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */ 195 rp->red_pkttime = 800; 196 else 197 rp->red_pkttime = pkttime; 198 199 if (weight == 0) { 200 /* when the link is very slow, adjust red parameters */ 201 npkts_per_sec = 1000000 / rp->red_pkttime; 202 if (npkts_per_sec < 50) { 203 /* up to about 400Kbps */ 204 rp->red_weight = W_WEIGHT_2; 205 } else if (npkts_per_sec < 300) { 206 /* up to about 2.4Mbps */ 207 rp->red_weight = W_WEIGHT_1; 208 } 209 } 210 211 /* calculate wshift. weight must be power of 2 */ 212 w = rp->red_weight; 213 for (i = 0; w > 1; i++) 214 w = w >> 1; 215 rp->red_wshift = i; 216 w = 1 << rp->red_wshift; 217 if (w != rp->red_weight) { 218 kprintf("invalid weight value %d for red! use %d\n", 219 rp->red_weight, w); 220 rp->red_weight = w; 221 } 222 223 /* 224 * thmin_s and thmax_s are scaled versions of th_min and th_max 225 * to be compared with avg. 226 */ 227 rp->red_thmin_s = rp->red_thmin << (rp->red_wshift + FP_SHIFT); 228 rp->red_thmax_s = rp->red_thmax << (rp->red_wshift + FP_SHIFT); 229 230 /* 231 * precompute probability denominator 232 * probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point 233 */ 234 rp->red_probd = (2 * (rp->red_thmax - rp->red_thmin) 235 * rp->red_inv_pmax) << FP_SHIFT; 236 237 /* allocate weight table */ 238 rp->red_wtab = wtab_alloc(rp->red_weight); 239 240 microtime(&rp->red_last); 241 return (rp); 242 } 243 244 void 245 red_destroy(red_t *rp) 246 { 247 wtab_destroy(rp->red_wtab); 248 kfree(rp, M_ALTQ); 249 } 250 251 void 252 red_getstats(red_t *rp, struct redstats *sp) 253 { 254 sp->q_avg = rp->red_avg >> rp->red_wshift; 255 sp->xmit_cnt = rp->red_stats.xmit_cnt; 256 sp->drop_cnt = rp->red_stats.drop_cnt; 257 sp->drop_forced = rp->red_stats.drop_forced; 258 sp->drop_unforced = rp->red_stats.drop_unforced; 259 sp->marked_packets = rp->red_stats.marked_packets; 260 } 261 262 int 263 red_addq(red_t *rp, class_queue_t *q, struct mbuf *m, struct altq_pktattr *pktattr) 264 { 265 int avg, droptype; 266 int n; 267 268 avg = rp->red_avg; 269 270 /* 271 * if we were idle, we pretend that n packets arrived during 272 * the idle period. 273 */ 274 if (rp->red_idle) { 275 struct timeval now; 276 int t; 277 278 rp->red_idle = 0; 279 microtime(&now); 280 t = (now.tv_sec - rp->red_last.tv_sec); 281 if (t > 60) { 282 /* 283 * being idle for more than 1 minute, set avg to zero. 284 * this prevents t from overflow. 285 */ 286 avg = 0; 287 } else { 288 t = t * 1000000 + (now.tv_usec - rp->red_last.tv_usec); 289 n = t / rp->red_pkttime - 1; 290 291 /* the following line does (avg = (1 - Wq)^n * avg) */ 292 if (n > 0) 293 avg = (avg >> FP_SHIFT) * 294 pow_w(rp->red_wtab, n); 295 } 296 } 297 298 /* run estimator. (note: avg is scaled by WEIGHT in fixed-point) */ 299 avg += (qlen(q) << FP_SHIFT) - (avg >> rp->red_wshift); 300 rp->red_avg = avg; /* save the new value */ 301 302 /* 303 * red_count keeps a tally of arriving traffic that has not 304 * been dropped. 305 */ 306 rp->red_count++; 307 308 /* see if we drop early */ 309 droptype = DTYPE_NODROP; 310 if (avg >= rp->red_thmin_s && qlen(q) > 1) { 311 if (avg >= rp->red_thmax_s) { 312 /* avg >= th_max: forced drop */ 313 droptype = DTYPE_FORCED; 314 } else if (rp->red_old == 0) { 315 /* first exceeds th_min */ 316 rp->red_count = 1; 317 rp->red_old = 1; 318 } else if (drop_early((avg - rp->red_thmin_s) >> rp->red_wshift, 319 rp->red_probd, rp->red_count)) { 320 /* mark or drop by red */ 321 if ((rp->red_flags & REDF_ECN) && 322 mark_ecn(m, pktattr, rp->red_flags)) { 323 /* successfully marked. do not drop. */ 324 rp->red_count = 0; 325 #ifdef RED_STATS 326 rp->red_stats.marked_packets++; 327 #endif 328 } else { 329 /* unforced drop by red */ 330 droptype = DTYPE_EARLY; 331 } 332 } 333 } else { 334 /* avg < th_min */ 335 rp->red_old = 0; 336 } 337 338 /* 339 * if the queue length hits the hard limit, it's a forced drop. 340 */ 341 if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q)) 342 droptype = DTYPE_FORCED; 343 344 #ifdef RED_RANDOM_DROP 345 /* if successful or forced drop, enqueue this packet. */ 346 if (droptype != DTYPE_EARLY) 347 _addq(q, m); 348 #else 349 /* if successful, enqueue this packet. */ 350 if (droptype == DTYPE_NODROP) 351 _addq(q, m); 352 #endif 353 if (droptype != DTYPE_NODROP) { 354 if (droptype == DTYPE_EARLY) { 355 /* drop the incoming packet */ 356 #ifdef RED_STATS 357 rp->red_stats.drop_unforced++; 358 #endif 359 } else { 360 /* forced drop, select a victim packet in the queue. */ 361 #ifdef RED_RANDOM_DROP 362 m = _getq_random(q); 363 #endif 364 #ifdef RED_STATS 365 rp->red_stats.drop_forced++; 366 #endif 367 } 368 #ifdef RED_STATS 369 PKTCNTR_ADD(&rp->red_stats.drop_cnt, m_pktlen(m)); 370 #endif 371 rp->red_count = 0; 372 m_freem(m); 373 return (-1); 374 } 375 /* successfully queued */ 376 #ifdef RED_STATS 377 PKTCNTR_ADD(&rp->red_stats.xmit_cnt, m_pktlen(m)); 378 #endif 379 return (0); 380 } 381 382 /* 383 * early-drop probability is calculated as follows: 384 * prob = p_max * (avg - th_min) / (th_max - th_min) 385 * prob_a = prob / (2 - count*prob) 386 * = (avg-th_min) / (2*(th_max-th_min)*inv_p_max - count*(avg-th_min)) 387 * here prob_a increases as successive undrop count increases. 388 * (prob_a starts from prob/2, becomes prob when (count == (1 / prob)), 389 * becomes 1 when (count >= (2 / prob))). 390 */ 391 int 392 drop_early(int fp_len, int fp_probd, int count) 393 { 394 int d; /* denominator of drop-probability */ 395 396 d = fp_probd - count * fp_len; 397 if (d <= 0) { 398 /* count exceeds the hard limit: drop or mark */ 399 return (1); 400 } 401 402 /* 403 * now the range of d is [1..600] in fixed-point. (when 404 * th_max-th_min=10 and p_max=1/30) 405 * drop probability = (avg - TH_MIN) / d 406 */ 407 408 if ((karc4random() % d) < fp_len) { 409 /* drop or mark */ 410 return (1); 411 } 412 /* no drop/mark */ 413 return (0); 414 } 415 416 /* 417 * try to mark CE bit to the packet. 418 * returns 1 if successfully marked, 0 otherwise. 419 */ 420 int 421 mark_ecn(struct mbuf *m, struct altq_pktattr *pktattr, int flags) 422 { 423 struct mbuf *m0; 424 void *hdr; 425 int af; 426 427 if (m->m_pkthdr.fw_flags & PF_MBUF_STRUCTURE) { 428 af = m->m_pkthdr.pf.ecn_af; 429 hdr = m->m_pkthdr.pf.hdr; 430 } else if (pktattr) { 431 af = pktattr->pattr_af; 432 hdr = pktattr->pattr_hdr; 433 } else { 434 return (0); 435 } 436 437 if (af != AF_INET && af != AF_INET6) 438 return (0); 439 440 /* verify that pattr_hdr is within the mbuf data */ 441 for (m0 = m; m0 != NULL; m0 = m0->m_next) { 442 if (((caddr_t)hdr >= m0->m_data) && 443 ((caddr_t)hdr < m0->m_data + m0->m_len)) 444 break; 445 } 446 if (m0 == NULL) { 447 /* ick, tag info is stale */ 448 return (0); 449 } 450 451 switch (af) { 452 case AF_INET: 453 if (flags & REDF_ECN4) { 454 struct ip *ip = hdr; 455 uint8_t otos; 456 int sum; 457 458 if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT) 459 return (0); /* not-ECT */ 460 if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) 461 return (1); /* already marked */ 462 463 /* 464 * ecn-capable but not marked, 465 * mark CE and update checksum 466 */ 467 otos = ip->ip_tos; 468 ip->ip_tos |= IPTOS_ECN_CE; 469 /* 470 * update checksum (from RFC1624) 471 * HC' = ~(~HC + ~m + m') 472 */ 473 sum = ~ntohs(ip->ip_sum) & 0xffff; 474 sum += (~otos & 0xffff) + ip->ip_tos; 475 sum = (sum >> 16) + (sum & 0xffff); 476 sum += (sum >> 16); /* add carry */ 477 ip->ip_sum = htons(~sum & 0xffff); 478 return (1); 479 } 480 break; 481 #ifdef INET6 482 case AF_INET6: 483 if (flags & REDF_ECN6) { 484 struct ip6_hdr *ip6 = hdr; 485 uint32_t flowlabel; 486 487 flowlabel = ntohl(ip6->ip6_flow); 488 if ((flowlabel >> 28) != 6) 489 return (0); /* version mismatch! */ 490 if ((flowlabel & (IPTOS_ECN_MASK << 20)) == 491 (IPTOS_ECN_NOTECT << 20)) 492 return (0); /* not-ECT */ 493 if ((flowlabel & (IPTOS_ECN_MASK << 20)) == 494 (IPTOS_ECN_CE << 20)) 495 return (1); /* already marked */ 496 /* 497 * ecn-capable but not marked, mark CE 498 */ 499 flowlabel |= (IPTOS_ECN_CE << 20); 500 ip6->ip6_flow = htonl(flowlabel); 501 return (1); 502 } 503 break; 504 #endif /* INET6 */ 505 } 506 507 /* not marked */ 508 return (0); 509 } 510 511 struct mbuf * 512 red_getq(red_t *rp, class_queue_t *q) 513 { 514 struct mbuf *m; 515 516 if ((m = _getq(q)) == NULL) { 517 if (rp->red_idle == 0) { 518 rp->red_idle = 1; 519 microtime(&rp->red_last); 520 } 521 return NULL; 522 } 523 524 rp->red_idle = 0; 525 return (m); 526 } 527 528 /* 529 * helper routine to calibrate avg during idle. 530 * pow_w(wtab, n) returns (1 - Wq)^n in fixed-point 531 * here Wq = 1/weight and the code assumes Wq is close to zero. 532 * 533 * w_tab[n] holds ((1 - Wq)^(2^n)) in fixed-point. 534 */ 535 static SLIST_HEAD(, wtab) wtab_list = SLIST_HEAD_INITIALIZER(&wtab_list); 536 537 struct wtab * 538 wtab_alloc(int weight) 539 { 540 struct wtab *w; 541 int i; 542 543 SLIST_FOREACH(w, &wtab_list, w_link) { 544 if (w->w_weight == weight) { 545 w->w_refcount++; 546 return (w); 547 } 548 } 549 550 w = kmalloc(sizeof(*w), M_ALTQ, M_WAITOK | M_ZERO); 551 w->w_weight = weight; 552 w->w_refcount = 1; 553 SLIST_INSERT_HEAD(&wtab_list, w, w_link); 554 555 /* initialize the weight table */ 556 w->w_tab[0] = ((weight - 1) << FP_SHIFT) / weight; 557 for (i = 1; i < 32; i++) { 558 w->w_tab[i] = (w->w_tab[i-1] * w->w_tab[i-1]) >> FP_SHIFT; 559 if (w->w_tab[i] == 0 && w->w_param_max == 0) 560 w->w_param_max = 1 << i; 561 } 562 563 return (w); 564 } 565 566 int 567 wtab_destroy(struct wtab *w) 568 { 569 if (--w->w_refcount > 0) 570 return (0); 571 572 SLIST_REMOVE(&wtab_list, w, wtab, w_link); 573 kfree(w, M_ALTQ); 574 575 return (0); 576 } 577 578 int32_t 579 pow_w(struct wtab *w, int n) 580 { 581 int i, bit; 582 int32_t val; 583 584 if (n >= w->w_param_max) 585 return (0); 586 587 val = 1 << FP_SHIFT; 588 if (n <= 0) 589 return (val); 590 591 bit = 1; 592 i = 0; 593 while (n) { 594 if (n & bit) { 595 val = (val * w->w_tab[i]) >> FP_SHIFT; 596 n &= ~bit; 597 } 598 i++; 599 bit <<= 1; 600 } 601 return (val); 602 } 603 604 #endif /* ALTQ_RED */ 605