1 /* $KAME: altq_red.c,v 1.19 2004/04/17 10:54:49 kjc Exp $ */ 2 /* $DragonFly: src/sys/net/altq/altq_red.c,v 1.4 2006/12/22 23:44:55 swildner Exp $ */ 3 4 /* 5 * Copyright (C) 1997-2003 6 * Sony Computer Science Laboratories Inc. All rights reserved. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY SONY CSL AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL SONY CSL OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 * 29 */ 30 /* 31 * Copyright (c) 1990-1994 Regents of the University of California. 32 * All rights reserved. 33 * 34 * Redistribution and use in source and binary forms, with or without 35 * modification, are permitted provided that the following conditions 36 * are met: 37 * 1. Redistributions of source code must retain the above copyright 38 * notice, this list of conditions and the following disclaimer. 39 * 2. Redistributions in binary form must reproduce the above copyright 40 * notice, this list of conditions and the following disclaimer in the 41 * documentation and/or other materials provided with the distribution. 42 * 3. All advertising materials mentioning features or use of this software 43 * must display the following acknowledgement: 44 * This product includes software developed by the Computer Systems 45 * Engineering Group at Lawrence Berkeley Laboratory. 46 * 4. Neither the name of the University nor of the Laboratory may be used 47 * to endorse or promote products derived from this software without 48 * specific prior written permission. 49 * 50 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 51 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 52 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 53 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 54 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 55 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 56 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 57 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 58 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 59 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 60 * SUCH DAMAGE. 61 */ 62 63 #include "opt_altq.h" 64 #include "opt_inet.h" 65 #include "opt_inet6.h" 66 67 #ifdef ALTQ_RED /* red is enabled by ALTQ_RED option in opt_altq.h */ 68 69 #include <sys/param.h> 70 #include <sys/malloc.h> 71 #include <sys/mbuf.h> 72 #include <sys/socket.h> 73 #include <sys/systm.h> 74 #include <sys/errno.h> 75 76 #include <net/if.h> 77 78 #include <netinet/in.h> 79 #include <netinet/in_systm.h> 80 #include <netinet/ip.h> 81 #ifdef INET6 82 #include <netinet/ip6.h> 83 #endif 84 85 #include <net/pf/pfvar.h> 86 #include <net/altq/altq.h> 87 #include <net/altq/altq_red.h> 88 89 /* 90 * ALTQ/RED (Random Early Detection) implementation using 32-bit 91 * fixed-point calculation. 92 * 93 * written by kjc using the ns code as a reference. 94 * you can learn more about red and ns from Sally's home page at 95 * http://www-nrg.ee.lbl.gov/floyd/ 96 * 97 * most of the red parameter values are fixed in this implementation 98 * to prevent fixed-point overflow/underflow. 99 * if you change the parameters, watch out for overflow/underflow! 100 * 101 * the parameters used are recommended values by Sally. 102 * the corresponding ns config looks: 103 * q_weight=0.00195 104 * minthresh=5 maxthresh=15 queue-size=60 105 * linterm=30 106 * dropmech=drop-tail 107 * bytes=false (can't be handled by 32-bit fixed-point) 108 * doubleq=false dqthresh=false 109 * wait=true 110 */ 111 /* 112 * alternative red parameters for a slow link. 113 * 114 * assume the queue length becomes from zero to L and keeps L, it takes 115 * N packets for q_avg to reach 63% of L. 116 * when q_weight is 0.002, N is about 500 packets. 117 * for a slow link like dial-up, 500 packets takes more than 1 minute! 118 * when q_weight is 0.008, N is about 127 packets. 119 * when q_weight is 0.016, N is about 63 packets. 120 * bursts of 50 packets are allowed for 0.002, bursts of 25 packets 121 * are allowed for 0.016. 122 * see Sally's paper for more details. 123 */ 124 /* normal red parameters */ 125 #define W_WEIGHT 512 /* inverse of weight of EWMA (511/512) */ 126 /* q_weight = 0.00195 */ 127 128 /* red parameters for a slow link */ 129 #define W_WEIGHT_1 128 /* inverse of weight of EWMA (127/128) */ 130 /* q_weight = 0.0078125 */ 131 132 /* red parameters for a very slow link (e.g., dialup) */ 133 #define W_WEIGHT_2 64 /* inverse of weight of EWMA (63/64) */ 134 /* q_weight = 0.015625 */ 135 136 /* fixed-point uses 12-bit decimal places */ 137 #define FP_SHIFT 12 /* fixed-point shift */ 138 139 /* red parameters for drop probability */ 140 #define INV_P_MAX 10 /* inverse of max drop probability */ 141 #define TH_MIN 5 /* min threshold */ 142 #define TH_MAX 15 /* max threshold */ 143 144 #define RED_LIMIT 60 /* default max queue lenght */ 145 #define RED_STATS /* collect statistics */ 146 147 /* 148 * our default policy for forced-drop is drop-tail. 149 * (in altq-1.1.2 or earlier, the default was random-drop. 150 * but it makes more sense to punish the cause of the surge.) 151 * to switch to the random-drop policy, define "RED_RANDOM_DROP". 152 */ 153 154 /* default red parameter values */ 155 static int default_th_min = TH_MIN; 156 static int default_th_max = TH_MAX; 157 static int default_inv_pmax = INV_P_MAX; 158 159 /* 160 * red support routines 161 */ 162 red_t * 163 red_alloc(int weight, int inv_pmax, int th_min, int th_max, int flags, int pkttime) 164 { 165 red_t *rp; 166 int w, i; 167 int npkts_per_sec; 168 169 rp = kmalloc(sizeof(*rp), M_ALTQ, M_WAITOK | M_ZERO); 170 rp->red_avg = 0; 171 rp->red_idle = 1; 172 173 if (weight == 0) 174 rp->red_weight = W_WEIGHT; 175 else 176 rp->red_weight = weight; 177 if (inv_pmax == 0) 178 rp->red_inv_pmax = default_inv_pmax; 179 else 180 rp->red_inv_pmax = inv_pmax; 181 if (th_min == 0) 182 rp->red_thmin = default_th_min; 183 else 184 rp->red_thmin = th_min; 185 if (th_max == 0) 186 rp->red_thmax = default_th_max; 187 else 188 rp->red_thmax = th_max; 189 190 rp->red_flags = flags; 191 192 if (pkttime == 0) 193 /* default packet time: 1000 bytes / 10Mbps * 8 * 1000000 */ 194 rp->red_pkttime = 800; 195 else 196 rp->red_pkttime = pkttime; 197 198 if (weight == 0) { 199 /* when the link is very slow, adjust red parameters */ 200 npkts_per_sec = 1000000 / rp->red_pkttime; 201 if (npkts_per_sec < 50) { 202 /* up to about 400Kbps */ 203 rp->red_weight = W_WEIGHT_2; 204 } else if (npkts_per_sec < 300) { 205 /* up to about 2.4Mbps */ 206 rp->red_weight = W_WEIGHT_1; 207 } 208 } 209 210 /* calculate wshift. weight must be power of 2 */ 211 w = rp->red_weight; 212 for (i = 0; w > 1; i++) 213 w = w >> 1; 214 rp->red_wshift = i; 215 w = 1 << rp->red_wshift; 216 if (w != rp->red_weight) { 217 kprintf("invalid weight value %d for red! use %d\n", 218 rp->red_weight, w); 219 rp->red_weight = w; 220 } 221 222 /* 223 * thmin_s and thmax_s are scaled versions of th_min and th_max 224 * to be compared with avg. 225 */ 226 rp->red_thmin_s = rp->red_thmin << (rp->red_wshift + FP_SHIFT); 227 rp->red_thmax_s = rp->red_thmax << (rp->red_wshift + FP_SHIFT); 228 229 /* 230 * precompute probability denominator 231 * probd = (2 * (TH_MAX-TH_MIN) / pmax) in fixed-point 232 */ 233 rp->red_probd = (2 * (rp->red_thmax - rp->red_thmin) 234 * rp->red_inv_pmax) << FP_SHIFT; 235 236 /* allocate weight table */ 237 rp->red_wtab = wtab_alloc(rp->red_weight); 238 239 microtime(&rp->red_last); 240 return (rp); 241 } 242 243 void 244 red_destroy(red_t *rp) 245 { 246 wtab_destroy(rp->red_wtab); 247 kfree(rp, M_ALTQ); 248 } 249 250 void 251 red_getstats(red_t *rp, struct redstats *sp) 252 { 253 sp->q_avg = rp->red_avg >> rp->red_wshift; 254 sp->xmit_cnt = rp->red_stats.xmit_cnt; 255 sp->drop_cnt = rp->red_stats.drop_cnt; 256 sp->drop_forced = rp->red_stats.drop_forced; 257 sp->drop_unforced = rp->red_stats.drop_unforced; 258 sp->marked_packets = rp->red_stats.marked_packets; 259 } 260 261 int 262 red_addq(red_t *rp, class_queue_t *q, struct mbuf *m, struct altq_pktattr *pktattr) 263 { 264 int avg, droptype; 265 int n; 266 267 avg = rp->red_avg; 268 269 /* 270 * if we were idle, we pretend that n packets arrived during 271 * the idle period. 272 */ 273 if (rp->red_idle) { 274 struct timeval now; 275 int t; 276 277 rp->red_idle = 0; 278 microtime(&now); 279 t = (now.tv_sec - rp->red_last.tv_sec); 280 if (t > 60) { 281 /* 282 * being idle for more than 1 minute, set avg to zero. 283 * this prevents t from overflow. 284 */ 285 avg = 0; 286 } else { 287 t = t * 1000000 + (now.tv_usec - rp->red_last.tv_usec); 288 n = t / rp->red_pkttime - 1; 289 290 /* the following line does (avg = (1 - Wq)^n * avg) */ 291 if (n > 0) 292 avg = (avg >> FP_SHIFT) * 293 pow_w(rp->red_wtab, n); 294 } 295 } 296 297 /* run estimator. (note: avg is scaled by WEIGHT in fixed-point) */ 298 avg += (qlen(q) << FP_SHIFT) - (avg >> rp->red_wshift); 299 rp->red_avg = avg; /* save the new value */ 300 301 /* 302 * red_count keeps a tally of arriving traffic that has not 303 * been dropped. 304 */ 305 rp->red_count++; 306 307 /* see if we drop early */ 308 droptype = DTYPE_NODROP; 309 if (avg >= rp->red_thmin_s && qlen(q) > 1) { 310 if (avg >= rp->red_thmax_s) { 311 /* avg >= th_max: forced drop */ 312 droptype = DTYPE_FORCED; 313 } else if (rp->red_old == 0) { 314 /* first exceeds th_min */ 315 rp->red_count = 1; 316 rp->red_old = 1; 317 } else if (drop_early((avg - rp->red_thmin_s) >> rp->red_wshift, 318 rp->red_probd, rp->red_count)) { 319 /* mark or drop by red */ 320 if ((rp->red_flags & REDF_ECN) && 321 mark_ecn(m, pktattr, rp->red_flags)) { 322 /* successfully marked. do not drop. */ 323 rp->red_count = 0; 324 #ifdef RED_STATS 325 rp->red_stats.marked_packets++; 326 #endif 327 } else { 328 /* unforced drop by red */ 329 droptype = DTYPE_EARLY; 330 } 331 } 332 } else { 333 /* avg < th_min */ 334 rp->red_old = 0; 335 } 336 337 /* 338 * if the queue length hits the hard limit, it's a forced drop. 339 */ 340 if (droptype == DTYPE_NODROP && qlen(q) >= qlimit(q)) 341 droptype = DTYPE_FORCED; 342 343 #ifdef RED_RANDOM_DROP 344 /* if successful or forced drop, enqueue this packet. */ 345 if (droptype != DTYPE_EARLY) 346 _addq(q, m); 347 #else 348 /* if successful, enqueue this packet. */ 349 if (droptype == DTYPE_NODROP) 350 _addq(q, m); 351 #endif 352 if (droptype != DTYPE_NODROP) { 353 if (droptype == DTYPE_EARLY) { 354 /* drop the incoming packet */ 355 #ifdef RED_STATS 356 rp->red_stats.drop_unforced++; 357 #endif 358 } else { 359 /* forced drop, select a victim packet in the queue. */ 360 #ifdef RED_RANDOM_DROP 361 m = _getq_random(q); 362 #endif 363 #ifdef RED_STATS 364 rp->red_stats.drop_forced++; 365 #endif 366 } 367 #ifdef RED_STATS 368 PKTCNTR_ADD(&rp->red_stats.drop_cnt, m_pktlen(m)); 369 #endif 370 rp->red_count = 0; 371 m_freem(m); 372 return (-1); 373 } 374 /* successfully queued */ 375 #ifdef RED_STATS 376 PKTCNTR_ADD(&rp->red_stats.xmit_cnt, m_pktlen(m)); 377 #endif 378 return (0); 379 } 380 381 /* 382 * early-drop probability is calculated as follows: 383 * prob = p_max * (avg - th_min) / (th_max - th_min) 384 * prob_a = prob / (2 - count*prob) 385 * = (avg-th_min) / (2*(th_max-th_min)*inv_p_max - count*(avg-th_min)) 386 * here prob_a increases as successive undrop count increases. 387 * (prob_a starts from prob/2, becomes prob when (count == (1 / prob)), 388 * becomes 1 when (count >= (2 / prob))). 389 */ 390 int 391 drop_early(int fp_len, int fp_probd, int count) 392 { 393 int d; /* denominator of drop-probability */ 394 395 d = fp_probd - count * fp_len; 396 if (d <= 0) { 397 /* count exceeds the hard limit: drop or mark */ 398 return (1); 399 } 400 401 /* 402 * now the range of d is [1..600] in fixed-point. (when 403 * th_max-th_min=10 and p_max=1/30) 404 * drop probability = (avg - TH_MIN) / d 405 */ 406 407 if ((karc4random() % d) < fp_len) { 408 /* drop or mark */ 409 return (1); 410 } 411 /* no drop/mark */ 412 return (0); 413 } 414 415 /* 416 * try to mark CE bit to the packet. 417 * returns 1 if successfully marked, 0 otherwise. 418 */ 419 int 420 mark_ecn(struct mbuf *m, struct altq_pktattr *pktattr, int flags) 421 { 422 struct mbuf *m0; 423 void *hdr; 424 int af; 425 426 if ((m->m_pkthdr.fw_flags & ALTQ_MBUF_TAGGED) == 0) 427 return (0); 428 af = m->m_pkthdr.ecn_af; 429 hdr = m->m_pkthdr.header; 430 431 if (af != AF_INET && af != AF_INET6) 432 return (0); 433 434 /* verify that pattr_hdr is within the mbuf data */ 435 for (m0 = m; m0 != NULL; m0 = m0->m_next) { 436 if (((caddr_t)hdr >= m0->m_data) && 437 ((caddr_t)hdr < m0->m_data + m0->m_len)) 438 break; 439 } 440 if (m0 == NULL) { 441 /* ick, tag info is stale */ 442 return (0); 443 } 444 445 switch (af) { 446 case AF_INET: 447 if (flags & REDF_ECN4) { 448 struct ip *ip = hdr; 449 uint8_t otos; 450 int sum; 451 452 if (ip->ip_v != 4) 453 return (0); /* version mismatch! */ 454 455 if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_NOTECT) 456 return (0); /* not-ECT */ 457 if ((ip->ip_tos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) 458 return (1); /* already marked */ 459 460 /* 461 * ecn-capable but not marked, 462 * mark CE and update checksum 463 */ 464 otos = ip->ip_tos; 465 ip->ip_tos |= IPTOS_ECN_CE; 466 /* 467 * update checksum (from RFC1624) 468 * HC' = ~(~HC + ~m + m') 469 */ 470 sum = ~ntohs(ip->ip_sum) & 0xffff; 471 sum += (~otos & 0xffff) + ip->ip_tos; 472 sum = (sum >> 16) + (sum & 0xffff); 473 sum += (sum >> 16); /* add carry */ 474 ip->ip_sum = htons(~sum & 0xffff); 475 return (1); 476 } 477 break; 478 #ifdef INET6 479 case AF_INET6: 480 if (flags & REDF_ECN6) { 481 struct ip6_hdr *ip6 = hdr; 482 uint32_t flowlabel; 483 484 flowlabel = ntohl(ip6->ip6_flow); 485 if ((flowlabel >> 28) != 6) 486 return (0); /* version mismatch! */ 487 if ((flowlabel & (IPTOS_ECN_MASK << 20)) == 488 (IPTOS_ECN_NOTECT << 20)) 489 return (0); /* not-ECT */ 490 if ((flowlabel & (IPTOS_ECN_MASK << 20)) == 491 (IPTOS_ECN_CE << 20)) 492 return (1); /* already marked */ 493 /* 494 * ecn-capable but not marked, mark CE 495 */ 496 flowlabel |= (IPTOS_ECN_CE << 20); 497 ip6->ip6_flow = htonl(flowlabel); 498 return (1); 499 } 500 break; 501 #endif /* INET6 */ 502 } 503 504 /* not marked */ 505 return (0); 506 } 507 508 struct mbuf * 509 red_getq(red_t *rp, class_queue_t *q) 510 { 511 struct mbuf *m; 512 513 if ((m = _getq(q)) == NULL) { 514 if (rp->red_idle == 0) { 515 rp->red_idle = 1; 516 microtime(&rp->red_last); 517 } 518 return NULL; 519 } 520 521 rp->red_idle = 0; 522 return (m); 523 } 524 525 /* 526 * helper routine to calibrate avg during idle. 527 * pow_w(wtab, n) returns (1 - Wq)^n in fixed-point 528 * here Wq = 1/weight and the code assumes Wq is close to zero. 529 * 530 * w_tab[n] holds ((1 - Wq)^(2^n)) in fixed-point. 531 */ 532 static SLIST_HEAD(, wtab) wtab_list = SLIST_HEAD_INITIALIZER(&wtab_list); 533 534 struct wtab * 535 wtab_alloc(int weight) 536 { 537 struct wtab *w; 538 int i; 539 540 SLIST_FOREACH(w, &wtab_list, w_link) { 541 if (w->w_weight == weight) { 542 w->w_refcount++; 543 return (w); 544 } 545 } 546 547 w = kmalloc(sizeof(*w), M_ALTQ, M_WAITOK | M_ZERO); 548 w->w_weight = weight; 549 w->w_refcount = 1; 550 SLIST_INSERT_HEAD(&wtab_list, w, w_link); 551 552 /* initialize the weight table */ 553 w->w_tab[0] = ((weight - 1) << FP_SHIFT) / weight; 554 for (i = 1; i < 32; i++) { 555 w->w_tab[i] = (w->w_tab[i-1] * w->w_tab[i-1]) >> FP_SHIFT; 556 if (w->w_tab[i] == 0 && w->w_param_max == 0) 557 w->w_param_max = 1 << i; 558 } 559 560 return (w); 561 } 562 563 int 564 wtab_destroy(struct wtab *w) 565 { 566 if (--w->w_refcount > 0) 567 return (0); 568 569 SLIST_REMOVE(&wtab_list, w, wtab, w_link); 570 kfree(w, M_ALTQ); 571 572 return (0); 573 } 574 575 int32_t 576 pow_w(struct wtab *w, int n) 577 { 578 int i, bit; 579 int32_t val; 580 581 if (n >= w->w_param_max) 582 return (0); 583 584 val = 1 << FP_SHIFT; 585 if (n <= 0) 586 return (val); 587 588 bit = 1; 589 i = 0; 590 while (n) { 591 if (n & bit) { 592 val = (val * w->w_tab[i]) >> FP_SHIFT; 593 n &= ~bit; 594 } 595 i++; 596 bit <<= 1; 597 } 598 return (val); 599 } 600 601 #endif /* ALTQ_RED */ 602