1 /*- 2 * Copyright (c) 2017 Netflix, Inc. 3 * 4 * Redistribution and use in source and binary forms, with or without 5 * modification, are permitted provided that the following conditions 6 * are met: 7 * 1. Redistributions of source code must retain the above copyright 8 * notice, this list of conditions and the following disclaimer. 9 * 2. Redistributions in binary form must reproduce the above copyright 10 * notice, this list of conditions and the following disclaimer in the 11 * documentation and/or other materials provided with the distribution. 12 * 13 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 16 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 23 * SUCH DAMAGE. 24 * 25 */ 26 #include <sys/cdefs.h> 27 __FBSDID("$FreeBSD$"); 28 #include <sys/types.h> 29 #include <sys/queue.h> 30 #include <sys/socket.h> 31 #include <sys/mbuf.h> 32 #include <sys/sockopt.h> 33 #include <netinet/tcp.h> 34 #include <netinet/tcp_var.h> 35 #include <netinet/tcp_seq.h> 36 #ifndef _KERNEL 37 #include <stdio.h> 38 #include <unistd.h> 39 #include <string.h> 40 #include <strings.h> 41 #include <stdlib.h> 42 #include <limits.h> 43 #include <getopt.h> 44 #endif 45 #include "sack_filter.h" 46 47 /* 48 * Sack filter is used to filter out sacks 49 * that have already been processed. The idea 50 * is pretty simple really, consider two sacks 51 * 52 * SACK 1 53 * cum-ack A 54 * sack B - C 55 * SACK 2 56 * cum-ack A 57 * sack D - E 58 * sack B - C 59 * 60 * The previous sack information (B-C) is repeated 61 * in SACK 2. If the receiver gets SACK 1 and then 62 * SACK 2 then any work associated with B-C as already 63 * been completed. This only effects where we may have 64 * (as in bbr or rack) cases where we walk a linked list. 65 * 66 * Now the utility trys to keep everything in a single 67 * cache line. This means that its not perfect and 68 * it could be that so big of sack's come that a 69 * "remembered" processed sack falls off the list and 70 * so gets re-processed. Thats ok, it just means we 71 * did some extra work. We could of course take more 72 * cache line hits by expanding the size of this 73 * structure, but then that would cost more. 74 */ 75 76 #ifndef _KERNEL 77 int detailed_dump = 0; 78 uint64_t cnt_skipped_oldsack = 0; 79 uint64_t cnt_used_oldsack = 0; 80 int highest_used=0; 81 int over_written=0; 82 int empty_avail=0; 83 int no_collapse = 0; 84 FILE *out = NULL; 85 FILE *in = NULL; 86 #endif 87 88 #define sack_blk_used(sf, i) ((1 << i) & sf->sf_bits) 89 #define sack_blk_set(sf, i) ((1 << i) | sf->sf_bits) 90 #define sack_blk_clr(sf, i) (~(1 << i) & sf->sf_bits) 91 92 #ifndef _KERNEL 93 static 94 #endif 95 void 96 sack_filter_clear(struct sack_filter *sf, tcp_seq seq) 97 { 98 sf->sf_ack = seq; 99 sf->sf_bits = 0; 100 sf->sf_cur = 0; 101 sf->sf_used = 0; 102 } 103 /* 104 * Given a previous sack filter block, filter out 105 * any entries where the cum-ack moves over them 106 * fully or partially. 107 */ 108 static void 109 sack_filter_prune(struct sack_filter *sf, tcp_seq th_ack) 110 { 111 int32_t i; 112 /* start with the oldest */ 113 for (i = 0; i < SACK_FILTER_BLOCKS; i++) { 114 if (sack_blk_used(sf, i)) { 115 if (SEQ_GT(th_ack, sf->sf_blks[i].end)) { 116 /* This block is consumed */ 117 sf->sf_bits = sack_blk_clr(sf, i); 118 sf->sf_used--; 119 } else if (SEQ_GT(th_ack, sf->sf_blks[i].start)) { 120 /* Some of it is acked */ 121 sf->sf_blks[i].start = th_ack; 122 /* We could in theory break here, but 123 * there are some broken implementations 124 * that send multiple blocks. We want 125 * to catch them all with similar seq's. 126 */ 127 } 128 } 129 } 130 sf->sf_ack = th_ack; 131 } 132 133 /* 134 * Return true if you find that 135 * the sackblock b is on the score 136 * board. Update it along the way 137 * if part of it is on the board. 138 */ 139 static int32_t 140 is_sack_on_board(struct sack_filter *sf, struct sackblk *b) 141 { 142 int32_t i, cnt; 143 for (i = sf->sf_cur, cnt=0; cnt < SACK_FILTER_BLOCKS; cnt++) { 144 if (sack_blk_used(sf, i)) { 145 if (SEQ_LT(b->start, sf->sf_ack)) { 146 /* Behind cum-ack update */ 147 b->start = sf->sf_ack; 148 } 149 if (SEQ_LT(b->end, sf->sf_ack)) { 150 /* End back behind too */ 151 b->end = sf->sf_ack; 152 } 153 if (b->start == b->end) 154 return(1); 155 /* Jonathans Rule 1 */ 156 if (SEQ_LEQ(sf->sf_blks[i].start, b->start) && 157 SEQ_GEQ(sf->sf_blks[i].end, b->end)) { 158 /** 159 * Our board has this entirely in 160 * whole or in part: 161 * 162 * board |-------------| 163 * sack |-------------| 164 * <or> 165 * board |-------------| 166 * sack |----| 167 * 168 */ 169 return(1); 170 } 171 /* Jonathans Rule 2 */ 172 if(SEQ_LT(sf->sf_blks[i].end, b->start)) { 173 /** 174 * Not near each other: 175 * 176 * board |---| 177 * sack |---| 178 */ 179 goto nxt_blk; 180 } 181 /* Jonathans Rule 3 */ 182 if (SEQ_GT(sf->sf_blks[i].start, b->end)) { 183 /** 184 * Not near each other: 185 * 186 * board |---| 187 * sack |---| 188 */ 189 goto nxt_blk; 190 } 191 if (SEQ_LEQ(sf->sf_blks[i].start, b->start)) { 192 /** 193 * The board block partial meets: 194 * 195 * board |--------| 196 * sack |----------| 197 * <or> 198 * board |--------| 199 * sack |--------------| 200 * 201 * up with this one (we have part of it). 202 * 1) Update the board block to the new end 203 * and 204 * 2) Update the start of this block to my end. 205 */ 206 b->start = sf->sf_blks[i].end; 207 sf->sf_blks[i].end = b->end; 208 goto nxt_blk; 209 } 210 if (SEQ_GEQ(sf->sf_blks[i].end, b->end)) { 211 /** 212 * The board block partial meets: 213 * 214 * board |--------| 215 * sack |----------| 216 * <or> 217 * board |----| 218 * sack |----------| 219 * 1) Update the board block to the new start 220 * and 221 * 2) Update the start of this block to my end. 222 */ 223 b->end = sf->sf_blks[i].start; 224 sf->sf_blks[i].start = b->start; 225 goto nxt_blk; 226 } 227 } 228 nxt_blk: 229 i++; 230 i %= SACK_FILTER_BLOCKS; 231 } 232 /* Did we totally consume it in pieces? */ 233 if (b->start != b->end) 234 return(0); 235 else 236 return(1); 237 } 238 239 static int32_t 240 sack_filter_old(struct sack_filter *sf, struct sackblk *in, int numblks) 241 { 242 int32_t num, i; 243 struct sackblk blkboard[TCP_MAX_SACK]; 244 /* 245 * An old sack has arrived. It may contain data 246 * we do not have. We might not have it since 247 * we could have had a lost ack <or> we might have the 248 * entire thing on our current board. We want to prune 249 * off anything we have. With this function though we 250 * won't add to the board. 251 */ 252 for( i = 0, num = 0; i<numblks; i++ ) { 253 if (is_sack_on_board(sf, &in[i])) { 254 #ifndef _KERNEL 255 cnt_skipped_oldsack++; 256 #endif 257 continue; 258 } 259 /* Did not find it (or found only 260 * a piece of it). Copy it to 261 * our outgoing board. 262 */ 263 memcpy(&blkboard[num], &in[i], sizeof(struct sackblk)); 264 #ifndef _KERNEL 265 cnt_used_oldsack++; 266 #endif 267 num++; 268 } 269 if (num) { 270 memcpy(in, blkboard, (num * sizeof(struct sackblk))); 271 } 272 return (num); 273 } 274 275 /* 276 * Given idx its used but there is space available 277 * move the entry to the next free slot 278 */ 279 static void 280 sack_move_to_empty(struct sack_filter *sf, uint32_t idx) 281 { 282 int32_t i, cnt; 283 284 i = (idx + 1) % SACK_FILTER_BLOCKS; 285 for (cnt=0; cnt <(SACK_FILTER_BLOCKS-1); cnt++) { 286 if (sack_blk_used(sf, i) == 0) { 287 memcpy(&sf->sf_blks[i], &sf->sf_blks[idx], sizeof(struct sackblk)); 288 sf->sf_bits = sack_blk_clr(sf, idx); 289 sf->sf_bits = sack_blk_set(sf, i); 290 return; 291 } 292 i++; 293 i %= SACK_FILTER_BLOCKS; 294 } 295 } 296 297 static int32_t 298 sack_filter_new(struct sack_filter *sf, struct sackblk *in, int numblks, tcp_seq th_ack) 299 { 300 struct sackblk blkboard[TCP_MAX_SACK]; 301 int32_t num, i; 302 /* 303 * First lets trim the old and possibly 304 * throw any away we have. 305 */ 306 for(i=0, num=0; i<numblks; i++) { 307 if (is_sack_on_board(sf, &in[i])) 308 continue; 309 memcpy(&blkboard[num], &in[i], sizeof(struct sackblk)); 310 num++; 311 } 312 if (num == 0) 313 return(num); 314 315 /* Now what we are left is either 316 * completely merged on to the board 317 * from the above steps, or are new 318 * and need to be added to the board 319 * with the last one updated to current. 320 * 321 * First copy it out we want to return that 322 * to our caller for processing. 323 */ 324 memcpy(in, blkboard, (num * sizeof(struct sackblk))); 325 numblks = num; 326 /* Now go through and add to our board as needed */ 327 for(i=(num-1); i>=0; i--) { 328 if (is_sack_on_board(sf, &blkboard[i])) 329 continue; 330 /* Add this guy its not listed */ 331 sf->sf_cur++; 332 sf->sf_cur %= SACK_FILTER_BLOCKS; 333 if ((sack_blk_used(sf, sf->sf_cur)) && 334 (sf->sf_used < SACK_FILTER_BLOCKS)) { 335 sack_move_to_empty(sf, sf->sf_cur); 336 } 337 #ifndef _KERNEL 338 if (sack_blk_used(sf, sf->sf_cur)) { 339 over_written++; 340 if (sf->sf_used < SACK_FILTER_BLOCKS) 341 empty_avail++; 342 } 343 #endif 344 memcpy(&sf->sf_blks[sf->sf_cur], &in[i], sizeof(struct sackblk)); 345 if (sack_blk_used(sf, sf->sf_cur) == 0) { 346 sf->sf_used++; 347 #ifndef _KERNEL 348 if (sf->sf_used > highest_used) 349 highest_used = sf->sf_used; 350 #endif 351 sf->sf_bits = sack_blk_set(sf, sf->sf_cur); 352 } 353 } 354 return(numblks); 355 } 356 357 /* 358 * Given a sack block on the board (the skip index) see if 359 * any other used entries overlap or meet, if so return the index. 360 */ 361 static int32_t 362 sack_blocks_overlap_or_meet(struct sack_filter *sf, struct sackblk *sb, uint32_t skip) 363 { 364 int32_t i; 365 366 for(i=0; i<SACK_FILTER_BLOCKS; i++) { 367 if (sack_blk_used(sf, i) == 0) 368 continue; 369 if (i == skip) 370 continue; 371 if (SEQ_GEQ(sf->sf_blks[i].end, sb->start) && 372 SEQ_LEQ(sf->sf_blks[i].end, sb->end) && 373 SEQ_LEQ(sf->sf_blks[i].start, sb->start)) { 374 /** 375 * The two board blocks meet: 376 * 377 * board1 |--------| 378 * board2 |----------| 379 * <or> 380 * board1 |--------| 381 * board2 |--------------| 382 * <or> 383 * board1 |--------| 384 * board2 |--------| 385 */ 386 return(i); 387 } 388 if (SEQ_LEQ(sf->sf_blks[i].start, sb->end) && 389 SEQ_GEQ(sf->sf_blks[i].start, sb->start) && 390 SEQ_GEQ(sf->sf_blks[i].end, sb->end)) { 391 /** 392 * The board block partial meets: 393 * 394 * board |--------| 395 * sack |----------| 396 * <or> 397 * board |----| 398 * sack |----------| 399 * 1) Update the board block to the new start 400 * and 401 * 2) Update the start of this block to my end. 402 */ 403 return(i); 404 } 405 } 406 return (-1); 407 } 408 409 /* 410 * Collapse entry src into entry into 411 * and free up the src entry afterwards. 412 */ 413 static void 414 sack_collapse(struct sack_filter *sf, int32_t src, int32_t into) 415 { 416 if (SEQ_LT(sf->sf_blks[src].start, sf->sf_blks[into].start)) { 417 /* src has a lower starting point */ 418 sf->sf_blks[into].start = sf->sf_blks[src].start; 419 } 420 if (SEQ_GT(sf->sf_blks[src].end, sf->sf_blks[into].end)) { 421 /* src has a higher ending point */ 422 sf->sf_blks[into].end = sf->sf_blks[src].end; 423 } 424 sf->sf_bits = sack_blk_clr(sf, src); 425 sf->sf_used--; 426 } 427 428 static void 429 sack_board_collapse(struct sack_filter *sf) 430 { 431 int32_t i, j, i_d, j_d; 432 433 for(i=0; i<SACK_FILTER_BLOCKS; i++) { 434 if (sack_blk_used(sf, i) == 0) 435 continue; 436 /* 437 * Look at all other blocks but this guy 438 * to see if they overlap. If so we collapse 439 * the two blocks together. 440 */ 441 j = sack_blocks_overlap_or_meet(sf, &sf->sf_blks[i], i); 442 if (j == -1) { 443 /* No overlap */ 444 continue; 445 } 446 /* 447 * Ok j and i overlap with each other, collapse the 448 * one out furthest away from the current position. 449 */ 450 if (sf->sf_cur > i) 451 i_d = sf->sf_cur - i; 452 else 453 i_d = i - sf->sf_cur; 454 if (sf->sf_cur > j) 455 j_d = sf->sf_cur - j; 456 else 457 j_d = j - sf->sf_cur; 458 if (j_d > i_d) { 459 sack_collapse(sf, j, i); 460 } else 461 sack_collapse(sf, i, j); 462 } 463 } 464 465 #ifndef _KERNEL 466 static 467 #endif 468 int 469 sack_filter_blks(struct sack_filter *sf, struct sackblk *in, int numblks, tcp_seq th_ack) 470 { 471 int32_t i, ret; 472 473 if (numblks > TCP_MAX_SACK) { 474 panic("sf:%p sb:%p Impossible number of sack blocks %d > 4\n", 475 sf, in, 476 numblks); 477 return(numblks); 478 } 479 if ((sf->sf_used == 0) && numblks) { 480 /* 481 * We are brand new add the blocks in 482 * reverse order. Note we can see more 483 * than one in new, since ack's could be lost. 484 */ 485 sf->sf_ack = th_ack; 486 for(i=(numblks-1), sf->sf_cur=0; i >= 0; i--) { 487 memcpy(&sf->sf_blks[sf->sf_cur], &in[i], sizeof(struct sackblk)); 488 sf->sf_bits = sack_blk_set(sf, sf->sf_cur); 489 sf->sf_cur++; 490 sf->sf_cur %= SACK_FILTER_BLOCKS; 491 sf->sf_used++; 492 #ifndef _KERNEL 493 if (sf->sf_used > highest_used) 494 highest_used = sf->sf_used; 495 #endif 496 } 497 if (sf->sf_cur) 498 sf->sf_cur--; 499 return(numblks); 500 } 501 if (SEQ_GT(th_ack, sf->sf_ack)) { 502 sack_filter_prune(sf, th_ack); 503 } 504 if (numblks) { 505 if (SEQ_GEQ(th_ack, sf->sf_ack)) { 506 ret = sack_filter_new(sf, in, numblks, th_ack); 507 } else { 508 ret = sack_filter_old(sf, in, numblks); 509 } 510 } else 511 ret = 0; 512 #ifndef _KERNEL 513 if ((sf->sf_used > 1) && (no_collapse == 0)) 514 sack_board_collapse(sf); 515 516 #else 517 if (sf->sf_used > 1) 518 sack_board_collapse(sf); 519 520 #endif 521 return (ret); 522 } 523 524 #ifndef _KERNEL 525 uint64_t saved=0; 526 uint64_t tot_sack_blks=0; 527 528 static void 529 sack_filter_dump(FILE *out, struct sack_filter *sf) 530 { 531 int i; 532 fprintf(out, " sf_ack:%u sf_bits:0x%x c:%d used:%d\n", 533 sf->sf_ack, sf->sf_bits, 534 sf->sf_cur, sf->sf_used); 535 536 for(i=0; i<SACK_FILTER_BLOCKS; i++) { 537 if (sack_blk_used(sf, i)) { 538 fprintf(out, "Entry:%d start:%u end:%u\n", i, 539 sf->sf_blks[i].start, 540 sf->sf_blks[i].end); 541 } 542 } 543 } 544 545 int 546 main(int argc, char **argv) 547 { 548 char buffer[512]; 549 struct sackblk blks[TCP_MAX_SACK]; 550 FILE *err; 551 tcp_seq th_ack, snd_una; 552 struct sack_filter sf; 553 int32_t numblks,i; 554 int snd_una_set=0; 555 double a, b, c; 556 int invalid_sack_print = 0; 557 uint32_t chg_remembered=0; 558 uint32_t sack_chg=0; 559 char line_buf[10][256]; 560 int line_buf_at=0; 561 562 in = stdin; 563 out = stdout; 564 while ((i = getopt(argc, argv, "ndIi:o:?h")) != -1) { 565 switch (i) { 566 case 'n': 567 no_collapse = 1; 568 break; 569 case 'd': 570 detailed_dump = 1; 571 break; 572 case'I': 573 invalid_sack_print = 1; 574 break; 575 case 'i': 576 in = fopen(optarg, "r"); 577 if (in == NULL) { 578 fprintf(stderr, "Fatal error can't open %s for input\n", optarg); 579 exit(-1); 580 } 581 break; 582 case 'o': 583 out = fopen(optarg, "w"); 584 if (out == NULL) { 585 fprintf(stderr, "Fatal error can't open %s for output\n", optarg); 586 exit(-1); 587 } 588 break; 589 default: 590 case '?': 591 case 'h': 592 fprintf(stderr, "Use %s [ -i infile -o outfile -I]\n", argv[0]); 593 return(0); 594 break; 595 }; 596 } 597 sack_filter_clear(&sf, 0); 598 memset(buffer, 0, sizeof(buffer)); 599 memset(blks, 0, sizeof(blks)); 600 numblks = 0; 601 fprintf(out, "************************************\n"); 602 while (fgets(buffer, sizeof(buffer), in) != NULL) { 603 sprintf(line_buf[line_buf_at], "%s", buffer); 604 line_buf_at++; 605 if (strncmp(buffer, "QUIT", 4) == 0) { 606 break; 607 } else if (strncmp(buffer, "DONE", 4) == 0) { 608 int nn, ii; 609 if (numblks) { 610 uint32_t szof, tot_chg; 611 for(ii=0; ii<line_buf_at; ii++) { 612 fprintf(out, "%s", line_buf[ii]); 613 } 614 fprintf(out, "------------------------------------\n"); 615 nn = sack_filter_blks(&sf, blks, numblks, th_ack); 616 saved += numblks - nn; 617 tot_sack_blks += numblks; 618 fprintf(out, "ACK:%u\n", sf.sf_ack); 619 for(ii=0, tot_chg=0; ii<nn; ii++) { 620 szof = blks[ii].end - blks[ii].start; 621 tot_chg += szof; 622 fprintf(out, "SACK:%u:%u [%u]\n", 623 blks[ii].start, 624 blks[ii].end, szof); 625 } 626 fprintf(out,"************************************\n"); 627 chg_remembered = tot_chg; 628 if (detailed_dump) { 629 sack_filter_dump(out, &sf); 630 fprintf(out,"************************************\n"); 631 } 632 } 633 memset(blks, 0, sizeof(blks)); 634 memset(line_buf, 0, sizeof(line_buf)); 635 line_buf_at=0; 636 numblks = 0; 637 } else if (strncmp(buffer, "CHG:", 4) == 0) { 638 sack_chg = strtoul(&buffer[4], NULL, 0); 639 if ((sack_chg != chg_remembered) && 640 (sack_chg > chg_remembered)){ 641 fprintf(out,"***WARNING WILL RODGERS DANGER!! sack_chg:%u last:%u\n", 642 sack_chg, chg_remembered 643 ); 644 } 645 sack_chg = chg_remembered = 0; 646 } else if (strncmp(buffer, "RXT", 3) == 0) { 647 sack_filter_clear(&sf, snd_una); 648 } else if (strncmp(buffer, "ACK:", 4) == 0) { 649 th_ack = strtoul(&buffer[4], NULL, 0); 650 if (snd_una_set == 0) { 651 snd_una = th_ack; 652 snd_una_set = 1; 653 } else if (SEQ_GT(th_ack, snd_una)) { 654 snd_una = th_ack; 655 } 656 } else if (strncmp(buffer, "EXIT", 4) == 0) { 657 sack_filter_clear(&sf, snd_una); 658 sack_chg = chg_remembered = 0; 659 } else if (strncmp(buffer, "SACK:", 5) == 0) { 660 char *end=NULL; 661 uint32_t start; 662 uint32_t endv; 663 start = strtoul(&buffer[5], &end, 0); 664 if (end) { 665 endv = strtoul(&end[1], NULL, 0); 666 } else { 667 fprintf(out, "--Sack invalid skip 0 start:%u : ??\n", start); 668 continue; 669 } 670 if (SEQ_LT(endv, start)) { 671 fprintf(out, "--Sack invalid skip 1 endv:%u < start:%u\n", endv, start); 672 continue; 673 } 674 if (numblks == TCP_MAX_SACK) { 675 fprintf(out, "--Exceeded max %d\n", numblks); 676 exit(0); 677 } 678 blks[numblks].start = start; 679 blks[numblks].end = endv; 680 numblks++; 681 } 682 memset(buffer, 0, sizeof(buffer)); 683 } 684 if (in != stdin) { 685 fclose(in); 686 } 687 if (out != stdout) { 688 fclose(out); 689 } 690 a = saved * 100.0; 691 b = tot_sack_blks * 1.0; 692 if (b > 0.0) 693 c = a/b; 694 else 695 c = 0.0; 696 if (out != stdout) 697 err = stdout; 698 else 699 err = stderr; 700 fprintf(err, "Saved %lu sack blocks out of %lu (%2.3f%%) old_skip:%lu old_usd:%lu high_cnt:%d ow:%d ea:%d\n", 701 saved, tot_sack_blks, c, cnt_skipped_oldsack, cnt_used_oldsack, highest_used, over_written, empty_avail); 702 return(0); 703 } 704 #endif 705