1 /* crc32.c -- compute the CRC-32 of a data stream 2 * Copyright (C) 1995-2022 Mark Adler 3 * For conditions of distribution and use, see copyright notice in zlib.h 4 * 5 * This interleaved implementation of a CRC makes use of pipelined multiple 6 * arithmetic-logic units, commonly found in modern CPU cores. It is due to 7 * Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution. 8 */ 9 10 /* @(#) $Id$ */ 11 12 /* 13 Note on the use of DYNAMIC_CRC_TABLE: there is no mutex or semaphore 14 protection on the static variables used to control the first-use generation 15 of the crc tables. Therefore, if you #define DYNAMIC_CRC_TABLE, you should 16 first call get_crc_table() to initialize the tables before allowing more than 17 one thread to use crc32(). 18 19 MAKECRCH can be #defined to write out crc32.h. A main() routine is also 20 produced, so that this one source file can be compiled to an executable. 21 */ 22 23 #ifdef MAKECRCH 24 # include <stdio.h> 25 # ifndef DYNAMIC_CRC_TABLE 26 # define DYNAMIC_CRC_TABLE 27 # endif /* !DYNAMIC_CRC_TABLE */ 28 #endif /* MAKECRCH */ 29 30 #include "zutil.h" /* for Z_U4, Z_U8, z_crc_t, and FAR definitions */ 31 32 /* 33 A CRC of a message is computed on N braids of words in the message, where 34 each word consists of W bytes (4 or 8). If N is 3, for example, then three 35 running sparse CRCs are calculated respectively on each braid, at these 36 indices in the array of words: 0, 3, 6, ..., 1, 4, 7, ..., and 2, 5, 8, ... 37 This is done starting at a word boundary, and continues until as many blocks 38 of N * W bytes as are available have been processed. The results are combined 39 into a single CRC at the end. For this code, N must be in the range 1..6 and 40 W must be 4 or 8. The upper limit on N can be increased if desired by adding 41 more #if blocks, extending the patterns apparent in the code. In addition, 42 crc32.h would need to be regenerated, if the maximum N value is increased. 43 44 N and W are chosen empirically by benchmarking the execution time on a given 45 processor. The choices for N and W below were based on testing on Intel Kaby 46 Lake i7, AMD Ryzen 7, ARM Cortex-A57, Sparc64-VII, PowerPC POWER9, and MIPS64 47 Octeon II processors. The Intel, AMD, and ARM processors were all fastest 48 with N=5, W=8. The Sparc, PowerPC, and MIPS64 were all fastest at N=5, W=4. 49 They were all tested with either gcc or clang, all using the -O3 optimization 50 level. Your mileage may vary. 51 */ 52 53 /* Define N */ 54 #ifdef Z_TESTN 55 # define N Z_TESTN 56 #else 57 # define N 5 58 #endif 59 #if N < 1 || N > 6 60 # error N must be in 1..6 61 #endif 62 63 /* 64 z_crc_t must be at least 32 bits. z_word_t must be at least as long as 65 z_crc_t. It is assumed here that z_word_t is either 32 bits or 64 bits, and 66 that bytes are eight bits. 67 */ 68 69 /* 70 Define W and the associated z_word_t type. If W is not defined, then a 71 braided calculation is not used, and the associated tables and code are not 72 compiled. 73 */ 74 #ifdef Z_TESTW 75 # if Z_TESTW-1 != -1 76 # define W Z_TESTW 77 # endif 78 #else 79 # ifdef MAKECRCH 80 # define W 8 /* required for MAKECRCH */ 81 # else 82 # if defined(__x86_64__) || defined(__aarch64__) 83 # define W 8 84 # else 85 # define W 4 86 # endif 87 # endif 88 #endif 89 #ifdef W 90 # if W == 8 && defined(Z_U8) 91 typedef Z_U8 z_word_t; 92 # elif defined(Z_U4) 93 # undef W 94 # define W 4 95 typedef Z_U4 z_word_t; 96 # else 97 # undef W 98 # endif 99 #endif 100 101 /* If available, use the ARM processor CRC32 instruction. */ 102 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) && W == 8 103 # define ARMCRC32 104 #endif 105 106 /* Local functions. */ 107 local z_crc_t multmodp OF((z_crc_t a, z_crc_t b)); 108 local z_crc_t x2nmodp OF((z_off64_t n, unsigned k)); 109 110 #if defined(W) && (!defined(ARMCRC32) || defined(DYNAMIC_CRC_TABLE)) 111 local z_word_t byte_swap OF((z_word_t word)); 112 #endif 113 114 #if defined(W) && !defined(ARMCRC32) 115 local z_crc_t crc_word OF((z_word_t data)); 116 local z_word_t crc_word_big OF((z_word_t data)); 117 #endif 118 119 #if defined(W) && (!defined(ARMCRC32) || defined(DYNAMIC_CRC_TABLE)) 120 /* 121 Swap the bytes in a z_word_t to convert between little and big endian. Any 122 self-respecting compiler will optimize this to a single machine byte-swap 123 instruction, if one is available. This assumes that word_t is either 32 bits 124 or 64 bits. 125 */ 126 local z_word_t byte_swap(word) 127 z_word_t word; 128 { 129 # if W == 8 130 return 131 (word & 0xff00000000000000) >> 56 | 132 (word & 0xff000000000000) >> 40 | 133 (word & 0xff0000000000) >> 24 | 134 (word & 0xff00000000) >> 8 | 135 (word & 0xff000000) << 8 | 136 (word & 0xff0000) << 24 | 137 (word & 0xff00) << 40 | 138 (word & 0xff) << 56; 139 # else /* W == 4 */ 140 return 141 (word & 0xff000000) >> 24 | 142 (word & 0xff0000) >> 8 | 143 (word & 0xff00) << 8 | 144 (word & 0xff) << 24; 145 # endif 146 } 147 #endif 148 149 /* CRC polynomial. */ 150 #define POLY 0xedb88320 /* p(x) reflected, with x^32 implied */ 151 152 #ifdef DYNAMIC_CRC_TABLE 153 154 local z_crc_t FAR crc_table[256]; 155 local z_crc_t FAR x2n_table[32]; 156 local void make_crc_table OF((void)); 157 #ifdef W 158 local z_word_t FAR crc_big_table[256]; 159 local z_crc_t FAR crc_braid_table[W][256]; 160 local z_word_t FAR crc_braid_big_table[W][256]; 161 local void braid OF((z_crc_t [][256], z_word_t [][256], int, int)); 162 #endif 163 #ifdef MAKECRCH 164 local void write_table OF((FILE *, const z_crc_t FAR *, int)); 165 local void write_table32hi OF((FILE *, const z_word_t FAR *, int)); 166 local void write_table64 OF((FILE *, const z_word_t FAR *, int)); 167 #endif /* MAKECRCH */ 168 169 /* 170 Define a once() function depending on the availability of atomics. If this is 171 compiled with DYNAMIC_CRC_TABLE defined, and if CRCs will be computed in 172 multiple threads, and if atomics are not available, then get_crc_table() must 173 be called to initialize the tables and must return before any threads are 174 allowed to compute or combine CRCs. 175 */ 176 177 /* Definition of once functionality. */ 178 typedef struct once_s once_t; 179 local void once OF((once_t *, void (*)(void))); 180 181 /* Check for the availability of atomics. */ 182 #if defined(__STDC__) && __STDC_VERSION__ >= 201112L && \ 183 !defined(__STDC_NO_ATOMICS__) 184 185 #include <stdatomic.h> 186 187 /* Structure for once(), which must be initialized with ONCE_INIT. */ 188 struct once_s { 189 atomic_flag begun; 190 atomic_int done; 191 }; 192 #define ONCE_INIT {ATOMIC_FLAG_INIT, 0} 193 194 /* 195 Run the provided init() function exactly once, even if multiple threads 196 invoke once() at the same time. The state must be a once_t initialized with 197 ONCE_INIT. 198 */ 199 local void once(state, init) 200 once_t *state; 201 void (*init)(void); 202 { 203 if (!atomic_load(&state->done)) { 204 if (atomic_flag_test_and_set(&state->begun)) 205 while (!atomic_load(&state->done)) 206 ; 207 else { 208 init(); 209 atomic_store(&state->done, 1); 210 } 211 } 212 } 213 214 #else /* no atomics */ 215 216 /* Structure for once(), which must be initialized with ONCE_INIT. */ 217 struct once_s { 218 volatile int begun; 219 volatile int done; 220 }; 221 #define ONCE_INIT {0, 0} 222 223 /* Test and set. Alas, not atomic, but tries to minimize the period of 224 vulnerability. */ 225 local int test_and_set OF((int volatile *)); 226 local int test_and_set(flag) 227 int volatile *flag; 228 { 229 int was; 230 231 was = *flag; 232 *flag = 1; 233 return was; 234 } 235 236 /* Run the provided init() function once. This is not thread-safe. */ 237 local void once(state, init) 238 once_t *state; 239 void (*init)(void); 240 { 241 if (!state->done) { 242 if (test_and_set(&state->begun)) 243 while (!state->done) 244 ; 245 else { 246 init(); 247 state->done = 1; 248 } 249 } 250 } 251 252 #endif 253 254 /* State for once(). */ 255 local once_t made = ONCE_INIT; 256 257 /* 258 Generate tables for a byte-wise 32-bit CRC calculation on the polynomial: 259 x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1. 260 261 Polynomials over GF(2) are represented in binary, one bit per coefficient, 262 with the lowest powers in the most significant bit. Then adding polynomials 263 is just exclusive-or, and multiplying a polynomial by x is a right shift by 264 one. If we call the above polynomial p, and represent a byte as the 265 polynomial q, also with the lowest power in the most significant bit (so the 266 byte 0xb1 is the polynomial x^7+x^3+x^2+1), then the CRC is (q*x^32) mod p, 267 where a mod b means the remainder after dividing a by b. 268 269 This calculation is done using the shift-register method of multiplying and 270 taking the remainder. The register is initialized to zero, and for each 271 incoming bit, x^32 is added mod p to the register if the bit is a one (where 272 x^32 mod p is p+x^32 = x^26+...+1), and the register is multiplied mod p by x 273 (which is shifting right by one and adding x^32 mod p if the bit shifted out 274 is a one). We start with the highest power (least significant bit) of q and 275 repeat for all eight bits of q. 276 277 The table is simply the CRC of all possible eight bit values. This is all the 278 information needed to generate CRCs on data a byte at a time for all 279 combinations of CRC register values and incoming bytes. 280 */ 281 282 local void make_crc_table() 283 { 284 unsigned i, j, n; 285 z_crc_t p; 286 287 /* initialize the CRC of bytes tables */ 288 for (i = 0; i < 256; i++) { 289 p = i; 290 for (j = 0; j < 8; j++) 291 p = p & 1 ? (p >> 1) ^ POLY : p >> 1; 292 crc_table[i] = p; 293 #ifdef W 294 crc_big_table[i] = byte_swap(p); 295 #endif 296 } 297 298 /* initialize the x^2^n mod p(x) table */ 299 p = (z_crc_t)1 << 30; /* x^1 */ 300 x2n_table[0] = p; 301 for (n = 1; n < 32; n++) 302 x2n_table[n] = p = multmodp(p, p); 303 304 #ifdef W 305 /* initialize the braiding tables -- needs x2n_table[] */ 306 braid(crc_braid_table, crc_braid_big_table, N, W); 307 #endif 308 309 #ifdef MAKECRCH 310 { 311 /* 312 The crc32.h header file contains tables for both 32-bit and 64-bit 313 z_word_t's, and so requires a 64-bit type be available. In that case, 314 z_word_t must be defined to be 64-bits. This code then also generates 315 and writes out the tables for the case that z_word_t is 32 bits. 316 */ 317 #if !defined(W) || W != 8 318 # error Need a 64-bit integer type in order to generate crc32.h. 319 #endif 320 FILE *out; 321 int k, n; 322 z_crc_t ltl[8][256]; 323 z_word_t big[8][256]; 324 325 out = fopen("crc32.h", "w"); 326 if (out == NULL) return; 327 328 /* write out little-endian CRC table to crc32.h */ 329 fprintf(out, 330 "/* crc32.h -- tables for rapid CRC calculation\n" 331 " * Generated automatically by crc32.c\n */\n" 332 "\n" 333 "local const z_crc_t FAR crc_table[] = {\n" 334 " "); 335 write_table(out, crc_table, 256); 336 fprintf(out, 337 "};\n"); 338 339 /* write out big-endian CRC table for 64-bit z_word_t to crc32.h */ 340 fprintf(out, 341 "\n" 342 "#ifdef W\n" 343 "\n" 344 "#if W == 8\n" 345 "\n" 346 "local const z_word_t FAR crc_big_table[] = {\n" 347 " "); 348 write_table64(out, crc_big_table, 256); 349 fprintf(out, 350 "};\n"); 351 352 /* write out big-endian CRC table for 32-bit z_word_t to crc32.h */ 353 fprintf(out, 354 "\n" 355 "#else /* W == 4 */\n" 356 "\n" 357 "local const z_word_t FAR crc_big_table[] = {\n" 358 " "); 359 write_table32hi(out, crc_big_table, 256); 360 fprintf(out, 361 "};\n" 362 "\n" 363 "#endif\n"); 364 365 /* write out braid tables for each value of N */ 366 for (n = 1; n <= 6; n++) { 367 fprintf(out, 368 "\n" 369 "#if N == %d\n", n); 370 371 /* compute braid tables for this N and 64-bit word_t */ 372 braid(ltl, big, n, 8); 373 374 /* write out braid tables for 64-bit z_word_t to crc32.h */ 375 fprintf(out, 376 "\n" 377 "#if W == 8\n" 378 "\n" 379 "local const z_crc_t FAR crc_braid_table[][256] = {\n"); 380 for (k = 0; k < 8; k++) { 381 fprintf(out, " {"); 382 write_table(out, ltl[k], 256); 383 fprintf(out, "}%s", k < 7 ? ",\n" : ""); 384 } 385 fprintf(out, 386 "};\n" 387 "\n" 388 "local const z_word_t FAR crc_braid_big_table[][256] = {\n"); 389 for (k = 0; k < 8; k++) { 390 fprintf(out, " {"); 391 write_table64(out, big[k], 256); 392 fprintf(out, "}%s", k < 7 ? ",\n" : ""); 393 } 394 fprintf(out, 395 "};\n"); 396 397 /* compute braid tables for this N and 32-bit word_t */ 398 braid(ltl, big, n, 4); 399 400 /* write out braid tables for 32-bit z_word_t to crc32.h */ 401 fprintf(out, 402 "\n" 403 "#else /* W == 4 */\n" 404 "\n" 405 "local const z_crc_t FAR crc_braid_table[][256] = {\n"); 406 for (k = 0; k < 4; k++) { 407 fprintf(out, " {"); 408 write_table(out, ltl[k], 256); 409 fprintf(out, "}%s", k < 3 ? ",\n" : ""); 410 } 411 fprintf(out, 412 "};\n" 413 "\n" 414 "local const z_word_t FAR crc_braid_big_table[][256] = {\n"); 415 for (k = 0; k < 4; k++) { 416 fprintf(out, " {"); 417 write_table32hi(out, big[k], 256); 418 fprintf(out, "}%s", k < 3 ? ",\n" : ""); 419 } 420 fprintf(out, 421 "};\n" 422 "\n" 423 "#endif\n" 424 "\n" 425 "#endif\n"); 426 } 427 fprintf(out, 428 "\n" 429 "#endif\n"); 430 431 /* write out zeros operator table to crc32.h */ 432 fprintf(out, 433 "\n" 434 "local const z_crc_t FAR x2n_table[] = {\n" 435 " "); 436 write_table(out, x2n_table, 32); 437 fprintf(out, 438 "};\n"); 439 fclose(out); 440 } 441 #endif /* MAKECRCH */ 442 } 443 444 #ifdef MAKECRCH 445 446 /* 447 Write the 32-bit values in table[0..k-1] to out, five per line in 448 hexadecimal separated by commas. 449 */ 450 local void write_table(out, table, k) 451 FILE *out; 452 const z_crc_t FAR *table; 453 int k; 454 { 455 int n; 456 457 for (n = 0; n < k; n++) 458 fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : " ", 459 (unsigned long)(table[n]), 460 n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", ")); 461 } 462 463 /* 464 Write the high 32-bits of each value in table[0..k-1] to out, five per line 465 in hexadecimal separated by commas. 466 */ 467 local void write_table32hi(out, table, k) 468 FILE *out; 469 const z_word_t FAR *table; 470 int k; 471 { 472 int n; 473 474 for (n = 0; n < k; n++) 475 fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : " ", 476 (unsigned long)(table[n] >> 32), 477 n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", ")); 478 } 479 480 /* 481 Write the 64-bit values in table[0..k-1] to out, three per line in 482 hexadecimal separated by commas. This assumes that if there is a 64-bit 483 type, then there is also a long long integer type, and it is at least 64 484 bits. If not, then the type cast and format string can be adjusted 485 accordingly. 486 */ 487 local void write_table64(out, table, k) 488 FILE *out; 489 const z_word_t FAR *table; 490 int k; 491 { 492 int n; 493 494 for (n = 0; n < k; n++) 495 fprintf(out, "%s0x%016llx%s", n == 0 || n % 3 ? "" : " ", 496 (unsigned long long)(table[n]), 497 n == k - 1 ? "" : (n % 3 == 2 ? ",\n" : ", ")); 498 } 499 500 /* Actually do the deed. */ 501 int main() 502 { 503 make_crc_table(); 504 return 0; 505 } 506 507 #endif /* MAKECRCH */ 508 509 #ifdef W 510 /* 511 Generate the little and big-endian braid tables for the given n and z_word_t 512 size w. Each array must have room for w blocks of 256 elements. 513 */ 514 local void braid(ltl, big, n, w) 515 z_crc_t ltl[][256]; 516 z_word_t big[][256]; 517 int n; 518 int w; 519 { 520 int k; 521 z_crc_t i, p, q; 522 for (k = 0; k < w; k++) { 523 p = x2nmodp((n * w + 3 - k) << 3, 0); 524 ltl[k][0] = 0; 525 big[w - 1 - k][0] = 0; 526 for (i = 1; i < 256; i++) { 527 ltl[k][i] = q = multmodp(i << 24, p); 528 big[w - 1 - k][i] = byte_swap(q); 529 } 530 } 531 } 532 #endif 533 534 #else /* !DYNAMIC_CRC_TABLE */ 535 /* ======================================================================== 536 * Tables for byte-wise and braided CRC-32 calculations, and a table of powers 537 * of x for combining CRC-32s, all made by make_crc_table(). 538 */ 539 #include "crc32.h" 540 #endif /* DYNAMIC_CRC_TABLE */ 541 542 /* ======================================================================== 543 * Routines used for CRC calculation. Some are also required for the table 544 * generation above. 545 */ 546 547 /* 548 Return a(x) multiplied by b(x) modulo p(x), where p(x) is the CRC polynomial, 549 reflected. For speed, this requires that a not be zero. 550 */ 551 local z_crc_t multmodp(a, b) 552 z_crc_t a; 553 z_crc_t b; 554 { 555 z_crc_t m, p; 556 557 m = (z_crc_t)1 << 31; 558 p = 0; 559 for (;;) { 560 if (a & m) { 561 p ^= b; 562 if ((a & (m - 1)) == 0) 563 break; 564 } 565 m >>= 1; 566 b = b & 1 ? (b >> 1) ^ POLY : b >> 1; 567 } 568 return p; 569 } 570 571 /* 572 Return x^(n * 2^k) modulo p(x). Requires that x2n_table[] has been 573 initialized. 574 */ 575 local z_crc_t x2nmodp(n, k) 576 z_off64_t n; 577 unsigned k; 578 { 579 z_crc_t p; 580 581 p = (z_crc_t)1 << 31; /* x^0 == 1 */ 582 while (n) { 583 if (n & 1) 584 p = multmodp(x2n_table[k & 31], p); 585 n >>= 1; 586 k++; 587 } 588 return p; 589 } 590 591 /* ========================================================================= 592 * This function can be used by asm versions of crc32(), and to force the 593 * generation of the CRC tables in a threaded application. 594 */ 595 const z_crc_t FAR * ZEXPORT get_crc_table() 596 { 597 #ifdef DYNAMIC_CRC_TABLE 598 once(&made, make_crc_table); 599 #endif /* DYNAMIC_CRC_TABLE */ 600 return (const z_crc_t FAR *)crc_table; 601 } 602 603 /* ========================================================================= 604 * Use ARM machine instructions if available. This will compute the CRC about 605 * ten times faster than the braided calculation. This code does not check for 606 * the presence of the CRC instruction at run time. __ARM_FEATURE_CRC32 will 607 * only be defined if the compilation specifies an ARM processor architecture 608 * that has the instructions. For example, compiling with -march=armv8.1-a or 609 * -march=armv8-a+crc, or -march=native if the compile machine has the crc32 610 * instructions. 611 */ 612 #ifdef ARMCRC32 613 614 /* 615 Constants empirically determined to maximize speed. These values are from 616 measurements on a Cortex-A57. Your mileage may vary. 617 */ 618 #define Z_BATCH 3990 /* number of words in a batch */ 619 #define Z_BATCH_ZEROS 0xa10d3d0c /* computed from Z_BATCH = 3990 */ 620 #define Z_BATCH_MIN 800 /* fewest words in a final batch */ 621 622 unsigned long ZEXPORT crc32_z(crc, buf, len) 623 unsigned long crc; 624 const unsigned char FAR *buf; 625 z_size_t len; 626 { 627 z_crc_t val; 628 z_word_t crc1, crc2; 629 const z_word_t *word; 630 z_word_t val0, val1, val2; 631 z_size_t last, last2, i; 632 z_size_t num; 633 634 /* Return initial CRC, if requested. */ 635 if (buf == Z_NULL) return 0; 636 637 #ifdef DYNAMIC_CRC_TABLE 638 once(&made, make_crc_table); 639 #endif /* DYNAMIC_CRC_TABLE */ 640 641 /* Pre-condition the CRC */ 642 crc = (~crc) & 0xffffffff; 643 644 /* Compute the CRC up to a word boundary. */ 645 while (len && ((z_size_t)buf & 7) != 0) { 646 len--; 647 val = *buf++; 648 __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val)); 649 } 650 651 /* Prepare to compute the CRC on full 64-bit words word[0..num-1]. */ 652 word = (z_word_t const *)buf; 653 num = len >> 3; 654 len &= 7; 655 656 /* Do three interleaved CRCs to realize the throughput of one crc32x 657 instruction per cycle. Each CRC is calculated on Z_BATCH words. The 658 three CRCs are combined into a single CRC after each set of batches. */ 659 while (num >= 3 * Z_BATCH) { 660 crc1 = 0; 661 crc2 = 0; 662 for (i = 0; i < Z_BATCH; i++) { 663 val0 = word[i]; 664 val1 = word[i + Z_BATCH]; 665 val2 = word[i + 2 * Z_BATCH]; 666 __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0)); 667 __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1)); 668 __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2)); 669 } 670 word += 3 * Z_BATCH; 671 num -= 3 * Z_BATCH; 672 crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc1; 673 crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc2; 674 } 675 676 /* Do one last smaller batch with the remaining words, if there are enough 677 to pay for the combination of CRCs. */ 678 last = num / 3; 679 if (last >= Z_BATCH_MIN) { 680 last2 = last << 1; 681 crc1 = 0; 682 crc2 = 0; 683 for (i = 0; i < last; i++) { 684 val0 = word[i]; 685 val1 = word[i + last]; 686 val2 = word[i + last2]; 687 __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0)); 688 __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1)); 689 __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2)); 690 } 691 word += 3 * last; 692 num -= 3 * last; 693 val = x2nmodp(last, 6); 694 crc = multmodp(val, crc) ^ crc1; 695 crc = multmodp(val, crc) ^ crc2; 696 } 697 698 /* Compute the CRC on any remaining words. */ 699 for (i = 0; i < num; i++) { 700 val0 = word[i]; 701 __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0)); 702 } 703 word += num; 704 705 /* Complete the CRC on any remaining bytes. */ 706 buf = (const unsigned char FAR *)word; 707 while (len) { 708 len--; 709 val = *buf++; 710 __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val)); 711 } 712 713 /* Return the CRC, post-conditioned. */ 714 return crc ^ 0xffffffff; 715 } 716 717 #else 718 719 #ifdef W 720 721 /* 722 Return the CRC of the W bytes in the word_t data, taking the 723 least-significant byte of the word as the first byte of data, without any pre 724 or post conditioning. This is used to combine the CRCs of each braid. 725 */ 726 local z_crc_t crc_word(data) 727 z_word_t data; 728 { 729 int k; 730 for (k = 0; k < W; k++) 731 data = (data >> 8) ^ crc_table[data & 0xff]; 732 return (z_crc_t)data; 733 } 734 735 local z_word_t crc_word_big(data) 736 z_word_t data; 737 { 738 int k; 739 for (k = 0; k < W; k++) 740 data = (data << 8) ^ 741 crc_big_table[(data >> ((W - 1) << 3)) & 0xff]; 742 return data; 743 } 744 745 #endif 746 747 /* ========================================================================= */ 748 unsigned long ZEXPORT crc32_z(crc, buf, len) 749 unsigned long crc; 750 const unsigned char FAR *buf; 751 z_size_t len; 752 { 753 /* Return initial CRC, if requested. */ 754 if (buf == Z_NULL) return 0; 755 756 #ifdef DYNAMIC_CRC_TABLE 757 once(&made, make_crc_table); 758 #endif /* DYNAMIC_CRC_TABLE */ 759 760 /* Pre-condition the CRC */ 761 crc = (~crc) & 0xffffffff; 762 763 #ifdef W 764 765 /* If provided enough bytes, do a braided CRC calculation. */ 766 if (len >= N * W + W - 1) { 767 z_size_t blks; 768 z_word_t const *words; 769 unsigned endian; 770 int k; 771 772 /* Compute the CRC up to a z_word_t boundary. */ 773 while (len && ((z_size_t)buf & (W - 1)) != 0) { 774 len--; 775 crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff]; 776 } 777 778 /* Compute the CRC on as many N z_word_t blocks as are available. */ 779 blks = len / (N * W); 780 len -= blks * N * W; 781 words = (z_word_t const *)buf; 782 783 /* Do endian check at execution time instead of compile time, since ARM 784 processors can change the endianess at execution time. If the 785 compiler knows what the endianess will be, it can optimize out the 786 check and the unused branch. */ 787 endian = 1; 788 if (*(unsigned char *)&endian) { 789 /* Little endian. */ 790 791 z_crc_t crc0; 792 z_word_t word0; 793 #if N > 1 794 z_crc_t crc1; 795 z_word_t word1; 796 #if N > 2 797 z_crc_t crc2; 798 z_word_t word2; 799 #if N > 3 800 z_crc_t crc3; 801 z_word_t word3; 802 #if N > 4 803 z_crc_t crc4; 804 z_word_t word4; 805 #if N > 5 806 z_crc_t crc5; 807 z_word_t word5; 808 #endif 809 #endif 810 #endif 811 #endif 812 #endif 813 814 /* Initialize the CRC for each braid. */ 815 crc0 = crc; 816 #if N > 1 817 crc1 = 0; 818 #if N > 2 819 crc2 = 0; 820 #if N > 3 821 crc3 = 0; 822 #if N > 4 823 crc4 = 0; 824 #if N > 5 825 crc5 = 0; 826 #endif 827 #endif 828 #endif 829 #endif 830 #endif 831 832 /* 833 Process the first blks-1 blocks, computing the CRCs on each braid 834 independently. 835 */ 836 while (--blks) { 837 /* Load the word for each braid into registers. */ 838 word0 = crc0 ^ words[0]; 839 #if N > 1 840 word1 = crc1 ^ words[1]; 841 #if N > 2 842 word2 = crc2 ^ words[2]; 843 #if N > 3 844 word3 = crc3 ^ words[3]; 845 #if N > 4 846 word4 = crc4 ^ words[4]; 847 #if N > 5 848 word5 = crc5 ^ words[5]; 849 #endif 850 #endif 851 #endif 852 #endif 853 #endif 854 words += N; 855 856 /* Compute and update the CRC for each word. The loop should 857 get unrolled. */ 858 crc0 = crc_braid_table[0][word0 & 0xff]; 859 #if N > 1 860 crc1 = crc_braid_table[0][word1 & 0xff]; 861 #if N > 2 862 crc2 = crc_braid_table[0][word2 & 0xff]; 863 #if N > 3 864 crc3 = crc_braid_table[0][word3 & 0xff]; 865 #if N > 4 866 crc4 = crc_braid_table[0][word4 & 0xff]; 867 #if N > 5 868 crc5 = crc_braid_table[0][word5 & 0xff]; 869 #endif 870 #endif 871 #endif 872 #endif 873 #endif 874 for (k = 1; k < W; k++) { 875 crc0 ^= crc_braid_table[k][(word0 >> (k << 3)) & 0xff]; 876 #if N > 1 877 crc1 ^= crc_braid_table[k][(word1 >> (k << 3)) & 0xff]; 878 #if N > 2 879 crc2 ^= crc_braid_table[k][(word2 >> (k << 3)) & 0xff]; 880 #if N > 3 881 crc3 ^= crc_braid_table[k][(word3 >> (k << 3)) & 0xff]; 882 #if N > 4 883 crc4 ^= crc_braid_table[k][(word4 >> (k << 3)) & 0xff]; 884 #if N > 5 885 crc5 ^= crc_braid_table[k][(word5 >> (k << 3)) & 0xff]; 886 #endif 887 #endif 888 #endif 889 #endif 890 #endif 891 } 892 } 893 894 /* 895 Process the last block, combining the CRCs of the N braids at the 896 same time. 897 */ 898 crc = crc_word(crc0 ^ words[0]); 899 #if N > 1 900 crc = crc_word(crc1 ^ words[1] ^ crc); 901 #if N > 2 902 crc = crc_word(crc2 ^ words[2] ^ crc); 903 #if N > 3 904 crc = crc_word(crc3 ^ words[3] ^ crc); 905 #if N > 4 906 crc = crc_word(crc4 ^ words[4] ^ crc); 907 #if N > 5 908 crc = crc_word(crc5 ^ words[5] ^ crc); 909 #endif 910 #endif 911 #endif 912 #endif 913 #endif 914 words += N; 915 } 916 else { 917 /* Big endian. */ 918 919 z_word_t crc0, word0, comb; 920 #if N > 1 921 z_word_t crc1, word1; 922 #if N > 2 923 z_word_t crc2, word2; 924 #if N > 3 925 z_word_t crc3, word3; 926 #if N > 4 927 z_word_t crc4, word4; 928 #if N > 5 929 z_word_t crc5, word5; 930 #endif 931 #endif 932 #endif 933 #endif 934 #endif 935 936 /* Initialize the CRC for each braid. */ 937 crc0 = byte_swap(crc); 938 #if N > 1 939 crc1 = 0; 940 #if N > 2 941 crc2 = 0; 942 #if N > 3 943 crc3 = 0; 944 #if N > 4 945 crc4 = 0; 946 #if N > 5 947 crc5 = 0; 948 #endif 949 #endif 950 #endif 951 #endif 952 #endif 953 954 /* 955 Process the first blks-1 blocks, computing the CRCs on each braid 956 independently. 957 */ 958 while (--blks) { 959 /* Load the word for each braid into registers. */ 960 word0 = crc0 ^ words[0]; 961 #if N > 1 962 word1 = crc1 ^ words[1]; 963 #if N > 2 964 word2 = crc2 ^ words[2]; 965 #if N > 3 966 word3 = crc3 ^ words[3]; 967 #if N > 4 968 word4 = crc4 ^ words[4]; 969 #if N > 5 970 word5 = crc5 ^ words[5]; 971 #endif 972 #endif 973 #endif 974 #endif 975 #endif 976 words += N; 977 978 /* Compute and update the CRC for each word. The loop should 979 get unrolled. */ 980 crc0 = crc_braid_big_table[0][word0 & 0xff]; 981 #if N > 1 982 crc1 = crc_braid_big_table[0][word1 & 0xff]; 983 #if N > 2 984 crc2 = crc_braid_big_table[0][word2 & 0xff]; 985 #if N > 3 986 crc3 = crc_braid_big_table[0][word3 & 0xff]; 987 #if N > 4 988 crc4 = crc_braid_big_table[0][word4 & 0xff]; 989 #if N > 5 990 crc5 = crc_braid_big_table[0][word5 & 0xff]; 991 #endif 992 #endif 993 #endif 994 #endif 995 #endif 996 for (k = 1; k < W; k++) { 997 crc0 ^= crc_braid_big_table[k][(word0 >> (k << 3)) & 0xff]; 998 #if N > 1 999 crc1 ^= crc_braid_big_table[k][(word1 >> (k << 3)) & 0xff]; 1000 #if N > 2 1001 crc2 ^= crc_braid_big_table[k][(word2 >> (k << 3)) & 0xff]; 1002 #if N > 3 1003 crc3 ^= crc_braid_big_table[k][(word3 >> (k << 3)) & 0xff]; 1004 #if N > 4 1005 crc4 ^= crc_braid_big_table[k][(word4 >> (k << 3)) & 0xff]; 1006 #if N > 5 1007 crc5 ^= crc_braid_big_table[k][(word5 >> (k << 3)) & 0xff]; 1008 #endif 1009 #endif 1010 #endif 1011 #endif 1012 #endif 1013 } 1014 } 1015 1016 /* 1017 Process the last block, combining the CRCs of the N braids at the 1018 same time. 1019 */ 1020 comb = crc_word_big(crc0 ^ words[0]); 1021 #if N > 1 1022 comb = crc_word_big(crc1 ^ words[1] ^ comb); 1023 #if N > 2 1024 comb = crc_word_big(crc2 ^ words[2] ^ comb); 1025 #if N > 3 1026 comb = crc_word_big(crc3 ^ words[3] ^ comb); 1027 #if N > 4 1028 comb = crc_word_big(crc4 ^ words[4] ^ comb); 1029 #if N > 5 1030 comb = crc_word_big(crc5 ^ words[5] ^ comb); 1031 #endif 1032 #endif 1033 #endif 1034 #endif 1035 #endif 1036 words += N; 1037 crc = byte_swap(comb); 1038 } 1039 1040 /* 1041 Update the pointer to the remaining bytes to process. 1042 */ 1043 buf = (unsigned char const *)words; 1044 } 1045 1046 #endif /* W */ 1047 1048 /* Complete the computation of the CRC on any remaining bytes. */ 1049 while (len >= 8) { 1050 len -= 8; 1051 crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff]; 1052 crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff]; 1053 crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff]; 1054 crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff]; 1055 crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff]; 1056 crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff]; 1057 crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff]; 1058 crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff]; 1059 } 1060 while (len) { 1061 len--; 1062 crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff]; 1063 } 1064 1065 /* Return the CRC, post-conditioned. */ 1066 return crc ^ 0xffffffff; 1067 } 1068 1069 #endif 1070 1071 /* ========================================================================= */ 1072 unsigned long ZEXPORT crc32(crc, buf, len) 1073 unsigned long crc; 1074 const unsigned char FAR *buf; 1075 uInt len; 1076 { 1077 return crc32_z(crc, buf, len); 1078 } 1079 1080 /* ========================================================================= */ 1081 uLong ZEXPORT crc32_combine64(crc1, crc2, len2) 1082 uLong crc1; 1083 uLong crc2; 1084 z_off64_t len2; 1085 { 1086 #ifdef DYNAMIC_CRC_TABLE 1087 once(&made, make_crc_table); 1088 #endif /* DYNAMIC_CRC_TABLE */ 1089 return multmodp(x2nmodp(len2, 3), crc1) ^ (crc2 & 0xffffffff); 1090 } 1091 1092 /* ========================================================================= */ 1093 uLong ZEXPORT crc32_combine(crc1, crc2, len2) 1094 uLong crc1; 1095 uLong crc2; 1096 z_off_t len2; 1097 { 1098 return crc32_combine64(crc1, crc2, (z_off64_t)len2); 1099 } 1100 1101 /* ========================================================================= */ 1102 uLong ZEXPORT crc32_combine_gen64(len2) 1103 z_off64_t len2; 1104 { 1105 #ifdef DYNAMIC_CRC_TABLE 1106 once(&made, make_crc_table); 1107 #endif /* DYNAMIC_CRC_TABLE */ 1108 return x2nmodp(len2, 3); 1109 } 1110 1111 /* ========================================================================= */ 1112 uLong ZEXPORT crc32_combine_gen(len2) 1113 z_off_t len2; 1114 { 1115 return crc32_combine_gen64((z_off64_t)len2); 1116 } 1117 1118 /* ========================================================================= */ 1119 uLong ZEXPORT crc32_combine_op(crc1, crc2, op) 1120 uLong crc1; 1121 uLong crc2; 1122 uLong op; 1123 { 1124 return multmodp(op, crc1) ^ (crc2 & 0xffffffff); 1125 } 1126