1 /* crc32.c -- compute the CRC-32 of a data stream
2 * Copyright (C) 1995-2022 Mark Adler
3 * For conditions of distribution and use, see copyright notice in zlib.h
4 *
5 * This interleaved implementation of a CRC makes use of pipelined multiple
6 * arithmetic-logic units, commonly found in modern CPU cores. It is due to
7 * Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution.
8 */
9
10 /*
11 Note on the use of DYNAMIC_CRC_TABLE: there is no mutex or semaphore
12 protection on the static variables used to control the first-use generation
13 of the crc tables. Therefore, if you #define DYNAMIC_CRC_TABLE, you should
14 first call get_crc_table() to initialize the tables before allowing more than
15 one thread to use crc32().
16
17 MAKECRCH can be #defined to write out crc32.h. A main() routine is also
18 produced, so that this one source file can be compiled to an executable.
19 */
20
21 #ifdef MAKECRCH
22 # include <stdio.h>
23 # ifndef DYNAMIC_CRC_TABLE
24 # define DYNAMIC_CRC_TABLE
25 # endif /* !DYNAMIC_CRC_TABLE */
26 #endif /* MAKECRCH */
27
28 #include "zutil.h" /* for Z_U4, Z_U8, z_crc_t, and FAR definitions */
29
30 /*
31 A CRC of a message is computed on N braids of words in the message, where
32 each word consists of W bytes (4 or 8). If N is 3, for example, then three
33 running sparse CRCs are calculated respectively on each braid, at these
34 indices in the array of words: 0, 3, 6, ..., 1, 4, 7, ..., and 2, 5, 8, ...
35 This is done starting at a word boundary, and continues until as many blocks
36 of N * W bytes as are available have been processed. The results are combined
37 into a single CRC at the end. For this code, N must be in the range 1..6 and
38 W must be 4 or 8. The upper limit on N can be increased if desired by adding
39 more #if blocks, extending the patterns apparent in the code. In addition,
40 crc32.h would need to be regenerated, if the maximum N value is increased.
41
42 N and W are chosen empirically by benchmarking the execution time on a given
43 processor. The choices for N and W below were based on testing on Intel Kaby
44 Lake i7, AMD Ryzen 7, ARM Cortex-A57, Sparc64-VII, PowerPC POWER9, and MIPS64
45 Octeon II processors. The Intel, AMD, and ARM processors were all fastest
46 with N=5, W=8. The Sparc, PowerPC, and MIPS64 were all fastest at N=5, W=4.
47 They were all tested with either gcc or clang, all using the -O3 optimization
48 level. Your mileage may vary.
49 */
50
51 /* Define N */
52 #ifdef Z_TESTN
53 # define N Z_TESTN
54 #else
55 # define N 5
56 #endif
57 #if N < 1 || N > 6
58 # error N must be in 1..6
59 #endif
60
61 /*
62 z_crc_t must be at least 32 bits. z_word_t must be at least as long as
63 z_crc_t. It is assumed here that z_word_t is either 32 bits or 64 bits, and
64 that bytes are eight bits.
65 */
66
67 /*
68 Define W and the associated z_word_t type. If W is not defined, then a
69 braided calculation is not used, and the associated tables and code are not
70 compiled.
71 */
72 #ifdef Z_TESTW
73 # if Z_TESTW-1 != -1
74 # define W Z_TESTW
75 # endif
76 #else
77 # ifdef MAKECRCH
78 # define W 8 /* required for MAKECRCH */
79 # else
80 # if defined(__x86_64__) || defined(__aarch64__)
81 # define W 8
82 # else
83 # define W 4
84 # endif
85 # endif
86 #endif
87 #ifdef W
88 # if W == 8 && defined(Z_U8)
89 typedef Z_U8 z_word_t;
90 # elif defined(Z_U4)
91 # undef W
92 # define W 4
93 typedef Z_U4 z_word_t;
94 # else
95 # undef W
96 # endif
97 #endif
98
99 /* Local functions. */
100 local z_crc_t multmodp OF((z_crc_t a, z_crc_t b));
101 local z_crc_t x2nmodp OF((z_off64_t n, unsigned k));
102
103 /* If available, use the ARM processor CRC32 instruction. */
104 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) && W == 8
105 # define ARMCRC32
106 #endif
107
108 #if defined(W) && (!defined(ARMCRC32) || defined(DYNAMIC_CRC_TABLE))
109 /*
110 Swap the bytes in a z_word_t to convert between little and big endian. Any
111 self-respecting compiler will optimize this to a single machine byte-swap
112 instruction, if one is available. This assumes that word_t is either 32 bits
113 or 64 bits.
114 */
byte_swap(z_word_t word)115 local z_word_t byte_swap(z_word_t word)
116 {
117 # if W == 8
118 return
119 (word & 0xff00000000000000) >> 56 |
120 (word & 0xff000000000000) >> 40 |
121 (word & 0xff0000000000) >> 24 |
122 (word & 0xff00000000) >> 8 |
123 (word & 0xff000000) << 8 |
124 (word & 0xff0000) << 24 |
125 (word & 0xff00) << 40 |
126 (word & 0xff) << 56;
127 # else /* W == 4 */
128 return
129 (word & 0xff000000) >> 24 |
130 (word & 0xff0000) >> 8 |
131 (word & 0xff00) << 8 |
132 (word & 0xff) << 24;
133 # endif
134 }
135 #endif
136
137 /* CRC polynomial. */
138 #define POLY 0xedb88320 /* p(x) reflected, with x^32 implied */
139
140 #ifdef DYNAMIC_CRC_TABLE
141
142 local z_crc_t FAR crc_table[256];
143 local z_crc_t FAR x2n_table[32];
144 local void make_crc_table OF((void));
145 #ifdef W
146 local z_word_t FAR crc_big_table[256];
147 local z_crc_t FAR crc_braid_table[W][256];
148 local z_word_t FAR crc_braid_big_table[W][256];
149 local void braid OF((z_crc_t [][256], z_word_t [][256], int, int));
150 #endif
151 #ifdef MAKECRCH
152 local void write_table OF((FILE *, const z_crc_t FAR *, int));
153 local void write_table32hi OF((FILE *, const z_word_t FAR *, int));
154 local void write_table64 OF((FILE *, const z_word_t FAR *, int));
155 #endif /* MAKECRCH */
156
157 /*
158 Define a once() function depending on the availability of atomics. If this is
159 compiled with DYNAMIC_CRC_TABLE defined, and if CRCs will be computed in
160 multiple threads, and if atomics are not available, then get_crc_table() must
161 be called to initialize the tables and must return before any threads are
162 allowed to compute or combine CRCs.
163 */
164
165 /* Definition of once functionality. */
166 typedef struct once_s once_t;
167 local void once OF((once_t *, void (*)(void)));
168
169 /* Check for the availability of atomics. */
170 #if defined(__STDC__) && __STDC_VERSION__ >= 201112L && \
171 !defined(__STDC_NO_ATOMICS__)
172
173 #include <stdatomic.h>
174
175 /* Structure for once(), which must be initialized with ONCE_INIT. */
176 struct once_s {
177 atomic_flag begun;
178 atomic_int done;
179 };
180 #define ONCE_INIT {ATOMIC_FLAG_INIT, 0}
181
182 /*
183 Run the provided init() function exactly once, even if multiple threads
184 invoke once() at the same time. The state must be a once_t initialized with
185 ONCE_INIT.
186 */
once(state,init)187 local void once(state, init)
188 once_t *state;
189 void (*init)(void);
190 {
191 if (!atomic_load(&state->done)) {
192 if (atomic_flag_test_and_set(&state->begun))
193 while (!atomic_load(&state->done))
194 ;
195 else {
196 init();
197 atomic_store(&state->done, 1);
198 }
199 }
200 }
201
202 #else /* no atomics */
203
204 /* Structure for once(), which must be initialized with ONCE_INIT. */
205 struct once_s {
206 volatile int begun;
207 volatile int done;
208 };
209 #define ONCE_INIT {0, 0}
210
211 /* Test and set. Alas, not atomic, but tries to minimize the period of
212 vulnerability. */
213 local int test_and_set OF((int volatile *));
test_and_set(flag)214 local int test_and_set(flag)
215 int volatile *flag;
216 {
217 int was;
218
219 was = *flag;
220 *flag = 1;
221 return was;
222 }
223
224 /* Run the provided init() function once. This is not thread-safe. */
once(state,init)225 local void once(state, init)
226 once_t *state;
227 void (*init)(void);
228 {
229 if (!state->done) {
230 if (test_and_set(&state->begun))
231 while (!state->done)
232 ;
233 else {
234 init();
235 state->done = 1;
236 }
237 }
238 }
239
240 #endif
241
242 /* State for once(). */
243 local once_t made = ONCE_INIT;
244
245 /*
246 Generate tables for a byte-wise 32-bit CRC calculation on the polynomial:
247 x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1.
248
249 Polynomials over GF(2) are represented in binary, one bit per coefficient,
250 with the lowest powers in the most significant bit. Then adding polynomials
251 is just exclusive-or, and multiplying a polynomial by x is a right shift by
252 one. If we call the above polynomial p, and represent a byte as the
253 polynomial q, also with the lowest power in the most significant bit (so the
254 byte 0xb1 is the polynomial x^7+x^3+x^2+1), then the CRC is (q*x^32) mod p,
255 where a mod b means the remainder after dividing a by b.
256
257 This calculation is done using the shift-register method of multiplying and
258 taking the remainder. The register is initialized to zero, and for each
259 incoming bit, x^32 is added mod p to the register if the bit is a one (where
260 x^32 mod p is p+x^32 = x^26+...+1), and the register is multiplied mod p by x
261 (which is shifting right by one and adding x^32 mod p if the bit shifted out
262 is a one). We start with the highest power (least significant bit) of q and
263 repeat for all eight bits of q.
264
265 The table is simply the CRC of all possible eight bit values. This is all the
266 information needed to generate CRCs on data a byte at a time for all
267 combinations of CRC register values and incoming bytes.
268 */
269
make_crc_table()270 local void make_crc_table()
271 {
272 unsigned i, j, n;
273 z_crc_t p;
274
275 /* initialize the CRC of bytes tables */
276 for (i = 0; i < 256; i++) {
277 p = i;
278 for (j = 0; j < 8; j++)
279 p = p & 1 ? (p >> 1) ^ POLY : p >> 1;
280 crc_table[i] = p;
281 #ifdef W
282 crc_big_table[i] = byte_swap(p);
283 #endif
284 }
285
286 /* initialize the x^2^n mod p(x) table */
287 p = (z_crc_t)1 << 30; /* x^1 */
288 x2n_table[0] = p;
289 for (n = 1; n < 32; n++)
290 x2n_table[n] = p = multmodp(p, p);
291
292 #ifdef W
293 /* initialize the braiding tables -- needs x2n_table[] */
294 braid(crc_braid_table, crc_braid_big_table, N, W);
295 #endif
296
297 #ifdef MAKECRCH
298 {
299 /*
300 The crc32.h header file contains tables for both 32-bit and 64-bit
301 z_word_t's, and so requires a 64-bit type be available. In that case,
302 z_word_t must be defined to be 64-bits. This code then also generates
303 and writes out the tables for the case that z_word_t is 32 bits.
304 */
305 #if !defined(W) || W != 8
306 # error Need a 64-bit integer type in order to generate crc32.h.
307 #endif
308 FILE *out;
309 int k, n;
310 z_crc_t ltl[8][256];
311 z_word_t big[8][256];
312
313 out = fopen("crc32.h", "w");
314 if (out == NULL) return;
315
316 /* write out little-endian CRC table to crc32.h */
317 fprintf(out,
318 "/* crc32.h -- tables for rapid CRC calculation\n"
319 " * Generated automatically by crc32.c\n */\n"
320 "\n"
321 "local const z_crc_t FAR crc_table[] = {\n"
322 " ");
323 write_table(out, crc_table, 256);
324 fprintf(out,
325 "};\n");
326
327 /* write out big-endian CRC table for 64-bit z_word_t to crc32.h */
328 fprintf(out,
329 "\n"
330 "#ifdef W\n"
331 "\n"
332 "#if W == 8\n"
333 "\n"
334 "local const z_word_t FAR crc_big_table[] = {\n"
335 " ");
336 write_table64(out, crc_big_table, 256);
337 fprintf(out,
338 "};\n");
339
340 /* write out big-endian CRC table for 32-bit z_word_t to crc32.h */
341 fprintf(out,
342 "\n"
343 "#else /* W == 4 */\n"
344 "\n"
345 "local const z_word_t FAR crc_big_table[] = {\n"
346 " ");
347 write_table32hi(out, crc_big_table, 256);
348 fprintf(out,
349 "};\n"
350 "\n"
351 "#endif\n");
352
353 /* write out braid tables for each value of N */
354 for (n = 1; n <= 6; n++) {
355 fprintf(out,
356 "\n"
357 "#if N == %d\n", n);
358
359 /* compute braid tables for this N and 64-bit word_t */
360 braid(ltl, big, n, 8);
361
362 /* write out braid tables for 64-bit z_word_t to crc32.h */
363 fprintf(out,
364 "\n"
365 "#if W == 8\n"
366 "\n"
367 "local const z_crc_t FAR crc_braid_table[][256] = {\n");
368 for (k = 0; k < 8; k++) {
369 fprintf(out, " {");
370 write_table(out, ltl[k], 256);
371 fprintf(out, "}%s", k < 7 ? ",\n" : "");
372 }
373 fprintf(out,
374 "};\n"
375 "\n"
376 "local const z_word_t FAR crc_braid_big_table[][256] = {\n");
377 for (k = 0; k < 8; k++) {
378 fprintf(out, " {");
379 write_table64(out, big[k], 256);
380 fprintf(out, "}%s", k < 7 ? ",\n" : "");
381 }
382 fprintf(out,
383 "};\n");
384
385 /* compute braid tables for this N and 32-bit word_t */
386 braid(ltl, big, n, 4);
387
388 /* write out braid tables for 32-bit z_word_t to crc32.h */
389 fprintf(out,
390 "\n"
391 "#else /* W == 4 */\n"
392 "\n"
393 "local const z_crc_t FAR crc_braid_table[][256] = {\n");
394 for (k = 0; k < 4; k++) {
395 fprintf(out, " {");
396 write_table(out, ltl[k], 256);
397 fprintf(out, "}%s", k < 3 ? ",\n" : "");
398 }
399 fprintf(out,
400 "};\n"
401 "\n"
402 "local const z_word_t FAR crc_braid_big_table[][256] = {\n");
403 for (k = 0; k < 4; k++) {
404 fprintf(out, " {");
405 write_table32hi(out, big[k], 256);
406 fprintf(out, "}%s", k < 3 ? ",\n" : "");
407 }
408 fprintf(out,
409 "};\n"
410 "\n"
411 "#endif\n"
412 "\n"
413 "#endif\n");
414 }
415 fprintf(out,
416 "\n"
417 "#endif\n");
418
419 /* write out zeros operator table to crc32.h */
420 fprintf(out,
421 "\n"
422 "local const z_crc_t FAR x2n_table[] = {\n"
423 " ");
424 write_table(out, x2n_table, 32);
425 fprintf(out,
426 "};\n");
427 fclose(out);
428 }
429 #endif /* MAKECRCH */
430 }
431
432 #ifdef MAKECRCH
433
434 /*
435 Write the 32-bit values in table[0..k-1] to out, five per line in
436 hexadecimal separated by commas.
437 */
write_table(out,table,k)438 local void write_table(out, table, k)
439 FILE *out;
440 const z_crc_t FAR *table;
441 int k;
442 {
443 int n;
444
445 for (n = 0; n < k; n++)
446 fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : " ",
447 (unsigned long)(table[n]),
448 n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", "));
449 }
450
451 /*
452 Write the high 32-bits of each value in table[0..k-1] to out, five per line
453 in hexadecimal separated by commas.
454 */
write_table32hi(out,table,k)455 local void write_table32hi(out, table, k)
456 FILE *out;
457 const z_word_t FAR *table;
458 int k;
459 {
460 int n;
461
462 for (n = 0; n < k; n++)
463 fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : " ",
464 (unsigned long)(table[n] >> 32),
465 n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", "));
466 }
467
468 /*
469 Write the 64-bit values in table[0..k-1] to out, three per line in
470 hexadecimal separated by commas. This assumes that if there is a 64-bit
471 type, then there is also a long long integer type, and it is at least 64
472 bits. If not, then the type cast and format string can be adjusted
473 accordingly.
474 */
write_table64(out,table,k)475 local void write_table64(out, table, k)
476 FILE *out;
477 const z_word_t FAR *table;
478 int k;
479 {
480 int n;
481
482 for (n = 0; n < k; n++)
483 fprintf(out, "%s0x%016llx%s", n == 0 || n % 3 ? "" : " ",
484 (unsigned long long)(table[n]),
485 n == k - 1 ? "" : (n % 3 == 2 ? ",\n" : ", "));
486 }
487
488 /* Actually do the deed. */
main()489 int main()
490 {
491 make_crc_table();
492 return 0;
493 }
494
495 #endif /* MAKECRCH */
496
497 #ifdef W
498 /*
499 Generate the little and big-endian braid tables for the given n and z_word_t
500 size w. Each array must have room for w blocks of 256 elements.
501 */
braid(ltl,big,n,w)502 local void braid(ltl, big, n, w)
503 z_crc_t ltl[][256];
504 z_word_t big[][256];
505 int n;
506 int w;
507 {
508 int k;
509 z_crc_t i, p, q;
510 for (k = 0; k < w; k++) {
511 p = x2nmodp((n * w + 3 - k) << 3, 0);
512 ltl[k][0] = 0;
513 big[w - 1 - k][0] = 0;
514 for (i = 1; i < 256; i++) {
515 ltl[k][i] = q = multmodp(i << 24, p);
516 big[w - 1 - k][i] = byte_swap(q);
517 }
518 }
519 }
520 #endif
521
522 #else /* !DYNAMIC_CRC_TABLE */
523 /* ========================================================================
524 * Tables for byte-wise and braided CRC-32 calculations, and a table of powers
525 * of x for combining CRC-32s, all made by make_crc_table().
526 */
527 #include "crc32.h"
528 #endif /* DYNAMIC_CRC_TABLE */
529
530 /* ========================================================================
531 * Routines used for CRC calculation. Some are also required for the table
532 * generation above.
533 */
534
535 /*
536 Return a(x) multiplied by b(x) modulo p(x), where p(x) is the CRC polynomial,
537 reflected. For speed, this requires that a not be zero.
538 */
multmodp(z_crc_t a,z_crc_t b)539 local z_crc_t multmodp(z_crc_t a, z_crc_t b)
540 {
541 z_crc_t m, p;
542
543 m = (z_crc_t)1 << 31;
544 p = 0;
545 for (;;) {
546 if (a & m) {
547 p ^= b;
548 if ((a & (m - 1)) == 0)
549 break;
550 }
551 m >>= 1;
552 b = b & 1 ? (b >> 1) ^ POLY : b >> 1;
553 }
554 return p;
555 }
556
557 /*
558 Return x^(n * 2^k) modulo p(x). Requires that x2n_table[] has been
559 initialized.
560 */
x2nmodp(z_off64_t n,unsigned k)561 local z_crc_t x2nmodp(z_off64_t n, unsigned k)
562 {
563 z_crc_t p;
564
565 p = (z_crc_t)1 << 31; /* x^0 == 1 */
566 while (n) {
567 if (n & 1)
568 p = multmodp(x2n_table[k & 31], p);
569 n >>= 1;
570 k++;
571 }
572 return p;
573 }
574
575 /* =========================================================================
576 * This function can be used by asm versions of crc32(), and to force the
577 * generation of the CRC tables in a threaded application.
578 */
get_crc_table(void)579 const z_crc_t FAR * ZEXPORT get_crc_table(void)
580 {
581 #ifdef DYNAMIC_CRC_TABLE
582 once(&made, make_crc_table);
583 #endif /* DYNAMIC_CRC_TABLE */
584 return (const z_crc_t FAR *)crc_table;
585 }
586
587 /* =========================================================================
588 * Use ARM machine instructions if available. This will compute the CRC about
589 * ten times faster than the braided calculation. This code does not check for
590 * the presence of the CRC instruction at run time. __ARM_FEATURE_CRC32 will
591 * only be defined if the compilation specifies an ARM processor architecture
592 * that has the instructions. For example, compiling with -march=armv8.1-a or
593 * -march=armv8-a+crc, or -march=native if the compile machine has the crc32
594 * instructions.
595 */
596 #ifdef ARMCRC32
597
598 /*
599 Constants empirically determined to maximize speed. These values are from
600 measurements on a Cortex-A57. Your mileage may vary.
601 */
602 #define Z_BATCH 3990 /* number of words in a batch */
603 #define Z_BATCH_ZEROS 0xa10d3d0c /* computed from Z_BATCH = 3990 */
604 #define Z_BATCH_MIN 800 /* fewest words in a final batch */
605
crc32_z(crc,buf,len)606 unsigned long ZEXPORT crc32_z(crc, buf, len)
607 unsigned long crc;
608 const unsigned char FAR *buf;
609 z_size_t len;
610 {
611 z_crc_t val;
612 z_word_t crc1, crc2;
613 const z_word_t *word;
614 z_word_t val0, val1, val2;
615 z_size_t last, last2, i;
616 z_size_t num;
617
618 /* Return initial CRC, if requested. */
619 if (buf == Z_NULL) return 0;
620
621 #ifdef DYNAMIC_CRC_TABLE
622 once(&made, make_crc_table);
623 #endif /* DYNAMIC_CRC_TABLE */
624
625 /* Pre-condition the CRC */
626 crc = (~crc) & 0xffffffff;
627
628 /* Compute the CRC up to a word boundary. */
629 while (len && ((z_size_t)buf & 7) != 0) {
630 len--;
631 val = *buf++;
632 __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
633 }
634
635 /* Prepare to compute the CRC on full 64-bit words word[0..num-1]. */
636 word = (z_word_t const *)buf;
637 num = len >> 3;
638 len &= 7;
639
640 /* Do three interleaved CRCs to realize the throughput of one crc32x
641 instruction per cycle. Each CRC is calcuated on Z_BATCH words. The three
642 CRCs are combined into a single CRC after each set of batches. */
643 while (num >= 3 * Z_BATCH) {
644 crc1 = 0;
645 crc2 = 0;
646 for (i = 0; i < Z_BATCH; i++) {
647 val0 = word[i];
648 val1 = word[i + Z_BATCH];
649 val2 = word[i + 2 * Z_BATCH];
650 __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
651 __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
652 __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
653 }
654 word += 3 * Z_BATCH;
655 num -= 3 * Z_BATCH;
656 crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc1;
657 crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc2;
658 }
659
660 /* Do one last smaller batch with the remaining words, if there are enough
661 to pay for the combination of CRCs. */
662 last = num / 3;
663 if (last >= Z_BATCH_MIN) {
664 last2 = last << 1;
665 crc1 = 0;
666 crc2 = 0;
667 for (i = 0; i < last; i++) {
668 val0 = word[i];
669 val1 = word[i + last];
670 val2 = word[i + last2];
671 __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
672 __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
673 __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
674 }
675 word += 3 * last;
676 num -= 3 * last;
677 val = x2nmodp(last, 6);
678 crc = multmodp(val, crc) ^ crc1;
679 crc = multmodp(val, crc) ^ crc2;
680 }
681
682 /* Compute the CRC on any remaining words. */
683 for (i = 0; i < num; i++) {
684 val0 = word[i];
685 __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
686 }
687 word += num;
688
689 /* Complete the CRC on any remaining bytes. */
690 buf = (const unsigned char FAR *)word;
691 while (len) {
692 len--;
693 val = *buf++;
694 __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
695 }
696
697 /* Return the CRC, post-conditioned. */
698 return crc ^ 0xffffffff;
699 }
700
701 #else
702
703 #ifdef W
704
705 /*
706 Return the CRC of the W bytes in the word_t data, taking the
707 least-significant byte of the word as the first byte of data, without any pre
708 or post conditioning. This is used to combine the CRCs of each braid.
709 */
crc_word(z_word_t data)710 local z_crc_t crc_word(z_word_t data)
711 {
712 int k;
713 for (k = 0; k < W; k++)
714 data = (data >> 8) ^ crc_table[data & 0xff];
715 return (z_crc_t)data;
716 }
717
crc_word_big(z_word_t data)718 local z_word_t crc_word_big(z_word_t data)
719 {
720 int k;
721 for (k = 0; k < W; k++)
722 data = (data << 8) ^
723 crc_big_table[(data >> ((W - 1) << 3)) & 0xff];
724 return data;
725 }
726
727 #endif
728
729 /* ========================================================================= */
crc32_z(unsigned long crc,const unsigned char FAR * buf,z_size_t len)730 unsigned long ZEXPORT crc32_z(unsigned long crc, const unsigned char FAR *buf,
731 z_size_t len)
732 {
733 /* Return initial CRC, if requested. */
734 if (buf == Z_NULL) return 0;
735
736 #ifdef DYNAMIC_CRC_TABLE
737 once(&made, make_crc_table);
738 #endif /* DYNAMIC_CRC_TABLE */
739
740 /* Pre-condition the CRC */
741 crc = (~crc) & 0xffffffff;
742
743 #ifdef W
744
745 /* If provided enough bytes, do a braided CRC calculation. */
746 if (len >= N * W + W - 1) {
747 z_size_t blks;
748 z_word_t const *words;
749 unsigned endian;
750 int k;
751
752 /* Compute the CRC up to a z_word_t boundary. */
753 while (len && ((z_size_t)buf & (W - 1)) != 0) {
754 len--;
755 crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
756 }
757
758 /* Compute the CRC on as many N z_word_t blocks as are available. */
759 blks = len / (N * W);
760 len -= blks * N * W;
761 words = (z_word_t const *)buf;
762
763 /* Do endian check at execution time instead of compile time, since ARM
764 processors can change the endianess at execution time. If the
765 compiler knows what the endianess will be, it can optimize out the
766 check and the unused branch. */
767 endian = 1;
768 if (*(unsigned char *)&endian) {
769 /* Little endian. */
770
771 z_crc_t crc0;
772 z_word_t word0;
773 #if N > 1
774 z_crc_t crc1;
775 z_word_t word1;
776 #if N > 2
777 z_crc_t crc2;
778 z_word_t word2;
779 #if N > 3
780 z_crc_t crc3;
781 z_word_t word3;
782 #if N > 4
783 z_crc_t crc4;
784 z_word_t word4;
785 #if N > 5
786 z_crc_t crc5;
787 z_word_t word5;
788 #endif
789 #endif
790 #endif
791 #endif
792 #endif
793
794 /* Initialize the CRC for each braid. */
795 crc0 = crc;
796 #if N > 1
797 crc1 = 0;
798 #if N > 2
799 crc2 = 0;
800 #if N > 3
801 crc3 = 0;
802 #if N > 4
803 crc4 = 0;
804 #if N > 5
805 crc5 = 0;
806 #endif
807 #endif
808 #endif
809 #endif
810 #endif
811
812 /*
813 Process the first blks-1 blocks, computing the CRCs on each braid
814 independently.
815 */
816 while (--blks) {
817 /* Load the word for each braid into registers. */
818 word0 = crc0 ^ words[0];
819 #if N > 1
820 word1 = crc1 ^ words[1];
821 #if N > 2
822 word2 = crc2 ^ words[2];
823 #if N > 3
824 word3 = crc3 ^ words[3];
825 #if N > 4
826 word4 = crc4 ^ words[4];
827 #if N > 5
828 word5 = crc5 ^ words[5];
829 #endif
830 #endif
831 #endif
832 #endif
833 #endif
834 words += N;
835
836 /* Compute and update the CRC for each word. The loop should
837 get unrolled. */
838 crc0 = crc_braid_table[0][word0 & 0xff];
839 #if N > 1
840 crc1 = crc_braid_table[0][word1 & 0xff];
841 #if N > 2
842 crc2 = crc_braid_table[0][word2 & 0xff];
843 #if N > 3
844 crc3 = crc_braid_table[0][word3 & 0xff];
845 #if N > 4
846 crc4 = crc_braid_table[0][word4 & 0xff];
847 #if N > 5
848 crc5 = crc_braid_table[0][word5 & 0xff];
849 #endif
850 #endif
851 #endif
852 #endif
853 #endif
854 for (k = 1; k < W; k++) {
855 crc0 ^= crc_braid_table[k][(word0 >> (k << 3)) & 0xff];
856 #if N > 1
857 crc1 ^= crc_braid_table[k][(word1 >> (k << 3)) & 0xff];
858 #if N > 2
859 crc2 ^= crc_braid_table[k][(word2 >> (k << 3)) & 0xff];
860 #if N > 3
861 crc3 ^= crc_braid_table[k][(word3 >> (k << 3)) & 0xff];
862 #if N > 4
863 crc4 ^= crc_braid_table[k][(word4 >> (k << 3)) & 0xff];
864 #if N > 5
865 crc5 ^= crc_braid_table[k][(word5 >> (k << 3)) & 0xff];
866 #endif
867 #endif
868 #endif
869 #endif
870 #endif
871 }
872 }
873
874 /*
875 Process the last block, combining the CRCs of the N braids at the
876 same time.
877 */
878 crc = crc_word(crc0 ^ words[0]);
879 #if N > 1
880 crc = crc_word(crc1 ^ words[1] ^ crc);
881 #if N > 2
882 crc = crc_word(crc2 ^ words[2] ^ crc);
883 #if N > 3
884 crc = crc_word(crc3 ^ words[3] ^ crc);
885 #if N > 4
886 crc = crc_word(crc4 ^ words[4] ^ crc);
887 #if N > 5
888 crc = crc_word(crc5 ^ words[5] ^ crc);
889 #endif
890 #endif
891 #endif
892 #endif
893 #endif
894 words += N;
895 }
896 else {
897 /* Big endian. */
898
899 z_word_t crc0, word0, comb;
900 #if N > 1
901 z_word_t crc1, word1;
902 #if N > 2
903 z_word_t crc2, word2;
904 #if N > 3
905 z_word_t crc3, word3;
906 #if N > 4
907 z_word_t crc4, word4;
908 #if N > 5
909 z_word_t crc5, word5;
910 #endif
911 #endif
912 #endif
913 #endif
914 #endif
915
916 /* Initialize the CRC for each braid. */
917 crc0 = byte_swap(crc);
918 #if N > 1
919 crc1 = 0;
920 #if N > 2
921 crc2 = 0;
922 #if N > 3
923 crc3 = 0;
924 #if N > 4
925 crc4 = 0;
926 #if N > 5
927 crc5 = 0;
928 #endif
929 #endif
930 #endif
931 #endif
932 #endif
933
934 /*
935 Process the first blks-1 blocks, computing the CRCs on each braid
936 independently.
937 */
938 while (--blks) {
939 /* Load the word for each braid into registers. */
940 word0 = crc0 ^ words[0];
941 #if N > 1
942 word1 = crc1 ^ words[1];
943 #if N > 2
944 word2 = crc2 ^ words[2];
945 #if N > 3
946 word3 = crc3 ^ words[3];
947 #if N > 4
948 word4 = crc4 ^ words[4];
949 #if N > 5
950 word5 = crc5 ^ words[5];
951 #endif
952 #endif
953 #endif
954 #endif
955 #endif
956 words += N;
957
958 /* Compute and update the CRC for each word. The loop should
959 get unrolled. */
960 crc0 = crc_braid_big_table[0][word0 & 0xff];
961 #if N > 1
962 crc1 = crc_braid_big_table[0][word1 & 0xff];
963 #if N > 2
964 crc2 = crc_braid_big_table[0][word2 & 0xff];
965 #if N > 3
966 crc3 = crc_braid_big_table[0][word3 & 0xff];
967 #if N > 4
968 crc4 = crc_braid_big_table[0][word4 & 0xff];
969 #if N > 5
970 crc5 = crc_braid_big_table[0][word5 & 0xff];
971 #endif
972 #endif
973 #endif
974 #endif
975 #endif
976 for (k = 1; k < W; k++) {
977 crc0 ^= crc_braid_big_table[k][(word0 >> (k << 3)) & 0xff];
978 #if N > 1
979 crc1 ^= crc_braid_big_table[k][(word1 >> (k << 3)) & 0xff];
980 #if N > 2
981 crc2 ^= crc_braid_big_table[k][(word2 >> (k << 3)) & 0xff];
982 #if N > 3
983 crc3 ^= crc_braid_big_table[k][(word3 >> (k << 3)) & 0xff];
984 #if N > 4
985 crc4 ^= crc_braid_big_table[k][(word4 >> (k << 3)) & 0xff];
986 #if N > 5
987 crc5 ^= crc_braid_big_table[k][(word5 >> (k << 3)) & 0xff];
988 #endif
989 #endif
990 #endif
991 #endif
992 #endif
993 }
994 }
995
996 /*
997 Process the last block, combining the CRCs of the N braids at the
998 same time.
999 */
1000 comb = crc_word_big(crc0 ^ words[0]);
1001 #if N > 1
1002 comb = crc_word_big(crc1 ^ words[1] ^ comb);
1003 #if N > 2
1004 comb = crc_word_big(crc2 ^ words[2] ^ comb);
1005 #if N > 3
1006 comb = crc_word_big(crc3 ^ words[3] ^ comb);
1007 #if N > 4
1008 comb = crc_word_big(crc4 ^ words[4] ^ comb);
1009 #if N > 5
1010 comb = crc_word_big(crc5 ^ words[5] ^ comb);
1011 #endif
1012 #endif
1013 #endif
1014 #endif
1015 #endif
1016 words += N;
1017 crc = byte_swap(comb);
1018 }
1019
1020 /*
1021 Update the pointer to the remaining bytes to process.
1022 */
1023 buf = (unsigned char const *)words;
1024 }
1025
1026 #endif /* W */
1027
1028 /* Complete the computation of the CRC on any remaining bytes. */
1029 while (len >= 8) {
1030 len -= 8;
1031 crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1032 crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1033 crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1034 crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1035 crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1036 crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1037 crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1038 crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1039 }
1040 while (len) {
1041 len--;
1042 crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1043 }
1044
1045 /* Return the CRC, post-conditioned. */
1046 return crc ^ 0xffffffff;
1047 }
1048
1049 #endif
1050
1051 /* ========================================================================= */
crc32(unsigned long crc,const unsigned char FAR * buf,uInt len)1052 unsigned long ZEXPORT crc32(unsigned long crc, const unsigned char FAR *buf,
1053 uInt len)
1054 {
1055 return crc32_z(crc, buf, len);
1056 }
1057
1058 /* ========================================================================= */
crc32_combine64(uLong crc1,uLong crc2,z_off64_t len2)1059 uLong ZEXPORT crc32_combine64(uLong crc1, uLong crc2, z_off64_t len2)
1060 {
1061 #ifdef DYNAMIC_CRC_TABLE
1062 once(&made, make_crc_table);
1063 #endif /* DYNAMIC_CRC_TABLE */
1064 return multmodp(x2nmodp(len2, 3), crc1) ^ (crc2 & 0xffffffff);
1065 }
1066
1067 /* ========================================================================= */
crc32_combine(uLong crc1,uLong crc2,z_off_t len2)1068 uLong ZEXPORT crc32_combine(uLong crc1, uLong crc2, z_off_t len2)
1069 {
1070 return crc32_combine64(crc1, crc2, len2);
1071 }
1072
1073 /* ========================================================================= */
crc32_combine_gen64(z_off64_t len2)1074 uLong ZEXPORT crc32_combine_gen64(z_off64_t len2)
1075 {
1076 #ifdef DYNAMIC_CRC_TABLE
1077 once(&made, make_crc_table);
1078 #endif /* DYNAMIC_CRC_TABLE */
1079 return x2nmodp(len2, 3);
1080 }
1081
1082 /* ========================================================================= */
crc32_combine_gen(z_off_t len2)1083 uLong ZEXPORT crc32_combine_gen(z_off_t len2)
1084 {
1085 return crc32_combine_gen64(len2);
1086 }
1087
1088 /* ========================================================================= */
crc32_combine_op(uLong crc1,uLong crc2,uLong op)1089 uLong crc32_combine_op(uLong crc1, uLong crc2, uLong op)
1090 {
1091 return multmodp(op, crc1) ^ (crc2 & 0xffffffff);
1092 }
1093