xref: /reactos/sdk/lib/3rdparty/zlib/crc32.c (revision e5993f13)
1 /* crc32.c -- compute the CRC-32 of a data stream
2  * Copyright (C) 1995-2022 Mark Adler
3  * For conditions of distribution and use, see copyright notice in zlib.h
4  *
5  * This interleaved implementation of a CRC makes use of pipelined multiple
6  * arithmetic-logic units, commonly found in modern CPU cores. It is due to
7  * Kadatch and Jenkins (2010). See doc/crc-doc.1.0.pdf in this distribution.
8  */
9 
10 /* @(#) $Id$ */
11 
12 /*
13   Note on the use of DYNAMIC_CRC_TABLE: there is no mutex or semaphore
14   protection on the static variables used to control the first-use generation
15   of the crc tables. Therefore, if you #define DYNAMIC_CRC_TABLE, you should
16   first call get_crc_table() to initialize the tables before allowing more than
17   one thread to use crc32().
18 
19   MAKECRCH can be #defined to write out crc32.h. A main() routine is also
20   produced, so that this one source file can be compiled to an executable.
21  */
22 
23 #ifdef MAKECRCH
24 #  include <stdio.h>
25 #  ifndef DYNAMIC_CRC_TABLE
26 #    define DYNAMIC_CRC_TABLE
27 #  endif /* !DYNAMIC_CRC_TABLE */
28 #endif /* MAKECRCH */
29 
30 #include "zutil.h"      /* for Z_U4, Z_U8, z_crc_t, and FAR definitions */
31 
32  /*
33   A CRC of a message is computed on N braids of words in the message, where
34   each word consists of W bytes (4 or 8). If N is 3, for example, then three
35   running sparse CRCs are calculated respectively on each braid, at these
36   indices in the array of words: 0, 3, 6, ..., 1, 4, 7, ..., and 2, 5, 8, ...
37   This is done starting at a word boundary, and continues until as many blocks
38   of N * W bytes as are available have been processed. The results are combined
39   into a single CRC at the end. For this code, N must be in the range 1..6 and
40   W must be 4 or 8. The upper limit on N can be increased if desired by adding
41   more #if blocks, extending the patterns apparent in the code. In addition,
42   crc32.h would need to be regenerated, if the maximum N value is increased.
43 
44   N and W are chosen empirically by benchmarking the execution time on a given
45   processor. The choices for N and W below were based on testing on Intel Kaby
46   Lake i7, AMD Ryzen 7, ARM Cortex-A57, Sparc64-VII, PowerPC POWER9, and MIPS64
47   Octeon II processors. The Intel, AMD, and ARM processors were all fastest
48   with N=5, W=8. The Sparc, PowerPC, and MIPS64 were all fastest at N=5, W=4.
49   They were all tested with either gcc or clang, all using the -O3 optimization
50   level. Your mileage may vary.
51  */
52 
53 /* Define N */
54 #ifdef Z_TESTN
55 #  define N Z_TESTN
56 #else
57 #  define N 5
58 #endif
59 #if N < 1 || N > 6
60 #  error N must be in 1..6
61 #endif
62 
63 /*
64   z_crc_t must be at least 32 bits. z_word_t must be at least as long as
65   z_crc_t. It is assumed here that z_word_t is either 32 bits or 64 bits, and
66   that bytes are eight bits.
67  */
68 
69 /*
70   Define W and the associated z_word_t type. If W is not defined, then a
71   braided calculation is not used, and the associated tables and code are not
72   compiled.
73  */
74 #ifdef Z_TESTW
75 #  if Z_TESTW-1 != -1
76 #    define W Z_TESTW
77 #  endif
78 #else
79 #  ifdef MAKECRCH
80 #    define W 8         /* required for MAKECRCH */
81 #  else
82 #    if defined(__x86_64__) || defined(__aarch64__)
83 #      define W 8
84 #    else
85 #      define W 4
86 #    endif
87 #  endif
88 #endif
89 #ifdef W
90 #  if W == 8 && defined(Z_U8)
91      typedef Z_U8 z_word_t;
92 #  elif defined(Z_U4)
93 #    undef W
94 #    define W 4
95      typedef Z_U4 z_word_t;
96 #  else
97 #    undef W
98 #  endif
99 #endif
100 
101 /* If available, use the ARM processor CRC32 instruction. */
102 #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) && W == 8
103 #  define ARMCRC32
104 #endif
105 
106 /* Local functions. */
107 local z_crc_t multmodp OF((z_crc_t a, z_crc_t b));
108 local z_crc_t x2nmodp OF((z_off64_t n, unsigned k));
109 
110 #if defined(W) && (!defined(ARMCRC32) || defined(DYNAMIC_CRC_TABLE))
111     local z_word_t byte_swap OF((z_word_t word));
112 #endif
113 
114 #if defined(W) && !defined(ARMCRC32)
115     local z_crc_t crc_word OF((z_word_t data));
116     local z_word_t crc_word_big OF((z_word_t data));
117 #endif
118 
119 #if defined(W) && (!defined(ARMCRC32) || defined(DYNAMIC_CRC_TABLE))
120 /*
121   Swap the bytes in a z_word_t to convert between little and big endian. Any
122   self-respecting compiler will optimize this to a single machine byte-swap
123   instruction, if one is available. This assumes that word_t is either 32 bits
124   or 64 bits.
125  */
126 local z_word_t byte_swap(word)
127     z_word_t word;
128 {
129 #  if W == 8
130     return
131         (word & 0xff00000000000000) >> 56 |
132         (word & 0xff000000000000) >> 40 |
133         (word & 0xff0000000000) >> 24 |
134         (word & 0xff00000000) >> 8 |
135         (word & 0xff000000) << 8 |
136         (word & 0xff0000) << 24 |
137         (word & 0xff00) << 40 |
138         (word & 0xff) << 56;
139 #  else   /* W == 4 */
140     return
141         (word & 0xff000000) >> 24 |
142         (word & 0xff0000) >> 8 |
143         (word & 0xff00) << 8 |
144         (word & 0xff) << 24;
145 #  endif
146 }
147 #endif
148 
149 /* CRC polynomial. */
150 #define POLY 0xedb88320         /* p(x) reflected, with x^32 implied */
151 
152 #ifdef DYNAMIC_CRC_TABLE
153 
154 local z_crc_t FAR crc_table[256];
155 local z_crc_t FAR x2n_table[32];
156 local void make_crc_table OF((void));
157 #ifdef W
158    local z_word_t FAR crc_big_table[256];
159    local z_crc_t FAR crc_braid_table[W][256];
160    local z_word_t FAR crc_braid_big_table[W][256];
161    local void braid OF((z_crc_t [][256], z_word_t [][256], int, int));
162 #endif
163 #ifdef MAKECRCH
164    local void write_table OF((FILE *, const z_crc_t FAR *, int));
165    local void write_table32hi OF((FILE *, const z_word_t FAR *, int));
166    local void write_table64 OF((FILE *, const z_word_t FAR *, int));
167 #endif /* MAKECRCH */
168 
169 /*
170   Define a once() function depending on the availability of atomics. If this is
171   compiled with DYNAMIC_CRC_TABLE defined, and if CRCs will be computed in
172   multiple threads, and if atomics are not available, then get_crc_table() must
173   be called to initialize the tables and must return before any threads are
174   allowed to compute or combine CRCs.
175  */
176 
177 /* Definition of once functionality. */
178 typedef struct once_s once_t;
179 local void once OF((once_t *, void (*)(void)));
180 
181 /* Check for the availability of atomics. */
182 #if defined(__STDC__) && __STDC_VERSION__ >= 201112L && \
183     !defined(__STDC_NO_ATOMICS__)
184 
185 #include <stdatomic.h>
186 
187 /* Structure for once(), which must be initialized with ONCE_INIT. */
188 struct once_s {
189     atomic_flag begun;
190     atomic_int done;
191 };
192 #define ONCE_INIT {ATOMIC_FLAG_INIT, 0}
193 
194 /*
195   Run the provided init() function exactly once, even if multiple threads
196   invoke once() at the same time. The state must be a once_t initialized with
197   ONCE_INIT.
198  */
199 local void once(state, init)
200     once_t *state;
201     void (*init)(void);
202 {
203     if (!atomic_load(&state->done)) {
204         if (atomic_flag_test_and_set(&state->begun))
205             while (!atomic_load(&state->done))
206                 ;
207         else {
208             init();
209             atomic_store(&state->done, 1);
210         }
211     }
212 }
213 
214 #else   /* no atomics */
215 
216 /* Structure for once(), which must be initialized with ONCE_INIT. */
217 struct once_s {
218     volatile int begun;
219     volatile int done;
220 };
221 #define ONCE_INIT {0, 0}
222 
223 /* Test and set. Alas, not atomic, but tries to minimize the period of
224    vulnerability. */
225 local int test_and_set OF((int volatile *));
226 local int test_and_set(flag)
227     int volatile *flag;
228 {
229     int was;
230 
231     was = *flag;
232     *flag = 1;
233     return was;
234 }
235 
236 /* Run the provided init() function once. This is not thread-safe. */
237 local void once(state, init)
238     once_t *state;
239     void (*init)(void);
240 {
241     if (!state->done) {
242         if (test_and_set(&state->begun))
243             while (!state->done)
244                 ;
245         else {
246             init();
247             state->done = 1;
248         }
249     }
250 }
251 
252 #endif
253 
254 /* State for once(). */
255 local once_t made = ONCE_INIT;
256 
257 /*
258   Generate tables for a byte-wise 32-bit CRC calculation on the polynomial:
259   x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1.
260 
261   Polynomials over GF(2) are represented in binary, one bit per coefficient,
262   with the lowest powers in the most significant bit. Then adding polynomials
263   is just exclusive-or, and multiplying a polynomial by x is a right shift by
264   one. If we call the above polynomial p, and represent a byte as the
265   polynomial q, also with the lowest power in the most significant bit (so the
266   byte 0xb1 is the polynomial x^7+x^3+x^2+1), then the CRC is (q*x^32) mod p,
267   where a mod b means the remainder after dividing a by b.
268 
269   This calculation is done using the shift-register method of multiplying and
270   taking the remainder. The register is initialized to zero, and for each
271   incoming bit, x^32 is added mod p to the register if the bit is a one (where
272   x^32 mod p is p+x^32 = x^26+...+1), and the register is multiplied mod p by x
273   (which is shifting right by one and adding x^32 mod p if the bit shifted out
274   is a one). We start with the highest power (least significant bit) of q and
275   repeat for all eight bits of q.
276 
277   The table is simply the CRC of all possible eight bit values. This is all the
278   information needed to generate CRCs on data a byte at a time for all
279   combinations of CRC register values and incoming bytes.
280  */
281 
282 local void make_crc_table()
283 {
284     unsigned i, j, n;
285     z_crc_t p;
286 
287     /* initialize the CRC of bytes tables */
288     for (i = 0; i < 256; i++) {
289         p = i;
290         for (j = 0; j < 8; j++)
291             p = p & 1 ? (p >> 1) ^ POLY : p >> 1;
292         crc_table[i] = p;
293 #ifdef W
294         crc_big_table[i] = byte_swap(p);
295 #endif
296     }
297 
298     /* initialize the x^2^n mod p(x) table */
299     p = (z_crc_t)1 << 30;         /* x^1 */
300     x2n_table[0] = p;
301     for (n = 1; n < 32; n++)
302         x2n_table[n] = p = multmodp(p, p);
303 
304 #ifdef W
305     /* initialize the braiding tables -- needs x2n_table[] */
306     braid(crc_braid_table, crc_braid_big_table, N, W);
307 #endif
308 
309 #ifdef MAKECRCH
310     {
311         /*
312           The crc32.h header file contains tables for both 32-bit and 64-bit
313           z_word_t's, and so requires a 64-bit type be available. In that case,
314           z_word_t must be defined to be 64-bits. This code then also generates
315           and writes out the tables for the case that z_word_t is 32 bits.
316          */
317 #if !defined(W) || W != 8
318 #  error Need a 64-bit integer type in order to generate crc32.h.
319 #endif
320         FILE *out;
321         int k, n;
322         z_crc_t ltl[8][256];
323         z_word_t big[8][256];
324 
325         out = fopen("crc32.h", "w");
326         if (out == NULL) return;
327 
328         /* write out little-endian CRC table to crc32.h */
329         fprintf(out,
330             "/* crc32.h -- tables for rapid CRC calculation\n"
331             " * Generated automatically by crc32.c\n */\n"
332             "\n"
333             "local const z_crc_t FAR crc_table[] = {\n"
334             "    ");
335         write_table(out, crc_table, 256);
336         fprintf(out,
337             "};\n");
338 
339         /* write out big-endian CRC table for 64-bit z_word_t to crc32.h */
340         fprintf(out,
341             "\n"
342             "#ifdef W\n"
343             "\n"
344             "#if W == 8\n"
345             "\n"
346             "local const z_word_t FAR crc_big_table[] = {\n"
347             "    ");
348         write_table64(out, crc_big_table, 256);
349         fprintf(out,
350             "};\n");
351 
352         /* write out big-endian CRC table for 32-bit z_word_t to crc32.h */
353         fprintf(out,
354             "\n"
355             "#else /* W == 4 */\n"
356             "\n"
357             "local const z_word_t FAR crc_big_table[] = {\n"
358             "    ");
359         write_table32hi(out, crc_big_table, 256);
360         fprintf(out,
361             "};\n"
362             "\n"
363             "#endif\n");
364 
365         /* write out braid tables for each value of N */
366         for (n = 1; n <= 6; n++) {
367             fprintf(out,
368             "\n"
369             "#if N == %d\n", n);
370 
371             /* compute braid tables for this N and 64-bit word_t */
372             braid(ltl, big, n, 8);
373 
374             /* write out braid tables for 64-bit z_word_t to crc32.h */
375             fprintf(out,
376             "\n"
377             "#if W == 8\n"
378             "\n"
379             "local const z_crc_t FAR crc_braid_table[][256] = {\n");
380             for (k = 0; k < 8; k++) {
381                 fprintf(out, "   {");
382                 write_table(out, ltl[k], 256);
383                 fprintf(out, "}%s", k < 7 ? ",\n" : "");
384             }
385             fprintf(out,
386             "};\n"
387             "\n"
388             "local const z_word_t FAR crc_braid_big_table[][256] = {\n");
389             for (k = 0; k < 8; k++) {
390                 fprintf(out, "   {");
391                 write_table64(out, big[k], 256);
392                 fprintf(out, "}%s", k < 7 ? ",\n" : "");
393             }
394             fprintf(out,
395             "};\n");
396 
397             /* compute braid tables for this N and 32-bit word_t */
398             braid(ltl, big, n, 4);
399 
400             /* write out braid tables for 32-bit z_word_t to crc32.h */
401             fprintf(out,
402             "\n"
403             "#else /* W == 4 */\n"
404             "\n"
405             "local const z_crc_t FAR crc_braid_table[][256] = {\n");
406             for (k = 0; k < 4; k++) {
407                 fprintf(out, "   {");
408                 write_table(out, ltl[k], 256);
409                 fprintf(out, "}%s", k < 3 ? ",\n" : "");
410             }
411             fprintf(out,
412             "};\n"
413             "\n"
414             "local const z_word_t FAR crc_braid_big_table[][256] = {\n");
415             for (k = 0; k < 4; k++) {
416                 fprintf(out, "   {");
417                 write_table32hi(out, big[k], 256);
418                 fprintf(out, "}%s", k < 3 ? ",\n" : "");
419             }
420             fprintf(out,
421             "};\n"
422             "\n"
423             "#endif\n"
424             "\n"
425             "#endif\n");
426         }
427         fprintf(out,
428             "\n"
429             "#endif\n");
430 
431         /* write out zeros operator table to crc32.h */
432         fprintf(out,
433             "\n"
434             "local const z_crc_t FAR x2n_table[] = {\n"
435             "    ");
436         write_table(out, x2n_table, 32);
437         fprintf(out,
438             "};\n");
439         fclose(out);
440     }
441 #endif /* MAKECRCH */
442 }
443 
444 #ifdef MAKECRCH
445 
446 /*
447    Write the 32-bit values in table[0..k-1] to out, five per line in
448    hexadecimal separated by commas.
449  */
450 local void write_table(out, table, k)
451     FILE *out;
452     const z_crc_t FAR *table;
453     int k;
454 {
455     int n;
456 
457     for (n = 0; n < k; n++)
458         fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : "    ",
459                 (unsigned long)(table[n]),
460                 n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", "));
461 }
462 
463 /*
464    Write the high 32-bits of each value in table[0..k-1] to out, five per line
465    in hexadecimal separated by commas.
466  */
467 local void write_table32hi(out, table, k)
468 FILE *out;
469 const z_word_t FAR *table;
470 int k;
471 {
472     int n;
473 
474     for (n = 0; n < k; n++)
475         fprintf(out, "%s0x%08lx%s", n == 0 || n % 5 ? "" : "    ",
476                 (unsigned long)(table[n] >> 32),
477                 n == k - 1 ? "" : (n % 5 == 4 ? ",\n" : ", "));
478 }
479 
480 /*
481   Write the 64-bit values in table[0..k-1] to out, three per line in
482   hexadecimal separated by commas. This assumes that if there is a 64-bit
483   type, then there is also a long long integer type, and it is at least 64
484   bits. If not, then the type cast and format string can be adjusted
485   accordingly.
486  */
487 local void write_table64(out, table, k)
488     FILE *out;
489     const z_word_t FAR *table;
490     int k;
491 {
492     int n;
493 
494     for (n = 0; n < k; n++)
495         fprintf(out, "%s0x%016llx%s", n == 0 || n % 3 ? "" : "    ",
496                 (unsigned long long)(table[n]),
497                 n == k - 1 ? "" : (n % 3 == 2 ? ",\n" : ", "));
498 }
499 
500 /* Actually do the deed. */
501 int main()
502 {
503     make_crc_table();
504     return 0;
505 }
506 
507 #endif /* MAKECRCH */
508 
509 #ifdef W
510 /*
511   Generate the little and big-endian braid tables for the given n and z_word_t
512   size w. Each array must have room for w blocks of 256 elements.
513  */
514 local void braid(ltl, big, n, w)
515     z_crc_t ltl[][256];
516     z_word_t big[][256];
517     int n;
518     int w;
519 {
520     int k;
521     z_crc_t i, p, q;
522     for (k = 0; k < w; k++) {
523         p = x2nmodp((n * w + 3 - k) << 3, 0);
524         ltl[k][0] = 0;
525         big[w - 1 - k][0] = 0;
526         for (i = 1; i < 256; i++) {
527             ltl[k][i] = q = multmodp(i << 24, p);
528             big[w - 1 - k][i] = byte_swap(q);
529         }
530     }
531 }
532 #endif
533 
534 #else /* !DYNAMIC_CRC_TABLE */
535 /* ========================================================================
536  * Tables for byte-wise and braided CRC-32 calculations, and a table of powers
537  * of x for combining CRC-32s, all made by make_crc_table().
538  */
539 #include "crc32.h"
540 #endif /* DYNAMIC_CRC_TABLE */
541 
542 /* ========================================================================
543  * Routines used for CRC calculation. Some are also required for the table
544  * generation above.
545  */
546 
547 /*
548   Return a(x) multiplied by b(x) modulo p(x), where p(x) is the CRC polynomial,
549   reflected. For speed, this requires that a not be zero.
550  */
551 local z_crc_t multmodp(a, b)
552     z_crc_t a;
553     z_crc_t b;
554 {
555     z_crc_t m, p;
556 
557     m = (z_crc_t)1 << 31;
558     p = 0;
559     for (;;) {
560         if (a & m) {
561             p ^= b;
562             if ((a & (m - 1)) == 0)
563                 break;
564         }
565         m >>= 1;
566         b = b & 1 ? (b >> 1) ^ POLY : b >> 1;
567     }
568     return p;
569 }
570 
571 /*
572   Return x^(n * 2^k) modulo p(x). Requires that x2n_table[] has been
573   initialized.
574  */
575 local z_crc_t x2nmodp(n, k)
576     z_off64_t n;
577     unsigned k;
578 {
579     z_crc_t p;
580 
581     p = (z_crc_t)1 << 31;           /* x^0 == 1 */
582     while (n) {
583         if (n & 1)
584             p = multmodp(x2n_table[k & 31], p);
585         n >>= 1;
586         k++;
587     }
588     return p;
589 }
590 
591 /* =========================================================================
592  * This function can be used by asm versions of crc32(), and to force the
593  * generation of the CRC tables in a threaded application.
594  */
595 const z_crc_t FAR * ZEXPORT get_crc_table()
596 {
597 #ifdef DYNAMIC_CRC_TABLE
598     once(&made, make_crc_table);
599 #endif /* DYNAMIC_CRC_TABLE */
600     return (const z_crc_t FAR *)crc_table;
601 }
602 
603 /* =========================================================================
604  * Use ARM machine instructions if available. This will compute the CRC about
605  * ten times faster than the braided calculation. This code does not check for
606  * the presence of the CRC instruction at run time. __ARM_FEATURE_CRC32 will
607  * only be defined if the compilation specifies an ARM processor architecture
608  * that has the instructions. For example, compiling with -march=armv8.1-a or
609  * -march=armv8-a+crc, or -march=native if the compile machine has the crc32
610  * instructions.
611  */
612 #ifdef ARMCRC32
613 
614 /*
615    Constants empirically determined to maximize speed. These values are from
616    measurements on a Cortex-A57. Your mileage may vary.
617  */
618 #define Z_BATCH 3990                /* number of words in a batch */
619 #define Z_BATCH_ZEROS 0xa10d3d0c    /* computed from Z_BATCH = 3990 */
620 #define Z_BATCH_MIN 800             /* fewest words in a final batch */
621 
622 unsigned long ZEXPORT crc32_z(crc, buf, len)
623     unsigned long crc;
624     const unsigned char FAR *buf;
625     z_size_t len;
626 {
627     z_crc_t val;
628     z_word_t crc1, crc2;
629     const z_word_t *word;
630     z_word_t val0, val1, val2;
631     z_size_t last, last2, i;
632     z_size_t num;
633 
634     /* Return initial CRC, if requested. */
635     if (buf == Z_NULL) return 0;
636 
637 #ifdef DYNAMIC_CRC_TABLE
638     once(&made, make_crc_table);
639 #endif /* DYNAMIC_CRC_TABLE */
640 
641     /* Pre-condition the CRC */
642     crc = (~crc) & 0xffffffff;
643 
644     /* Compute the CRC up to a word boundary. */
645     while (len && ((z_size_t)buf & 7) != 0) {
646         len--;
647         val = *buf++;
648         __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
649     }
650 
651     /* Prepare to compute the CRC on full 64-bit words word[0..num-1]. */
652     word = (z_word_t const *)buf;
653     num = len >> 3;
654     len &= 7;
655 
656     /* Do three interleaved CRCs to realize the throughput of one crc32x
657        instruction per cycle. Each CRC is calculated on Z_BATCH words. The
658        three CRCs are combined into a single CRC after each set of batches. */
659     while (num >= 3 * Z_BATCH) {
660         crc1 = 0;
661         crc2 = 0;
662         for (i = 0; i < Z_BATCH; i++) {
663             val0 = word[i];
664             val1 = word[i + Z_BATCH];
665             val2 = word[i + 2 * Z_BATCH];
666             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
667             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
668             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
669         }
670         word += 3 * Z_BATCH;
671         num -= 3 * Z_BATCH;
672         crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc1;
673         crc = multmodp(Z_BATCH_ZEROS, crc) ^ crc2;
674     }
675 
676     /* Do one last smaller batch with the remaining words, if there are enough
677        to pay for the combination of CRCs. */
678     last = num / 3;
679     if (last >= Z_BATCH_MIN) {
680         last2 = last << 1;
681         crc1 = 0;
682         crc2 = 0;
683         for (i = 0; i < last; i++) {
684             val0 = word[i];
685             val1 = word[i + last];
686             val2 = word[i + last2];
687             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
688             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc1) : "r"(val1));
689             __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc2) : "r"(val2));
690         }
691         word += 3 * last;
692         num -= 3 * last;
693         val = x2nmodp(last, 6);
694         crc = multmodp(val, crc) ^ crc1;
695         crc = multmodp(val, crc) ^ crc2;
696     }
697 
698     /* Compute the CRC on any remaining words. */
699     for (i = 0; i < num; i++) {
700         val0 = word[i];
701         __asm__ volatile("crc32x %w0, %w0, %x1" : "+r"(crc) : "r"(val0));
702     }
703     word += num;
704 
705     /* Complete the CRC on any remaining bytes. */
706     buf = (const unsigned char FAR *)word;
707     while (len) {
708         len--;
709         val = *buf++;
710         __asm__ volatile("crc32b %w0, %w0, %w1" : "+r"(crc) : "r"(val));
711     }
712 
713     /* Return the CRC, post-conditioned. */
714     return crc ^ 0xffffffff;
715 }
716 
717 #else
718 
719 #ifdef W
720 
721 /*
722   Return the CRC of the W bytes in the word_t data, taking the
723   least-significant byte of the word as the first byte of data, without any pre
724   or post conditioning. This is used to combine the CRCs of each braid.
725  */
726 local z_crc_t crc_word(data)
727     z_word_t data;
728 {
729     int k;
730     for (k = 0; k < W; k++)
731         data = (data >> 8) ^ crc_table[data & 0xff];
732     return (z_crc_t)data;
733 }
734 
735 local z_word_t crc_word_big(data)
736     z_word_t data;
737 {
738     int k;
739     for (k = 0; k < W; k++)
740         data = (data << 8) ^
741             crc_big_table[(data >> ((W - 1) << 3)) & 0xff];
742     return data;
743 }
744 
745 #endif
746 
747 /* ========================================================================= */
748 unsigned long ZEXPORT crc32_z(crc, buf, len)
749     unsigned long crc;
750     const unsigned char FAR *buf;
751     z_size_t len;
752 {
753     /* Return initial CRC, if requested. */
754     if (buf == Z_NULL) return 0;
755 
756 #ifdef DYNAMIC_CRC_TABLE
757     once(&made, make_crc_table);
758 #endif /* DYNAMIC_CRC_TABLE */
759 
760     /* Pre-condition the CRC */
761     crc = (~crc) & 0xffffffff;
762 
763 #ifdef W
764 
765     /* If provided enough bytes, do a braided CRC calculation. */
766     if (len >= N * W + W - 1) {
767         z_size_t blks;
768         z_word_t const *words;
769         unsigned endian;
770         int k;
771 
772         /* Compute the CRC up to a z_word_t boundary. */
773         while (len && ((z_size_t)buf & (W - 1)) != 0) {
774             len--;
775             crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
776         }
777 
778         /* Compute the CRC on as many N z_word_t blocks as are available. */
779         blks = len / (N * W);
780         len -= blks * N * W;
781         words = (z_word_t const *)buf;
782 
783         /* Do endian check at execution time instead of compile time, since ARM
784            processors can change the endianess at execution time. If the
785            compiler knows what the endianess will be, it can optimize out the
786            check and the unused branch. */
787         endian = 1;
788         if (*(unsigned char *)&endian) {
789             /* Little endian. */
790 
791             z_crc_t crc0;
792             z_word_t word0;
793 #if N > 1
794             z_crc_t crc1;
795             z_word_t word1;
796 #if N > 2
797             z_crc_t crc2;
798             z_word_t word2;
799 #if N > 3
800             z_crc_t crc3;
801             z_word_t word3;
802 #if N > 4
803             z_crc_t crc4;
804             z_word_t word4;
805 #if N > 5
806             z_crc_t crc5;
807             z_word_t word5;
808 #endif
809 #endif
810 #endif
811 #endif
812 #endif
813 
814             /* Initialize the CRC for each braid. */
815             crc0 = crc;
816 #if N > 1
817             crc1 = 0;
818 #if N > 2
819             crc2 = 0;
820 #if N > 3
821             crc3 = 0;
822 #if N > 4
823             crc4 = 0;
824 #if N > 5
825             crc5 = 0;
826 #endif
827 #endif
828 #endif
829 #endif
830 #endif
831 
832             /*
833               Process the first blks-1 blocks, computing the CRCs on each braid
834               independently.
835              */
836             while (--blks) {
837                 /* Load the word for each braid into registers. */
838                 word0 = crc0 ^ words[0];
839 #if N > 1
840                 word1 = crc1 ^ words[1];
841 #if N > 2
842                 word2 = crc2 ^ words[2];
843 #if N > 3
844                 word3 = crc3 ^ words[3];
845 #if N > 4
846                 word4 = crc4 ^ words[4];
847 #if N > 5
848                 word5 = crc5 ^ words[5];
849 #endif
850 #endif
851 #endif
852 #endif
853 #endif
854                 words += N;
855 
856                 /* Compute and update the CRC for each word. The loop should
857                    get unrolled. */
858                 crc0 = crc_braid_table[0][word0 & 0xff];
859 #if N > 1
860                 crc1 = crc_braid_table[0][word1 & 0xff];
861 #if N > 2
862                 crc2 = crc_braid_table[0][word2 & 0xff];
863 #if N > 3
864                 crc3 = crc_braid_table[0][word3 & 0xff];
865 #if N > 4
866                 crc4 = crc_braid_table[0][word4 & 0xff];
867 #if N > 5
868                 crc5 = crc_braid_table[0][word5 & 0xff];
869 #endif
870 #endif
871 #endif
872 #endif
873 #endif
874                 for (k = 1; k < W; k++) {
875                     crc0 ^= crc_braid_table[k][(word0 >> (k << 3)) & 0xff];
876 #if N > 1
877                     crc1 ^= crc_braid_table[k][(word1 >> (k << 3)) & 0xff];
878 #if N > 2
879                     crc2 ^= crc_braid_table[k][(word2 >> (k << 3)) & 0xff];
880 #if N > 3
881                     crc3 ^= crc_braid_table[k][(word3 >> (k << 3)) & 0xff];
882 #if N > 4
883                     crc4 ^= crc_braid_table[k][(word4 >> (k << 3)) & 0xff];
884 #if N > 5
885                     crc5 ^= crc_braid_table[k][(word5 >> (k << 3)) & 0xff];
886 #endif
887 #endif
888 #endif
889 #endif
890 #endif
891                 }
892             }
893 
894             /*
895               Process the last block, combining the CRCs of the N braids at the
896               same time.
897              */
898             crc = crc_word(crc0 ^ words[0]);
899 #if N > 1
900             crc = crc_word(crc1 ^ words[1] ^ crc);
901 #if N > 2
902             crc = crc_word(crc2 ^ words[2] ^ crc);
903 #if N > 3
904             crc = crc_word(crc3 ^ words[3] ^ crc);
905 #if N > 4
906             crc = crc_word(crc4 ^ words[4] ^ crc);
907 #if N > 5
908             crc = crc_word(crc5 ^ words[5] ^ crc);
909 #endif
910 #endif
911 #endif
912 #endif
913 #endif
914             words += N;
915         }
916         else {
917             /* Big endian. */
918 
919             z_word_t crc0, word0, comb;
920 #if N > 1
921             z_word_t crc1, word1;
922 #if N > 2
923             z_word_t crc2, word2;
924 #if N > 3
925             z_word_t crc3, word3;
926 #if N > 4
927             z_word_t crc4, word4;
928 #if N > 5
929             z_word_t crc5, word5;
930 #endif
931 #endif
932 #endif
933 #endif
934 #endif
935 
936             /* Initialize the CRC for each braid. */
937             crc0 = byte_swap(crc);
938 #if N > 1
939             crc1 = 0;
940 #if N > 2
941             crc2 = 0;
942 #if N > 3
943             crc3 = 0;
944 #if N > 4
945             crc4 = 0;
946 #if N > 5
947             crc5 = 0;
948 #endif
949 #endif
950 #endif
951 #endif
952 #endif
953 
954             /*
955               Process the first blks-1 blocks, computing the CRCs on each braid
956               independently.
957              */
958             while (--blks) {
959                 /* Load the word for each braid into registers. */
960                 word0 = crc0 ^ words[0];
961 #if N > 1
962                 word1 = crc1 ^ words[1];
963 #if N > 2
964                 word2 = crc2 ^ words[2];
965 #if N > 3
966                 word3 = crc3 ^ words[3];
967 #if N > 4
968                 word4 = crc4 ^ words[4];
969 #if N > 5
970                 word5 = crc5 ^ words[5];
971 #endif
972 #endif
973 #endif
974 #endif
975 #endif
976                 words += N;
977 
978                 /* Compute and update the CRC for each word. The loop should
979                    get unrolled. */
980                 crc0 = crc_braid_big_table[0][word0 & 0xff];
981 #if N > 1
982                 crc1 = crc_braid_big_table[0][word1 & 0xff];
983 #if N > 2
984                 crc2 = crc_braid_big_table[0][word2 & 0xff];
985 #if N > 3
986                 crc3 = crc_braid_big_table[0][word3 & 0xff];
987 #if N > 4
988                 crc4 = crc_braid_big_table[0][word4 & 0xff];
989 #if N > 5
990                 crc5 = crc_braid_big_table[0][word5 & 0xff];
991 #endif
992 #endif
993 #endif
994 #endif
995 #endif
996                 for (k = 1; k < W; k++) {
997                     crc0 ^= crc_braid_big_table[k][(word0 >> (k << 3)) & 0xff];
998 #if N > 1
999                     crc1 ^= crc_braid_big_table[k][(word1 >> (k << 3)) & 0xff];
1000 #if N > 2
1001                     crc2 ^= crc_braid_big_table[k][(word2 >> (k << 3)) & 0xff];
1002 #if N > 3
1003                     crc3 ^= crc_braid_big_table[k][(word3 >> (k << 3)) & 0xff];
1004 #if N > 4
1005                     crc4 ^= crc_braid_big_table[k][(word4 >> (k << 3)) & 0xff];
1006 #if N > 5
1007                     crc5 ^= crc_braid_big_table[k][(word5 >> (k << 3)) & 0xff];
1008 #endif
1009 #endif
1010 #endif
1011 #endif
1012 #endif
1013                 }
1014             }
1015 
1016             /*
1017               Process the last block, combining the CRCs of the N braids at the
1018               same time.
1019              */
1020             comb = crc_word_big(crc0 ^ words[0]);
1021 #if N > 1
1022             comb = crc_word_big(crc1 ^ words[1] ^ comb);
1023 #if N > 2
1024             comb = crc_word_big(crc2 ^ words[2] ^ comb);
1025 #if N > 3
1026             comb = crc_word_big(crc3 ^ words[3] ^ comb);
1027 #if N > 4
1028             comb = crc_word_big(crc4 ^ words[4] ^ comb);
1029 #if N > 5
1030             comb = crc_word_big(crc5 ^ words[5] ^ comb);
1031 #endif
1032 #endif
1033 #endif
1034 #endif
1035 #endif
1036             words += N;
1037             crc = byte_swap(comb);
1038         }
1039 
1040         /*
1041           Update the pointer to the remaining bytes to process.
1042          */
1043         buf = (unsigned char const *)words;
1044     }
1045 
1046 #endif /* W */
1047 
1048     /* Complete the computation of the CRC on any remaining bytes. */
1049     while (len >= 8) {
1050         len -= 8;
1051         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1052         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1053         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1054         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1055         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1056         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1057         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1058         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1059     }
1060     while (len) {
1061         len--;
1062         crc = (crc >> 8) ^ crc_table[(crc ^ *buf++) & 0xff];
1063     }
1064 
1065     /* Return the CRC, post-conditioned. */
1066     return crc ^ 0xffffffff;
1067 }
1068 
1069 #endif
1070 
1071 /* ========================================================================= */
1072 unsigned long ZEXPORT crc32(crc, buf, len)
1073     unsigned long crc;
1074     const unsigned char FAR *buf;
1075     uInt len;
1076 {
1077     return crc32_z(crc, buf, len);
1078 }
1079 
1080 /* ========================================================================= */
1081 uLong ZEXPORT crc32_combine64(crc1, crc2, len2)
1082     uLong crc1;
1083     uLong crc2;
1084     z_off64_t len2;
1085 {
1086 #ifdef DYNAMIC_CRC_TABLE
1087     once(&made, make_crc_table);
1088 #endif /* DYNAMIC_CRC_TABLE */
1089     return multmodp(x2nmodp(len2, 3), crc1) ^ (crc2 & 0xffffffff);
1090 }
1091 
1092 /* ========================================================================= */
1093 uLong ZEXPORT crc32_combine(crc1, crc2, len2)
1094     uLong crc1;
1095     uLong crc2;
1096     z_off_t len2;
1097 {
1098     return crc32_combine64(crc1, crc2, (z_off64_t)len2);
1099 }
1100 
1101 /* ========================================================================= */
1102 uLong ZEXPORT crc32_combine_gen64(len2)
1103     z_off64_t len2;
1104 {
1105 #ifdef DYNAMIC_CRC_TABLE
1106     once(&made, make_crc_table);
1107 #endif /* DYNAMIC_CRC_TABLE */
1108     return x2nmodp(len2, 3);
1109 }
1110 
1111 /* ========================================================================= */
1112 uLong ZEXPORT crc32_combine_gen(len2)
1113     z_off_t len2;
1114 {
1115     return crc32_combine_gen64((z_off64_t)len2);
1116 }
1117 
1118 /* ========================================================================= */
1119 uLong ZEXPORT crc32_combine_op(crc1, crc2, op)
1120     uLong crc1;
1121     uLong crc2;
1122     uLong op;
1123 {
1124     return multmodp(op, crc1) ^ (crc2 & 0xffffffff);
1125 }
1126