xref: /dragonfly/test/debug/crc32hw.c (revision a4fe36f1)
1*a4fe36f1SMatthew Dillon /* crc32c.c -- compute CRC-32C using the Intel crc32 instruction
2*a4fe36f1SMatthew Dillon  * Copyright (C) 2013 Mark Adler
3*a4fe36f1SMatthew Dillon  * Version 1.1  1 Aug 2013  Mark Adler
4*a4fe36f1SMatthew Dillon  */
5*a4fe36f1SMatthew Dillon 
6*a4fe36f1SMatthew Dillon /*
7*a4fe36f1SMatthew Dillon   This software is provided 'as-is', without any express or implied
8*a4fe36f1SMatthew Dillon   warranty.  In no event will the author be held liable for any damages
9*a4fe36f1SMatthew Dillon   arising from the use of this software.
10*a4fe36f1SMatthew Dillon 
11*a4fe36f1SMatthew Dillon   Permission is granted to anyone to use this software for any purpose,
12*a4fe36f1SMatthew Dillon   including commercial applications, and to alter it and redistribute it
13*a4fe36f1SMatthew Dillon   freely, subject to the following restrictions:
14*a4fe36f1SMatthew Dillon 
15*a4fe36f1SMatthew Dillon   1. The origin of this software must not be misrepresented; you must not
16*a4fe36f1SMatthew Dillon      claim that you wrote the original software. If you use this software
17*a4fe36f1SMatthew Dillon      in a product, an acknowledgment in the product documentation would be
18*a4fe36f1SMatthew Dillon      appreciated but is not required.
19*a4fe36f1SMatthew Dillon   2. Altered source versions must be plainly marked as such, and must not be
20*a4fe36f1SMatthew Dillon      misrepresented as being the original software.
21*a4fe36f1SMatthew Dillon   3. This notice may not be removed or altered from any source distribution.
22*a4fe36f1SMatthew Dillon 
23*a4fe36f1SMatthew Dillon   Mark Adler
24*a4fe36f1SMatthew Dillon   madler@alumni.caltech.edu
25*a4fe36f1SMatthew Dillon  */
26*a4fe36f1SMatthew Dillon 
27*a4fe36f1SMatthew Dillon /* Use hardware CRC instruction on Intel SSE 4.2 processors.  This computes a
28*a4fe36f1SMatthew Dillon    CRC-32C, *not* the CRC-32 used by Ethernet and zip, gzip, etc.  A software
29*a4fe36f1SMatthew Dillon    version is provided as a fall-back, as well as for speed comparisons. */
30*a4fe36f1SMatthew Dillon 
31*a4fe36f1SMatthew Dillon /* Version history:
32*a4fe36f1SMatthew Dillon    1.0  10 Feb 2013  First version
33*a4fe36f1SMatthew Dillon    1.1   1 Aug 2013  Correct comments on why three crc instructions in parallel
34*a4fe36f1SMatthew Dillon  */
35*a4fe36f1SMatthew Dillon 
36*a4fe36f1SMatthew Dillon #include <stdio.h>
37*a4fe36f1SMatthew Dillon #include <stdlib.h>
38*a4fe36f1SMatthew Dillon #include <stdint.h>
39*a4fe36f1SMatthew Dillon #include <unistd.h>
40*a4fe36f1SMatthew Dillon #include <pthread.h>
41*a4fe36f1SMatthew Dillon 
42*a4fe36f1SMatthew Dillon /* CRC-32C (iSCSI) polynomial in reversed bit order. */
43*a4fe36f1SMatthew Dillon #define POLY 0x82f63b78
44*a4fe36f1SMatthew Dillon 
45*a4fe36f1SMatthew Dillon /* Table for a quadword-at-a-time software crc. */
46*a4fe36f1SMatthew Dillon static pthread_once_t crc32c_once_sw = PTHREAD_ONCE_INIT;
47*a4fe36f1SMatthew Dillon static uint32_t crc32c_table[8][256];
48*a4fe36f1SMatthew Dillon 
49*a4fe36f1SMatthew Dillon /* Construct table for software CRC-32C calculation. */
crc32c_init_sw(void)50*a4fe36f1SMatthew Dillon static void crc32c_init_sw(void)
51*a4fe36f1SMatthew Dillon {
52*a4fe36f1SMatthew Dillon     uint32_t n, crc, k;
53*a4fe36f1SMatthew Dillon 
54*a4fe36f1SMatthew Dillon     for (n = 0; n < 256; n++) {
55*a4fe36f1SMatthew Dillon         crc = n;
56*a4fe36f1SMatthew Dillon         crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
57*a4fe36f1SMatthew Dillon         crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
58*a4fe36f1SMatthew Dillon         crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
59*a4fe36f1SMatthew Dillon         crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
60*a4fe36f1SMatthew Dillon         crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
61*a4fe36f1SMatthew Dillon         crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
62*a4fe36f1SMatthew Dillon         crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
63*a4fe36f1SMatthew Dillon         crc = crc & 1 ? (crc >> 1) ^ POLY : crc >> 1;
64*a4fe36f1SMatthew Dillon         crc32c_table[0][n] = crc;
65*a4fe36f1SMatthew Dillon     }
66*a4fe36f1SMatthew Dillon     for (n = 0; n < 256; n++) {
67*a4fe36f1SMatthew Dillon         crc = crc32c_table[0][n];
68*a4fe36f1SMatthew Dillon         for (k = 1; k < 8; k++) {
69*a4fe36f1SMatthew Dillon             crc = crc32c_table[0][crc & 0xff] ^ (crc >> 8);
70*a4fe36f1SMatthew Dillon             crc32c_table[k][n] = crc;
71*a4fe36f1SMatthew Dillon         }
72*a4fe36f1SMatthew Dillon     }
73*a4fe36f1SMatthew Dillon }
74*a4fe36f1SMatthew Dillon 
75*a4fe36f1SMatthew Dillon /* Table-driven software version as a fall-back.  This is about 15 times slower
76*a4fe36f1SMatthew Dillon    than using the hardware instructions.  This assumes little-endian integers,
77*a4fe36f1SMatthew Dillon    as is the case on Intel processors that the assembler code here is for. */
crc32c_sw(uint32_t crci,const void * buf,size_t len)78*a4fe36f1SMatthew Dillon static uint32_t crc32c_sw(uint32_t crci, const void *buf, size_t len)
79*a4fe36f1SMatthew Dillon {
80*a4fe36f1SMatthew Dillon     const unsigned char *next = buf;
81*a4fe36f1SMatthew Dillon     uint64_t crc;
82*a4fe36f1SMatthew Dillon 
83*a4fe36f1SMatthew Dillon exit(1);
84*a4fe36f1SMatthew Dillon     pthread_once(&crc32c_once_sw, crc32c_init_sw);
85*a4fe36f1SMatthew Dillon     crc = crci ^ 0xffffffff;
86*a4fe36f1SMatthew Dillon     while (len && ((uintptr_t)next & 7) != 0) {
87*a4fe36f1SMatthew Dillon         crc = crc32c_table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
88*a4fe36f1SMatthew Dillon         len--;
89*a4fe36f1SMatthew Dillon     }
90*a4fe36f1SMatthew Dillon     while (len >= 8) {
91*a4fe36f1SMatthew Dillon         crc ^= *(uint64_t *)next;
92*a4fe36f1SMatthew Dillon         crc = crc32c_table[7][crc & 0xff] ^
93*a4fe36f1SMatthew Dillon               crc32c_table[6][(crc >> 8) & 0xff] ^
94*a4fe36f1SMatthew Dillon               crc32c_table[5][(crc >> 16) & 0xff] ^
95*a4fe36f1SMatthew Dillon               crc32c_table[4][(crc >> 24) & 0xff] ^
96*a4fe36f1SMatthew Dillon               crc32c_table[3][(crc >> 32) & 0xff] ^
97*a4fe36f1SMatthew Dillon               crc32c_table[2][(crc >> 40) & 0xff] ^
98*a4fe36f1SMatthew Dillon               crc32c_table[1][(crc >> 48) & 0xff] ^
99*a4fe36f1SMatthew Dillon               crc32c_table[0][crc >> 56];
100*a4fe36f1SMatthew Dillon         next += 8;
101*a4fe36f1SMatthew Dillon         len -= 8;
102*a4fe36f1SMatthew Dillon     }
103*a4fe36f1SMatthew Dillon     while (len) {
104*a4fe36f1SMatthew Dillon         crc = crc32c_table[0][(crc ^ *next++) & 0xff] ^ (crc >> 8);
105*a4fe36f1SMatthew Dillon         len--;
106*a4fe36f1SMatthew Dillon     }
107*a4fe36f1SMatthew Dillon     return (uint32_t)crc ^ 0xffffffff;
108*a4fe36f1SMatthew Dillon }
109*a4fe36f1SMatthew Dillon 
110*a4fe36f1SMatthew Dillon /* Multiply a matrix times a vector over the Galois field of two elements,
111*a4fe36f1SMatthew Dillon    GF(2).  Each element is a bit in an unsigned integer.  mat must have at
112*a4fe36f1SMatthew Dillon    least as many entries as the power of two for most significant one bit in
113*a4fe36f1SMatthew Dillon    vec. */
gf2_matrix_times(uint32_t * mat,uint32_t vec)114*a4fe36f1SMatthew Dillon static inline uint32_t gf2_matrix_times(uint32_t *mat, uint32_t vec)
115*a4fe36f1SMatthew Dillon {
116*a4fe36f1SMatthew Dillon     uint32_t sum;
117*a4fe36f1SMatthew Dillon 
118*a4fe36f1SMatthew Dillon     sum = 0;
119*a4fe36f1SMatthew Dillon     while (vec) {
120*a4fe36f1SMatthew Dillon         if (vec & 1)
121*a4fe36f1SMatthew Dillon             sum ^= *mat;
122*a4fe36f1SMatthew Dillon         vec >>= 1;
123*a4fe36f1SMatthew Dillon         mat++;
124*a4fe36f1SMatthew Dillon     }
125*a4fe36f1SMatthew Dillon     return sum;
126*a4fe36f1SMatthew Dillon }
127*a4fe36f1SMatthew Dillon 
128*a4fe36f1SMatthew Dillon /* Multiply a matrix by itself over GF(2).  Both mat and square must have 32
129*a4fe36f1SMatthew Dillon    rows. */
gf2_matrix_square(uint32_t * square,uint32_t * mat)130*a4fe36f1SMatthew Dillon static inline void gf2_matrix_square(uint32_t *square, uint32_t *mat)
131*a4fe36f1SMatthew Dillon {
132*a4fe36f1SMatthew Dillon     int n;
133*a4fe36f1SMatthew Dillon 
134*a4fe36f1SMatthew Dillon     for (n = 0; n < 32; n++)
135*a4fe36f1SMatthew Dillon         square[n] = gf2_matrix_times(mat, mat[n]);
136*a4fe36f1SMatthew Dillon }
137*a4fe36f1SMatthew Dillon 
138*a4fe36f1SMatthew Dillon /* Construct an operator to apply len zeros to a crc.  len must be a power of
139*a4fe36f1SMatthew Dillon    two.  If len is not a power of two, then the result is the same as for the
140*a4fe36f1SMatthew Dillon    largest power of two less than len.  The result for len == 0 is the same as
141*a4fe36f1SMatthew Dillon    for len == 1.  A version of this routine could be easily written for any
142*a4fe36f1SMatthew Dillon    len, but that is not needed for this application. */
crc32c_zeros_op(uint32_t * even,size_t len)143*a4fe36f1SMatthew Dillon static void crc32c_zeros_op(uint32_t *even, size_t len)
144*a4fe36f1SMatthew Dillon {
145*a4fe36f1SMatthew Dillon     int n;
146*a4fe36f1SMatthew Dillon     uint32_t row;
147*a4fe36f1SMatthew Dillon     uint32_t odd[32];       /* odd-power-of-two zeros operator */
148*a4fe36f1SMatthew Dillon 
149*a4fe36f1SMatthew Dillon     /* put operator for one zero bit in odd */
150*a4fe36f1SMatthew Dillon     odd[0] = POLY;              /* CRC-32C polynomial */
151*a4fe36f1SMatthew Dillon     row = 1;
152*a4fe36f1SMatthew Dillon     for (n = 1; n < 32; n++) {
153*a4fe36f1SMatthew Dillon         odd[n] = row;
154*a4fe36f1SMatthew Dillon         row <<= 1;
155*a4fe36f1SMatthew Dillon     }
156*a4fe36f1SMatthew Dillon 
157*a4fe36f1SMatthew Dillon     /* put operator for two zero bits in even */
158*a4fe36f1SMatthew Dillon     gf2_matrix_square(even, odd);
159*a4fe36f1SMatthew Dillon 
160*a4fe36f1SMatthew Dillon     /* put operator for four zero bits in odd */
161*a4fe36f1SMatthew Dillon     gf2_matrix_square(odd, even);
162*a4fe36f1SMatthew Dillon 
163*a4fe36f1SMatthew Dillon     /* first square will put the operator for one zero byte (eight zero bits),
164*a4fe36f1SMatthew Dillon        in even -- next square puts operator for two zero bytes in odd, and so
165*a4fe36f1SMatthew Dillon        on, until len has been rotated down to zero */
166*a4fe36f1SMatthew Dillon     do {
167*a4fe36f1SMatthew Dillon         gf2_matrix_square(even, odd);
168*a4fe36f1SMatthew Dillon         len >>= 1;
169*a4fe36f1SMatthew Dillon         if (len == 0)
170*a4fe36f1SMatthew Dillon             return;
171*a4fe36f1SMatthew Dillon         gf2_matrix_square(odd, even);
172*a4fe36f1SMatthew Dillon         len >>= 1;
173*a4fe36f1SMatthew Dillon     } while (len);
174*a4fe36f1SMatthew Dillon 
175*a4fe36f1SMatthew Dillon     /* answer ended up in odd -- copy to even */
176*a4fe36f1SMatthew Dillon     for (n = 0; n < 32; n++)
177*a4fe36f1SMatthew Dillon         even[n] = odd[n];
178*a4fe36f1SMatthew Dillon }
179*a4fe36f1SMatthew Dillon 
180*a4fe36f1SMatthew Dillon /* Take a length and build four lookup tables for applying the zeros operator
181*a4fe36f1SMatthew Dillon    for that length, byte-by-byte on the operand. */
crc32c_zeros(uint32_t zeros[][256],size_t len)182*a4fe36f1SMatthew Dillon static void crc32c_zeros(uint32_t zeros[][256], size_t len)
183*a4fe36f1SMatthew Dillon {
184*a4fe36f1SMatthew Dillon     uint32_t n;
185*a4fe36f1SMatthew Dillon     uint32_t op[32];
186*a4fe36f1SMatthew Dillon 
187*a4fe36f1SMatthew Dillon     crc32c_zeros_op(op, len);
188*a4fe36f1SMatthew Dillon     for (n = 0; n < 256; n++) {
189*a4fe36f1SMatthew Dillon         zeros[0][n] = gf2_matrix_times(op, n);
190*a4fe36f1SMatthew Dillon         zeros[1][n] = gf2_matrix_times(op, n << 8);
191*a4fe36f1SMatthew Dillon         zeros[2][n] = gf2_matrix_times(op, n << 16);
192*a4fe36f1SMatthew Dillon         zeros[3][n] = gf2_matrix_times(op, n << 24);
193*a4fe36f1SMatthew Dillon     }
194*a4fe36f1SMatthew Dillon }
195*a4fe36f1SMatthew Dillon 
196*a4fe36f1SMatthew Dillon /* Apply the zeros operator table to crc. */
crc32c_shift(uint32_t zeros[][256],uint32_t crc)197*a4fe36f1SMatthew Dillon static inline uint32_t crc32c_shift(uint32_t zeros[][256], uint32_t crc)
198*a4fe36f1SMatthew Dillon {
199*a4fe36f1SMatthew Dillon     return zeros[0][crc & 0xff] ^ zeros[1][(crc >> 8) & 0xff] ^
200*a4fe36f1SMatthew Dillon            zeros[2][(crc >> 16) & 0xff] ^ zeros[3][crc >> 24];
201*a4fe36f1SMatthew Dillon }
202*a4fe36f1SMatthew Dillon 
203*a4fe36f1SMatthew Dillon /* Block sizes for three-way parallel crc computation.  LONG and SHORT must
204*a4fe36f1SMatthew Dillon    both be powers of two.  The associated string constants must be set
205*a4fe36f1SMatthew Dillon    accordingly, for use in constructing the assembler instructions. */
206*a4fe36f1SMatthew Dillon #define LONG 8192
207*a4fe36f1SMatthew Dillon #define LONGx1 "8192"
208*a4fe36f1SMatthew Dillon #define LONGx2 "16384"
209*a4fe36f1SMatthew Dillon #define SHORT 256
210*a4fe36f1SMatthew Dillon #define SHORTx1 "256"
211*a4fe36f1SMatthew Dillon #define SHORTx2 "512"
212*a4fe36f1SMatthew Dillon 
213*a4fe36f1SMatthew Dillon /* Tables for hardware crc that shift a crc by LONG and SHORT zeros. */
214*a4fe36f1SMatthew Dillon static pthread_once_t crc32c_once_hw = PTHREAD_ONCE_INIT;
215*a4fe36f1SMatthew Dillon static uint32_t crc32c_long[4][256];
216*a4fe36f1SMatthew Dillon static uint32_t crc32c_short[4][256];
217*a4fe36f1SMatthew Dillon 
218*a4fe36f1SMatthew Dillon /* Initialize tables for shifting crcs. */
crc32c_init_hw(void)219*a4fe36f1SMatthew Dillon static void crc32c_init_hw(void)
220*a4fe36f1SMatthew Dillon {
221*a4fe36f1SMatthew Dillon     crc32c_zeros(crc32c_long, LONG);
222*a4fe36f1SMatthew Dillon     crc32c_zeros(crc32c_short, SHORT);
223*a4fe36f1SMatthew Dillon }
224*a4fe36f1SMatthew Dillon 
225*a4fe36f1SMatthew Dillon /* Compute CRC-32C using the Intel hardware instruction. */
crc32c_hw(uint32_t crc,const void * buf,size_t len)226*a4fe36f1SMatthew Dillon static uint32_t crc32c_hw(uint32_t crc, const void *buf, size_t len)
227*a4fe36f1SMatthew Dillon {
228*a4fe36f1SMatthew Dillon     const unsigned char *next = buf;
229*a4fe36f1SMatthew Dillon     const unsigned char *end;
230*a4fe36f1SMatthew Dillon     uint64_t crc0, crc1, crc2;      /* need to be 64 bits for crc32q */
231*a4fe36f1SMatthew Dillon 
232*a4fe36f1SMatthew Dillon     /* populate shift tables the first time through */
233*a4fe36f1SMatthew Dillon     pthread_once(&crc32c_once_hw, crc32c_init_hw);
234*a4fe36f1SMatthew Dillon 
235*a4fe36f1SMatthew Dillon     /* pre-process the crc */
236*a4fe36f1SMatthew Dillon     crc0 = crc ^ 0xffffffff;
237*a4fe36f1SMatthew Dillon 
238*a4fe36f1SMatthew Dillon     /* compute the crc for up to seven leading bytes to bring the data pointer
239*a4fe36f1SMatthew Dillon        to an eight-byte boundary */
240*a4fe36f1SMatthew Dillon     while (len && ((uintptr_t)next & 7) != 0) {
241*a4fe36f1SMatthew Dillon         __asm__("crc32b\t" "(%1), %0"
242*a4fe36f1SMatthew Dillon                 : "=r"(crc0)
243*a4fe36f1SMatthew Dillon                 : "r"(next), "0"(crc0));
244*a4fe36f1SMatthew Dillon         next++;
245*a4fe36f1SMatthew Dillon         len--;
246*a4fe36f1SMatthew Dillon     }
247*a4fe36f1SMatthew Dillon 
248*a4fe36f1SMatthew Dillon     /* compute the crc on sets of LONG*3 bytes, executing three independent crc
249*a4fe36f1SMatthew Dillon        instructions, each on LONG bytes -- this is optimized for the Nehalem,
250*a4fe36f1SMatthew Dillon        Westmere, Sandy Bridge, and Ivy Bridge architectures, which have a
251*a4fe36f1SMatthew Dillon        throughput of one crc per cycle, but a latency of three cycles */
252*a4fe36f1SMatthew Dillon     while (len >= LONG*3) {
253*a4fe36f1SMatthew Dillon         crc1 = 0;
254*a4fe36f1SMatthew Dillon         crc2 = 0;
255*a4fe36f1SMatthew Dillon         end = next + LONG;
256*a4fe36f1SMatthew Dillon         do {
257*a4fe36f1SMatthew Dillon             __asm__("crc32q\t" "(%3), %0\n\t"
258*a4fe36f1SMatthew Dillon                     "crc32q\t" LONGx1 "(%3), %1\n\t"
259*a4fe36f1SMatthew Dillon                     "crc32q\t" LONGx2 "(%3), %2"
260*a4fe36f1SMatthew Dillon                     : "=r"(crc0), "=r"(crc1), "=r"(crc2)
261*a4fe36f1SMatthew Dillon                     : "r"(next), "0"(crc0), "1"(crc1), "2"(crc2));
262*a4fe36f1SMatthew Dillon             next += 8;
263*a4fe36f1SMatthew Dillon         } while (next < end);
264*a4fe36f1SMatthew Dillon         crc0 = crc32c_shift(crc32c_long, crc0) ^ crc1;
265*a4fe36f1SMatthew Dillon         crc0 = crc32c_shift(crc32c_long, crc0) ^ crc2;
266*a4fe36f1SMatthew Dillon         next += LONG*2;
267*a4fe36f1SMatthew Dillon         len -= LONG*3;
268*a4fe36f1SMatthew Dillon     }
269*a4fe36f1SMatthew Dillon 
270*a4fe36f1SMatthew Dillon     /* do the same thing, but now on SHORT*3 blocks for the remaining data less
271*a4fe36f1SMatthew Dillon        than a LONG*3 block */
272*a4fe36f1SMatthew Dillon     while (len >= SHORT*3) {
273*a4fe36f1SMatthew Dillon         crc1 = 0;
274*a4fe36f1SMatthew Dillon         crc2 = 0;
275*a4fe36f1SMatthew Dillon         end = next + SHORT;
276*a4fe36f1SMatthew Dillon         do {
277*a4fe36f1SMatthew Dillon             __asm__("crc32q\t" "(%3), %0\n\t"
278*a4fe36f1SMatthew Dillon                     "crc32q\t" SHORTx1 "(%3), %1\n\t"
279*a4fe36f1SMatthew Dillon                     "crc32q\t" SHORTx2 "(%3), %2"
280*a4fe36f1SMatthew Dillon                     : "=r"(crc0), "=r"(crc1), "=r"(crc2)
281*a4fe36f1SMatthew Dillon                     : "r"(next), "0"(crc0), "1"(crc1), "2"(crc2));
282*a4fe36f1SMatthew Dillon             next += 8;
283*a4fe36f1SMatthew Dillon         } while (next < end);
284*a4fe36f1SMatthew Dillon         crc0 = crc32c_shift(crc32c_short, crc0) ^ crc1;
285*a4fe36f1SMatthew Dillon         crc0 = crc32c_shift(crc32c_short, crc0) ^ crc2;
286*a4fe36f1SMatthew Dillon         next += SHORT*2;
287*a4fe36f1SMatthew Dillon         len -= SHORT*3;
288*a4fe36f1SMatthew Dillon     }
289*a4fe36f1SMatthew Dillon 
290*a4fe36f1SMatthew Dillon     /* compute the crc on the remaining eight-byte units less than a SHORT*3
291*a4fe36f1SMatthew Dillon        block */
292*a4fe36f1SMatthew Dillon     end = next + (len - (len & 7));
293*a4fe36f1SMatthew Dillon     while (next < end) {
294*a4fe36f1SMatthew Dillon         __asm__("crc32q\t" "(%1), %0"
295*a4fe36f1SMatthew Dillon                 : "=r"(crc0)
296*a4fe36f1SMatthew Dillon                 : "r"(next), "0"(crc0));
297*a4fe36f1SMatthew Dillon         next += 8;
298*a4fe36f1SMatthew Dillon     }
299*a4fe36f1SMatthew Dillon     len &= 7;
300*a4fe36f1SMatthew Dillon 
301*a4fe36f1SMatthew Dillon     /* compute the crc for up to seven trailing bytes */
302*a4fe36f1SMatthew Dillon     while (len) {
303*a4fe36f1SMatthew Dillon         __asm__("crc32b\t" "(%1), %0"
304*a4fe36f1SMatthew Dillon                 : "=r"(crc0)
305*a4fe36f1SMatthew Dillon                 : "r"(next), "0"(crc0));
306*a4fe36f1SMatthew Dillon         next++;
307*a4fe36f1SMatthew Dillon         len--;
308*a4fe36f1SMatthew Dillon     }
309*a4fe36f1SMatthew Dillon 
310*a4fe36f1SMatthew Dillon     /* return a post-processed crc */
311*a4fe36f1SMatthew Dillon     return (uint32_t)crc0 ^ 0xffffffff;
312*a4fe36f1SMatthew Dillon }
313*a4fe36f1SMatthew Dillon 
314*a4fe36f1SMatthew Dillon /* Check for SSE 4.2.  SSE 4.2 was first supported in Nehalem processors
315*a4fe36f1SMatthew Dillon    introduced in November, 2008.  This does not check for the existence of the
316*a4fe36f1SMatthew Dillon    cpuid instruction itself, which was introduced on the 486SL in 1992, so this
317*a4fe36f1SMatthew Dillon    will fail on earlier x86 processors.  cpuid works on all Pentium and later
318*a4fe36f1SMatthew Dillon    processors. */
319*a4fe36f1SMatthew Dillon #define SSE42(have) \
320*a4fe36f1SMatthew Dillon     do { \
321*a4fe36f1SMatthew Dillon         uint32_t eax, ecx; \
322*a4fe36f1SMatthew Dillon         eax = 1; \
323*a4fe36f1SMatthew Dillon         __asm__("cpuid" \
324*a4fe36f1SMatthew Dillon                 : "=c"(ecx) \
325*a4fe36f1SMatthew Dillon                 : "a"(eax) \
326*a4fe36f1SMatthew Dillon                 : "%ebx", "%edx"); \
327*a4fe36f1SMatthew Dillon         (have) = (ecx >> 20) & 1; \
328*a4fe36f1SMatthew Dillon     } while (0)
329*a4fe36f1SMatthew Dillon 
330*a4fe36f1SMatthew Dillon /* Compute a CRC-32C.  If the crc32 instruction is available, use the hardware
331*a4fe36f1SMatthew Dillon    version.  Otherwise, use the software version. */
crc32c(uint32_t crc,const void * buf,size_t len)332*a4fe36f1SMatthew Dillon uint32_t crc32c(uint32_t crc, const void *buf, size_t len)
333*a4fe36f1SMatthew Dillon {
334*a4fe36f1SMatthew Dillon     int sse42;
335*a4fe36f1SMatthew Dillon 
336*a4fe36f1SMatthew Dillon     SSE42(sse42);
337*a4fe36f1SMatthew Dillon     return sse42 ? crc32c_hw(crc, buf, len) : crc32c_sw(crc, buf, len);
338*a4fe36f1SMatthew Dillon }
339*a4fe36f1SMatthew Dillon 
340*a4fe36f1SMatthew Dillon #ifdef TEST
341*a4fe36f1SMatthew Dillon 
342*a4fe36f1SMatthew Dillon #define SIZE (262144*3)
343*a4fe36f1SMatthew Dillon #define CHUNK SIZE
344*a4fe36f1SMatthew Dillon 
main(int argc,char ** argv)345*a4fe36f1SMatthew Dillon int main(int argc, char **argv)
346*a4fe36f1SMatthew Dillon {
347*a4fe36f1SMatthew Dillon     char *buf;
348*a4fe36f1SMatthew Dillon     ssize_t got;
349*a4fe36f1SMatthew Dillon     size_t off, n;
350*a4fe36f1SMatthew Dillon     uint32_t crc;
351*a4fe36f1SMatthew Dillon 
352*a4fe36f1SMatthew Dillon     (void)argv;
353*a4fe36f1SMatthew Dillon     crc = 0;
354*a4fe36f1SMatthew Dillon     buf = malloc(SIZE);
355*a4fe36f1SMatthew Dillon     if (buf == NULL) {
356*a4fe36f1SMatthew Dillon         fputs("out of memory", stderr);
357*a4fe36f1SMatthew Dillon         return 1;
358*a4fe36f1SMatthew Dillon     }
359*a4fe36f1SMatthew Dillon     while ((got = read(0, buf, SIZE)) > 0) {
360*a4fe36f1SMatthew Dillon         off = 0;
361*a4fe36f1SMatthew Dillon         do {
362*a4fe36f1SMatthew Dillon             n = (size_t)got - off;
363*a4fe36f1SMatthew Dillon             if (n > CHUNK)
364*a4fe36f1SMatthew Dillon                 n = CHUNK;
365*a4fe36f1SMatthew Dillon             crc = argc > 1 ? crc32c_sw(crc, buf + off, n) :
366*a4fe36f1SMatthew Dillon                              crc32c(crc, buf + off, n);
367*a4fe36f1SMatthew Dillon             off += n;
368*a4fe36f1SMatthew Dillon         } while (off < (size_t)got);
369*a4fe36f1SMatthew Dillon     }
370*a4fe36f1SMatthew Dillon     free(buf);
371*a4fe36f1SMatthew Dillon     if (got == -1) {
372*a4fe36f1SMatthew Dillon         fputs("read error\n", stderr);
373*a4fe36f1SMatthew Dillon         return 1;
374*a4fe36f1SMatthew Dillon     }
375*a4fe36f1SMatthew Dillon     printf("%08x\n", crc);
376*a4fe36f1SMatthew Dillon     return 0;
377*a4fe36f1SMatthew Dillon }
378*a4fe36f1SMatthew Dillon 
379*a4fe36f1SMatthew Dillon #endif /* TEST */
380