1 /* crc32c.c -- compute CRC-32C using the Intel crc32 instruction
2  * Copyright (C) 2013 Mark Adler
3  * Version 1.1  1 Aug 2013  Mark Adler
4  *
5  * Adapted by Rodrigo Tobar for inclusion in the crc32c python package
6  */
7 
8 /*
9   This software is provided 'as-is', without any express or implied
10   warranty.  In no event will the author be held liable for any damages
11   arising from the use of this software.
12 
13   Permission is granted to anyone to use this software for any purpose,
14   including commercial applications, and to alter it and redistribute it
15   freely, subject to the following restrictions:
16 
17   1. The origin of this software must not be misrepresented; you must not
18      claim that you wrote the original software. If you use this software
19      in a product, an acknowledgment in the product documentation would be
20      appreciated but is not required.
21   2. Altered source versions must be plainly marked as such, and must not be
22      misrepresented as being the original software.
23   3. This notice may not be removed or altered from any source distribution.
24 
25   Mark Adler
26   madler@alumni.caltech.edu
27  */
28 
29 /* Use hardware CRC instruction on Intel SSE 4.2 processors.  This computes a
30    CRC-32C, *not* the CRC-32 used by Ethernet and zip, gzip, etc.  A software
31    version is provided as a fall-back, as well as for speed comparisons. */
32 
33 /* Version history:
34    1.0  10 Feb 2013  First version
35    1.1   1 Aug 2013  Correct comments on why three crc instructions in parallel
36  */
37 
38 /* Altered version
39  * This version modified to fit into the benchmarking code retrieved from
40  * http://www.evanjones.ca/crc32c.html
41  * 1.2  20 Mar 2016  Ferry Toth - Fit into benchmarking
42  * 1.3  07 May 2016  Ferry Toth - Applied some speed ups by putting more CRC32 in the short and long loop
43  *                              - Moved crc32q into macro's and put alternative code there for 32bit operation
44 */
45 
46 #if defined(IS_INTEL)
47 
48 #include "common.h"
49 
50 /*
51  * MSVC/icc don't have __builtin_ia32_crc32_* functions. Instead they have
52  * the _mm_crc32_* intrinsics, which accomplish the same at the end of the day
53  */
54 #if defined(_MSC_VER) || defined(__ICC)
55 # include <nmmintrin.h>
56 # define __builtin_ia32_crc32qi _mm_crc32_u8
57 # define __builtin_ia32_crc32hi _mm_crc32_u16
58 # define __builtin_ia32_crc32si _mm_crc32_u32
59 # define __builtin_ia32_crc32di _mm_crc32_u64
60 #endif /* defined(_MSC_VER) || defined(__ICC) */
61 
62 /* CRC-32C (iSCSI) polynomial in reversed bit order. */
63 #define POLY 0x82f63b78
64 
65 /* Multiply a matrix times a vector over the Galois field of two elements,
66    GF(2).  Each element is a bit in an unsigned integer.  mat must have at
67    least as many entries as the power of two for most significant one bit in
68    vec. */
gf2_matrix_times(uint32_t * mat,uint32_t vec)69 static CRC32C_INLINE uint32_t gf2_matrix_times ( uint32_t *mat, uint32_t vec )
70 {
71         uint32_t sum;
72 
73         sum = 0;
74         while ( vec ) {
75                 if ( vec & 1 )
76                         sum ^= *mat;
77                 vec >>= 1;
78                 mat++;
79         }
80         return sum;
81 }
82 
83 /* Multiply a matrix by itself over GF(2).  Both mat and square must have 32
84    rows. */
gf2_matrix_square(uint32_t * square,uint32_t * mat)85 static CRC32C_INLINE void gf2_matrix_square ( uint32_t *square, uint32_t *mat )
86 {
87         int n;
88 
89         for ( n = 0; n < 32; n++ )
90                 square[n] = gf2_matrix_times ( mat, mat[n] );
91 }
92 
93 /* Construct an operator to apply len zeros to a crc.  len must be a power of
94    two.  If len is not a power of two, then the result is the same as for the
95    largest power of two less than len.  The result for len == 0 is the same as
96    for len == 1.  A version of this routine could be easily written for any
97    len, but that is not needed for this application. */
crc32c_zeros_op(uint32_t * even,size_t len)98 static void crc32c_zeros_op ( uint32_t *even, size_t len )
99 {
100         int n;
101         uint32_t row;
102         uint32_t odd[32];       /* odd-power-of-two zeros operator */
103 
104         /* put operator for one zero bit in odd */
105         odd[0] = POLY;              /* CRC-32C polynomial */
106         row = 1;
107         for ( n = 1; n < 32; n++ ) {
108                 odd[n] = row;
109                 row <<= 1;
110         }
111 
112         /* put operator for two zero bits in even */
113         gf2_matrix_square ( even, odd );
114 
115         /* put operator for four zero bits in odd */
116         gf2_matrix_square ( odd, even );
117 
118         /* first square will put the operator for one zero byte (eight zero bits),
119            in even -- next square puts operator for two zero bytes in odd, and so
120            on, until len has been rotated down to zero */
121         do {
122                 gf2_matrix_square ( even, odd );
123                 len >>= 1;
124                 if ( len == 0 )
125                         return;
126                 gf2_matrix_square ( odd, even );
127                 len >>= 1;
128         } while ( len );
129 
130         /* answer ended up in odd -- copy to even */
131         for ( n = 0; n < 32; n++ )
132                 even[n] = odd[n];
133 }
134 
135 /* Take a length and build four lookup tables for applying the zeros operator
136    for that length, byte-by-byte on the operand. */
crc32c_zeros(uint32_t zeros[][256],size_t len)137 static void crc32c_zeros ( uint32_t zeros[][256], size_t len )
138 {
139         uint32_t n;
140         uint32_t op[32];
141 
142         crc32c_zeros_op ( op, len );
143         for ( n = 0; n < 256; n++ ) {
144                 zeros[0][n] = gf2_matrix_times ( op, n );
145                 zeros[1][n] = gf2_matrix_times ( op, n << 8 );
146                 zeros[2][n] = gf2_matrix_times ( op, n << 16 );
147                 zeros[3][n] = gf2_matrix_times ( op, n << 24 );
148         }
149 }
150 
151 
152 /* Apply the zeros operator table to crc. */
crc32c_shift(uint32_t zeros[][256],uint32_t crc)153 static CRC32C_INLINE uint32_t crc32c_shift ( uint32_t zeros[][256], uint32_t crc )
154 {
155         return zeros[0][crc & 0xff] ^ zeros[1][ ( crc >> 8 ) & 0xff] ^
156                zeros[2][ ( crc >> 16 ) & 0xff] ^ zeros[3][crc >> 24];
157 }
158 
159 /* Block sizes for three-way parallel crc computation.  LONG and SHORT must
160    both be powers of two.  The associated string constants must be set
161    accordingly, for use in constructing the assembler instructions. */
162 #define LONG 8192
163 #define LONGx1 "8192"
164 #define LONGx2 "16384"
165 #define SHORT 256
166 #define SHORTx1 "256"
167 #define SHORTx2 "512"
168 
169 /* Tables for hardware crc that shift a crc by LONG and SHORT zeros. */
170 static uint32_t crc32c_long[4][256];
171 static uint32_t crc32c_short[4][256];
172 
173 /* Initialize tables for shifting crcs. */
crc32c_init_hw_adler(void)174 void crc32c_init_hw_adler( void )
175 {
176         crc32c_zeros ( crc32c_long, LONG );
177         crc32c_zeros ( crc32c_short, SHORT );
178 }
179 
180 #ifndef CRC32C_IS_64_BITS
181 #define CRCtriplet(crc, buf, size, i) \
182     crc ## 0 = __builtin_ia32_crc32si(crc ## 0, *(uint32_t*) (buf + i)); \
183     crc ## 1 = __builtin_ia32_crc32si(crc ## 1, *(uint32_t*) (buf + i + size)); \
184     crc ## 2 = __builtin_ia32_crc32si(crc ## 2, *(uint32_t*) (buf + i + 2 * size)); \
185     crc ## 0 = __builtin_ia32_crc32si(crc ## 0, *(uint32_t*) (buf + sizeof(uint32_t) + i)); \
186     crc ## 1 = __builtin_ia32_crc32si(crc ## 1, *(uint32_t*) (buf + sizeof(uint32_t) + i + size)); \
187     crc ## 2 = __builtin_ia32_crc32si(crc ## 2, *(uint32_t*) (buf + sizeof(uint32_t) + i + 2 * size));
188 #else
189 #define CRCtriplet(crc, buf, size, i) \
190     crc ## 0 = __builtin_ia32_crc32di(crc ## 0, *(uint64_t*) (buf + i)); \
191     crc ## 1 = __builtin_ia32_crc32di(crc ## 1, *(uint64_t*) (buf + i + size)); \
192     crc ## 2 = __builtin_ia32_crc32di(crc ## 2, *(uint64_t*) (buf + i + 2 * size));
193 #endif
194 
195 
196 #ifndef CRC32C_IS_64_BITS
197 #define CRCsinglet(crc, buf) \
198     crc = __builtin_ia32_crc32si(crc, *(uint32_t*)buf); \
199     crc = __builtin_ia32_crc32si(crc, *(uint32_t*)(buf + sizeof(uint32_t))); \
200     buf+= 2 *sizeof(uint32_t);
201 #else
202 #define CRCsinglet(crc, buf) crc = __builtin_ia32_crc32di(crc, *(uint64_t*)buf); buf+= sizeof(uint64_t);
203 #endif
204 
205 /* Compute CRC-32C using the Intel hardware instruction. */
_crc32c_hw_adler(uint32_t crc,const unsigned char * buf,unsigned long len)206 uint32_t _crc32c_hw_adler(uint32_t crc, const unsigned char *buf, unsigned long len)
207 {
208         const unsigned char *next = buf;
209         const unsigned char *end;
210         unsigned short count;
211 
212 #ifndef CRC32C_IS_64_BITS
213         uint32_t crc0, crc1, crc2;
214 #else
215         uint64_t crc0, crc1, crc2;      /* need to be 64 bits for crc32q */
216 #endif
217         uint32_t crc32bit;
218 
219         crc32bit = crc;
220         // in len > 256 compute the crc for up to seven leading bytes to bring the data pointer to an eight-byte boundary
221         if ( len > 128 ) {
222                 unsigned char align = ( 8 - ( uintptr_t ) next ) % 8;            // byte to boundary
223                 len -= align;
224                 if ( ( align % 2 ) != 0 ) crc32bit = __builtin_ia32_crc32qi ( crc32bit, *next );
225                 next += align;
226                 switch ( align / 2 ) {
227                 case 3:
228                         crc32bit = __builtin_ia32_crc32hi ( crc32bit, * ( uint16_t* ) ( next - 6 ) ); // 6 char, remain 4
229                 case 2:
230                         crc32bit = __builtin_ia32_crc32si ( crc32bit, * ( uint32_t* ) ( next - 4 ) ); // 4 char, remain 0
231                         break;
232                 case 1:
233                         crc32bit = __builtin_ia32_crc32hi ( crc32bit, * ( uint16_t* ) ( next - 2 ) ); // 2 char, remain 0
234                 case 0:
235                         break;
236                 }
237         };
238 
239         /* compute the crc on sets of LONG*3 bytes, executing three independent crc
240            instructions, each on LONG bytes -- this is optimized for the Nehalem,
241            Westmere, Sandy Bridge, and Ivy Bridge architectures, which have a
242            throughput of one crc per cycle, but a latency of three cycles */
243 
244         crc0 = crc32bit;
245         while ( len >= LONG*3 ) {
246                 crc1 = 0;
247                 crc2 = 0;
248                 end = next + LONG;
249                 do {
250                         CRCtriplet ( crc, next, LONG, 0 );
251                         CRCtriplet ( crc, next, LONG, 8 );
252                         CRCtriplet ( crc, next, LONG, 16 );
253                         CRCtriplet ( crc, next, LONG, 24 );
254                         next += 32;
255                 } while ( next < end );
256                 crc0 = crc32c_shift ( crc32c_long, (uint32_t)crc0 ) ^ crc1;
257                 crc0 = crc32c_shift ( crc32c_long, (uint32_t)crc0 ) ^ crc2;
258                 next += LONG*2;
259                 len -= LONG*3;
260         }
261 
262         /* do the same thing, but now on SHORT*3 blocks for the remaining data less
263            than a LONG*3 block */
264         while ( len >= SHORT*3 ) {
265                 crc1 = 0;
266                 crc2 = 0;
267                 end = next + SHORT;
268                 do {
269                         CRCtriplet ( crc, next, SHORT, 0 );
270                         CRCtriplet ( crc, next, SHORT, 8 );
271                         CRCtriplet ( crc, next, SHORT, 16 );
272                         CRCtriplet ( crc, next, SHORT, 24 );
273                         next += 32;
274                 } while ( next < end );
275                 crc0 = crc32c_shift ( crc32c_short, (uint32_t)crc0 ) ^ crc1;
276                 crc0 = crc32c_shift ( crc32c_short, (uint32_t)crc0 ) ^ crc2;
277                 next += SHORT*2;
278                 len -= SHORT*3;
279         }
280 
281         /* compute the crc on the remaining eight-byte units less than a SHORT*3
282            block */
283 
284         // use Duff's device, a for() loop inside a switch() statement. This is Legal
285         if ( ( count = ( len - ( len & 7 ) ) ) >= 8 ) { // needs to execute crc at least once
286                 unsigned short n;
287                 len -= count;
288                 count /= 8;                        // count number of crc32di
289                 n = ( count + 15 ) / 16;
290                 switch ( count % 16 ) {
291                 case 0:
292                         do {
293                                 CRCsinglet ( crc0, next );
294                         case 15:
295                                 CRCsinglet ( crc0, next );
296                         case 14:
297                                 CRCsinglet ( crc0, next );
298                         case 13:
299                                 CRCsinglet ( crc0, next );
300                         case 12:
301                                 CRCsinglet ( crc0, next );
302                         case 11:
303                                 CRCsinglet ( crc0, next );
304                         case 10:
305                                 CRCsinglet ( crc0, next );
306                         case 9:
307                                 CRCsinglet ( crc0, next );
308                         case 8:
309                                 CRCsinglet ( crc0, next );
310                         case 7:
311                                 CRCsinglet ( crc0, next );
312                         case 6:
313                                 CRCsinglet ( crc0, next );
314                         case 5:
315                                 CRCsinglet ( crc0, next );
316                         case 4:
317                                 CRCsinglet ( crc0, next );
318                         case 3:
319                                 CRCsinglet ( crc0, next );
320                         case 2:
321                                 CRCsinglet ( crc0, next );
322                         case 1:
323                                 CRCsinglet ( crc0, next );
324                         } while ( --n > 0 );
325                 }
326         };
327 
328         /* compute the crc for up to seven trailing bytes */
329         crc32bit = (uint32_t)crc0;
330         if ( ( len % 2 ) != 0 ) crc32bit = __builtin_ia32_crc32qi ( crc32bit, * ( next ) ); // 1 char, remain even
331         next += len;
332         switch ( len / 2 ) {
333         case 3:
334                 crc32bit = __builtin_ia32_crc32hi ( crc32bit, * ( uint16_t* ) ( next - 6 ) ); // 2 char, remain 4
335         case 2:
336                 crc32bit = __builtin_ia32_crc32si ( crc32bit, * ( uint32_t* ) ( next - 4 ) ); // 4 char, remain 0
337                 break;
338         case 1:
339                 crc32bit = __builtin_ia32_crc32hi ( crc32bit, * ( uint16_t* ) ( next - 2 ) ); // 2 char, remain 0
340                 break;
341         case 0:
342                 break;
343         }
344         return ( uint32_t ) crc32bit;
345 }
346 
347 #endif // defined(IS_INTEL)