1 /* crc32c.c -- compute CRC-32C using the Intel crc32 instruction
2 * Copyright (C) 2013 Mark Adler
3 * Version 1.1 1 Aug 2013 Mark Adler
4 *
5 * Adapted by Rodrigo Tobar for inclusion in the crc32c python package
6 */
7
8 /*
9 This software is provided 'as-is', without any express or implied
10 warranty. In no event will the author be held liable for any damages
11 arising from the use of this software.
12
13 Permission is granted to anyone to use this software for any purpose,
14 including commercial applications, and to alter it and redistribute it
15 freely, subject to the following restrictions:
16
17 1. The origin of this software must not be misrepresented; you must not
18 claim that you wrote the original software. If you use this software
19 in a product, an acknowledgment in the product documentation would be
20 appreciated but is not required.
21 2. Altered source versions must be plainly marked as such, and must not be
22 misrepresented as being the original software.
23 3. This notice may not be removed or altered from any source distribution.
24
25 Mark Adler
26 madler@alumni.caltech.edu
27 */
28
29 /* Use hardware CRC instruction on Intel SSE 4.2 processors. This computes a
30 CRC-32C, *not* the CRC-32 used by Ethernet and zip, gzip, etc. A software
31 version is provided as a fall-back, as well as for speed comparisons. */
32
33 /* Version history:
34 1.0 10 Feb 2013 First version
35 1.1 1 Aug 2013 Correct comments on why three crc instructions in parallel
36 */
37
38 /* Altered version
39 * This version modified to fit into the benchmarking code retrieved from
40 * http://www.evanjones.ca/crc32c.html
41 * 1.2 20 Mar 2016 Ferry Toth - Fit into benchmarking
42 * 1.3 07 May 2016 Ferry Toth - Applied some speed ups by putting more CRC32 in the short and long loop
43 * - Moved crc32q into macro's and put alternative code there for 32bit operation
44 */
45
46 #if defined(IS_INTEL)
47
48 #include "common.h"
49
50 /*
51 * MSVC/icc don't have __builtin_ia32_crc32_* functions. Instead they have
52 * the _mm_crc32_* intrinsics, which accomplish the same at the end of the day
53 */
54 #if defined(_MSC_VER) || defined(__ICC)
55 # include <nmmintrin.h>
56 # define __builtin_ia32_crc32qi _mm_crc32_u8
57 # define __builtin_ia32_crc32hi _mm_crc32_u16
58 # define __builtin_ia32_crc32si _mm_crc32_u32
59 # define __builtin_ia32_crc32di _mm_crc32_u64
60 #endif /* defined(_MSC_VER) || defined(__ICC) */
61
62 /* CRC-32C (iSCSI) polynomial in reversed bit order. */
63 #define POLY 0x82f63b78
64
65 /* Multiply a matrix times a vector over the Galois field of two elements,
66 GF(2). Each element is a bit in an unsigned integer. mat must have at
67 least as many entries as the power of two for most significant one bit in
68 vec. */
gf2_matrix_times(uint32_t * mat,uint32_t vec)69 static CRC32C_INLINE uint32_t gf2_matrix_times ( uint32_t *mat, uint32_t vec )
70 {
71 uint32_t sum;
72
73 sum = 0;
74 while ( vec ) {
75 if ( vec & 1 )
76 sum ^= *mat;
77 vec >>= 1;
78 mat++;
79 }
80 return sum;
81 }
82
83 /* Multiply a matrix by itself over GF(2). Both mat and square must have 32
84 rows. */
gf2_matrix_square(uint32_t * square,uint32_t * mat)85 static CRC32C_INLINE void gf2_matrix_square ( uint32_t *square, uint32_t *mat )
86 {
87 int n;
88
89 for ( n = 0; n < 32; n++ )
90 square[n] = gf2_matrix_times ( mat, mat[n] );
91 }
92
93 /* Construct an operator to apply len zeros to a crc. len must be a power of
94 two. If len is not a power of two, then the result is the same as for the
95 largest power of two less than len. The result for len == 0 is the same as
96 for len == 1. A version of this routine could be easily written for any
97 len, but that is not needed for this application. */
crc32c_zeros_op(uint32_t * even,size_t len)98 static void crc32c_zeros_op ( uint32_t *even, size_t len )
99 {
100 int n;
101 uint32_t row;
102 uint32_t odd[32]; /* odd-power-of-two zeros operator */
103
104 /* put operator for one zero bit in odd */
105 odd[0] = POLY; /* CRC-32C polynomial */
106 row = 1;
107 for ( n = 1; n < 32; n++ ) {
108 odd[n] = row;
109 row <<= 1;
110 }
111
112 /* put operator for two zero bits in even */
113 gf2_matrix_square ( even, odd );
114
115 /* put operator for four zero bits in odd */
116 gf2_matrix_square ( odd, even );
117
118 /* first square will put the operator for one zero byte (eight zero bits),
119 in even -- next square puts operator for two zero bytes in odd, and so
120 on, until len has been rotated down to zero */
121 do {
122 gf2_matrix_square ( even, odd );
123 len >>= 1;
124 if ( len == 0 )
125 return;
126 gf2_matrix_square ( odd, even );
127 len >>= 1;
128 } while ( len );
129
130 /* answer ended up in odd -- copy to even */
131 for ( n = 0; n < 32; n++ )
132 even[n] = odd[n];
133 }
134
135 /* Take a length and build four lookup tables for applying the zeros operator
136 for that length, byte-by-byte on the operand. */
crc32c_zeros(uint32_t zeros[][256],size_t len)137 static void crc32c_zeros ( uint32_t zeros[][256], size_t len )
138 {
139 uint32_t n;
140 uint32_t op[32];
141
142 crc32c_zeros_op ( op, len );
143 for ( n = 0; n < 256; n++ ) {
144 zeros[0][n] = gf2_matrix_times ( op, n );
145 zeros[1][n] = gf2_matrix_times ( op, n << 8 );
146 zeros[2][n] = gf2_matrix_times ( op, n << 16 );
147 zeros[3][n] = gf2_matrix_times ( op, n << 24 );
148 }
149 }
150
151
152 /* Apply the zeros operator table to crc. */
crc32c_shift(uint32_t zeros[][256],uint32_t crc)153 static CRC32C_INLINE uint32_t crc32c_shift ( uint32_t zeros[][256], uint32_t crc )
154 {
155 return zeros[0][crc & 0xff] ^ zeros[1][ ( crc >> 8 ) & 0xff] ^
156 zeros[2][ ( crc >> 16 ) & 0xff] ^ zeros[3][crc >> 24];
157 }
158
159 /* Block sizes for three-way parallel crc computation. LONG and SHORT must
160 both be powers of two. The associated string constants must be set
161 accordingly, for use in constructing the assembler instructions. */
162 #define LONG 8192
163 #define LONGx1 "8192"
164 #define LONGx2 "16384"
165 #define SHORT 256
166 #define SHORTx1 "256"
167 #define SHORTx2 "512"
168
169 /* Tables for hardware crc that shift a crc by LONG and SHORT zeros. */
170 static uint32_t crc32c_long[4][256];
171 static uint32_t crc32c_short[4][256];
172
173 /* Initialize tables for shifting crcs. */
crc32c_init_hw_adler(void)174 void crc32c_init_hw_adler( void )
175 {
176 crc32c_zeros ( crc32c_long, LONG );
177 crc32c_zeros ( crc32c_short, SHORT );
178 }
179
180 #ifndef CRC32C_IS_64_BITS
181 #define CRCtriplet(crc, buf, size, i) \
182 crc ## 0 = __builtin_ia32_crc32si(crc ## 0, *(uint32_t*) (buf + i)); \
183 crc ## 1 = __builtin_ia32_crc32si(crc ## 1, *(uint32_t*) (buf + i + size)); \
184 crc ## 2 = __builtin_ia32_crc32si(crc ## 2, *(uint32_t*) (buf + i + 2 * size)); \
185 crc ## 0 = __builtin_ia32_crc32si(crc ## 0, *(uint32_t*) (buf + sizeof(uint32_t) + i)); \
186 crc ## 1 = __builtin_ia32_crc32si(crc ## 1, *(uint32_t*) (buf + sizeof(uint32_t) + i + size)); \
187 crc ## 2 = __builtin_ia32_crc32si(crc ## 2, *(uint32_t*) (buf + sizeof(uint32_t) + i + 2 * size));
188 #else
189 #define CRCtriplet(crc, buf, size, i) \
190 crc ## 0 = __builtin_ia32_crc32di(crc ## 0, *(uint64_t*) (buf + i)); \
191 crc ## 1 = __builtin_ia32_crc32di(crc ## 1, *(uint64_t*) (buf + i + size)); \
192 crc ## 2 = __builtin_ia32_crc32di(crc ## 2, *(uint64_t*) (buf + i + 2 * size));
193 #endif
194
195
196 #ifndef CRC32C_IS_64_BITS
197 #define CRCsinglet(crc, buf) \
198 crc = __builtin_ia32_crc32si(crc, *(uint32_t*)buf); \
199 crc = __builtin_ia32_crc32si(crc, *(uint32_t*)(buf + sizeof(uint32_t))); \
200 buf+= 2 *sizeof(uint32_t);
201 #else
202 #define CRCsinglet(crc, buf) crc = __builtin_ia32_crc32di(crc, *(uint64_t*)buf); buf+= sizeof(uint64_t);
203 #endif
204
205 /* Compute CRC-32C using the Intel hardware instruction. */
_crc32c_hw_adler(uint32_t crc,const unsigned char * buf,unsigned long len)206 uint32_t _crc32c_hw_adler(uint32_t crc, const unsigned char *buf, unsigned long len)
207 {
208 const unsigned char *next = buf;
209 const unsigned char *end;
210 unsigned short count;
211
212 #ifndef CRC32C_IS_64_BITS
213 uint32_t crc0, crc1, crc2;
214 #else
215 uint64_t crc0, crc1, crc2; /* need to be 64 bits for crc32q */
216 #endif
217 uint32_t crc32bit;
218
219 crc32bit = crc;
220 // in len > 256 compute the crc for up to seven leading bytes to bring the data pointer to an eight-byte boundary
221 if ( len > 128 ) {
222 unsigned char align = ( 8 - ( uintptr_t ) next ) % 8; // byte to boundary
223 len -= align;
224 if ( ( align % 2 ) != 0 ) crc32bit = __builtin_ia32_crc32qi ( crc32bit, *next );
225 next += align;
226 switch ( align / 2 ) {
227 case 3:
228 crc32bit = __builtin_ia32_crc32hi ( crc32bit, * ( uint16_t* ) ( next - 6 ) ); // 6 char, remain 4
229 case 2:
230 crc32bit = __builtin_ia32_crc32si ( crc32bit, * ( uint32_t* ) ( next - 4 ) ); // 4 char, remain 0
231 break;
232 case 1:
233 crc32bit = __builtin_ia32_crc32hi ( crc32bit, * ( uint16_t* ) ( next - 2 ) ); // 2 char, remain 0
234 case 0:
235 break;
236 }
237 };
238
239 /* compute the crc on sets of LONG*3 bytes, executing three independent crc
240 instructions, each on LONG bytes -- this is optimized for the Nehalem,
241 Westmere, Sandy Bridge, and Ivy Bridge architectures, which have a
242 throughput of one crc per cycle, but a latency of three cycles */
243
244 crc0 = crc32bit;
245 while ( len >= LONG*3 ) {
246 crc1 = 0;
247 crc2 = 0;
248 end = next + LONG;
249 do {
250 CRCtriplet ( crc, next, LONG, 0 );
251 CRCtriplet ( crc, next, LONG, 8 );
252 CRCtriplet ( crc, next, LONG, 16 );
253 CRCtriplet ( crc, next, LONG, 24 );
254 next += 32;
255 } while ( next < end );
256 crc0 = crc32c_shift ( crc32c_long, (uint32_t)crc0 ) ^ crc1;
257 crc0 = crc32c_shift ( crc32c_long, (uint32_t)crc0 ) ^ crc2;
258 next += LONG*2;
259 len -= LONG*3;
260 }
261
262 /* do the same thing, but now on SHORT*3 blocks for the remaining data less
263 than a LONG*3 block */
264 while ( len >= SHORT*3 ) {
265 crc1 = 0;
266 crc2 = 0;
267 end = next + SHORT;
268 do {
269 CRCtriplet ( crc, next, SHORT, 0 );
270 CRCtriplet ( crc, next, SHORT, 8 );
271 CRCtriplet ( crc, next, SHORT, 16 );
272 CRCtriplet ( crc, next, SHORT, 24 );
273 next += 32;
274 } while ( next < end );
275 crc0 = crc32c_shift ( crc32c_short, (uint32_t)crc0 ) ^ crc1;
276 crc0 = crc32c_shift ( crc32c_short, (uint32_t)crc0 ) ^ crc2;
277 next += SHORT*2;
278 len -= SHORT*3;
279 }
280
281 /* compute the crc on the remaining eight-byte units less than a SHORT*3
282 block */
283
284 // use Duff's device, a for() loop inside a switch() statement. This is Legal
285 if ( ( count = ( len - ( len & 7 ) ) ) >= 8 ) { // needs to execute crc at least once
286 unsigned short n;
287 len -= count;
288 count /= 8; // count number of crc32di
289 n = ( count + 15 ) / 16;
290 switch ( count % 16 ) {
291 case 0:
292 do {
293 CRCsinglet ( crc0, next );
294 case 15:
295 CRCsinglet ( crc0, next );
296 case 14:
297 CRCsinglet ( crc0, next );
298 case 13:
299 CRCsinglet ( crc0, next );
300 case 12:
301 CRCsinglet ( crc0, next );
302 case 11:
303 CRCsinglet ( crc0, next );
304 case 10:
305 CRCsinglet ( crc0, next );
306 case 9:
307 CRCsinglet ( crc0, next );
308 case 8:
309 CRCsinglet ( crc0, next );
310 case 7:
311 CRCsinglet ( crc0, next );
312 case 6:
313 CRCsinglet ( crc0, next );
314 case 5:
315 CRCsinglet ( crc0, next );
316 case 4:
317 CRCsinglet ( crc0, next );
318 case 3:
319 CRCsinglet ( crc0, next );
320 case 2:
321 CRCsinglet ( crc0, next );
322 case 1:
323 CRCsinglet ( crc0, next );
324 } while ( --n > 0 );
325 }
326 };
327
328 /* compute the crc for up to seven trailing bytes */
329 crc32bit = (uint32_t)crc0;
330 if ( ( len % 2 ) != 0 ) crc32bit = __builtin_ia32_crc32qi ( crc32bit, * ( next ) ); // 1 char, remain even
331 next += len;
332 switch ( len / 2 ) {
333 case 3:
334 crc32bit = __builtin_ia32_crc32hi ( crc32bit, * ( uint16_t* ) ( next - 6 ) ); // 2 char, remain 4
335 case 2:
336 crc32bit = __builtin_ia32_crc32si ( crc32bit, * ( uint32_t* ) ( next - 4 ) ); // 4 char, remain 0
337 break;
338 case 1:
339 crc32bit = __builtin_ia32_crc32hi ( crc32bit, * ( uint16_t* ) ( next - 2 ) ); // 2 char, remain 0
340 break;
341 case 0:
342 break;
343 }
344 return ( uint32_t ) crc32bit;
345 }
346
347 #endif // defined(IS_INTEL)