1 /******************************************************
2 Copyright (c) 2017 Percona LLC and/or its affiliates.
3
4 CRC32 using Intel's PCLMUL instruction.
5
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; version 2 of the License.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
18
19 *******************************************************/
20
21 /* crc-intel-pclmul.c - Intel PCLMUL accelerated CRC implementation
22 * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi>
23 *
24 * This file is part of Libgcrypt.
25 *
26 * Libgcrypt is free software; you can redistribute it and/or modify
27 * it under the terms of the GNU Lesser General Public License as
28 * published by the Free Software Foundation; either version 2.1 of
29 * the License, or (at your option) any later version.
30 *
31 * Libgcrypt is distributed in the hope that it will be useful,
32 * but WITHOUT ANY WARRANTY; without even the implied warranty of
33 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
34 * GNU Lesser General Public License for more details.
35 *
36 * You should have received a copy of the GNU Lesser General Public
37 * License along with this program; if not, write to the Free Software
38 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
39 *
40 */
41
42 #include <gcrypt.h>
43 #include <stdint.h>
44 #include <stdio.h>
45 #include <stdlib.h>
46 #include <string.h>
47 #include "config.h"
48
49 #define U64_C(c) (c##UL)
50
51 typedef uint32_t u32;
52 typedef uint16_t u16;
53 typedef uint64_t u64;
54 #ifndef byte
55 typedef uint8_t byte;
56 #endif
57
58 #define _gcry_bswap32 __builtin_bswap32
59
60 #if __GNUC__ >= 4 && defined(__x86_64__) && defined(HAVE_CLMUL_INSTRUCTION)
61
62 #if _GCRY_GCC_VERSION >= 40400 /* 4.4 */
63 /* Prevent compiler from issuing SSE instructions between asm blocks. */
64 #pragma GCC target("no-sse")
65 #endif
66
67 #define ALIGNED_16 __attribute__((aligned(16)))
68
69 struct u16_unaligned_s {
70 u16 a;
71 } __attribute__((packed, aligned(1), may_alias));
72
73 /* Constants structure for generic reflected/non-reflected CRC32 CLMUL
74 * functions. */
75 struct crc32_consts_s {
76 /* k: { x^(32*17), x^(32*15), x^(32*5), x^(32*3), x^(32*2), 0 } mod P(x) */
77 u64 k[6];
78 /* my_p: { floor(x^64 / P(x)), P(x) } */
79 u64 my_p[2];
80 };
81
82 /* CLMUL constants for CRC32 and CRC32RFC1510. */
83 static const struct crc32_consts_s crc32_consts ALIGNED_16 = {
84 {
85 /* k[6] = reverse_33bits( x^(32*y) mod P(x) ) */
86 U64_C(0x154442bd4), U64_C(0x1c6e41596), /* y = { 17, 15 } */
87 U64_C(0x1751997d0), U64_C(0x0ccaa009e), /* y = { 5, 3 } */
88 U64_C(0x163cd6124), 0 /* y = 2 */
89 },
90 {/* my_p[2] = reverse_33bits ( { floor(x^64 / P(x)), P(x) } ) */
91 U64_C(0x1f7011641), U64_C(0x1db710641)}};
92
93 /* Common constants for CRC32 algorithms. */
94 static const byte crc32_refl_shuf_shift[3 * 16] ALIGNED_16 = {
95 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
96 0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
97 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff,
98 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
99 };
100 static const byte crc32_partial_fold_input_mask[16 + 16] ALIGNED_16 = {
101 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
102 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
103 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
104 };
105 static const u64 crc32_merge9to15_shuf[15 - 9 + 1][2] ALIGNED_16 = {
106 {U64_C(0x0706050403020100), U64_C(0xffffffffffffff0f)}, /* 9 */
107 {U64_C(0x0706050403020100), U64_C(0xffffffffffff0f0e)},
108 {U64_C(0x0706050403020100), U64_C(0xffffffffff0f0e0d)},
109 {U64_C(0x0706050403020100), U64_C(0xffffffff0f0e0d0c)},
110 {U64_C(0x0706050403020100), U64_C(0xffffff0f0e0d0c0b)},
111 {U64_C(0x0706050403020100), U64_C(0xffff0f0e0d0c0b0a)},
112 {U64_C(0x0706050403020100), U64_C(0xff0f0e0d0c0b0a09)}, /* 15 */
113 };
114 static const u64 crc32_merge5to7_shuf[7 - 5 + 1][2] ALIGNED_16 = {
115 {U64_C(0xffffff0703020100), U64_C(0xffffffffffffffff)}, /* 5 */
116 {U64_C(0xffff070603020100), U64_C(0xffffffffffffffff)},
117 {U64_C(0xff07060503020100), U64_C(0xffffffffffffffff)}, /* 7 */
118 };
119
120 /* PCLMUL functions for reflected CRC32. */
crc32_reflected_bulk(u32 * pcrc,const byte * inbuf,size_t inlen,const struct crc32_consts_s * consts)121 static inline void crc32_reflected_bulk(u32 *pcrc, const byte *inbuf,
122 size_t inlen,
123 const struct crc32_consts_s *consts) {
124 if (inlen >= 8 * 16) {
125 asm volatile(
126 "movd %[crc], %%xmm4\n\t"
127 "movdqu %[inbuf_0], %%xmm0\n\t"
128 "movdqu %[inbuf_1], %%xmm1\n\t"
129 "movdqu %[inbuf_2], %%xmm2\n\t"
130 "movdqu %[inbuf_3], %%xmm3\n\t"
131 "pxor %%xmm4, %%xmm0\n\t"
132 :
133 : [inbuf_0] "m"(inbuf[0 * 16]), [inbuf_1] "m"(inbuf[1 * 16]),
134 [inbuf_2] "m"(inbuf[2 * 16]), [inbuf_3] "m"(inbuf[3 * 16]),
135 [crc] "m"(*pcrc));
136
137 inbuf += 4 * 16;
138 inlen -= 4 * 16;
139
140 asm volatile("movdqa %[k1k2], %%xmm4\n\t" : : [k1k2] "m"(consts->k[1 - 1]));
141
142 /* Fold by 4. */
143 while (inlen >= 4 * 16) {
144 asm volatile(
145 "movdqu %[inbuf_0], %%xmm5\n\t"
146 "movdqa %%xmm0, %%xmm6\n\t"
147 "pclmulqdq $0x00, %%xmm4, %%xmm0\n\t"
148 "pclmulqdq $0x11, %%xmm4, %%xmm6\n\t"
149 "pxor %%xmm5, %%xmm0\n\t"
150 "pxor %%xmm6, %%xmm0\n\t"
151
152 "movdqu %[inbuf_1], %%xmm5\n\t"
153 "movdqa %%xmm1, %%xmm6\n\t"
154 "pclmulqdq $0x00, %%xmm4, %%xmm1\n\t"
155 "pclmulqdq $0x11, %%xmm4, %%xmm6\n\t"
156 "pxor %%xmm5, %%xmm1\n\t"
157 "pxor %%xmm6, %%xmm1\n\t"
158
159 "movdqu %[inbuf_2], %%xmm5\n\t"
160 "movdqa %%xmm2, %%xmm6\n\t"
161 "pclmulqdq $0x00, %%xmm4, %%xmm2\n\t"
162 "pclmulqdq $0x11, %%xmm4, %%xmm6\n\t"
163 "pxor %%xmm5, %%xmm2\n\t"
164 "pxor %%xmm6, %%xmm2\n\t"
165
166 "movdqu %[inbuf_3], %%xmm5\n\t"
167 "movdqa %%xmm3, %%xmm6\n\t"
168 "pclmulqdq $0x00, %%xmm4, %%xmm3\n\t"
169 "pclmulqdq $0x11, %%xmm4, %%xmm6\n\t"
170 "pxor %%xmm5, %%xmm3\n\t"
171 "pxor %%xmm6, %%xmm3\n\t"
172 :
173 : [inbuf_0] "m"(inbuf[0 * 16]), [inbuf_1] "m"(inbuf[1 * 16]),
174 [inbuf_2] "m"(inbuf[2 * 16]), [inbuf_3] "m"(inbuf[3 * 16]));
175
176 inbuf += 4 * 16;
177 inlen -= 4 * 16;
178 }
179
180 asm volatile(
181 "movdqa %[k3k4], %%xmm6\n\t"
182 "movdqa %[my_p], %%xmm5\n\t"
183 :
184 : [k3k4] "m"(consts->k[3 - 1]), [my_p] "m"(consts->my_p[0]));
185
186 /* Fold 4 to 1. */
187
188 asm volatile(
189 "movdqa %%xmm0, %%xmm4\n\t"
190 "pclmulqdq $0x00, %%xmm6, %%xmm0\n\t"
191 "pclmulqdq $0x11, %%xmm6, %%xmm4\n\t"
192 "pxor %%xmm1, %%xmm0\n\t"
193 "pxor %%xmm4, %%xmm0\n\t"
194
195 "movdqa %%xmm0, %%xmm4\n\t"
196 "pclmulqdq $0x00, %%xmm6, %%xmm0\n\t"
197 "pclmulqdq $0x11, %%xmm6, %%xmm4\n\t"
198 "pxor %%xmm2, %%xmm0\n\t"
199 "pxor %%xmm4, %%xmm0\n\t"
200
201 "movdqa %%xmm0, %%xmm4\n\t"
202 "pclmulqdq $0x00, %%xmm6, %%xmm0\n\t"
203 "pclmulqdq $0x11, %%xmm6, %%xmm4\n\t"
204 "pxor %%xmm3, %%xmm0\n\t"
205 "pxor %%xmm4, %%xmm0\n\t"
206 :
207 :);
208 } else {
209 asm volatile(
210 "movd %[crc], %%xmm1\n\t"
211 "movdqu %[inbuf], %%xmm0\n\t"
212 "movdqa %[k3k4], %%xmm6\n\t"
213 "pxor %%xmm1, %%xmm0\n\t"
214 "movdqa %[my_p], %%xmm5\n\t"
215 :
216 : [inbuf] "m"(*inbuf), [crc] "m"(*pcrc), [k3k4] "m"(consts->k[3 - 1]),
217 [my_p] "m"(consts->my_p[0]));
218
219 inbuf += 16;
220 inlen -= 16;
221 }
222
223 /* Fold by 1. */
224 if (inlen >= 16) {
225 while (inlen >= 16) {
226 /* Load next block to XMM2. Fold XMM0 to XMM0:XMM1. */
227 asm volatile(
228 "movdqu %[inbuf], %%xmm2\n\t"
229 "movdqa %%xmm0, %%xmm1\n\t"
230 "pclmulqdq $0x00, %%xmm6, %%xmm0\n\t"
231 "pclmulqdq $0x11, %%xmm6, %%xmm1\n\t"
232 "pxor %%xmm2, %%xmm0\n\t"
233 "pxor %%xmm1, %%xmm0\n\t"
234 :
235 : [inbuf] "m"(*inbuf));
236
237 inbuf += 16;
238 inlen -= 16;
239 }
240 }
241
242 /* Partial fold. */
243 if (inlen) {
244 /* Load last input and add padding zeros. */
245 asm volatile(
246 "movdqu %[shr_shuf], %%xmm3\n\t"
247 "movdqu %[shl_shuf], %%xmm4\n\t"
248 "movdqu %[mask], %%xmm2\n\t"
249
250 "movdqa %%xmm0, %%xmm1\n\t"
251 "pshufb %%xmm4, %%xmm0\n\t"
252 "movdqu %[inbuf], %%xmm4\n\t"
253 "pshufb %%xmm3, %%xmm1\n\t"
254 "pand %%xmm4, %%xmm2\n\t"
255 "por %%xmm1, %%xmm2\n\t"
256
257 "movdqa %%xmm0, %%xmm1\n\t"
258 "pclmulqdq $0x00, %%xmm6, %%xmm0\n\t"
259 "pclmulqdq $0x11, %%xmm6, %%xmm1\n\t"
260 "pxor %%xmm2, %%xmm0\n\t"
261 "pxor %%xmm1, %%xmm0\n\t"
262 :
263 : [inbuf] "m"(*(inbuf - 16 + inlen)),
264 [mask] "m"(crc32_partial_fold_input_mask[inlen]),
265 [shl_shuf] "m"(crc32_refl_shuf_shift[inlen]),
266 [shr_shuf] "m"(crc32_refl_shuf_shift[inlen + 16]));
267
268 inbuf += inlen;
269 inlen -= inlen;
270 }
271
272 /* Final fold. */
273 asm volatile(/* reduce 128-bits to 96-bits */
274 "movdqa %%xmm0, %%xmm1\n\t"
275 "pclmulqdq $0x10, %%xmm6, %%xmm0\n\t"
276 "psrldq $8, %%xmm1\n\t"
277 "pxor %%xmm1, %%xmm0\n\t"
278
279 /* reduce 96-bits to 64-bits */
280 "pshufd $0xfc, %%xmm0, %%xmm1\n\t" /* [00][00][00][x] */
281 "pshufd $0xf9, %%xmm0, %%xmm0\n\t" /* [00][00][x>>64][x>>32] */
282 "pclmulqdq $0x00, %[k5], %%xmm1\n\t" /* [00][00][xx][xx] */
283 "pxor %%xmm1, %%xmm0\n\t" /* top 64-bit are zero */
284
285 /* barrett reduction */
286 "pshufd $0xf3, %%xmm0, %%xmm1\n\t" /* [00][00][x>>32][00] */
287 "pslldq $4, %%xmm0\n\t" /* [??][x>>32][??][??] */
288 "pclmulqdq $0x00, %%xmm5, %%xmm1\n\t" /* [00][xx][xx][00] */
289 "pclmulqdq $0x10, %%xmm5, %%xmm1\n\t" /* [00][xx][xx][00] */
290 "pxor %%xmm1, %%xmm0\n\t"
291
292 /* store CRC */
293 "pextrd $2, %%xmm0, %[out]\n\t"
294 : [out] "=m"(*pcrc)
295 : [k5] "m"(consts->k[5 - 1]));
296 }
297
crc32_reflected_less_than_16(u32 * pcrc,const byte * inbuf,size_t inlen,const struct crc32_consts_s * consts)298 static inline void crc32_reflected_less_than_16(
299 u32 *pcrc, const byte *inbuf, size_t inlen,
300 const struct crc32_consts_s *consts) {
301 if (inlen < 4) {
302 u32 crc = *pcrc;
303 u32 data;
304
305 asm volatile("movdqa %[my_p], %%xmm5\n\t" : : [my_p] "m"(consts->my_p[0]));
306
307 if (inlen == 1) {
308 data = inbuf[0];
309 data ^= crc;
310 data <<= 24;
311 crc >>= 8;
312 } else if (inlen == 2) {
313 data = ((const struct u16_unaligned_s *)inbuf)->a;
314 data ^= crc;
315 data <<= 16;
316 crc >>= 16;
317 } else {
318 data = ((const struct u16_unaligned_s *)inbuf)->a;
319 data |= inbuf[2] << 16;
320 data ^= crc;
321 data <<= 8;
322 crc >>= 24;
323 }
324
325 /* Barrett reduction */
326 asm volatile(
327 "movd %[in], %%xmm0\n\t"
328 "movd %[crc], %%xmm1\n\t"
329
330 "pclmulqdq $0x00, %%xmm5, %%xmm0\n\t" /* [00][00][xx][xx] */
331 "psllq $32, %%xmm1\n\t"
332 "pshufd $0xfc, %%xmm0, %%xmm0\n\t" /* [00][00][00][x] */
333 "pclmulqdq $0x10, %%xmm5, %%xmm0\n\t" /* [00][00][xx][xx] */
334 "pxor %%xmm1, %%xmm0\n\t"
335
336 "pextrd $1, %%xmm0, %[out]\n\t"
337 : [out] "=m"(*pcrc)
338 : [in] "rm"(data), [crc] "rm"(crc));
339 } else if (inlen == 4) {
340 /* Barrett reduction */
341 asm volatile(
342 "movd %[crc], %%xmm1\n\t"
343 "movd %[in], %%xmm0\n\t"
344 "movdqa %[my_p], %%xmm5\n\t"
345 "pxor %%xmm1, %%xmm0\n\t"
346
347 "pclmulqdq $0x00, %%xmm5, %%xmm0\n\t" /* [00][00][xx][xx] */
348 "pshufd $0xfc, %%xmm0, %%xmm0\n\t" /* [00][00][00][x] */
349 "pclmulqdq $0x10, %%xmm5, %%xmm0\n\t" /* [00][00][xx][xx] */
350
351 "pextrd $1, %%xmm0, %[out]\n\t"
352 : [out] "=m"(*pcrc)
353 : [in] "m"(*inbuf), [crc] "m"(*pcrc), [my_p] "m"(consts->my_p[0]));
354 } else {
355 asm volatile(
356 "movdqu %[shuf], %%xmm4\n\t"
357 "movd %[crc], %%xmm1\n\t"
358 "movdqa %[my_p], %%xmm5\n\t"
359 "movdqa %[k3k4], %%xmm6\n\t"
360 :
361 : [shuf] "m"(crc32_refl_shuf_shift[inlen]), [crc] "m"(*pcrc),
362 [my_p] "m"(consts->my_p[0]), [k3k4] "m"(consts->k[3 - 1]));
363
364 if (inlen >= 8) {
365 asm volatile("movq %[inbuf], %%xmm0\n\t" : : [inbuf] "m"(*inbuf));
366 if (inlen > 8) {
367 asm volatile(/*"pinsrq $1, %[inbuf_tail], %%xmm0\n\t"*/
368 "movq %[inbuf_tail], %%xmm2\n\t"
369 "punpcklqdq %%xmm2, %%xmm0\n\t"
370 "pshufb %[merge_shuf], %%xmm0\n\t"
371 :
372 : [inbuf_tail] "m"(inbuf[inlen - 8]),
373 [merge_shuf] "m"(*crc32_merge9to15_shuf[inlen - 9]));
374 }
375 } else {
376 asm volatile(
377 "movd %[inbuf], %%xmm0\n\t"
378 "pinsrd $1, %[inbuf_tail], %%xmm0\n\t"
379 "pshufb %[merge_shuf], %%xmm0\n\t"
380 :
381 : [inbuf] "m"(*inbuf), [inbuf_tail] "m"(inbuf[inlen - 4]),
382 [merge_shuf] "m"(*crc32_merge5to7_shuf[inlen - 5]));
383 }
384
385 /* Final fold. */
386 asm volatile(
387 "pxor %%xmm1, %%xmm0\n\t"
388 "pshufb %%xmm4, %%xmm0\n\t"
389
390 /* reduce 128-bits to 96-bits */
391 "movdqa %%xmm0, %%xmm1\n\t"
392 "pclmulqdq $0x10, %%xmm6, %%xmm0\n\t"
393 "psrldq $8, %%xmm1\n\t"
394 "pxor %%xmm1, %%xmm0\n\t" /* top 32-bit are zero */
395
396 /* reduce 96-bits to 64-bits */
397 "pshufd $0xfc, %%xmm0, %%xmm1\n\t" /* [00][00][00][x] */
398 "pshufd $0xf9, %%xmm0, %%xmm0\n\t" /* [00][00][x>>64][x>>32] */
399 "pclmulqdq $0x00, %[k5], %%xmm1\n\t" /* [00][00][xx][xx] */
400 "pxor %%xmm1, %%xmm0\n\t" /* top 64-bit are zero */
401
402 /* barrett reduction */
403 "pshufd $0xf3, %%xmm0, %%xmm1\n\t" /* [00][00][x>>32][00] */
404 "pslldq $4, %%xmm0\n\t" /* [??][x>>32][??][??] */
405 "pclmulqdq $0x00, %%xmm5, %%xmm1\n\t" /* [00][xx][xx][00] */
406 "pclmulqdq $0x10, %%xmm5, %%xmm1\n\t" /* [00][xx][xx][00] */
407 "pxor %%xmm1, %%xmm0\n\t"
408
409 /* store CRC */
410 "pextrd $2, %%xmm0, %[out]\n\t"
411 : [out] "=m"(*pcrc)
412 : [k5] "m"(consts->k[5 - 1]));
413 }
414 }
415
crc32_intel_pclmul(u32 * pcrc,const byte * inbuf,size_t inlen)416 void crc32_intel_pclmul(u32 *pcrc, const byte *inbuf, size_t inlen) {
417 const struct crc32_consts_s *consts = &crc32_consts;
418 #if defined(__x86_64__) && defined(__WIN64__)
419 char win64tmp[2 * 16];
420
421 /* XMM6-XMM7 need to be restored after use. */
422 asm volatile(
423 "movdqu %%xmm6, 0*16(%0)\n\t"
424 "movdqu %%xmm7, 1*16(%0)\n\t"
425 :
426 : "r"(win64tmp)
427 : "memory");
428 #endif
429
430 if (!inlen) return;
431
432 if (inlen >= 16)
433 crc32_reflected_bulk(pcrc, inbuf, inlen, consts);
434 else
435 crc32_reflected_less_than_16(pcrc, inbuf, inlen, consts);
436
437 #if defined(__x86_64__) && defined(__WIN64__)
438 /* Restore used registers. */
439 asm volatile(
440 "movdqu 0*16(%0), %%xmm6\n\t"
441 "movdqu 1*16(%0), %%xmm7\n\t"
442 :
443 : "r"(win64tmp)
444 : "memory");
445 #endif
446 }
447
448 #endif
449