1 /******************************************************
2 Copyright (c) 2017 Percona LLC and/or its affiliates.
3 
4 CRC32 using Intel's PCLMUL instruction.
5 
6 This program is free software; you can redistribute it and/or modify
7 it under the terms of the GNU General Public License as published by
8 the Free Software Foundation; version 2 of the License.
9 
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13 GNU General Public License for more details.
14 
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA
18 
19 *******************************************************/
20 
21 /* crc-intel-pclmul.c - Intel PCLMUL accelerated CRC implementation
22  * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi>
23  *
24  * This file is part of Libgcrypt.
25  *
26  * Libgcrypt is free software; you can redistribute it and/or modify
27  * it under the terms of the GNU Lesser General Public License as
28  * published by the Free Software Foundation; either version 2.1 of
29  * the License, or (at your option) any later version.
30  *
31  * Libgcrypt is distributed in the hope that it will be useful,
32  * but WITHOUT ANY WARRANTY; without even the implied warranty of
33  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
34  * GNU Lesser General Public License for more details.
35  *
36  * You should have received a copy of the GNU Lesser General Public
37  * License along with this program; if not, write to the Free Software
38  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
39  *
40  */
41 
42 #include <gcrypt.h>
43 #include <stdint.h>
44 #include <stdio.h>
45 #include <stdlib.h>
46 #include <string.h>
47 #include "config.h"
48 
49 #define U64_C(c) (c##UL)
50 
51 typedef uint32_t u32;
52 typedef uint16_t u16;
53 typedef uint64_t u64;
54 #ifndef byte
55 typedef uint8_t byte;
56 #endif
57 
58 #define _gcry_bswap32 __builtin_bswap32
59 
60 #if __GNUC__ >= 4 && defined(__x86_64__) && defined(HAVE_CLMUL_INSTRUCTION)
61 
62 #if _GCRY_GCC_VERSION >= 40400 /* 4.4 */
63 /* Prevent compiler from issuing SSE instructions between asm blocks. */
64 #pragma GCC target("no-sse")
65 #endif
66 
67 #define ALIGNED_16 __attribute__((aligned(16)))
68 
69 struct u16_unaligned_s {
70   u16 a;
71 } __attribute__((packed, aligned(1), may_alias));
72 
73 /* Constants structure for generic reflected/non-reflected CRC32 CLMUL
74  * functions. */
75 struct crc32_consts_s {
76   /* k: { x^(32*17), x^(32*15), x^(32*5), x^(32*3), x^(32*2), 0 } mod P(x) */
77   u64 k[6];
78   /* my_p: { floor(x^64 / P(x)), P(x) } */
79   u64 my_p[2];
80 };
81 
82 /* CLMUL constants for CRC32 and CRC32RFC1510. */
83 static const struct crc32_consts_s crc32_consts ALIGNED_16 = {
84     {
85         /* k[6] = reverse_33bits( x^(32*y) mod P(x) ) */
86         U64_C(0x154442bd4), U64_C(0x1c6e41596), /* y = { 17, 15 } */
87         U64_C(0x1751997d0), U64_C(0x0ccaa009e), /* y = { 5, 3 } */
88         U64_C(0x163cd6124), 0                   /* y = 2 */
89     },
90     {/* my_p[2] = reverse_33bits ( { floor(x^64 / P(x)), P(x) } ) */
91      U64_C(0x1f7011641), U64_C(0x1db710641)}};
92 
93 /* Common constants for CRC32 algorithms. */
94 static const byte crc32_refl_shuf_shift[3 * 16] ALIGNED_16 = {
95     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
96     0xff, 0xff, 0xff, 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07,
97     0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff,
98     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
99 };
100 static const byte crc32_partial_fold_input_mask[16 + 16] ALIGNED_16 = {
101     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
102     0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
103     0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
104 };
105 static const u64 crc32_merge9to15_shuf[15 - 9 + 1][2] ALIGNED_16 = {
106     {U64_C(0x0706050403020100), U64_C(0xffffffffffffff0f)}, /* 9 */
107     {U64_C(0x0706050403020100), U64_C(0xffffffffffff0f0e)},
108     {U64_C(0x0706050403020100), U64_C(0xffffffffff0f0e0d)},
109     {U64_C(0x0706050403020100), U64_C(0xffffffff0f0e0d0c)},
110     {U64_C(0x0706050403020100), U64_C(0xffffff0f0e0d0c0b)},
111     {U64_C(0x0706050403020100), U64_C(0xffff0f0e0d0c0b0a)},
112     {U64_C(0x0706050403020100), U64_C(0xff0f0e0d0c0b0a09)}, /* 15 */
113 };
114 static const u64 crc32_merge5to7_shuf[7 - 5 + 1][2] ALIGNED_16 = {
115     {U64_C(0xffffff0703020100), U64_C(0xffffffffffffffff)}, /* 5 */
116     {U64_C(0xffff070603020100), U64_C(0xffffffffffffffff)},
117     {U64_C(0xff07060503020100), U64_C(0xffffffffffffffff)}, /* 7 */
118 };
119 
120 /* PCLMUL functions for reflected CRC32. */
crc32_reflected_bulk(u32 * pcrc,const byte * inbuf,size_t inlen,const struct crc32_consts_s * consts)121 static inline void crc32_reflected_bulk(u32 *pcrc, const byte *inbuf,
122                                         size_t inlen,
123                                         const struct crc32_consts_s *consts) {
124   if (inlen >= 8 * 16) {
125     asm volatile(
126         "movd %[crc], %%xmm4\n\t"
127         "movdqu %[inbuf_0], %%xmm0\n\t"
128         "movdqu %[inbuf_1], %%xmm1\n\t"
129         "movdqu %[inbuf_2], %%xmm2\n\t"
130         "movdqu %[inbuf_3], %%xmm3\n\t"
131         "pxor %%xmm4, %%xmm0\n\t"
132         :
133         : [inbuf_0] "m"(inbuf[0 * 16]), [inbuf_1] "m"(inbuf[1 * 16]),
134           [inbuf_2] "m"(inbuf[2 * 16]), [inbuf_3] "m"(inbuf[3 * 16]),
135           [crc] "m"(*pcrc));
136 
137     inbuf += 4 * 16;
138     inlen -= 4 * 16;
139 
140     asm volatile("movdqa %[k1k2], %%xmm4\n\t" : : [k1k2] "m"(consts->k[1 - 1]));
141 
142     /* Fold by 4. */
143     while (inlen >= 4 * 16) {
144       asm volatile(
145           "movdqu %[inbuf_0], %%xmm5\n\t"
146           "movdqa %%xmm0, %%xmm6\n\t"
147           "pclmulqdq $0x00, %%xmm4, %%xmm0\n\t"
148           "pclmulqdq $0x11, %%xmm4, %%xmm6\n\t"
149           "pxor %%xmm5, %%xmm0\n\t"
150           "pxor %%xmm6, %%xmm0\n\t"
151 
152           "movdqu %[inbuf_1], %%xmm5\n\t"
153           "movdqa %%xmm1, %%xmm6\n\t"
154           "pclmulqdq $0x00, %%xmm4, %%xmm1\n\t"
155           "pclmulqdq $0x11, %%xmm4, %%xmm6\n\t"
156           "pxor %%xmm5, %%xmm1\n\t"
157           "pxor %%xmm6, %%xmm1\n\t"
158 
159           "movdqu %[inbuf_2], %%xmm5\n\t"
160           "movdqa %%xmm2, %%xmm6\n\t"
161           "pclmulqdq $0x00, %%xmm4, %%xmm2\n\t"
162           "pclmulqdq $0x11, %%xmm4, %%xmm6\n\t"
163           "pxor %%xmm5, %%xmm2\n\t"
164           "pxor %%xmm6, %%xmm2\n\t"
165 
166           "movdqu %[inbuf_3], %%xmm5\n\t"
167           "movdqa %%xmm3, %%xmm6\n\t"
168           "pclmulqdq $0x00, %%xmm4, %%xmm3\n\t"
169           "pclmulqdq $0x11, %%xmm4, %%xmm6\n\t"
170           "pxor %%xmm5, %%xmm3\n\t"
171           "pxor %%xmm6, %%xmm3\n\t"
172           :
173           : [inbuf_0] "m"(inbuf[0 * 16]), [inbuf_1] "m"(inbuf[1 * 16]),
174             [inbuf_2] "m"(inbuf[2 * 16]), [inbuf_3] "m"(inbuf[3 * 16]));
175 
176       inbuf += 4 * 16;
177       inlen -= 4 * 16;
178     }
179 
180     asm volatile(
181         "movdqa %[k3k4], %%xmm6\n\t"
182         "movdqa %[my_p], %%xmm5\n\t"
183         :
184         : [k3k4] "m"(consts->k[3 - 1]), [my_p] "m"(consts->my_p[0]));
185 
186     /* Fold 4 to 1. */
187 
188     asm volatile(
189         "movdqa %%xmm0, %%xmm4\n\t"
190         "pclmulqdq $0x00, %%xmm6, %%xmm0\n\t"
191         "pclmulqdq $0x11, %%xmm6, %%xmm4\n\t"
192         "pxor %%xmm1, %%xmm0\n\t"
193         "pxor %%xmm4, %%xmm0\n\t"
194 
195         "movdqa %%xmm0, %%xmm4\n\t"
196         "pclmulqdq $0x00, %%xmm6, %%xmm0\n\t"
197         "pclmulqdq $0x11, %%xmm6, %%xmm4\n\t"
198         "pxor %%xmm2, %%xmm0\n\t"
199         "pxor %%xmm4, %%xmm0\n\t"
200 
201         "movdqa %%xmm0, %%xmm4\n\t"
202         "pclmulqdq $0x00, %%xmm6, %%xmm0\n\t"
203         "pclmulqdq $0x11, %%xmm6, %%xmm4\n\t"
204         "pxor %%xmm3, %%xmm0\n\t"
205         "pxor %%xmm4, %%xmm0\n\t"
206         :
207         :);
208   } else {
209     asm volatile(
210         "movd %[crc], %%xmm1\n\t"
211         "movdqu %[inbuf], %%xmm0\n\t"
212         "movdqa %[k3k4], %%xmm6\n\t"
213         "pxor %%xmm1, %%xmm0\n\t"
214         "movdqa %[my_p], %%xmm5\n\t"
215         :
216         : [inbuf] "m"(*inbuf), [crc] "m"(*pcrc), [k3k4] "m"(consts->k[3 - 1]),
217           [my_p] "m"(consts->my_p[0]));
218 
219     inbuf += 16;
220     inlen -= 16;
221   }
222 
223   /* Fold by 1. */
224   if (inlen >= 16) {
225     while (inlen >= 16) {
226       /* Load next block to XMM2. Fold XMM0 to XMM0:XMM1. */
227       asm volatile(
228           "movdqu %[inbuf], %%xmm2\n\t"
229           "movdqa %%xmm0, %%xmm1\n\t"
230           "pclmulqdq $0x00, %%xmm6, %%xmm0\n\t"
231           "pclmulqdq $0x11, %%xmm6, %%xmm1\n\t"
232           "pxor %%xmm2, %%xmm0\n\t"
233           "pxor %%xmm1, %%xmm0\n\t"
234           :
235           : [inbuf] "m"(*inbuf));
236 
237       inbuf += 16;
238       inlen -= 16;
239     }
240   }
241 
242   /* Partial fold. */
243   if (inlen) {
244     /* Load last input and add padding zeros. */
245     asm volatile(
246         "movdqu %[shr_shuf], %%xmm3\n\t"
247         "movdqu %[shl_shuf], %%xmm4\n\t"
248         "movdqu %[mask], %%xmm2\n\t"
249 
250         "movdqa %%xmm0, %%xmm1\n\t"
251         "pshufb %%xmm4, %%xmm0\n\t"
252         "movdqu %[inbuf], %%xmm4\n\t"
253         "pshufb %%xmm3, %%xmm1\n\t"
254         "pand %%xmm4, %%xmm2\n\t"
255         "por %%xmm1, %%xmm2\n\t"
256 
257         "movdqa %%xmm0, %%xmm1\n\t"
258         "pclmulqdq $0x00, %%xmm6, %%xmm0\n\t"
259         "pclmulqdq $0x11, %%xmm6, %%xmm1\n\t"
260         "pxor %%xmm2, %%xmm0\n\t"
261         "pxor %%xmm1, %%xmm0\n\t"
262         :
263         : [inbuf] "m"(*(inbuf - 16 + inlen)),
264           [mask] "m"(crc32_partial_fold_input_mask[inlen]),
265           [shl_shuf] "m"(crc32_refl_shuf_shift[inlen]),
266           [shr_shuf] "m"(crc32_refl_shuf_shift[inlen + 16]));
267 
268     inbuf += inlen;
269     inlen -= inlen;
270   }
271 
272   /* Final fold. */
273   asm volatile(/* reduce 128-bits to 96-bits */
274                "movdqa %%xmm0, %%xmm1\n\t"
275                "pclmulqdq $0x10, %%xmm6, %%xmm0\n\t"
276                "psrldq $8, %%xmm1\n\t"
277                "pxor %%xmm1, %%xmm0\n\t"
278 
279                /* reduce 96-bits to 64-bits */
280                "pshufd $0xfc, %%xmm0, %%xmm1\n\t"   /* [00][00][00][x] */
281                "pshufd $0xf9, %%xmm0, %%xmm0\n\t"   /* [00][00][x>>64][x>>32] */
282                "pclmulqdq $0x00, %[k5], %%xmm1\n\t" /* [00][00][xx][xx] */
283                "pxor %%xmm1, %%xmm0\n\t"            /* top 64-bit are zero */
284 
285                /* barrett reduction */
286                "pshufd $0xf3, %%xmm0, %%xmm1\n\t"    /* [00][00][x>>32][00] */
287                "pslldq $4, %%xmm0\n\t"               /* [??][x>>32][??][??] */
288                "pclmulqdq $0x00, %%xmm5, %%xmm1\n\t" /* [00][xx][xx][00] */
289                "pclmulqdq $0x10, %%xmm5, %%xmm1\n\t" /* [00][xx][xx][00] */
290                "pxor %%xmm1, %%xmm0\n\t"
291 
292                /* store CRC */
293                "pextrd $2, %%xmm0, %[out]\n\t"
294                : [out] "=m"(*pcrc)
295                : [k5] "m"(consts->k[5 - 1]));
296 }
297 
crc32_reflected_less_than_16(u32 * pcrc,const byte * inbuf,size_t inlen,const struct crc32_consts_s * consts)298 static inline void crc32_reflected_less_than_16(
299     u32 *pcrc, const byte *inbuf, size_t inlen,
300     const struct crc32_consts_s *consts) {
301   if (inlen < 4) {
302     u32 crc = *pcrc;
303     u32 data;
304 
305     asm volatile("movdqa %[my_p], %%xmm5\n\t" : : [my_p] "m"(consts->my_p[0]));
306 
307     if (inlen == 1) {
308       data = inbuf[0];
309       data ^= crc;
310       data <<= 24;
311       crc >>= 8;
312     } else if (inlen == 2) {
313       data = ((const struct u16_unaligned_s *)inbuf)->a;
314       data ^= crc;
315       data <<= 16;
316       crc >>= 16;
317     } else {
318       data = ((const struct u16_unaligned_s *)inbuf)->a;
319       data |= inbuf[2] << 16;
320       data ^= crc;
321       data <<= 8;
322       crc >>= 24;
323     }
324 
325     /* Barrett reduction */
326     asm volatile(
327         "movd %[in], %%xmm0\n\t"
328         "movd %[crc], %%xmm1\n\t"
329 
330         "pclmulqdq $0x00, %%xmm5, %%xmm0\n\t" /* [00][00][xx][xx] */
331         "psllq $32, %%xmm1\n\t"
332         "pshufd $0xfc, %%xmm0, %%xmm0\n\t"    /* [00][00][00][x] */
333         "pclmulqdq $0x10, %%xmm5, %%xmm0\n\t" /* [00][00][xx][xx] */
334         "pxor %%xmm1, %%xmm0\n\t"
335 
336         "pextrd $1, %%xmm0, %[out]\n\t"
337         : [out] "=m"(*pcrc)
338         : [in] "rm"(data), [crc] "rm"(crc));
339   } else if (inlen == 4) {
340     /* Barrett reduction */
341     asm volatile(
342         "movd %[crc], %%xmm1\n\t"
343         "movd %[in], %%xmm0\n\t"
344         "movdqa %[my_p], %%xmm5\n\t"
345         "pxor %%xmm1, %%xmm0\n\t"
346 
347         "pclmulqdq $0x00, %%xmm5, %%xmm0\n\t" /* [00][00][xx][xx] */
348         "pshufd $0xfc, %%xmm0, %%xmm0\n\t"    /* [00][00][00][x] */
349         "pclmulqdq $0x10, %%xmm5, %%xmm0\n\t" /* [00][00][xx][xx] */
350 
351         "pextrd $1, %%xmm0, %[out]\n\t"
352         : [out] "=m"(*pcrc)
353         : [in] "m"(*inbuf), [crc] "m"(*pcrc), [my_p] "m"(consts->my_p[0]));
354   } else {
355     asm volatile(
356         "movdqu %[shuf], %%xmm4\n\t"
357         "movd %[crc], %%xmm1\n\t"
358         "movdqa %[my_p], %%xmm5\n\t"
359         "movdqa %[k3k4], %%xmm6\n\t"
360         :
361         : [shuf] "m"(crc32_refl_shuf_shift[inlen]), [crc] "m"(*pcrc),
362           [my_p] "m"(consts->my_p[0]), [k3k4] "m"(consts->k[3 - 1]));
363 
364     if (inlen >= 8) {
365       asm volatile("movq %[inbuf], %%xmm0\n\t" : : [inbuf] "m"(*inbuf));
366       if (inlen > 8) {
367         asm volatile(/*"pinsrq $1, %[inbuf_tail], %%xmm0\n\t"*/
368                      "movq %[inbuf_tail], %%xmm2\n\t"
369                      "punpcklqdq %%xmm2, %%xmm0\n\t"
370                      "pshufb %[merge_shuf], %%xmm0\n\t"
371                      :
372                      : [inbuf_tail] "m"(inbuf[inlen - 8]),
373                        [merge_shuf] "m"(*crc32_merge9to15_shuf[inlen - 9]));
374       }
375     } else {
376       asm volatile(
377           "movd %[inbuf], %%xmm0\n\t"
378           "pinsrd $1, %[inbuf_tail], %%xmm0\n\t"
379           "pshufb %[merge_shuf], %%xmm0\n\t"
380           :
381           : [inbuf] "m"(*inbuf), [inbuf_tail] "m"(inbuf[inlen - 4]),
382             [merge_shuf] "m"(*crc32_merge5to7_shuf[inlen - 5]));
383     }
384 
385     /* Final fold. */
386     asm volatile(
387         "pxor %%xmm1, %%xmm0\n\t"
388         "pshufb %%xmm4, %%xmm0\n\t"
389 
390         /* reduce 128-bits to 96-bits */
391         "movdqa %%xmm0, %%xmm1\n\t"
392         "pclmulqdq $0x10, %%xmm6, %%xmm0\n\t"
393         "psrldq $8, %%xmm1\n\t"
394         "pxor %%xmm1, %%xmm0\n\t" /* top 32-bit are zero */
395 
396         /* reduce 96-bits to 64-bits */
397         "pshufd $0xfc, %%xmm0, %%xmm1\n\t"   /* [00][00][00][x] */
398         "pshufd $0xf9, %%xmm0, %%xmm0\n\t"   /* [00][00][x>>64][x>>32] */
399         "pclmulqdq $0x00, %[k5], %%xmm1\n\t" /* [00][00][xx][xx] */
400         "pxor %%xmm1, %%xmm0\n\t"            /* top 64-bit are zero */
401 
402         /* barrett reduction */
403         "pshufd $0xf3, %%xmm0, %%xmm1\n\t"    /* [00][00][x>>32][00] */
404         "pslldq $4, %%xmm0\n\t"               /* [??][x>>32][??][??] */
405         "pclmulqdq $0x00, %%xmm5, %%xmm1\n\t" /* [00][xx][xx][00] */
406         "pclmulqdq $0x10, %%xmm5, %%xmm1\n\t" /* [00][xx][xx][00] */
407         "pxor %%xmm1, %%xmm0\n\t"
408 
409         /* store CRC */
410         "pextrd $2, %%xmm0, %[out]\n\t"
411         : [out] "=m"(*pcrc)
412         : [k5] "m"(consts->k[5 - 1]));
413   }
414 }
415 
crc32_intel_pclmul(u32 * pcrc,const byte * inbuf,size_t inlen)416 void crc32_intel_pclmul(u32 *pcrc, const byte *inbuf, size_t inlen) {
417   const struct crc32_consts_s *consts = &crc32_consts;
418 #if defined(__x86_64__) && defined(__WIN64__)
419   char win64tmp[2 * 16];
420 
421   /* XMM6-XMM7 need to be restored after use. */
422   asm volatile(
423       "movdqu %%xmm6, 0*16(%0)\n\t"
424       "movdqu %%xmm7, 1*16(%0)\n\t"
425       :
426       : "r"(win64tmp)
427       : "memory");
428 #endif
429 
430   if (!inlen) return;
431 
432   if (inlen >= 16)
433     crc32_reflected_bulk(pcrc, inbuf, inlen, consts);
434   else
435     crc32_reflected_less_than_16(pcrc, inbuf, inlen, consts);
436 
437 #if defined(__x86_64__) && defined(__WIN64__)
438   /* Restore used registers. */
439   asm volatile(
440       "movdqu 0*16(%0), %%xmm6\n\t"
441       "movdqu 1*16(%0), %%xmm7\n\t"
442       :
443       : "r"(win64tmp)
444       : "memory");
445 #endif
446 }
447 
448 #endif
449