1/* crc-armv8-aarch64-ce.S - ARMv8/CE PMULL accelerated CRC implementation
2 * Copyright (C) 2019 Jussi Kivilinna <jussi.kivilinna@iki.fi>
3 *
4 * This file is part of Libgcrypt.
5 *
6 * Libgcrypt is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation; either version 2.1 of
9 * the License, or (at your option) any later version.
10 *
11 * Libgcrypt is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this program; if not, see <http://www.gnu.org/licenses/>.
18 */
19
20#include "asm-common-aarch64.h"
21
22#if defined(__AARCH64EL__) && \
23    defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
24    defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO)
25
26.cpu generic+simd+crypto
27
28.text
29
30
31/* Structure of crc32_consts_s */
32
33#define consts_k(idx)    ((idx) * 8)
34#define consts_my_p(idx) (consts_k(6) + (idx) * 8)
35
36/* Constants */
37
38.align 6
39.Lcrc32_constants:
40.Lcrc32_partial_fold_input_mask:
41  .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
42  .byte 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
43.Lcrc32_refl_shuf_shift:
44  .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
45  .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
46  .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07
47  .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f
48.Lcrc32_shuf_shift:
49  .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
50  .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
51.Lcrc32_bswap_shuf:
52  .byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08
53  .byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
54  .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
55  .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff
56
57
58/*
59 * void _gcry_crc32r_armv8_ce_bulk (u32 *pcrc, const byte *inbuf, size_t inlen,
60 *                                  const struct crc32_consts_s *consts);
61 */
62.align 3
63.globl _gcry_crc32r_armv8_ce_bulk
64ELF(.type  _gcry_crc32r_armv8_ce_bulk,%function;)
65_gcry_crc32r_armv8_ce_bulk:
66  /* input:
67   *    x0: pcrc
68   *    x1: inbuf
69   *    x2: inlen
70   *    x3: consts
71   */
72  CFI_STARTPROC()
73
74  GET_DATA_POINTER(x7, .Lcrc32_constants)
75  add x9, x3, #consts_k(5 - 1)
76  cmp x2, #128
77
78  b.lo .Lcrc32r_fold_by_one_setup
79
80  eor v4.16b, v4.16b, v4.16b
81  add x4, x3, #consts_k(1 - 1)
82  ld1 {v4.s}[0], [x0]             /* load pcrc */
83  ld1 {v0.16b-v3.16b}, [x1], #64  /* load 64 bytes of input */
84  sub x2, x2, #64
85  ld1 {v6.16b}, [x4]
86  eor v0.16b, v0.16b, v4.16b
87
88  add x4, x3, #consts_k(3 - 1)
89  add x5, x3, #consts_my_p(0)
90
91.Lcrc32r_fold_by_four:
92
93  /* Fold by 4. */
94  ld1 {v16.16b-v19.16b}, [x1], #64 /* load 64 bytes of input */
95  sub x2, x2, #64
96  pmull v20.1q, v0.1d, v6.1d
97  pmull v21.1q, v1.1d, v6.1d
98  pmull v22.1q, v2.1d, v6.1d
99  pmull v23.1q, v3.1d, v6.1d
100  cmp x2, #64
101  pmull2 v24.1q, v0.2d, v6.2d
102  pmull2 v25.1q, v1.2d, v6.2d
103  pmull2 v26.1q, v2.2d, v6.2d
104  pmull2 v27.1q, v3.2d, v6.2d
105  eor v0.16b, v20.16b, v16.16b
106  eor v1.16b, v21.16b, v17.16b
107  eor v2.16b, v22.16b, v18.16b
108  eor v3.16b, v23.16b, v19.16b
109  eor v0.16b, v0.16b, v24.16b
110  eor v1.16b, v1.16b, v25.16b
111  eor v2.16b, v2.16b, v26.16b
112  eor v3.16b, v3.16b, v27.16b
113  b.hs .Lcrc32r_fold_by_four
114
115  ld1 {v6.16b}, [x4]
116  ld1 {v5.16b}, [x5]
117
118  cmp x2, #16
119
120  /* Fold 4 to 1. */
121
122  pmull v16.1q, v0.1d, v6.1d
123  pmull2 v4.1q, v0.2d, v6.2d
124  eor v0.16b, v16.16b, v1.16b
125  eor v0.16b, v0.16b, v4.16b
126
127  pmull v16.1q, v0.1d, v6.1d
128  pmull2 v4.1q, v0.2d, v6.2d
129  eor v0.16b, v16.16b, v2.16b
130  eor v0.16b, v0.16b, v4.16b
131
132  pmull v16.1q, v0.1d, v6.1d
133  pmull2 v4.1q, v0.2d, v6.2d
134  eor v0.16b, v16.16b, v3.16b
135  eor v0.16b, v0.16b, v4.16b
136
137  b.lo .Lcrc32r_fold_by_one_done
138  b .Lcrc32r_fold_by_one
139
140.Lcrc32r_fold_by_one_setup:
141
142  eor v1.16b, v1.16b, v1.16b
143  add x4, x3, #consts_k(3 - 1)
144  add x5, x3, #consts_my_p(0)
145  sub x2, x2, #16
146  ld1 {v1.s}[0], [x0]             /* load pcrc */
147  ld1 {v0.16b}, [x1], #16         /* load 16 bytes of input */
148  cmp x2, #16
149  ld1 {v6.16b}, [x4]              /* load k3k4 */
150  ld1 {v5.16b}, [x5]              /* load my_p */
151  eor v0.16b, v0.16b, v1.16b
152  b.lo .Lcrc32r_fold_by_one_done
153
154.Lcrc32r_fold_by_one:
155  sub x2, x2, #16
156  ld1 {v2.16b}, [x1], #16         /* load 16 bytes of input */
157  pmull v3.1q, v0.1d, v6.1d
158  pmull2 v1.1q, v0.2d, v6.2d
159  cmp x2, #16
160  eor v0.16b, v3.16b, v2.16b
161  eor v0.16b, v0.16b, v1.16b
162
163  b.hs .Lcrc32r_fold_by_one
164
165.Lcrc32r_fold_by_one_done:
166
167  cmp x2, #0
168  b.eq .Lcrc32r_final_fold
169
170  /* Partial fold. */
171
172  add x4, x7, #.Lcrc32_refl_shuf_shift - .Lcrc32_constants
173  add x5, x7, #.Lcrc32_refl_shuf_shift - .Lcrc32_constants + 16
174  add x6, x7, #.Lcrc32_partial_fold_input_mask - .Lcrc32_constants
175  sub x8, x2, #16
176  add x4, x4, x2
177  add x5, x5, x2
178  add x6, x6, x2
179  add x8, x1, x8
180
181  /* Load last input and add padding zeros. */
182  ld1 {v4.16b}, [x4]
183  eor x2, x2, x2
184  ld1 {v3.16b}, [x5]
185  ld1 {v2.16b}, [x6]
186  tbl v30.16b, {v0.16b}, v4.16b
187  ld1 {v4.16b}, [x8]
188  tbl v1.16b, {v0.16b}, v3.16b
189
190  pmull v0.1q, v30.1d, v6.1d
191  and v2.16b, v2.16b, v4.16b
192  pmull2 v31.1q, v30.2d, v6.2d
193  orr v2.16b, v2.16b, v1.16b
194  eor v0.16b, v0.16b, v31.16b
195  eor v0.16b, v0.16b, v2.16b
196
197.Lcrc32r_final_fold:
198
199  /* Final fold. */
200
201  eor v2.16b, v2.16b, v2.16b      /* zero reg */
202  ld1 {v7.16b}, [x9]
203
204  /* reduce 128-bits to 96-bits */
205  ext v6.16b, v6.16b, v6.16b, #8  /* swap high and low parts */
206  mov v1.16b, v0.16b
207  pmull v0.1q, v0.1d, v6.1d
208  ext v6.16b, v5.16b, v5.16b, #8  /* swap high and low parts */
209  ext v1.16b, v1.16b, v2.16b, #8  /* high to low, high zeroed */
210  eor v3.16b, v0.16b, v1.16b
211
212  /* reduce 96-bits to 64-bits */
213  eor v1.16b, v1.16b, v1.16b
214  ext v0.16b, v3.16b, v2.16b, #4  /* [00][00][x2][x1] */
215  mov v1.s[0], v3.s[0]            /* [00][00][00][x0] */
216  eor v3.16b, v3.16b, v3.16b
217  pmull v1.1q, v1.1d, v7.1d       /* [00][00][xx][xx] */
218  eor v0.16b, v0.16b, v1.16b      /* top 64-bit are zero */
219
220  /* barrett reduction */
221  mov v3.s[1], v0.s[0]            /* [00][00][x1][00] */
222  ext v0.16b, v2.16b, v0.16b, #12 /* [??][x1][??][00] */
223  pmull v1.1q, v3.1d, v5.1d       /* [00][xx][xx][00] */
224  pmull v1.1q, v1.1d, v6.1d       /* [00][xx][xx][00] */
225  eor v0.16b, v0.16b, v1.16b
226
227  /* store CRC */
228  st1 {v0.s}[2], [x0]
229
230  ret
231  CFI_ENDPROC()
232ELF(.size _gcry_crc32r_armv8_ce_bulk,.-_gcry_crc32r_armv8_ce_bulk;)
233
234/*
235 * void _gcry_crc32r_armv8_ce_reduction_4 (u32 *pcrc, u32 data, u32 crc,
236 *                                         const struct crc32_consts_s *consts);
237 */
238.align 3
239.globl _gcry_crc32r_armv8_ce_reduction_4
240ELF(.type  _gcry_crc32r_armv8_ce_reduction_4,%function;)
241_gcry_crc32r_armv8_ce_reduction_4:
242  /* input:
243   *    w0: data
244   *    w1: crc
245   *    x2: crc32 constants
246   */
247  CFI_STARTPROC()
248
249  eor v0.16b, v0.16b, v0.16b
250  add x2, x2, #consts_my_p(0)
251  eor v1.16b, v1.16b, v1.16b
252  ld1 {v5.16b}, [x2]
253
254  mov v0.s[0], w0
255  pmull v0.1q, v0.1d, v5.1d     /* [00][00][xx][xx] */
256  mov v1.s[1], w1
257  mov v0.s[2], v0.s[0]          /* [00][x0][x1][x0] */
258  pmull2 v0.1q, v0.2d, v5.2d    /* [00][00][xx][xx] */
259  eor v0.16b, v0.16b, v1.16b
260
261  mov w0, v0.s[1]
262
263  ret
264  CFI_ENDPROC()
265ELF(.size _gcry_crc32r_armv8_ce_reduction_4,.-_gcry_crc32r_armv8_ce_reduction_4;)
266
267/*
268 * void _gcry_crc32_armv8_ce_bulk (u32 *pcrc, const byte *inbuf, size_t inlen,
269 *                                 const struct crc32_consts_s *consts);
270 */
271.align 3
272.globl _gcry_crc32_armv8_ce_bulk
273ELF(.type  _gcry_crc32_armv8_ce_bulk,%function;)
274_gcry_crc32_armv8_ce_bulk:
275  /* input:
276   *    x0: pcrc
277   *    x1: inbuf
278   *    x2: inlen
279   *    x3: consts
280   */
281  CFI_STARTPROC()
282
283  GET_DATA_POINTER(x7, .Lcrc32_constants)
284  add x4, x7, #.Lcrc32_bswap_shuf - .Lcrc32_constants
285  cmp x2, #128
286  ld1 {v7.16b}, [x4]
287
288  b.lo .Lcrc32_fold_by_one_setup
289
290  eor v4.16b, v4.16b, v4.16b
291  add x4, x3, #consts_k(1 - 1)
292  ld1 {v4.s}[0], [x0]            /* load pcrc */
293  ld1 {v0.16b-v3.16b}, [x1], #64 /* load 64 bytes of input */
294  sub x2, x2, #64
295  ld1 {v6.16b}, [x4]
296  eor v0.16b, v0.16b, v4.16b
297  ext v4.16b, v6.16b, v6.16b, #8
298  tbl v0.16b, { v0.16b }, v7.16b /* byte swap */
299  tbl v1.16b, { v1.16b }, v7.16b /* byte swap */
300  tbl v2.16b, { v2.16b }, v7.16b /* byte swap */
301  tbl v3.16b, { v3.16b }, v7.16b /* byte swap */
302
303  add x4, x3, #consts_k(3 - 1)
304  add x5, x3, #consts_my_p(0)
305
306.Lcrc32_fold_by_four:
307
308  /* Fold by 4. */
309  ld1 {v16.16b-v19.16b}, [x1], #64 /* load 64 bytes of input */
310  sub x2, x2, #64
311  tbl v16.16b, { v16.16b }, v7.16b /* byte swap */
312  tbl v17.16b, { v17.16b }, v7.16b /* byte swap */
313  tbl v18.16b, { v18.16b }, v7.16b /* byte swap */
314  tbl v19.16b, { v19.16b }, v7.16b /* byte swap */
315  cmp x2, #64
316  pmull2 v20.1q, v0.2d, v4.2d
317  pmull2 v21.1q, v1.2d, v4.2d
318  pmull2 v22.1q, v2.2d, v4.2d
319  pmull2 v23.1q, v3.2d, v4.2d
320  pmull v24.1q, v0.1d, v4.1d
321  pmull v25.1q, v1.1d, v4.1d
322  pmull v26.1q, v2.1d, v4.1d
323  pmull v27.1q, v3.1d, v4.1d
324  eor v0.16b, v20.16b, v16.16b
325  eor v1.16b, v21.16b, v17.16b
326  eor v2.16b, v22.16b, v18.16b
327  eor v3.16b, v23.16b, v19.16b
328  eor v0.16b, v0.16b, v24.16b
329  eor v1.16b, v1.16b, v25.16b
330  eor v2.16b, v2.16b, v26.16b
331  eor v3.16b, v3.16b, v27.16b
332  b.hs .Lcrc32_fold_by_four
333
334  ld1 {v6.16b}, [x4]
335  ld1 {v5.16b}, [x5]
336  ext v6.16b, v6.16b, v6.16b, #8
337  ext v5.16b, v5.16b, v5.16b, #8
338
339  cmp x2, #16
340
341  /* Fold 4 to 1. */
342
343  pmull2 v16.1q, v0.2d, v6.2d
344  pmull v4.1q, v0.1d, v6.1d
345  eor v0.16b, v16.16b, v1.16b
346  eor v0.16b, v0.16b, v4.16b
347
348  pmull2 v16.1q, v0.2d, v6.2d
349  pmull v4.1q, v0.1d, v6.1d
350  eor v0.16b, v16.16b, v2.16b
351  eor v0.16b, v0.16b, v4.16b
352
353  pmull2 v16.1q, v0.2d, v6.2d
354  pmull v4.1q, v0.1d, v6.1d
355  eor v0.16b, v16.16b, v3.16b
356  eor v0.16b, v0.16b, v4.16b
357
358  b.lo .Lcrc32_fold_by_one_done
359  b .Lcrc32_fold_by_one
360
361.Lcrc32_fold_by_one_setup:
362
363  eor v1.16b, v1.16b, v1.16b
364  add x4, x3, #consts_k(3 - 1)
365  add x5, x3, #consts_my_p(0)
366  ld1 {v1.s}[0], [x0]            /* load pcrc */
367  sub x2, x2, #16
368  ld1 {v0.16b}, [x1], #16        /* load 16 bytes of input */
369  ld1 {v6.16b}, [x4]             /* load k3k4 */
370  ld1 {v5.16b}, [x5]             /* load my_p */
371  eor v0.16b, v0.16b, v1.16b
372  cmp x2, #16
373  ext v6.16b, v6.16b, v6.16b, #8 /* swap high and low parts */
374  ext v5.16b, v5.16b, v5.16b, #8 /* swap high and low parts */
375  tbl v0.16b, { v0.16b }, v7.16b /* byte swap */
376  b.lo .Lcrc32_fold_by_one_done
377
378.Lcrc32_fold_by_one:
379  sub x2, x2, #16
380  ld1 {v2.16b}, [x1], #16        /* load 16 bytes of input */
381  pmull2 v3.1q, v0.2d, v6.2d
382  tbl v2.16b, { v2.16b }, v7.16b /* byte swap */
383  pmull v1.1q, v0.1d, v6.1d
384  cmp x2, #16
385  eor v0.16b, v3.16b, v2.16b
386  eor v0.16b, v0.16b, v1.16b
387
388  b.hs .Lcrc32_fold_by_one
389
390.Lcrc32_fold_by_one_done:
391
392  cmp x2, #0
393  b.eq .Lcrc32_final_fold
394
395  /* Partial fold. */
396
397  add x4, x7, #.Lcrc32_refl_shuf_shift - .Lcrc32_constants + 32
398  add x5, x7, #.Lcrc32_shuf_shift - .Lcrc32_constants + 16
399  add x6, x7, #.Lcrc32_partial_fold_input_mask - .Lcrc32_constants
400  sub x8, x2, #16
401  sub x4, x4, x2
402  add x5, x5, x2
403  add x6, x6, x2
404  add x8, x1, x8
405
406  /* Load last input and add padding zeros. */
407  ld1 {v4.16b}, [x4]
408  eor x2, x2, x2
409  ld1 {v3.16b}, [x5]
410  ld1 {v2.16b}, [x6]
411  tbl v30.16b, {v0.16b}, v4.16b
412  ld1 {v4.16b}, [x8]
413  tbl v1.16b, {v0.16b}, v3.16b
414  and v2.16b, v2.16b, v4.16b
415
416  pmull2 v0.1q, v30.2d, v6.2d
417  orr v2.16b, v2.16b, v1.16b
418  pmull v1.1q, v30.1d, v6.1d
419  tbl v2.16b, {v2.16b}, v7.16b   /* byte swap */
420  eor v0.16b, v0.16b, v1.16b
421  eor v0.16b, v0.16b, v2.16b
422
423.Lcrc32_final_fold:
424
425  /* Final fold. */
426
427  eor v2.16b, v2.16b, v2.16b     /* zero reg */
428
429  /* reduce 128-bits to 96-bits */
430  add x4, x3, #consts_k(4)
431  ext v3.16b, v6.16b, v6.16b, #8 /* swap high and low parts */
432  eor v6.16b, v6.16b, v6.16b
433  mov v1.16b, v0.16b
434  pmull2 v0.1q, v0.2d, v3.2d
435  ld1 {v6.d}[1], [x4]            /* load k4 */
436  ext v1.16b, v2.16b, v1.16b, #8 /* low to high, low zeroed */
437  eor v3.16b, v0.16b, v1.16b     /* bottom 32-bit are zero */
438
439  /* reduce 96-bits to 64-bits */
440  eor v0.16b, v0.16b, v0.16b
441  eor v1.16b, v1.16b, v1.16b
442  mov v0.s[1], v3.s[1]           /* [00][00][x1][00] */
443  mov v1.s[2], v3.s[3]           /* [00][x3][00][00] */
444  mov v0.s[2], v3.s[2]           /* [00][x2][x1][00] */
445  eor v3.16b, v3.16b, v3.16b
446  pmull2 v1.1q, v1.2d, v6.2d     /* [00][xx][xx][00] */
447  eor v0.16b, v0.16b, v1.16b     /* top and bottom 32-bit are zero */
448
449  /* barrett reduction */
450  mov v3.s[0], v0.s[1]           /* [00][00][00][x1] */
451  pmull2 v0.1q, v0.2d, v5.2d     /* [00][xx][xx][xx] */
452  ext v0.16b, v0.16b, v2.16b, #4 /* [00][00][xx][xx] */
453  pmull v0.1q, v0.1d, v5.1d
454  eor v0.16b, v0.16b, v3.16b
455
456  /* store CRC in input endian */
457  rev32 v0.8b, v0.8b             /* byte swap */
458  st1 {v0.s}[0], [x0]
459
460  ret
461  CFI_ENDPROC()
462ELF(.size _gcry_crc32_armv8_ce_bulk,.-_gcry_crc32_armv8_ce_bulk;)
463
464/*
465 * void _gcry_crc32_armv8_ce_reduction_4 (u32 *pcrc, u32 data, u32 crc,
466 *                                        const struct crc32_consts_s *consts);
467 */
468.align 3
469.globl _gcry_crc32_armv8_ce_reduction_4
470ELF(.type  _gcry_crc32_armv8_ce_reduction_4,%function;)
471_gcry_crc32_armv8_ce_reduction_4:
472  /* input:
473   *    w0: data
474   *    w1: crc
475   *    x2: crc32 constants
476   */
477  CFI_STARTPROC()
478
479  eor v0.16b, v0.16b, v0.16b
480  add x2, x2, #consts_my_p(0)
481  eor v1.16b, v1.16b, v1.16b
482  ld1 {v5.16b}, [x2]
483
484  mov v0.s[1], w0
485  pmull v0.1q, v0.1d, v5.1d     /* [00][xx][xx][00] */
486  mov v1.s[0], w1
487  pmull2 v0.1q, v0.2d, v5.2d    /* [00][00][xx][xx] */
488  eor v0.16b, v0.16b, v1.16b
489
490  rev32 v0.8b, v0.8b            /* Return in input endian */
491  mov w0, v0.s[0]
492
493  ret
494  CFI_ENDPROC()
495ELF(.size _gcry_crc32_armv8_ce_reduction_4,.-_gcry_crc32_armv8_ce_reduction_4;)
496
497#endif
498