1/* cipher-gcm-armv8-aarch64-ce.S - ARM/CE accelerated GHASH
2 * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi>
3 *
4 * This file is part of Libgcrypt.
5 *
6 * Libgcrypt is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation; either version 2.1 of
9 * the License, or (at your option) any later version.
10 *
11 * Libgcrypt is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this program; if not, see <http://www.gnu.org/licenses/>.
18 */
19
20#include "asm-common-aarch64.h"
21
22#if defined(__AARCH64EL__) && \
23    defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
24    defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO)
25
26.cpu generic+simd+crypto
27
28.text
29
30
31/* Constants */
32
33.align 4
34gcry_gcm_reduction_constant:
35.Lrconst:
36  .quad 0x87
37
38
39/* Register macros */
40
41#define rhash   v0
42#define rr0     v1
43#define rr1     v2
44#define rbuf    v3
45#define rbuf1   v4
46#define rbuf2   v5
47#define rbuf3   v6
48#define rbuf4   v7
49#define rbuf5   v8
50#define rr2     v9
51#define rr3     v10
52#define rr4     v11
53#define rr5     v12
54#define rr6     v13
55#define rr7     v14
56#define rr8     v15
57#define rr9     v16
58
59#define rrconst v18
60#define rh1     v19
61#define rh2     v20
62#define rh3     v21
63#define rh4     v22
64#define rh5     v23
65#define rh6     v24
66#define t0      v25
67#define t1      v26
68#define t2      v27
69#define t3      v28
70#define t4      v29
71#define t5      v30
72#define vZZ     v31
73
74/* GHASH macros */
75
76/* See "Gouvêa, C. P. L. & López, J. Implementing GCM on ARMv8. Topics in
77 * Cryptology — CT-RSA 2015" for details.
78 */
79
80/* Input: 'a' and 'b', Output: 'r0:r1' (low 128-bits in r0, high in r1) */
81#define PMUL_128x128(r0, r1, a, b, T0, T1, interleave_op) \
82	ext T0.16b, b.16b, b.16b, #8; \
83	pmull r0.1q, a.1d, b.1d; \
84	pmull2 r1.1q, a.2d, b.2d; \
85	pmull T1.1q, a.1d, T0.1d; \
86	pmull2 T0.1q, a.2d, T0.2d; \
87	interleave_op; \
88	eor T0.16b, T0.16b, T1.16b; \
89	ext T1.16b, vZZ.16b, T0.16b, #8; \
90	ext T0.16b, T0.16b, vZZ.16b, #8; \
91	eor r0.16b, r0.16b, T1.16b; \
92	eor r1.16b, r1.16b, T0.16b;
93
94/* Input: 'aA' and 'bA', Output: 'r0A:r1A' (low 128-bits in r0A, high in r1A)
95 * Input: 'aB' and 'bB', Output: 'r0B:r1B' (low 128-bits in r0B, high in r1B)
96 * Input: 'aC' and 'bC', Output: 'r0C:r1C' (low 128-bits in r0C, high in r1C)
97 */
98#define PMUL_128x128_3(r0A, r1A, aA, bA, t0A, t1A, \
99                       r0B, r1B, aB, bB, t0B, t1B, \
100                       r0C, r1C, aC, bC, t0C, t1C,  interleave_op) \
101        ext t0A.16b, bA.16b, bA.16b, #8; \
102        pmull r0A.1q, aA.1d, bA.1d; \
103        pmull2 r1A.1q, aA.2d, bA.2d; \
104          ext t0B.16b, bB.16b, bB.16b, #8; \
105          pmull r0B.1q, aB.1d, bB.1d; \
106          pmull2 r1B.1q, aB.2d, bB.2d; \
107            ext t0C.16b, bC.16b, bC.16b, #8; \
108            pmull r0C.1q, aC.1d, bC.1d; \
109            pmull2 r1C.1q, aC.2d, bC.2d; \
110        pmull t1A.1q, aA.1d, t0A.1d; \
111        pmull2 t0A.1q, aA.2d, t0A.2d; \
112          pmull t1B.1q, aB.1d, t0B.1d; \
113          pmull2 t0B.1q, aB.2d, t0B.2d; \
114            pmull t1C.1q, aC.1d, t0C.1d; \
115            pmull2 t0C.1q, aC.2d, t0C.2d; \
116        eor t0A.16b, t0A.16b, t1A.16b; \
117          eor t0B.16b, t0B.16b, t1B.16b; \
118            eor t0C.16b, t0C.16b, t1C.16b; \
119              interleave_op; \
120        ext t1A.16b, vZZ.16b, t0A.16b, #8; \
121        ext t0A.16b, t0A.16b, vZZ.16b, #8; \
122          ext t1B.16b, vZZ.16b, t0B.16b, #8; \
123          ext t0B.16b, t0B.16b, vZZ.16b, #8; \
124            ext t1C.16b, vZZ.16b, t0C.16b, #8; \
125            ext t0C.16b, t0C.16b, vZZ.16b, #8; \
126        eor r0A.16b, r0A.16b, t1A.16b; \
127        eor r1A.16b, r1A.16b, t0A.16b; \
128          eor r0B.16b, r0B.16b, t1B.16b; \
129          eor r1B.16b, r1B.16b, t0B.16b; \
130            eor r0C.16b, r0C.16b, t1C.16b; \
131            eor r1C.16b, r1C.16b, t0C.16b; \
132
133/* Input: 'r0:r1', Output: 'a' */
134#define REDUCTION(a, r0, r1, rconst, T0, T1, interleave_op1, interleave_op2, \
135                  interleave_op3) \
136        pmull2 T0.1q, r1.2d, rconst.2d; \
137        interleave_op1; \
138        ext T1.16b, T0.16b, vZZ.16b, #8; \
139        ext T0.16b, vZZ.16b, T0.16b, #8; \
140        interleave_op2; \
141        eor r1.16b, r1.16b, T1.16b; \
142        eor r0.16b, r0.16b, T0.16b; \
143        pmull T0.1q, r1.1d, rconst.1d; \
144        interleave_op3; \
145        eor a.16b, r0.16b, T0.16b;
146
147/* Other functional macros */
148
149#define _(...) __VA_ARGS__
150#define __ _()
151
152#define CLEAR_REG(reg) eor reg.16b, reg.16b, reg.16b;
153
154#define VPUSH_ABI \
155        stp d8, d9, [sp, #-16]!; \
156        CFI_ADJUST_CFA_OFFSET(16); \
157        stp d10, d11, [sp, #-16]!; \
158        CFI_ADJUST_CFA_OFFSET(16); \
159        stp d12, d13, [sp, #-16]!; \
160        CFI_ADJUST_CFA_OFFSET(16); \
161        stp d14, d15, [sp, #-16]!; \
162        CFI_ADJUST_CFA_OFFSET(16);
163
164#define VPOP_ABI \
165        ldp d14, d15, [sp], #16; \
166        CFI_ADJUST_CFA_OFFSET(-16); \
167        ldp d12, d13, [sp], #16; \
168        CFI_ADJUST_CFA_OFFSET(-16); \
169        ldp d10, d11, [sp], #16; \
170        CFI_ADJUST_CFA_OFFSET(-16); \
171        ldp d8, d9, [sp], #16; \
172        CFI_ADJUST_CFA_OFFSET(-16);
173
174/*
175 * unsigned int _gcry_ghash_armv8_ce_pmull (void *gcm_key, byte *result,
176 *                                          const byte *buf, size_t nblocks,
177 *                                          void *gcm_table);
178 */
179.align 3
180.globl _gcry_ghash_armv8_ce_pmull
181ELF(.type  _gcry_ghash_armv8_ce_pmull,%function;)
182_gcry_ghash_armv8_ce_pmull:
183  /* input:
184   *    x0: gcm_key
185   *    x1: result/hash
186   *    x2: buf
187   *    x3: nblocks
188   *    x4: gcm_table
189   */
190  CFI_STARTPROC();
191
192  cbz x3, .Ldo_nothing;
193
194  GET_DATA_POINTER(x5, .Lrconst)
195
196  eor vZZ.16b, vZZ.16b, vZZ.16b
197  ld1 {rhash.16b}, [x1]
198  ld1 {rh1.16b}, [x0]
199
200  rbit rhash.16b, rhash.16b /* bit-swap */
201  ld1r {rrconst.2d}, [x5]
202
203  cmp x3, #6
204  b.lo .Less_than_6
205
206  add x6, x4, #64
207  VPUSH_ABI
208
209  ld1 {rh2.16b-rh5.16b}, [x4]
210  ld1 {rh6.16b}, [x6]
211
212  sub x3, x3, #6
213
214  ld1 {rbuf.16b-rbuf2.16b}, [x2], #(3*16)
215  ld1 {rbuf3.16b-rbuf5.16b}, [x2], #(3*16)
216
217  rbit rbuf.16b, rbuf.16b /* bit-swap */
218  rbit rbuf1.16b, rbuf1.16b /* bit-swap */
219  rbit rbuf2.16b, rbuf2.16b /* bit-swap */
220  rbit rbuf3.16b, rbuf3.16b /* bit-swap */
221  rbit rbuf4.16b, rbuf4.16b /* bit-swap */
222  rbit rbuf5.16b, rbuf5.16b /* bit-swap */
223  eor rhash.16b, rhash.16b, rbuf.16b
224
225  cmp x3, #6
226  b.lo .Lend_6
227
228.Loop_6:
229
230  /* (in1) * H⁵ => rr0:rr1 */
231  /* (in2) * H⁴ => rr2:rr3 */
232  /* (in0 ^ hash) * H⁶ => rr4:rr5 */
233  PMUL_128x128_3(rr0, rr1, rbuf1, rh5, t0, t1,
234                 rr2, rr3, rbuf2, rh4, t2, t3,
235                 rr4, rr5, rhash, rh6, t4, t5,
236                 _(sub x3, x3, #6))
237
238  ld1 {rbuf.16b-rbuf2.16b}, [x2], #(3*16)
239  cmp x3, #6
240
241  eor rr0.16b, rr0.16b, rr2.16b
242  eor rr1.16b, rr1.16b, rr3.16b
243
244  /* (in3) * H³ => rr2:rr3 */
245  /* (in4) * H² => rr6:rr7 */
246  /* (in5) * H¹ => rr8:rr9 */
247  PMUL_128x128_3(rr2, rr3, rbuf3, rh3, t0, t1,
248                 rr6, rr7, rbuf4, rh2, t2, t3,
249                 rr8, rr9, rbuf5, rh1, t4, t5,
250                 _(eor rr0.16b, rr0.16b, rr4.16b;
251                   eor rr1.16b, rr1.16b, rr5.16b))
252
253  eor rr0.16b, rr0.16b, rr2.16b
254  eor rr1.16b, rr1.16b, rr3.16b
255  rbit rbuf.16b, rbuf.16b
256  eor rr0.16b, rr0.16b, rr6.16b
257  eor rr1.16b, rr1.16b, rr7.16b
258  rbit rbuf1.16b, rbuf1.16b
259  eor rr0.16b, rr0.16b, rr8.16b
260  eor rr1.16b, rr1.16b, rr9.16b
261  ld1 {rbuf3.16b-rbuf5.16b}, [x2], #(3*16)
262
263  REDUCTION(rhash, rr0, rr1, rrconst, t0, t1,
264            _(rbit rbuf2.16b, rbuf2.16b),
265            _(rbit rbuf3.16b, rbuf3.16b),
266            _(rbit rbuf4.16b, rbuf4.16b))
267
268  rbit rbuf5.16b, rbuf5.16b
269  eor rhash.16b, rhash.16b, rbuf.16b
270
271  b.hs .Loop_6
272
273.Lend_6:
274
275  /* (in1) * H⁵ => rr0:rr1 */
276  /* (in0 ^ hash) * H⁶ => rr2:rr3 */
277  /* (in2) * H⁴ => rr4:rr5 */
278  PMUL_128x128_3(rr0, rr1, rbuf1, rh5, t0, t1,
279                 rr2, rr3, rhash, rh6, t2, t3,
280                 rr4, rr5, rbuf2, rh4, t4, t5,
281                 __)
282  eor rr0.16b, rr0.16b, rr2.16b
283  eor rr1.16b, rr1.16b, rr3.16b
284  eor rr0.16b, rr0.16b, rr4.16b
285  eor rr1.16b, rr1.16b, rr5.16b
286
287  /* (in3) * H³ => rhash:rbuf */
288  /* (in4) * H² => rr6:rr7 */
289  /* (in5) * H¹ => rr8:rr9 */
290  PMUL_128x128_3(rhash, rbuf, rbuf3, rh3, t0, t1,
291                 rr6, rr7, rbuf4, rh2, t2, t3,
292                 rr8, rr9, rbuf5, rh1, t4, t5,
293                 _(CLEAR_REG(rh4);
294                   CLEAR_REG(rh5);
295                   CLEAR_REG(rh6)))
296  eor rr0.16b, rr0.16b, rhash.16b
297  eor rr1.16b, rr1.16b, rbuf.16b
298  eor rr0.16b, rr0.16b, rr6.16b
299  eor rr1.16b, rr1.16b, rr7.16b
300  eor rr0.16b, rr0.16b, rr8.16b
301  eor rr1.16b, rr1.16b, rr9.16b
302
303  REDUCTION(rhash, rr0, rr1, rrconst, t0, t1,
304            _(CLEAR_REG(rh2);
305              CLEAR_REG(rh3);
306              CLEAR_REG(rr2);
307              CLEAR_REG(rbuf2);
308              CLEAR_REG(rbuf3)),
309            _(CLEAR_REG(rr3);
310              CLEAR_REG(rr4);
311              CLEAR_REG(rr5);
312              CLEAR_REG(rr6);
313              CLEAR_REG(rr7)),
314            _(CLEAR_REG(rr8);
315              CLEAR_REG(rr9);
316              CLEAR_REG(rbuf1);
317              CLEAR_REG(rbuf2)))
318
319  CLEAR_REG(rbuf4)
320  CLEAR_REG(rbuf5)
321  CLEAR_REG(t2)
322  CLEAR_REG(t3)
323  CLEAR_REG(t4)
324  CLEAR_REG(t5)
325
326  VPOP_ABI
327
328  cbz x3, .Ldone
329
330.Less_than_6:
331  /* Handle remaining blocks. */
332
333  ld1 {rbuf.16b}, [x2], #16
334  sub x3, x3, #1
335
336  rbit rbuf.16b, rbuf.16b /* bit-swap */
337
338  eor rhash.16b, rhash.16b, rbuf.16b
339
340  cbz x3, .Lend
341
342.Loop:
343  PMUL_128x128(rr0, rr1, rh1, rhash, t0, t1, _(ld1 {rbuf.16b}, [x2], #16))
344  REDUCTION(rhash, rr0, rr1, rrconst, t0, t1,
345            _(sub x3, x3, #1),
346            _(rbit rbuf.16b, rbuf.16b),
347            __)
348  eor rhash.16b, rhash.16b, rbuf.16b
349
350  cbnz x3, .Loop
351
352.Lend:
353  PMUL_128x128(rr0, rr1, rh1, rhash, t0, t1, _(CLEAR_REG(rbuf)))
354  REDUCTION(rhash, rr0, rr1, rrconst, t0, t1, __, _(CLEAR_REG(rh1)), __)
355
356.Ldone:
357  CLEAR_REG(rr1)
358  CLEAR_REG(rr0)
359  rbit rhash.16b, rhash.16b /* bit-swap */
360  CLEAR_REG(t0)
361  CLEAR_REG(t1)
362
363  st1 {rhash.2d}, [x1]
364  CLEAR_REG(rhash)
365
366.Ldo_nothing:
367  mov x0, #0
368  ret
369  CFI_ENDPROC()
370ELF(.size _gcry_ghash_armv8_ce_pmull,.-_gcry_ghash_armv8_ce_pmull;)
371
372
373/*
374 * void _gcry_ghash_setup_armv8_ce_pmull (void *gcm_key, void *gcm_table);
375 */
376.align 3
377.globl _gcry_ghash_setup_armv8_ce_pmull
378ELF(.type  _gcry_ghash_setup_armv8_ce_pmull,%function;)
379_gcry_ghash_setup_armv8_ce_pmull:
380  /* input:
381   *	x0: gcm_key
382   *	x1: gcm_table
383   */
384  CFI_STARTPROC()
385
386  GET_DATA_POINTER(x2, .Lrconst)
387
388  eor vZZ.16b, vZZ.16b, vZZ.16b
389
390  /* H¹ */
391  ld1 {rh1.16b}, [x0]
392  rbit rh1.16b, rh1.16b
393  st1 {rh1.16b}, [x0]
394
395  ld1r {rrconst.2d}, [x2]
396
397  /* H² */
398  PMUL_128x128(rr0, rr1, rh1, rh1, t0, t1, __)
399  REDUCTION(rh2, rr0, rr1, rrconst, t0, t1, __, __, __)
400
401  /* H³ */
402  PMUL_128x128(rr0, rr1, rh2, rh1, t0, t1, __)
403  REDUCTION(rh3, rr0, rr1, rrconst, t0, t1, __, __, __)
404
405  /* H⁴ */
406  PMUL_128x128(rr0, rr1, rh2, rh2, t0, t1, __)
407  REDUCTION(rh4, rr0, rr1, rrconst, t0, t1, __, __, __)
408
409  /* H⁵ */
410  PMUL_128x128(rr0, rr1, rh2, rh3, t0, t1, __)
411  REDUCTION(rh5, rr0, rr1, rrconst, t0, t1, __, __, __)
412
413  /* H⁶ */
414  PMUL_128x128(rr0, rr1, rh3, rh3, t0, t1, __)
415  REDUCTION(rh6, rr0, rr1, rrconst, t0, t1, __, __, __)
416
417  st1 {rh2.16b-rh4.16b}, [x1], #(3*16)
418  st1 {rh5.16b-rh6.16b}, [x1]
419
420  ret
421  CFI_ENDPROC()
422ELF(.size _gcry_ghash_setup_armv8_ce_pmull,.-_gcry_ghash_setup_armv8_ce_pmull;)
423
424#endif
425