1/* cipher-gcm-armv8-aarch32-ce.S - ARM/CE accelerated GHASH
2 * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi>
3 *
4 * This file is part of Libgcrypt.
5 *
6 * Libgcrypt is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation; either version 2.1 of
9 * the License, or (at your option) any later version.
10 *
11 * Libgcrypt is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this program; if not, see <http://www.gnu.org/licenses/>.
18 */
19
20#include <config.h>
21
22#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \
23    defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \
24    defined(HAVE_GCC_INLINE_ASM_AARCH32_CRYPTO)
25
26.syntax unified
27.arch armv8-a
28.fpu crypto-neon-fp-armv8
29.arm
30
31.text
32
33#ifdef __PIC__
34#  define GET_DATA_POINTER(reg, name, rtmp) \
35		ldr reg, 1f; \
36		ldr rtmp, 2f; \
37		b 3f; \
38	1:	.word _GLOBAL_OFFSET_TABLE_-(3f+8); \
39	2:	.word name(GOT); \
40	3:	add reg, pc, reg; \
41		ldr reg, [reg, rtmp];
42#else
43#  define GET_DATA_POINTER(reg, name, rtmp) ldr reg, =name
44#endif
45
46
47/* Constants */
48
49.align 4
50gcry_gcm_reduction_constant:
51.Lrconst64:
52  .quad 0xc200000000000000
53
54
55/* Register macros */
56
57#define rhash q0
58#define rhash_l d0
59#define rhash_h d1
60
61#define rh1 q1
62#define rh1_l d2
63#define rh1_h d3
64
65#define rbuf q2
66#define rbuf_l d4
67#define rbuf_h d5
68
69#define rbuf1 q3
70#define rbuf1_l d6
71#define rbuf1_h d7
72
73#define rbuf2 q4
74#define rbuf2_l d8
75#define rbuf2_h d9
76
77#define rbuf3 q5
78#define rbuf3_l d10
79#define rbuf3_h d11
80
81#define rh2 q6
82#define rh2_l d12
83#define rh2_h d13
84
85#define rh3 q7
86#define rh3_l d14
87#define rh3_h d15
88
89#define rh4 q8
90#define rh4_l d16
91#define rh4_h d17
92
93#define rr2 q9
94#define rr2_l d18
95#define rr2_h d19
96
97#define rr3 q10
98#define rr3_l d20
99#define rr3_h d21
100
101#define rr0 q11
102#define rr0_l d22
103#define rr0_h d23
104
105#define rr1 q12
106#define rr1_l d24
107#define rr1_h d25
108
109#define rt0 q13
110#define rt0_l d26
111#define rt0_h d27
112
113#define rt1 q14
114#define rt1_l d28
115#define rt1_h d29
116
117#define rrconst q15
118#define rrconst_l d30
119#define rrconst_h d31
120
121/* GHASH macros */
122
123/* See "Gouvêa, C. P. L. & López, J. Implementing GCM on ARMv8. Topics in
124 * Cryptology — CT-RSA 2015" for details.
125 */
126
127/* Input: 'a' and 'b', Output: 'r0:r1' (low 128-bits in r0, high in r1)
128 *  Note: 'r1' may be 'a' or 'b', 'r0' must not be either 'a' or 'b'.
129 */
130#define PMUL_128x128(r0, r1, a, b, t, interleave_op) \
131        veor t##_h, b##_l, b##_h; \
132        veor t##_l, a##_l, a##_h; \
133        vmull.p64 r0, a##_l, b##_l; \
134        vmull.p64 r1, a##_h, b##_h; \
135        vmull.p64 t, t##_h, t##_l; \
136        interleave_op; \
137        veor t, r0; \
138        veor t, r1; \
139        veor r0##_h, t##_l; \
140        veor r1##_l, t##_h;
141
142/* Input: 'aA' and 'bA', Output: 'r0A:r1A' (low 128-bits in r0A, high in r1A)
143 *  Note: 'r1A' may be 'aA' or 'bA', 'r0A' must not be either 'aA' or 'bA'.
144 * Input: 'aB' and 'bB', Output: 'r0B:r1B' (low 128-bits in r0B, high in r1B)
145 *  Note: 'r1B' may be 'aB' or 'bB', 'r0B' must not be either 'aB' or 'bB'.
146 */
147#define PMUL_128x128_2(r0A, r1A, aA, bA, r0B, r1B, aB, bB, tA, tB, interleave_op) \
148        veor tA##_h, bA##_l, bA##_h; \
149        veor tA##_l, aA##_l, aA##_h; \
150          veor tB##_h, bB##_l, bB##_h; \
151          veor tB##_l, aB##_l, aB##_h; \
152        vmull.p64 r0A, aA##_l, bA##_l; \
153        vmull.p64 r1A, aA##_h, bA##_h; \
154        vmull.p64 tA, tA##_h, tA##_l; \
155          vmull.p64 r0B, aB##_l, bB##_l; \
156          vmull.p64 r1B, aB##_h, bB##_h; \
157          vmull.p64 tB, tB##_h, tB##_l; \
158        interleave_op; \
159        veor tA, r0A; \
160        veor tA, r1A; \
161          veor tB, r0B; \
162          veor tB, r1B; \
163        veor r0A##_h, tA##_l; \
164        veor r1A##_l, tA##_h; \
165          veor r0B##_h, tB##_l; \
166          veor r1B##_l, tB##_h; \
167
168/* Input: 'r0:r1', Output: 'a' */
169#define REDUCTION(a, r0, r1, rconst, t, interleave_op) \
170        vmull.p64 t, r0##_l, rconst; \
171        veor r0##_h, t##_l; \
172        veor r1##_l, t##_h; \
173        interleave_op; \
174        vmull.p64 t, r0##_h, rconst; \
175        veor r1, t; \
176        veor a, r0, r1;
177
178#define _(...) __VA_ARGS__
179#define __ _()
180
181/* Other functional macros */
182
183#define CLEAR_REG(reg) veor reg, reg;
184
185
186/*
187 * unsigned int _gcry_ghash_armv8_ce_pmull (void *gcm_key, byte *result,
188 *                                          const byte *buf, size_t nblocks,
189 *                                          void *gcm_table);
190 */
191.align 3
192.globl _gcry_ghash_armv8_ce_pmull
193.type  _gcry_ghash_armv8_ce_pmull,%function;
194_gcry_ghash_armv8_ce_pmull:
195  /* input:
196   *    r0: gcm_key
197   *    r1: result/hash
198   *    r2: buf
199   *    r3: nblocks
200   *    %st+0: gcm_table
201   */
202  push {r4-r6, lr}
203
204  cmp r3, #0
205  beq .Ldo_nothing
206
207  GET_DATA_POINTER(r4, .Lrconst64, lr)
208
209  vld1.64 {rhash}, [r1]
210  vld1.64 {rh1}, [r0]
211
212  vrev64.8 rhash, rhash /* byte-swap */
213  vld1.64 {rrconst_h}, [r4]
214  vext.8 rhash, rhash, rhash, #8
215
216  cmp r3, #4
217  blo .Less_than_4
218
219  /* Bulk processing of 4 blocks per loop iteration. */
220
221  ldr r5, [sp, #(4*4)];
222  add r6, r5, #32
223
224  vpush {q4-q7}
225
226  vld1.64 {rh2-rh3}, [r5]
227  vld1.64 {rh4}, [r6]
228
229  vld1.64 {rbuf-rbuf1}, [r2]!
230  sub r3, r3, #4
231  vld1.64 {rbuf2-rbuf3}, [r2]!
232
233  cmp r3, #4
234  vrev64.8 rbuf, rbuf /* byte-swap */
235  vrev64.8 rbuf1, rbuf1 /* byte-swap */
236  vrev64.8 rbuf2, rbuf2 /* byte-swap */
237  vrev64.8 rbuf3, rbuf3 /* byte-swap */
238
239  vext.8 rbuf, rbuf, rbuf, #8
240  vext.8 rbuf1, rbuf1, rbuf1, #8
241  vext.8 rbuf2, rbuf2, rbuf2, #8
242  vext.8 rbuf3, rbuf3, rbuf3, #8
243  veor rhash, rhash, rbuf /* in0 ^ hash */
244
245  blo .Lend_4
246
247.Loop_4:
248  /* (in0 ^ hash) * H⁴ => rr2:rr3 */
249  /* (in1) * H³ => rr0:rr1 */
250  PMUL_128x128_2(rr0, rr1, rbuf1, rh3, rr2, rr3, rhash, rh4, rt1, rt0, __)
251
252  vld1.64 {rbuf-rbuf1}, [r2]!
253  sub r3, r3, #4
254  veor rr0, rr0, rr2
255  veor rr1, rr1, rr3
256
257  /* (in2) * H² => rr2:rr3 */
258  /* (in3) * H¹ => rhash:rbuf3 */
259  PMUL_128x128_2(rr2, rr3, rbuf2, rh2, rhash, rbuf3, rbuf3, rh1, rt0, rt1,
260                 _(vrev64.8 rbuf, rbuf))
261
262  vld1.64 {rbuf2}, [r2]!
263
264  vrev64.8 rbuf1, rbuf1
265  veor rr0, rr0, rr2
266  veor rr1, rr1, rr3
267
268  cmp r3, #4
269  vext.8 rbuf, rbuf, rbuf, #8
270  vext.8 rbuf1, rbuf1, rbuf1, #8
271
272  veor rr0, rr0, rhash
273  veor rr1, rr1, rbuf3
274
275  vld1.64 {rbuf3}, [r2]!
276
277  REDUCTION(rhash, rr0, rr1, rrconst_h, rt1,
278            _(vrev64.8 rbuf2, rbuf2;
279              vrev64.8 rbuf3, rbuf3))
280
281  vext.8 rbuf2, rbuf2, rbuf2, #8
282  vext.8 rbuf3, rbuf3, rbuf3, #8
283  veor rhash, rhash, rbuf /* in0 ^ hash */
284
285  bhs .Loop_4
286
287.Lend_4:
288  /* (in0 ^ hash) * H⁴ => rr2:rr3 */
289  /* (in1) * H³ => rr0:rr1 */
290  PMUL_128x128_2(rr0, rr1, rbuf1, rh3, rr2, rr3, rhash, rh4, rt1, rt0, __)
291
292  /* (in2) * H² => rhash:rbuf */
293  /* (in3) * H¹ => rbuf1:rbuf2 */
294  PMUL_128x128_2(rhash, rbuf, rbuf2, rh2, rbuf1, rbuf2, rbuf3, rh1, rt0, rt1,
295                 _(veor rr0, rr0, rr2;
296                   veor rr1, rr1, rr3))
297
298  veor rr0, rr0, rhash
299  veor rr1, rr1, rbuf
300
301  veor rr0, rr0, rbuf1
302  veor rr1, rr1, rbuf2
303
304  REDUCTION(rhash, rr0, rr1, rrconst_h, rt1,
305            _(CLEAR_REG(rr2);
306              CLEAR_REG(rr3);
307              CLEAR_REG(rbuf1);
308              CLEAR_REG(rbuf2);
309              CLEAR_REG(rbuf3);
310              CLEAR_REG(rh2);
311              CLEAR_REG(rh3);
312              CLEAR_REG(rh4)))
313
314  vpop {q4-q7}
315
316  cmp r3, #0
317  beq .Ldone
318
319.Less_than_4:
320  /* Handle remaining blocks. */
321
322  vld1.64 {rbuf}, [r2]!
323  subs r3, r3, #1
324
325  vrev64.8 rbuf, rbuf /* byte-swap */
326  vext.8 rbuf, rbuf, rbuf, #8
327
328  veor rhash, rhash, rbuf
329
330  beq .Lend
331
332.Loop:
333  vld1.64 {rbuf}, [r2]!
334  subs r3, r3, #1
335  PMUL_128x128(rr0, rr1, rhash, rh1, rt0, _(vrev64.8 rbuf, rbuf))
336  REDUCTION(rhash, rr0, rr1, rrconst_h, rt0, _(vext.8 rbuf, rbuf, rbuf, #8))
337  veor rhash, rhash, rbuf
338
339  bne .Loop
340
341.Lend:
342  PMUL_128x128(rr0, rr1, rhash, rh1, rt0, _(CLEAR_REG(rbuf)))
343  REDUCTION(rhash, rr0, rr1, rrconst_h, rt0, _(CLEAR_REG(rh1)))
344
345.Ldone:
346  CLEAR_REG(rr1)
347  vrev64.8 rhash, rhash /* byte-swap */
348  CLEAR_REG(rt0)
349  CLEAR_REG(rr0)
350  vext.8 rhash, rhash, rhash, #8
351  CLEAR_REG(rt1)
352  vst1.64 {rhash}, [r1]
353  CLEAR_REG(rhash)
354
355.Ldo_nothing:
356  mov r0, #0
357  pop {r4-r6, pc}
358.size _gcry_ghash_armv8_ce_pmull,.-_gcry_ghash_armv8_ce_pmull;
359
360
361/*
362 * void _gcry_ghash_setup_armv8_ce_pmull (void *gcm_key, void *gcm_table);
363 */
364.align 3
365.globl _gcry_ghash_setup_armv8_ce_pmull
366.type  _gcry_ghash_setup_armv8_ce_pmull,%function;
367_gcry_ghash_setup_armv8_ce_pmull:
368  /* input:
369   *	r0: gcm_key
370   *	r1: gcm_table
371   */
372
373  vpush {q4-q7}
374
375  GET_DATA_POINTER(r2, .Lrconst64, r3)
376
377  vld1.64 {rrconst_h}, [r2]
378
379#define GCM_LSH_1(r_out, ia, ib, const_d, oa, ob, ma) \
380        /* H <<< 1 */ \
381        vshr.s64 ma, ib, #63; \
382        vshr.u64 oa, ib, #63; \
383        vshr.u64 ob, ia, #63; \
384        vand ma, const_d; \
385        vshl.u64 ib, ib, #1; \
386        vshl.u64 ia, ia, #1; \
387        vorr ob, ib; \
388        vorr oa, ia; \
389        veor ob, ma; \
390        vst1.64 {oa, ob}, [r_out]
391
392  vld1.64 {rhash}, [r0]
393  vrev64.8 rhash, rhash /* byte-swap */
394  vext.8 rhash, rhash, rhash, #8
395
396  vmov rbuf1, rhash
397  GCM_LSH_1(r0, rhash_l, rhash_h, rrconst_h, rh1_l, rh1_h, rt1_l) /* H<<<1 */
398
399  /* H² */
400  PMUL_128x128(rr0, rr1, rbuf1, rh1, rt0, __)
401  REDUCTION(rh2, rr0, rr1, rrconst_h, rt0, __)
402  vmov rhash, rh2
403  GCM_LSH_1(r1, rh2_l, rh2_h, rrconst_h, rbuf1_l, rbuf1_h, rt1_l) /* H²<<<1 */
404  add r1, r1, #16
405
406  /* H³ */
407  PMUL_128x128(rr0, rr1, rhash, rh1, rt1, __)
408  REDUCTION(rh3, rr0, rr1, rrconst_h, rt1, __)
409
410  /* H⁴ */
411  PMUL_128x128(rr0, rr1, rhash, rbuf1, rt0, __)
412  REDUCTION(rh4, rr0, rr1, rrconst_h, rt0, __)
413
414  GCM_LSH_1(r1, rh3_l, rh3_h, rrconst_h, rt0_l, rt0_h, rt1_l) /* H³<<<1 */
415  add r1, r1, #16
416  GCM_LSH_1(r1, rh4_l, rh4_h, rrconst_h, rt0_l, rt0_h, rt1_l) /* H⁴<<<1 */
417
418  CLEAR_REG(rt0)
419  CLEAR_REG(rt1)
420  CLEAR_REG(rr1)
421  CLEAR_REG(rr0)
422  CLEAR_REG(rh1)
423  CLEAR_REG(rh2)
424  CLEAR_REG(rh3)
425  CLEAR_REG(rh4)
426  CLEAR_REG(rhash)
427  CLEAR_REG(rbuf1)
428  CLEAR_REG(rrconst)
429  vpop {q4-q7}
430  bx lr
431.size _gcry_ghash_setup_armv8_ce_pmull,.-_gcry_ghash_setup_armv8_ce_pmull;
432
433#endif
434