1/* sha256-armv8-aarch64-ce.S - ARM/CE accelerated SHA-256 transform function
2 * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi>
3 *
4 * This file is part of Libgcrypt.
5 *
6 * Libgcrypt is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU Lesser General Public License as
8 * published by the Free Software Foundation; either version 2.1 of
9 * the License, or (at your option) any later version.
10 *
11 * Libgcrypt is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14 * GNU Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this program; if not, see <http://www.gnu.org/licenses/>.
18 */
19
20#include "asm-common-aarch64.h"
21
22#if defined(__AARCH64EL__) && \
23    defined(HAVE_COMPATIBLE_GCC_AARCH64_PLATFORM_AS) && \
24    defined(HAVE_GCC_INLINE_ASM_AARCH64_CRYPTO) && defined(USE_SHA256)
25
26.cpu generic+simd+crypto
27
28.text
29
30
31/* Constants */
32
33.align 4
34gcry_sha256_aarch64_ce_K:
35.LK:
36  .long 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5
37  .long 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5
38  .long 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3
39  .long 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174
40  .long 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc
41  .long 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da
42  .long 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7
43  .long 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967
44  .long 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13
45  .long 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85
46  .long 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3
47  .long 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070
48  .long 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5
49  .long 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3
50  .long 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208
51  .long 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
52
53
54/* Register macros */
55
56#define vH0123 v0
57#define vH4567 v1
58
59#define vABCD0 v2
60#define qABCD0 q2
61#define vABCD1 v3
62#define qABCD1 q3
63#define vEFGH  v4
64#define qEFGH  q4
65
66#define vT0 v5
67#define vT1 v6
68
69#define vW0 v16
70#define vW1 v17
71#define vW2 v18
72#define vW3 v19
73
74#define vK0 v20
75#define vK1 v21
76#define vK2 v22
77#define vK3 v23
78
79
80/* Round macros */
81
82#define _(...) /*_*/
83
84#define do_loadk(nk0, nk1) ld1 {nk0.16b-nk1.16b},[x3],#32;
85#define do_add(a, b) add a.4s, a.4s, b.4s;
86#define do_sha256su0(w0, w1) sha256su0 w0.4s, w1.4s;
87#define do_sha256su1(w0, w2, w3) sha256su1 w0.4s, w2.4s, w3.4s;
88
89#define do_rounds(k, nk0, nk1, w0, w1, w2, w3, loadk_fn, add_fn, su0_fn, su1_fn) \
90        loadk_fn(   v##nk0, v##nk1     ); \
91        su0_fn(     v##w0, v##w1       ); \
92        mov         vABCD1.16b, vABCD0.16b; \
93        sha256h     qABCD0, qEFGH, v##k.4s; \
94        sha256h2    qEFGH, qABCD1, v##k.4s; \
95        add_fn(     v##nk0, v##w2      ); \
96        su1_fn(     v##w0, v##w2, v##w3   );
97
98
99/* Other functional macros */
100
101#define CLEAR_REG(reg) eor reg.16b, reg.16b, reg.16b;
102
103
104/*
105 * unsigned int
106 * _gcry_sha256_transform_armv8_ce (u32 state[8], const void *input_data,
107 *                                  size_t num_blks)
108 */
109.align 3
110.globl _gcry_sha256_transform_armv8_ce
111ELF(.type  _gcry_sha256_transform_armv8_ce,%function;)
112_gcry_sha256_transform_armv8_ce:
113  /* input:
114   *	r0: ctx, CTX
115   *	r1: data (64*nblks bytes)
116   *	r2: nblks
117   */
118  CFI_STARTPROC();
119
120  cbz x2, .Ldo_nothing;
121
122  GET_DATA_POINTER(x3, .LK);
123  mov x4, x3
124
125  ld1 {vH0123.4s-vH4567.4s}, [x0]  /* load state */
126
127  ld1 {vW0.16b-vW1.16b}, [x1], #32
128  do_loadk(vK0, vK1)
129  ld1 {vW2.16b-vW3.16b}, [x1], #32
130  mov vABCD0.16b, vH0123.16b
131  mov vEFGH.16b, vH4567.16b
132
133  rev32 vW0.16b, vW0.16b
134  rev32 vW1.16b, vW1.16b
135  rev32 vW2.16b, vW2.16b
136  do_add(vK0, vW0)
137  rev32 vW3.16b, vW3.16b
138  do_add(vK1, vW1)
139
140.Loop:
141  do_rounds(K0, K2, K3, W0, W1, W2, W3, do_loadk, do_add, do_sha256su0, do_sha256su1)
142  sub x2,x2,#1
143  do_rounds(K1, K3, _ , W1, W2, W3, W0, _       , do_add, do_sha256su0, do_sha256su1)
144  do_rounds(K2, K0, K1, W2, W3, W0, W1, do_loadk, do_add, do_sha256su0, do_sha256su1)
145  do_rounds(K3, K1, _ , W3, W0, W1, W2, _       , do_add, do_sha256su0, do_sha256su1)
146
147  do_rounds(K0, K2, K3, W0, W1, W2, W3, do_loadk, do_add, do_sha256su0, do_sha256su1)
148  do_rounds(K1, K3, _ , W1, W2, W3, W0, _       , do_add, do_sha256su0, do_sha256su1)
149  do_rounds(K2, K0, K1, W2, W3, W0, W1, do_loadk, do_add, do_sha256su0, do_sha256su1)
150  do_rounds(K3, K1, _ , W3, W0, W1, W2, _       , do_add, do_sha256su0, do_sha256su1)
151
152  do_rounds(K0, K2, K3, W0, W1, W2, W3, do_loadk, do_add, do_sha256su0, do_sha256su1)
153  do_rounds(K1, K3, _ , W1, W2, W3, W0, _       , do_add, do_sha256su0, do_sha256su1)
154  do_rounds(K2, K0, K1, W2, W3, W0, W1, do_loadk, do_add, do_sha256su0, do_sha256su1)
155  do_rounds(K3, K1, _ , W3, W0, W1, W2, _       , do_add, do_sha256su0, do_sha256su1)
156
157  cbz x2, .Lend
158
159  do_rounds(K0, K2, K3, W0, _  , W2, W3, do_loadk, do_add, _, _)
160  ld1 {vW0.16b}, [x1], #16
161  mov x3, x4
162  do_rounds(K1, K3, _ , W1, _  , W3, _  , _       , do_add, _, _)
163  ld1 {vW1.16b}, [x1], #16
164  rev32 vW0.16b, vW0.16b
165  do_rounds(K2, K0, K1, W2, _  , W0, _  , do_loadk, do_add, _, _)
166  rev32 vW1.16b, vW1.16b
167  ld1 {vW2.16b}, [x1], #16
168  do_rounds(K3, K1, _ , W3, _  , W1, _  , _       , do_add, _, _)
169  ld1 {vW3.16b}, [x1], #16
170
171  do_add(vH0123, vABCD0)
172  do_add(vH4567, vEFGH)
173
174  rev32 vW2.16b, vW2.16b
175  mov vABCD0.16b, vH0123.16b
176  rev32 vW3.16b, vW3.16b
177  mov vEFGH.16b, vH4567.16b
178
179  b .Loop
180
181.Lend:
182
183  do_rounds(K0, K2, K3, W0, _  , W2, W3, do_loadk, do_add, _, _)
184  do_rounds(K1, K3, _ , W1, _  , W3, _  , _       , do_add, _, _)
185  do_rounds(K2, _ , _ , W2, _  , _  , _  , _       , _, _, _)
186  do_rounds(K3, _ , _ , W3, _  , _  , _  , _       , _, _, _)
187
188  CLEAR_REG(vW0)
189  CLEAR_REG(vW1)
190  CLEAR_REG(vW2)
191  CLEAR_REG(vW3)
192  CLEAR_REG(vK0)
193  CLEAR_REG(vK1)
194  CLEAR_REG(vK2)
195  CLEAR_REG(vK3)
196
197  do_add(vH0123, vABCD0)
198  do_add(vH4567, vEFGH)
199
200  CLEAR_REG(vABCD0)
201  CLEAR_REG(vABCD1)
202  CLEAR_REG(vEFGH)
203
204  st1 {vH0123.4s-vH4567.4s}, [x0] /* store state */
205
206  CLEAR_REG(vH0123)
207  CLEAR_REG(vH4567)
208
209.Ldo_nothing:
210  mov x0, #0
211  ret
212  CFI_ENDPROC();
213ELF(.size _gcry_sha256_transform_armv8_ce,.-_gcry_sha256_transform_armv8_ce;)
214
215#endif
216