1/* sha512-armv7-neon.S  -  ARM/NEON assembly implementation of SHA-512 transform
2 *
3 * Copyright (C) 2013 Jussi Kivilinna <jussi.kivilinna@iki.fi>
4 *
5 * This file is part of Libgcrypt.
6 *
7 * Libgcrypt is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU Lesser General Public License as
9 * published by the Free Software Foundation; either version 2.1 of
10 * the License, or (at your option) any later version.
11 *
12 * Libgcrypt is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with this program; if not, see <http://www.gnu.org/licenses/>.
19 */
20
21#include <config.h>
22
23#if defined(HAVE_ARM_ARCH_V6) && defined(__ARMEL__) && \
24    defined(HAVE_COMPATIBLE_GCC_ARM_PLATFORM_AS) && \
25    defined(HAVE_GCC_INLINE_ASM_NEON)
26
27.text
28
29.syntax unified
30.fpu neon
31.arm
32
33/* structure of SHA512_CONTEXT */
34#define hd_a 0
35#define hd_b ((hd_a) + 8)
36#define hd_c ((hd_b) + 8)
37#define hd_d ((hd_c) + 8)
38#define hd_e ((hd_d) + 8)
39#define hd_f ((hd_e) + 8)
40#define hd_g ((hd_f) + 8)
41
42/* register macros */
43#define RK %r2
44
45#define RA d0
46#define RB d1
47#define RC d2
48#define RD d3
49#define RE d4
50#define RF d5
51#define RG d6
52#define RH d7
53
54#define RT0 d8
55#define RT1 d9
56#define RT2 d10
57#define RT3 d11
58#define RT4 d12
59#define RT5 d13
60#define RT6 d14
61#define RT7 d15
62
63#define RT01q q4
64#define RT23q q5
65#define RT45q q6
66#define RT67q q7
67
68#define RW0 d16
69#define RW1 d17
70#define RW2 d18
71#define RW3 d19
72#define RW4 d20
73#define RW5 d21
74#define RW6 d22
75#define RW7 d23
76#define RW8 d24
77#define RW9 d25
78#define RW10 d26
79#define RW11 d27
80#define RW12 d28
81#define RW13 d29
82#define RW14 d30
83#define RW15 d31
84
85#define RW01q q8
86#define RW23q q9
87#define RW45q q10
88#define RW67q q11
89#define RW89q q12
90#define RW1011q q13
91#define RW1213q q14
92#define RW1415q q15
93
94/***********************************************************************
95 * ARM assembly implementation of sha512 transform
96 ***********************************************************************/
97#define rounds2_0_63(ra, rb, rc, rd, re, rf, rg, rh, rw0, rw1, rw01q, rw2, rw23q, rw1415q, rw9, rw10, interleave_op, arg1) \
98	/* t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t]; */ \
99	vshr.u64 RT2, re, #14; \
100	vshl.u64 RT3, re, #64 - 14; \
101	interleave_op(arg1); \
102	vshr.u64 RT4, re, #18; \
103	vshl.u64 RT5, re, #64 - 18; \
104	vld1.64 {RT0}, [RK]!; \
105	veor.64 RT23q, RT23q, RT45q; \
106	vshr.u64 RT4, re, #41; \
107	vshl.u64 RT5, re, #64 - 41; \
108	vadd.u64 RT0, RT0, rw0; \
109	veor.64 RT23q, RT23q, RT45q; \
110	vmov.64 RT7, re; \
111	veor.64 RT1, RT2, RT3; \
112	vbsl.64 RT7, rf, rg; \
113	\
114	vadd.u64 RT1, RT1, rh; \
115	vshr.u64 RT2, ra, #28; \
116	vshl.u64 RT3, ra, #64 - 28; \
117	vadd.u64 RT1, RT1, RT0; \
118	vshr.u64 RT4, ra, #34; \
119	vshl.u64 RT5, ra, #64 - 34; \
120	vadd.u64 RT1, RT1, RT7; \
121	\
122	/* h = Sum0 (a) + Maj (a, b, c); */ \
123	veor.64 RT23q, RT23q, RT45q; \
124	vshr.u64 RT4, ra, #39; \
125	vshl.u64 RT5, ra, #64 - 39; \
126	veor.64 RT0, ra, rb; \
127	veor.64 RT23q, RT23q, RT45q; \
128	vbsl.64 RT0, rc, rb; \
129	vadd.u64 rd, rd, RT1; /* d+=t1; */ \
130	veor.64 rh, RT2, RT3; \
131	\
132	/* t1 = g + Sum1 (d) + Ch (d, e, f) + k[t] + w[t]; */ \
133	vshr.u64 RT2, rd, #14; \
134	vshl.u64 RT3, rd, #64 - 14; \
135	vadd.u64 rh, rh, RT0; \
136	vshr.u64 RT4, rd, #18; \
137	vshl.u64 RT5, rd, #64 - 18; \
138	vadd.u64 rh, rh, RT1; /* h+=t1; */ \
139	vld1.64 {RT0}, [RK]!; \
140	veor.64 RT23q, RT23q, RT45q; \
141	vshr.u64 RT4, rd, #41; \
142	vshl.u64 RT5, rd, #64 - 41; \
143	vadd.u64 RT0, RT0, rw1; \
144	veor.64 RT23q, RT23q, RT45q; \
145	vmov.64 RT7, rd; \
146	veor.64 RT1, RT2, RT3; \
147	vbsl.64 RT7, re, rf; \
148	\
149	vadd.u64 RT1, RT1, rg; \
150	vshr.u64 RT2, rh, #28; \
151	vshl.u64 RT3, rh, #64 - 28; \
152	vadd.u64 RT1, RT1, RT0; \
153	vshr.u64 RT4, rh, #34; \
154	vshl.u64 RT5, rh, #64 - 34; \
155	vadd.u64 RT1, RT1, RT7; \
156	\
157	/* g = Sum0 (h) + Maj (h, a, b); */ \
158	veor.64 RT23q, RT23q, RT45q; \
159	vshr.u64 RT4, rh, #39; \
160	vshl.u64 RT5, rh, #64 - 39; \
161	veor.64 RT0, rh, ra; \
162	veor.64 RT23q, RT23q, RT45q; \
163	vbsl.64 RT0, rb, ra; \
164	vadd.u64 rc, rc, RT1; /* c+=t1; */ \
165	veor.64 rg, RT2, RT3; \
166	\
167	/* w[0] += S1 (w[14]) + w[9] + S0 (w[1]); */ \
168	/* w[1] += S1 (w[15]) + w[10] + S0 (w[2]); */ \
169	\
170	/**** S0(w[1:2]) */ \
171	\
172	/* w[0:1] += w[9:10] */ \
173	/* RT23q = rw1:rw2 */ \
174	vext.u64 RT23q, rw01q, rw23q, #1; \
175	vadd.u64 rw0, rw9; \
176	vadd.u64 rg, rg, RT0; \
177	vadd.u64 rw1, rw10;\
178	vadd.u64 rg, rg, RT1; /* g+=t1; */ \
179	\
180	vshr.u64 RT45q, RT23q, #1; \
181	vshl.u64 RT67q, RT23q, #64 - 1; \
182	vshr.u64 RT01q, RT23q, #8; \
183	veor.u64 RT45q, RT45q, RT67q; \
184	vshl.u64 RT67q, RT23q, #64 - 8; \
185	veor.u64 RT45q, RT45q, RT01q; \
186	vshr.u64 RT01q, RT23q, #7; \
187	veor.u64 RT45q, RT45q, RT67q; \
188	\
189	/**** S1(w[14:15]) */ \
190	vshr.u64 RT23q, rw1415q, #6; \
191	veor.u64 RT01q, RT01q, RT45q; \
192	vshr.u64 RT45q, rw1415q, #19; \
193	vshl.u64 RT67q, rw1415q, #64 - 19; \
194	veor.u64 RT23q, RT23q, RT45q; \
195	vshr.u64 RT45q, rw1415q, #61; \
196	veor.u64 RT23q, RT23q, RT67q; \
197	vshl.u64 RT67q, rw1415q, #64 - 61; \
198	veor.u64 RT23q, RT23q, RT45q; \
199	vadd.u64 rw01q, RT01q; /* w[0:1] += S(w[1:2]) */ \
200	veor.u64 RT01q, RT23q, RT67q;
201#define vadd_RT01q(rw01q) \
202	/* w[0:1] += S(w[14:15]) */ \
203	vadd.u64 rw01q, RT01q;
204
205#define dummy(_) /*_*/
206
207#define rounds2_64_79(ra, rb, rc, rd, re, rf, rg, rh, rw0, rw1, interleave_op1, arg1, interleave_op2, arg2) \
208	/* t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t]; */ \
209	vshr.u64 RT2, re, #14; \
210	vshl.u64 RT3, re, #64 - 14; \
211	interleave_op1(arg1); \
212	vshr.u64 RT4, re, #18; \
213	vshl.u64 RT5, re, #64 - 18; \
214	interleave_op2(arg2); \
215	vld1.64 {RT0}, [RK]!; \
216	veor.64 RT23q, RT23q, RT45q; \
217	vshr.u64 RT4, re, #41; \
218	vshl.u64 RT5, re, #64 - 41; \
219	vadd.u64 RT0, RT0, rw0; \
220	veor.64 RT23q, RT23q, RT45q; \
221	vmov.64 RT7, re; \
222	veor.64 RT1, RT2, RT3; \
223	vbsl.64 RT7, rf, rg; \
224	\
225	vadd.u64 RT1, RT1, rh; \
226	vshr.u64 RT2, ra, #28; \
227	vshl.u64 RT3, ra, #64 - 28; \
228	vadd.u64 RT1, RT1, RT0; \
229	vshr.u64 RT4, ra, #34; \
230	vshl.u64 RT5, ra, #64 - 34; \
231	vadd.u64 RT1, RT1, RT7; \
232	\
233	/* h = Sum0 (a) + Maj (a, b, c); */ \
234	veor.64 RT23q, RT23q, RT45q; \
235	vshr.u64 RT4, ra, #39; \
236	vshl.u64 RT5, ra, #64 - 39; \
237	veor.64 RT0, ra, rb; \
238	veor.64 RT23q, RT23q, RT45q; \
239	vbsl.64 RT0, rc, rb; \
240	vadd.u64 rd, rd, RT1; /* d+=t1; */ \
241	veor.64 rh, RT2, RT3; \
242	\
243	/* t1 = g + Sum1 (d) + Ch (d, e, f) + k[t] + w[t]; */ \
244	vshr.u64 RT2, rd, #14; \
245	vshl.u64 RT3, rd, #64 - 14; \
246	vadd.u64 rh, rh, RT0; \
247	vshr.u64 RT4, rd, #18; \
248	vshl.u64 RT5, rd, #64 - 18; \
249	vadd.u64 rh, rh, RT1; /* h+=t1; */ \
250	vld1.64 {RT0}, [RK]!; \
251	veor.64 RT23q, RT23q, RT45q; \
252	vshr.u64 RT4, rd, #41; \
253	vshl.u64 RT5, rd, #64 - 41; \
254	vadd.u64 RT0, RT0, rw1; \
255	veor.64 RT23q, RT23q, RT45q; \
256	vmov.64 RT7, rd; \
257	veor.64 RT1, RT2, RT3; \
258	vbsl.64 RT7, re, rf; \
259	\
260	vadd.u64 RT1, RT1, rg; \
261	vshr.u64 RT2, rh, #28; \
262	vshl.u64 RT3, rh, #64 - 28; \
263	vadd.u64 RT1, RT1, RT0; \
264	vshr.u64 RT4, rh, #34; \
265	vshl.u64 RT5, rh, #64 - 34; \
266	vadd.u64 RT1, RT1, RT7; \
267	\
268	/* g = Sum0 (h) + Maj (h, a, b); */ \
269	veor.64 RT23q, RT23q, RT45q; \
270	vshr.u64 RT4, rh, #39; \
271	vshl.u64 RT5, rh, #64 - 39; \
272	veor.64 RT0, rh, ra; \
273	veor.64 RT23q, RT23q, RT45q; \
274	vbsl.64 RT0, rb, ra; \
275	vadd.u64 rc, rc, RT1; /* c+=t1; */ \
276	veor.64 rg, RT2, RT3;
277#define vadd_rg_RT0(rg) \
278	vadd.u64 rg, rg, RT0;
279#define vadd_rg_RT1(rg) \
280	vadd.u64 rg, rg, RT1; /* g+=t1; */
281
282.align 3
283.globl _gcry_sha512_transform_armv7_neon
284.type  _gcry_sha512_transform_armv7_neon,%function;
285
286_gcry_sha512_transform_armv7_neon:
287	/* Input:
288	 *	%r0: SHA512_CONTEXT
289	 *	%r1: data
290	 *	%r2: u64 k[] constants
291	 *	%r3: nblks
292	 */
293	push {%lr};
294
295	mov %lr, #0;
296
297	/* Load context to d0-d7 */
298	vld1.64 {RA-RD}, [%r0]!;
299	vld1.64 {RE-RH}, [%r0];
300	sub %r0, #(4*8);
301
302	/* Load input to w[16], d16-d31 */
303	/* NOTE: Assumes that on ARMv7 unaligned accesses are always allowed. */
304	vld1.64 {RW0-RW3}, [%r1]!;
305	vld1.64 {RW4-RW7}, [%r1]!;
306	vld1.64 {RW8-RW11}, [%r1]!;
307	vld1.64 {RW12-RW15}, [%r1]!;
308#ifdef __ARMEL__
309	/* byteswap */
310	vrev64.8 RW01q, RW01q;
311	vrev64.8 RW23q, RW23q;
312	vrev64.8 RW45q, RW45q;
313	vrev64.8 RW67q, RW67q;
314	vrev64.8 RW89q, RW89q;
315	vrev64.8 RW1011q, RW1011q;
316	vrev64.8 RW1213q, RW1213q;
317	vrev64.8 RW1415q, RW1415q;
318#endif
319
320	/* EABI says that d8-d15 must be preserved by callee. */
321	vpush {RT0-RT7};
322
323.Loop:
324	rounds2_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW0, RW1, RW01q, RW2, RW23q, RW1415q, RW9, RW10, dummy, _);
325	b .Lenter_rounds;
326
327.Loop_rounds:
328	rounds2_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW0, RW1, RW01q, RW2, RW23q, RW1415q, RW9, RW10, vadd_RT01q, RW1415q);
329.Lenter_rounds:
330	rounds2_0_63(RG, RH, RA, RB, RC, RD, RE, RF, RW2, RW3, RW23q, RW4, RW45q, RW01q, RW11, RW12, vadd_RT01q, RW01q);
331	rounds2_0_63(RE, RF, RG, RH, RA, RB, RC, RD, RW4, RW5, RW45q, RW6, RW67q, RW23q, RW13, RW14, vadd_RT01q, RW23q);
332	rounds2_0_63(RC, RD, RE, RF, RG, RH, RA, RB, RW6, RW7, RW67q, RW8, RW89q, RW45q, RW15, RW0, vadd_RT01q, RW45q);
333	rounds2_0_63(RA, RB, RC, RD, RE, RF, RG, RH, RW8, RW9, RW89q, RW10, RW1011q, RW67q, RW1, RW2, vadd_RT01q, RW67q);
334	rounds2_0_63(RG, RH, RA, RB, RC, RD, RE, RF, RW10, RW11, RW1011q, RW12, RW1213q, RW89q, RW3, RW4, vadd_RT01q, RW89q);
335	add %lr, #16;
336	rounds2_0_63(RE, RF, RG, RH, RA, RB, RC, RD, RW12, RW13, RW1213q, RW14, RW1415q, RW1011q, RW5, RW6, vadd_RT01q, RW1011q);
337	cmp %lr, #64;
338	rounds2_0_63(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW15, RW1415q, RW0, RW01q, RW1213q, RW7, RW8, vadd_RT01q, RW1213q);
339	bne .Loop_rounds;
340
341	subs %r3, #1;
342
343	rounds2_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW0, RW1, vadd_RT01q, RW1415q, dummy, _);
344	rounds2_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW2, RW3, vadd_rg_RT0, RG, vadd_rg_RT1, RG);
345	beq .Lhandle_tail;
346	vld1.64 {RW0-RW3}, [%r1]!;
347	rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW4, RW5, vadd_rg_RT0, RE, vadd_rg_RT1, RE);
348	rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW6, RW7, vadd_rg_RT0, RC, vadd_rg_RT1, RC);
349#ifdef __ARMEL__
350	vrev64.8 RW01q, RW01q;
351	vrev64.8 RW23q, RW23q;
352#endif
353	vld1.64 {RW4-RW7}, [%r1]!;
354	rounds2_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW8, RW9, vadd_rg_RT0, RA, vadd_rg_RT1, RA);
355	rounds2_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW10, RW11, vadd_rg_RT0, RG, vadd_rg_RT1, RG);
356#ifdef __ARMEL__
357	vrev64.8 RW45q, RW45q;
358	vrev64.8 RW67q, RW67q;
359#endif
360	vld1.64 {RW8-RW11}, [%r1]!;
361	rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW12, RW13, vadd_rg_RT0, RE, vadd_rg_RT1, RE);
362	rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW15, vadd_rg_RT0, RC, vadd_rg_RT1, RC);
363#ifdef __ARMEL__
364	vrev64.8 RW89q, RW89q;
365	vrev64.8 RW1011q, RW1011q;
366#endif
367	vld1.64 {RW12-RW15}, [%r1]!;
368	vadd_rg_RT0(RA);
369	vadd_rg_RT1(RA);
370
371	/* Load context */
372	vld1.64 {RT0-RT3}, [%r0]!;
373	vld1.64 {RT4-RT7}, [%r0];
374	sub %r0, #(4*8);
375
376#ifdef __ARMEL__
377	vrev64.8 RW1213q, RW1213q;
378	vrev64.8 RW1415q, RW1415q;
379#endif
380
381	vadd.u64 RA, RT0;
382	vadd.u64 RB, RT1;
383	vadd.u64 RC, RT2;
384	vadd.u64 RD, RT3;
385	vadd.u64 RE, RT4;
386	vadd.u64 RF, RT5;
387	vadd.u64 RG, RT6;
388	vadd.u64 RH, RT7;
389
390	/* Store the first half of context */
391	vst1.64 {RA-RD}, [%r0]!;
392	sub RK, $(8*80);
393	vst1.64 {RE-RH}, [%r0]; /* Store the last half of context */
394	mov %lr, #0;
395	sub %r0, #(4*8);
396
397	b .Loop;
398.ltorg
399
400.Lhandle_tail:
401	rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW4, RW5, vadd_rg_RT0, RE, vadd_rg_RT1, RE);
402	rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW6, RW7, vadd_rg_RT0, RC, vadd_rg_RT1, RC);
403	rounds2_64_79(RA, RB, RC, RD, RE, RF, RG, RH, RW8, RW9, vadd_rg_RT0, RA, vadd_rg_RT1, RA);
404	rounds2_64_79(RG, RH, RA, RB, RC, RD, RE, RF, RW10, RW11, vadd_rg_RT0, RG, vadd_rg_RT1, RG);
405	rounds2_64_79(RE, RF, RG, RH, RA, RB, RC, RD, RW12, RW13, vadd_rg_RT0, RE, vadd_rg_RT1, RE);
406	rounds2_64_79(RC, RD, RE, RF, RG, RH, RA, RB, RW14, RW15, vadd_rg_RT0, RC, vadd_rg_RT1, RC);
407
408	/* Load context to d16-d23 */
409	vld1.64 {RW0-RW3}, [%r0]!;
410	vadd_rg_RT0(RA);
411	vld1.64 {RW4-RW7}, [%r0];
412	vadd_rg_RT1(RA);
413	sub %r0, #(4*8);
414
415	vadd.u64 RA, RW0;
416	vadd.u64 RB, RW1;
417	vadd.u64 RC, RW2;
418	vadd.u64 RD, RW3;
419	vadd.u64 RE, RW4;
420	vadd.u64 RF, RW5;
421	vadd.u64 RG, RW6;
422	vadd.u64 RH, RW7;
423
424	/* Store the first half of context */
425	vst1.64 {RA-RD}, [%r0]!;
426
427	/* Clear used registers */
428	/* d16-d31 */
429	veor.u64 RW01q, RW01q;
430	veor.u64 RW23q, RW23q;
431	veor.u64 RW45q, RW45q;
432	veor.u64 RW67q, RW67q;
433	vst1.64 {RE-RH}, [%r0]; /* Store the last half of context */
434	veor.u64 RW89q, RW89q;
435	veor.u64 RW1011q, RW1011q;
436	veor.u64 RW1213q, RW1213q;
437	veor.u64 RW1415q, RW1415q;
438	/* d8-d15 */
439	vpop {RT0-RT7};
440	/* d0-d7 (q0-q3) */
441	veor.u64 %q0, %q0;
442	veor.u64 %q1, %q1;
443	veor.u64 %q2, %q2;
444	veor.u64 %q3, %q3;
445
446	eor %r0, %r0;
447	pop {%pc};
448.size _gcry_sha512_transform_armv7_neon,.-_gcry_sha512_transform_armv7_neon;
449
450#endif
451