1/* Do not modify. This file is auto-generated from sha512-armv8.pl. */
2// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
3//
4// Licensed under the Apache License 2.0 (the "License").  You may not use
5// this file except in compliance with the License.  You can obtain a copy
6// in the file LICENSE in the source distribution or at
7// https://www.openssl.org/source/license.html
8
9// ====================================================================
10// Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11// project. The module is, however, dual licensed under OpenSSL and
12// CRYPTOGAMS licenses depending on where you obtain it. For further
13// details see http://www.openssl.org/~appro/cryptogams/.
14//
15// Permission to use under GPLv2 terms is granted.
16// ====================================================================
17//
18// SHA256/512 for ARMv8.
19//
20// Performance in cycles per processed byte and improvement coefficient
21// over code generated with "default" compiler:
22//
23//		SHA256-hw	SHA256(*)	SHA512
24// Apple A7	1.97		10.5 (+33%)	6.73 (-1%(**))
25// Cortex-A53	2.38		15.5 (+115%)	10.0 (+150%(***))
26// Cortex-A57	2.31		11.6 (+86%)	7.51 (+260%(***))
27// Denver	2.01		10.5 (+26%)	6.70 (+8%)
28// X-Gene			20.0 (+100%)	12.8 (+300%(***))
29// Mongoose	2.36		13.0 (+50%)	8.36 (+33%)
30// Kryo		1.92		17.4 (+30%)	11.2 (+8%)
31// ThunderX2	2.54		13.2 (+40%)	8.40 (+18%)
32//
33// (*)	Software SHA256 results are of lesser relevance, presented
34//	mostly for informational purposes.
35// (**)	The result is a trade-off: it's possible to improve it by
36//	10% (or by 1 cycle per round), but at the cost of 20% loss
37//	on Cortex-A53 (or by 4 cycles per round).
38// (***)	Super-impressive coefficients over gcc-generated code are
39//	indication of some compiler "pathology", most notably code
40//	generated with -mgeneral-regs-only is significantly faster
41//	and the gap is only 40-90%.
42//
43// October 2016.
44//
45// Originally it was reckoned that it makes no sense to implement NEON
46// version of SHA256 for 64-bit processors. This is because performance
47// improvement on most wide-spread Cortex-A5x processors was observed
48// to be marginal, same on Cortex-A53 and ~10% on A57. But then it was
49// observed that 32-bit NEON SHA256 performs significantly better than
50// 64-bit scalar version on *some* of the more recent processors. As
51// result 64-bit NEON version of SHA256 was added to provide best
52// all-round performance. For example it executes ~30% faster on X-Gene
53// and Mongoose. [For reference, NEON version of SHA512 is bound to
54// deliver much less improvement, likely *negative* on Cortex-A5x.
55// Which is why NEON support is limited to SHA256.]
56
57// $output is the last argument if it looks like a file (it has an extension)
58// $flavour is the first argument if it doesn't look like a file
59#ifndef	__KERNEL__
60# include "arm_arch.h"
61
62.hidden	OPENSSL_armcap_P
63#endif
64
65.text
66
67.globl	sha512_block_data_order
68.type	sha512_block_data_order,%function
69.align	6
70sha512_block_data_order:
71#ifndef	__KERNEL__
72	adrp	x16,OPENSSL_armcap_P
73	ldr	w16,[x16,#:lo12:OPENSSL_armcap_P]
74	tst	w16,#ARMV8_SHA512
75	b.ne	.Lv8_entry
76#endif
77.inst	0xd503233f				// paciasp
78	stp	x29,x30,[sp,#-128]!
79	add	x29,sp,#0
80
81	stp	x19,x20,[sp,#16]
82	stp	x21,x22,[sp,#32]
83	stp	x23,x24,[sp,#48]
84	stp	x25,x26,[sp,#64]
85	stp	x27,x28,[sp,#80]
86	sub	sp,sp,#4*8
87
88	ldp	x20,x21,[x0]				// load context
89	ldp	x22,x23,[x0,#2*8]
90	ldp	x24,x25,[x0,#4*8]
91	add	x2,x1,x2,lsl#7	// end of input
92	ldp	x26,x27,[x0,#6*8]
93	adr	x30,.LK512
94	stp	x0,x2,[x29,#96]
95
96.Loop:
97	ldp	x3,x4,[x1],#2*8
98	ldr	x19,[x30],#8			// *K++
99	eor	x28,x21,x22				// magic seed
100	str	x1,[x29,#112]
101#ifndef	__AARCH64EB__
102	rev	x3,x3			// 0
103#endif
104	ror	x16,x24,#14
105	add	x27,x27,x19			// h+=K[i]
106	eor	x6,x24,x24,ror#23
107	and	x17,x25,x24
108	bic	x19,x26,x24
109	add	x27,x27,x3			// h+=X[i]
110	orr	x17,x17,x19			// Ch(e,f,g)
111	eor	x19,x20,x21			// a^b, b^c in next round
112	eor	x16,x16,x6,ror#18	// Sigma1(e)
113	ror	x6,x20,#28
114	add	x27,x27,x17			// h+=Ch(e,f,g)
115	eor	x17,x20,x20,ror#5
116	add	x27,x27,x16			// h+=Sigma1(e)
117	and	x28,x28,x19			// (b^c)&=(a^b)
118	add	x23,x23,x27			// d+=h
119	eor	x28,x28,x21			// Maj(a,b,c)
120	eor	x17,x6,x17,ror#34	// Sigma0(a)
121	add	x27,x27,x28			// h+=Maj(a,b,c)
122	ldr	x28,[x30],#8		// *K++, x19 in next round
123	//add	x27,x27,x17			// h+=Sigma0(a)
124#ifndef	__AARCH64EB__
125	rev	x4,x4			// 1
126#endif
127	ldp	x5,x6,[x1],#2*8
128	add	x27,x27,x17			// h+=Sigma0(a)
129	ror	x16,x23,#14
130	add	x26,x26,x28			// h+=K[i]
131	eor	x7,x23,x23,ror#23
132	and	x17,x24,x23
133	bic	x28,x25,x23
134	add	x26,x26,x4			// h+=X[i]
135	orr	x17,x17,x28			// Ch(e,f,g)
136	eor	x28,x27,x20			// a^b, b^c in next round
137	eor	x16,x16,x7,ror#18	// Sigma1(e)
138	ror	x7,x27,#28
139	add	x26,x26,x17			// h+=Ch(e,f,g)
140	eor	x17,x27,x27,ror#5
141	add	x26,x26,x16			// h+=Sigma1(e)
142	and	x19,x19,x28			// (b^c)&=(a^b)
143	add	x22,x22,x26			// d+=h
144	eor	x19,x19,x20			// Maj(a,b,c)
145	eor	x17,x7,x17,ror#34	// Sigma0(a)
146	add	x26,x26,x19			// h+=Maj(a,b,c)
147	ldr	x19,[x30],#8		// *K++, x28 in next round
148	//add	x26,x26,x17			// h+=Sigma0(a)
149#ifndef	__AARCH64EB__
150	rev	x5,x5			// 2
151#endif
152	add	x26,x26,x17			// h+=Sigma0(a)
153	ror	x16,x22,#14
154	add	x25,x25,x19			// h+=K[i]
155	eor	x8,x22,x22,ror#23
156	and	x17,x23,x22
157	bic	x19,x24,x22
158	add	x25,x25,x5			// h+=X[i]
159	orr	x17,x17,x19			// Ch(e,f,g)
160	eor	x19,x26,x27			// a^b, b^c in next round
161	eor	x16,x16,x8,ror#18	// Sigma1(e)
162	ror	x8,x26,#28
163	add	x25,x25,x17			// h+=Ch(e,f,g)
164	eor	x17,x26,x26,ror#5
165	add	x25,x25,x16			// h+=Sigma1(e)
166	and	x28,x28,x19			// (b^c)&=(a^b)
167	add	x21,x21,x25			// d+=h
168	eor	x28,x28,x27			// Maj(a,b,c)
169	eor	x17,x8,x17,ror#34	// Sigma0(a)
170	add	x25,x25,x28			// h+=Maj(a,b,c)
171	ldr	x28,[x30],#8		// *K++, x19 in next round
172	//add	x25,x25,x17			// h+=Sigma0(a)
173#ifndef	__AARCH64EB__
174	rev	x6,x6			// 3
175#endif
176	ldp	x7,x8,[x1],#2*8
177	add	x25,x25,x17			// h+=Sigma0(a)
178	ror	x16,x21,#14
179	add	x24,x24,x28			// h+=K[i]
180	eor	x9,x21,x21,ror#23
181	and	x17,x22,x21
182	bic	x28,x23,x21
183	add	x24,x24,x6			// h+=X[i]
184	orr	x17,x17,x28			// Ch(e,f,g)
185	eor	x28,x25,x26			// a^b, b^c in next round
186	eor	x16,x16,x9,ror#18	// Sigma1(e)
187	ror	x9,x25,#28
188	add	x24,x24,x17			// h+=Ch(e,f,g)
189	eor	x17,x25,x25,ror#5
190	add	x24,x24,x16			// h+=Sigma1(e)
191	and	x19,x19,x28			// (b^c)&=(a^b)
192	add	x20,x20,x24			// d+=h
193	eor	x19,x19,x26			// Maj(a,b,c)
194	eor	x17,x9,x17,ror#34	// Sigma0(a)
195	add	x24,x24,x19			// h+=Maj(a,b,c)
196	ldr	x19,[x30],#8		// *K++, x28 in next round
197	//add	x24,x24,x17			// h+=Sigma0(a)
198#ifndef	__AARCH64EB__
199	rev	x7,x7			// 4
200#endif
201	add	x24,x24,x17			// h+=Sigma0(a)
202	ror	x16,x20,#14
203	add	x23,x23,x19			// h+=K[i]
204	eor	x10,x20,x20,ror#23
205	and	x17,x21,x20
206	bic	x19,x22,x20
207	add	x23,x23,x7			// h+=X[i]
208	orr	x17,x17,x19			// Ch(e,f,g)
209	eor	x19,x24,x25			// a^b, b^c in next round
210	eor	x16,x16,x10,ror#18	// Sigma1(e)
211	ror	x10,x24,#28
212	add	x23,x23,x17			// h+=Ch(e,f,g)
213	eor	x17,x24,x24,ror#5
214	add	x23,x23,x16			// h+=Sigma1(e)
215	and	x28,x28,x19			// (b^c)&=(a^b)
216	add	x27,x27,x23			// d+=h
217	eor	x28,x28,x25			// Maj(a,b,c)
218	eor	x17,x10,x17,ror#34	// Sigma0(a)
219	add	x23,x23,x28			// h+=Maj(a,b,c)
220	ldr	x28,[x30],#8		// *K++, x19 in next round
221	//add	x23,x23,x17			// h+=Sigma0(a)
222#ifndef	__AARCH64EB__
223	rev	x8,x8			// 5
224#endif
225	ldp	x9,x10,[x1],#2*8
226	add	x23,x23,x17			// h+=Sigma0(a)
227	ror	x16,x27,#14
228	add	x22,x22,x28			// h+=K[i]
229	eor	x11,x27,x27,ror#23
230	and	x17,x20,x27
231	bic	x28,x21,x27
232	add	x22,x22,x8			// h+=X[i]
233	orr	x17,x17,x28			// Ch(e,f,g)
234	eor	x28,x23,x24			// a^b, b^c in next round
235	eor	x16,x16,x11,ror#18	// Sigma1(e)
236	ror	x11,x23,#28
237	add	x22,x22,x17			// h+=Ch(e,f,g)
238	eor	x17,x23,x23,ror#5
239	add	x22,x22,x16			// h+=Sigma1(e)
240	and	x19,x19,x28			// (b^c)&=(a^b)
241	add	x26,x26,x22			// d+=h
242	eor	x19,x19,x24			// Maj(a,b,c)
243	eor	x17,x11,x17,ror#34	// Sigma0(a)
244	add	x22,x22,x19			// h+=Maj(a,b,c)
245	ldr	x19,[x30],#8		// *K++, x28 in next round
246	//add	x22,x22,x17			// h+=Sigma0(a)
247#ifndef	__AARCH64EB__
248	rev	x9,x9			// 6
249#endif
250	add	x22,x22,x17			// h+=Sigma0(a)
251	ror	x16,x26,#14
252	add	x21,x21,x19			// h+=K[i]
253	eor	x12,x26,x26,ror#23
254	and	x17,x27,x26
255	bic	x19,x20,x26
256	add	x21,x21,x9			// h+=X[i]
257	orr	x17,x17,x19			// Ch(e,f,g)
258	eor	x19,x22,x23			// a^b, b^c in next round
259	eor	x16,x16,x12,ror#18	// Sigma1(e)
260	ror	x12,x22,#28
261	add	x21,x21,x17			// h+=Ch(e,f,g)
262	eor	x17,x22,x22,ror#5
263	add	x21,x21,x16			// h+=Sigma1(e)
264	and	x28,x28,x19			// (b^c)&=(a^b)
265	add	x25,x25,x21			// d+=h
266	eor	x28,x28,x23			// Maj(a,b,c)
267	eor	x17,x12,x17,ror#34	// Sigma0(a)
268	add	x21,x21,x28			// h+=Maj(a,b,c)
269	ldr	x28,[x30],#8		// *K++, x19 in next round
270	//add	x21,x21,x17			// h+=Sigma0(a)
271#ifndef	__AARCH64EB__
272	rev	x10,x10			// 7
273#endif
274	ldp	x11,x12,[x1],#2*8
275	add	x21,x21,x17			// h+=Sigma0(a)
276	ror	x16,x25,#14
277	add	x20,x20,x28			// h+=K[i]
278	eor	x13,x25,x25,ror#23
279	and	x17,x26,x25
280	bic	x28,x27,x25
281	add	x20,x20,x10			// h+=X[i]
282	orr	x17,x17,x28			// Ch(e,f,g)
283	eor	x28,x21,x22			// a^b, b^c in next round
284	eor	x16,x16,x13,ror#18	// Sigma1(e)
285	ror	x13,x21,#28
286	add	x20,x20,x17			// h+=Ch(e,f,g)
287	eor	x17,x21,x21,ror#5
288	add	x20,x20,x16			// h+=Sigma1(e)
289	and	x19,x19,x28			// (b^c)&=(a^b)
290	add	x24,x24,x20			// d+=h
291	eor	x19,x19,x22			// Maj(a,b,c)
292	eor	x17,x13,x17,ror#34	// Sigma0(a)
293	add	x20,x20,x19			// h+=Maj(a,b,c)
294	ldr	x19,[x30],#8		// *K++, x28 in next round
295	//add	x20,x20,x17			// h+=Sigma0(a)
296#ifndef	__AARCH64EB__
297	rev	x11,x11			// 8
298#endif
299	add	x20,x20,x17			// h+=Sigma0(a)
300	ror	x16,x24,#14
301	add	x27,x27,x19			// h+=K[i]
302	eor	x14,x24,x24,ror#23
303	and	x17,x25,x24
304	bic	x19,x26,x24
305	add	x27,x27,x11			// h+=X[i]
306	orr	x17,x17,x19			// Ch(e,f,g)
307	eor	x19,x20,x21			// a^b, b^c in next round
308	eor	x16,x16,x14,ror#18	// Sigma1(e)
309	ror	x14,x20,#28
310	add	x27,x27,x17			// h+=Ch(e,f,g)
311	eor	x17,x20,x20,ror#5
312	add	x27,x27,x16			// h+=Sigma1(e)
313	and	x28,x28,x19			// (b^c)&=(a^b)
314	add	x23,x23,x27			// d+=h
315	eor	x28,x28,x21			// Maj(a,b,c)
316	eor	x17,x14,x17,ror#34	// Sigma0(a)
317	add	x27,x27,x28			// h+=Maj(a,b,c)
318	ldr	x28,[x30],#8		// *K++, x19 in next round
319	//add	x27,x27,x17			// h+=Sigma0(a)
320#ifndef	__AARCH64EB__
321	rev	x12,x12			// 9
322#endif
323	ldp	x13,x14,[x1],#2*8
324	add	x27,x27,x17			// h+=Sigma0(a)
325	ror	x16,x23,#14
326	add	x26,x26,x28			// h+=K[i]
327	eor	x15,x23,x23,ror#23
328	and	x17,x24,x23
329	bic	x28,x25,x23
330	add	x26,x26,x12			// h+=X[i]
331	orr	x17,x17,x28			// Ch(e,f,g)
332	eor	x28,x27,x20			// a^b, b^c in next round
333	eor	x16,x16,x15,ror#18	// Sigma1(e)
334	ror	x15,x27,#28
335	add	x26,x26,x17			// h+=Ch(e,f,g)
336	eor	x17,x27,x27,ror#5
337	add	x26,x26,x16			// h+=Sigma1(e)
338	and	x19,x19,x28			// (b^c)&=(a^b)
339	add	x22,x22,x26			// d+=h
340	eor	x19,x19,x20			// Maj(a,b,c)
341	eor	x17,x15,x17,ror#34	// Sigma0(a)
342	add	x26,x26,x19			// h+=Maj(a,b,c)
343	ldr	x19,[x30],#8		// *K++, x28 in next round
344	//add	x26,x26,x17			// h+=Sigma0(a)
345#ifndef	__AARCH64EB__
346	rev	x13,x13			// 10
347#endif
348	add	x26,x26,x17			// h+=Sigma0(a)
349	ror	x16,x22,#14
350	add	x25,x25,x19			// h+=K[i]
351	eor	x0,x22,x22,ror#23
352	and	x17,x23,x22
353	bic	x19,x24,x22
354	add	x25,x25,x13			// h+=X[i]
355	orr	x17,x17,x19			// Ch(e,f,g)
356	eor	x19,x26,x27			// a^b, b^c in next round
357	eor	x16,x16,x0,ror#18	// Sigma1(e)
358	ror	x0,x26,#28
359	add	x25,x25,x17			// h+=Ch(e,f,g)
360	eor	x17,x26,x26,ror#5
361	add	x25,x25,x16			// h+=Sigma1(e)
362	and	x28,x28,x19			// (b^c)&=(a^b)
363	add	x21,x21,x25			// d+=h
364	eor	x28,x28,x27			// Maj(a,b,c)
365	eor	x17,x0,x17,ror#34	// Sigma0(a)
366	add	x25,x25,x28			// h+=Maj(a,b,c)
367	ldr	x28,[x30],#8		// *K++, x19 in next round
368	//add	x25,x25,x17			// h+=Sigma0(a)
369#ifndef	__AARCH64EB__
370	rev	x14,x14			// 11
371#endif
372	ldp	x15,x0,[x1],#2*8
373	add	x25,x25,x17			// h+=Sigma0(a)
374	str	x6,[sp,#24]
375	ror	x16,x21,#14
376	add	x24,x24,x28			// h+=K[i]
377	eor	x6,x21,x21,ror#23
378	and	x17,x22,x21
379	bic	x28,x23,x21
380	add	x24,x24,x14			// h+=X[i]
381	orr	x17,x17,x28			// Ch(e,f,g)
382	eor	x28,x25,x26			// a^b, b^c in next round
383	eor	x16,x16,x6,ror#18	// Sigma1(e)
384	ror	x6,x25,#28
385	add	x24,x24,x17			// h+=Ch(e,f,g)
386	eor	x17,x25,x25,ror#5
387	add	x24,x24,x16			// h+=Sigma1(e)
388	and	x19,x19,x28			// (b^c)&=(a^b)
389	add	x20,x20,x24			// d+=h
390	eor	x19,x19,x26			// Maj(a,b,c)
391	eor	x17,x6,x17,ror#34	// Sigma0(a)
392	add	x24,x24,x19			// h+=Maj(a,b,c)
393	ldr	x19,[x30],#8		// *K++, x28 in next round
394	//add	x24,x24,x17			// h+=Sigma0(a)
395#ifndef	__AARCH64EB__
396	rev	x15,x15			// 12
397#endif
398	add	x24,x24,x17			// h+=Sigma0(a)
399	str	x7,[sp,#0]
400	ror	x16,x20,#14
401	add	x23,x23,x19			// h+=K[i]
402	eor	x7,x20,x20,ror#23
403	and	x17,x21,x20
404	bic	x19,x22,x20
405	add	x23,x23,x15			// h+=X[i]
406	orr	x17,x17,x19			// Ch(e,f,g)
407	eor	x19,x24,x25			// a^b, b^c in next round
408	eor	x16,x16,x7,ror#18	// Sigma1(e)
409	ror	x7,x24,#28
410	add	x23,x23,x17			// h+=Ch(e,f,g)
411	eor	x17,x24,x24,ror#5
412	add	x23,x23,x16			// h+=Sigma1(e)
413	and	x28,x28,x19			// (b^c)&=(a^b)
414	add	x27,x27,x23			// d+=h
415	eor	x28,x28,x25			// Maj(a,b,c)
416	eor	x17,x7,x17,ror#34	// Sigma0(a)
417	add	x23,x23,x28			// h+=Maj(a,b,c)
418	ldr	x28,[x30],#8		// *K++, x19 in next round
419	//add	x23,x23,x17			// h+=Sigma0(a)
420#ifndef	__AARCH64EB__
421	rev	x0,x0			// 13
422#endif
423	ldp	x1,x2,[x1]
424	add	x23,x23,x17			// h+=Sigma0(a)
425	str	x8,[sp,#8]
426	ror	x16,x27,#14
427	add	x22,x22,x28			// h+=K[i]
428	eor	x8,x27,x27,ror#23
429	and	x17,x20,x27
430	bic	x28,x21,x27
431	add	x22,x22,x0			// h+=X[i]
432	orr	x17,x17,x28			// Ch(e,f,g)
433	eor	x28,x23,x24			// a^b, b^c in next round
434	eor	x16,x16,x8,ror#18	// Sigma1(e)
435	ror	x8,x23,#28
436	add	x22,x22,x17			// h+=Ch(e,f,g)
437	eor	x17,x23,x23,ror#5
438	add	x22,x22,x16			// h+=Sigma1(e)
439	and	x19,x19,x28			// (b^c)&=(a^b)
440	add	x26,x26,x22			// d+=h
441	eor	x19,x19,x24			// Maj(a,b,c)
442	eor	x17,x8,x17,ror#34	// Sigma0(a)
443	add	x22,x22,x19			// h+=Maj(a,b,c)
444	ldr	x19,[x30],#8		// *K++, x28 in next round
445	//add	x22,x22,x17			// h+=Sigma0(a)
446#ifndef	__AARCH64EB__
447	rev	x1,x1			// 14
448#endif
449	ldr	x6,[sp,#24]
450	add	x22,x22,x17			// h+=Sigma0(a)
451	str	x9,[sp,#16]
452	ror	x16,x26,#14
453	add	x21,x21,x19			// h+=K[i]
454	eor	x9,x26,x26,ror#23
455	and	x17,x27,x26
456	bic	x19,x20,x26
457	add	x21,x21,x1			// h+=X[i]
458	orr	x17,x17,x19			// Ch(e,f,g)
459	eor	x19,x22,x23			// a^b, b^c in next round
460	eor	x16,x16,x9,ror#18	// Sigma1(e)
461	ror	x9,x22,#28
462	add	x21,x21,x17			// h+=Ch(e,f,g)
463	eor	x17,x22,x22,ror#5
464	add	x21,x21,x16			// h+=Sigma1(e)
465	and	x28,x28,x19			// (b^c)&=(a^b)
466	add	x25,x25,x21			// d+=h
467	eor	x28,x28,x23			// Maj(a,b,c)
468	eor	x17,x9,x17,ror#34	// Sigma0(a)
469	add	x21,x21,x28			// h+=Maj(a,b,c)
470	ldr	x28,[x30],#8		// *K++, x19 in next round
471	//add	x21,x21,x17			// h+=Sigma0(a)
472#ifndef	__AARCH64EB__
473	rev	x2,x2			// 15
474#endif
475	ldr	x7,[sp,#0]
476	add	x21,x21,x17			// h+=Sigma0(a)
477	str	x10,[sp,#24]
478	ror	x16,x25,#14
479	add	x20,x20,x28			// h+=K[i]
480	ror	x9,x4,#1
481	and	x17,x26,x25
482	ror	x8,x1,#19
483	bic	x28,x27,x25
484	ror	x10,x21,#28
485	add	x20,x20,x2			// h+=X[i]
486	eor	x16,x16,x25,ror#18
487	eor	x9,x9,x4,ror#8
488	orr	x17,x17,x28			// Ch(e,f,g)
489	eor	x28,x21,x22			// a^b, b^c in next round
490	eor	x16,x16,x25,ror#41	// Sigma1(e)
491	eor	x10,x10,x21,ror#34
492	add	x20,x20,x17			// h+=Ch(e,f,g)
493	and	x19,x19,x28			// (b^c)&=(a^b)
494	eor	x8,x8,x1,ror#61
495	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
496	add	x20,x20,x16			// h+=Sigma1(e)
497	eor	x19,x19,x22			// Maj(a,b,c)
498	eor	x17,x10,x21,ror#39	// Sigma0(a)
499	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
500	add	x3,x3,x12
501	add	x24,x24,x20			// d+=h
502	add	x20,x20,x19			// h+=Maj(a,b,c)
503	ldr	x19,[x30],#8		// *K++, x28 in next round
504	add	x3,x3,x9
505	add	x20,x20,x17			// h+=Sigma0(a)
506	add	x3,x3,x8
507.Loop_16_xx:
508	ldr	x8,[sp,#8]
509	str	x11,[sp,#0]
510	ror	x16,x24,#14
511	add	x27,x27,x19			// h+=K[i]
512	ror	x10,x5,#1
513	and	x17,x25,x24
514	ror	x9,x2,#19
515	bic	x19,x26,x24
516	ror	x11,x20,#28
517	add	x27,x27,x3			// h+=X[i]
518	eor	x16,x16,x24,ror#18
519	eor	x10,x10,x5,ror#8
520	orr	x17,x17,x19			// Ch(e,f,g)
521	eor	x19,x20,x21			// a^b, b^c in next round
522	eor	x16,x16,x24,ror#41	// Sigma1(e)
523	eor	x11,x11,x20,ror#34
524	add	x27,x27,x17			// h+=Ch(e,f,g)
525	and	x28,x28,x19			// (b^c)&=(a^b)
526	eor	x9,x9,x2,ror#61
527	eor	x10,x10,x5,lsr#7	// sigma0(X[i+1])
528	add	x27,x27,x16			// h+=Sigma1(e)
529	eor	x28,x28,x21			// Maj(a,b,c)
530	eor	x17,x11,x20,ror#39	// Sigma0(a)
531	eor	x9,x9,x2,lsr#6	// sigma1(X[i+14])
532	add	x4,x4,x13
533	add	x23,x23,x27			// d+=h
534	add	x27,x27,x28			// h+=Maj(a,b,c)
535	ldr	x28,[x30],#8		// *K++, x19 in next round
536	add	x4,x4,x10
537	add	x27,x27,x17			// h+=Sigma0(a)
538	add	x4,x4,x9
539	ldr	x9,[sp,#16]
540	str	x12,[sp,#8]
541	ror	x16,x23,#14
542	add	x26,x26,x28			// h+=K[i]
543	ror	x11,x6,#1
544	and	x17,x24,x23
545	ror	x10,x3,#19
546	bic	x28,x25,x23
547	ror	x12,x27,#28
548	add	x26,x26,x4			// h+=X[i]
549	eor	x16,x16,x23,ror#18
550	eor	x11,x11,x6,ror#8
551	orr	x17,x17,x28			// Ch(e,f,g)
552	eor	x28,x27,x20			// a^b, b^c in next round
553	eor	x16,x16,x23,ror#41	// Sigma1(e)
554	eor	x12,x12,x27,ror#34
555	add	x26,x26,x17			// h+=Ch(e,f,g)
556	and	x19,x19,x28			// (b^c)&=(a^b)
557	eor	x10,x10,x3,ror#61
558	eor	x11,x11,x6,lsr#7	// sigma0(X[i+1])
559	add	x26,x26,x16			// h+=Sigma1(e)
560	eor	x19,x19,x20			// Maj(a,b,c)
561	eor	x17,x12,x27,ror#39	// Sigma0(a)
562	eor	x10,x10,x3,lsr#6	// sigma1(X[i+14])
563	add	x5,x5,x14
564	add	x22,x22,x26			// d+=h
565	add	x26,x26,x19			// h+=Maj(a,b,c)
566	ldr	x19,[x30],#8		// *K++, x28 in next round
567	add	x5,x5,x11
568	add	x26,x26,x17			// h+=Sigma0(a)
569	add	x5,x5,x10
570	ldr	x10,[sp,#24]
571	str	x13,[sp,#16]
572	ror	x16,x22,#14
573	add	x25,x25,x19			// h+=K[i]
574	ror	x12,x7,#1
575	and	x17,x23,x22
576	ror	x11,x4,#19
577	bic	x19,x24,x22
578	ror	x13,x26,#28
579	add	x25,x25,x5			// h+=X[i]
580	eor	x16,x16,x22,ror#18
581	eor	x12,x12,x7,ror#8
582	orr	x17,x17,x19			// Ch(e,f,g)
583	eor	x19,x26,x27			// a^b, b^c in next round
584	eor	x16,x16,x22,ror#41	// Sigma1(e)
585	eor	x13,x13,x26,ror#34
586	add	x25,x25,x17			// h+=Ch(e,f,g)
587	and	x28,x28,x19			// (b^c)&=(a^b)
588	eor	x11,x11,x4,ror#61
589	eor	x12,x12,x7,lsr#7	// sigma0(X[i+1])
590	add	x25,x25,x16			// h+=Sigma1(e)
591	eor	x28,x28,x27			// Maj(a,b,c)
592	eor	x17,x13,x26,ror#39	// Sigma0(a)
593	eor	x11,x11,x4,lsr#6	// sigma1(X[i+14])
594	add	x6,x6,x15
595	add	x21,x21,x25			// d+=h
596	add	x25,x25,x28			// h+=Maj(a,b,c)
597	ldr	x28,[x30],#8		// *K++, x19 in next round
598	add	x6,x6,x12
599	add	x25,x25,x17			// h+=Sigma0(a)
600	add	x6,x6,x11
601	ldr	x11,[sp,#0]
602	str	x14,[sp,#24]
603	ror	x16,x21,#14
604	add	x24,x24,x28			// h+=K[i]
605	ror	x13,x8,#1
606	and	x17,x22,x21
607	ror	x12,x5,#19
608	bic	x28,x23,x21
609	ror	x14,x25,#28
610	add	x24,x24,x6			// h+=X[i]
611	eor	x16,x16,x21,ror#18
612	eor	x13,x13,x8,ror#8
613	orr	x17,x17,x28			// Ch(e,f,g)
614	eor	x28,x25,x26			// a^b, b^c in next round
615	eor	x16,x16,x21,ror#41	// Sigma1(e)
616	eor	x14,x14,x25,ror#34
617	add	x24,x24,x17			// h+=Ch(e,f,g)
618	and	x19,x19,x28			// (b^c)&=(a^b)
619	eor	x12,x12,x5,ror#61
620	eor	x13,x13,x8,lsr#7	// sigma0(X[i+1])
621	add	x24,x24,x16			// h+=Sigma1(e)
622	eor	x19,x19,x26			// Maj(a,b,c)
623	eor	x17,x14,x25,ror#39	// Sigma0(a)
624	eor	x12,x12,x5,lsr#6	// sigma1(X[i+14])
625	add	x7,x7,x0
626	add	x20,x20,x24			// d+=h
627	add	x24,x24,x19			// h+=Maj(a,b,c)
628	ldr	x19,[x30],#8		// *K++, x28 in next round
629	add	x7,x7,x13
630	add	x24,x24,x17			// h+=Sigma0(a)
631	add	x7,x7,x12
632	ldr	x12,[sp,#8]
633	str	x15,[sp,#0]
634	ror	x16,x20,#14
635	add	x23,x23,x19			// h+=K[i]
636	ror	x14,x9,#1
637	and	x17,x21,x20
638	ror	x13,x6,#19
639	bic	x19,x22,x20
640	ror	x15,x24,#28
641	add	x23,x23,x7			// h+=X[i]
642	eor	x16,x16,x20,ror#18
643	eor	x14,x14,x9,ror#8
644	orr	x17,x17,x19			// Ch(e,f,g)
645	eor	x19,x24,x25			// a^b, b^c in next round
646	eor	x16,x16,x20,ror#41	// Sigma1(e)
647	eor	x15,x15,x24,ror#34
648	add	x23,x23,x17			// h+=Ch(e,f,g)
649	and	x28,x28,x19			// (b^c)&=(a^b)
650	eor	x13,x13,x6,ror#61
651	eor	x14,x14,x9,lsr#7	// sigma0(X[i+1])
652	add	x23,x23,x16			// h+=Sigma1(e)
653	eor	x28,x28,x25			// Maj(a,b,c)
654	eor	x17,x15,x24,ror#39	// Sigma0(a)
655	eor	x13,x13,x6,lsr#6	// sigma1(X[i+14])
656	add	x8,x8,x1
657	add	x27,x27,x23			// d+=h
658	add	x23,x23,x28			// h+=Maj(a,b,c)
659	ldr	x28,[x30],#8		// *K++, x19 in next round
660	add	x8,x8,x14
661	add	x23,x23,x17			// h+=Sigma0(a)
662	add	x8,x8,x13
663	ldr	x13,[sp,#16]
664	str	x0,[sp,#8]
665	ror	x16,x27,#14
666	add	x22,x22,x28			// h+=K[i]
667	ror	x15,x10,#1
668	and	x17,x20,x27
669	ror	x14,x7,#19
670	bic	x28,x21,x27
671	ror	x0,x23,#28
672	add	x22,x22,x8			// h+=X[i]
673	eor	x16,x16,x27,ror#18
674	eor	x15,x15,x10,ror#8
675	orr	x17,x17,x28			// Ch(e,f,g)
676	eor	x28,x23,x24			// a^b, b^c in next round
677	eor	x16,x16,x27,ror#41	// Sigma1(e)
678	eor	x0,x0,x23,ror#34
679	add	x22,x22,x17			// h+=Ch(e,f,g)
680	and	x19,x19,x28			// (b^c)&=(a^b)
681	eor	x14,x14,x7,ror#61
682	eor	x15,x15,x10,lsr#7	// sigma0(X[i+1])
683	add	x22,x22,x16			// h+=Sigma1(e)
684	eor	x19,x19,x24			// Maj(a,b,c)
685	eor	x17,x0,x23,ror#39	// Sigma0(a)
686	eor	x14,x14,x7,lsr#6	// sigma1(X[i+14])
687	add	x9,x9,x2
688	add	x26,x26,x22			// d+=h
689	add	x22,x22,x19			// h+=Maj(a,b,c)
690	ldr	x19,[x30],#8		// *K++, x28 in next round
691	add	x9,x9,x15
692	add	x22,x22,x17			// h+=Sigma0(a)
693	add	x9,x9,x14
694	ldr	x14,[sp,#24]
695	str	x1,[sp,#16]
696	ror	x16,x26,#14
697	add	x21,x21,x19			// h+=K[i]
698	ror	x0,x11,#1
699	and	x17,x27,x26
700	ror	x15,x8,#19
701	bic	x19,x20,x26
702	ror	x1,x22,#28
703	add	x21,x21,x9			// h+=X[i]
704	eor	x16,x16,x26,ror#18
705	eor	x0,x0,x11,ror#8
706	orr	x17,x17,x19			// Ch(e,f,g)
707	eor	x19,x22,x23			// a^b, b^c in next round
708	eor	x16,x16,x26,ror#41	// Sigma1(e)
709	eor	x1,x1,x22,ror#34
710	add	x21,x21,x17			// h+=Ch(e,f,g)
711	and	x28,x28,x19			// (b^c)&=(a^b)
712	eor	x15,x15,x8,ror#61
713	eor	x0,x0,x11,lsr#7	// sigma0(X[i+1])
714	add	x21,x21,x16			// h+=Sigma1(e)
715	eor	x28,x28,x23			// Maj(a,b,c)
716	eor	x17,x1,x22,ror#39	// Sigma0(a)
717	eor	x15,x15,x8,lsr#6	// sigma1(X[i+14])
718	add	x10,x10,x3
719	add	x25,x25,x21			// d+=h
720	add	x21,x21,x28			// h+=Maj(a,b,c)
721	ldr	x28,[x30],#8		// *K++, x19 in next round
722	add	x10,x10,x0
723	add	x21,x21,x17			// h+=Sigma0(a)
724	add	x10,x10,x15
725	ldr	x15,[sp,#0]
726	str	x2,[sp,#24]
727	ror	x16,x25,#14
728	add	x20,x20,x28			// h+=K[i]
729	ror	x1,x12,#1
730	and	x17,x26,x25
731	ror	x0,x9,#19
732	bic	x28,x27,x25
733	ror	x2,x21,#28
734	add	x20,x20,x10			// h+=X[i]
735	eor	x16,x16,x25,ror#18
736	eor	x1,x1,x12,ror#8
737	orr	x17,x17,x28			// Ch(e,f,g)
738	eor	x28,x21,x22			// a^b, b^c in next round
739	eor	x16,x16,x25,ror#41	// Sigma1(e)
740	eor	x2,x2,x21,ror#34
741	add	x20,x20,x17			// h+=Ch(e,f,g)
742	and	x19,x19,x28			// (b^c)&=(a^b)
743	eor	x0,x0,x9,ror#61
744	eor	x1,x1,x12,lsr#7	// sigma0(X[i+1])
745	add	x20,x20,x16			// h+=Sigma1(e)
746	eor	x19,x19,x22			// Maj(a,b,c)
747	eor	x17,x2,x21,ror#39	// Sigma0(a)
748	eor	x0,x0,x9,lsr#6	// sigma1(X[i+14])
749	add	x11,x11,x4
750	add	x24,x24,x20			// d+=h
751	add	x20,x20,x19			// h+=Maj(a,b,c)
752	ldr	x19,[x30],#8		// *K++, x28 in next round
753	add	x11,x11,x1
754	add	x20,x20,x17			// h+=Sigma0(a)
755	add	x11,x11,x0
756	ldr	x0,[sp,#8]
757	str	x3,[sp,#0]
758	ror	x16,x24,#14
759	add	x27,x27,x19			// h+=K[i]
760	ror	x2,x13,#1
761	and	x17,x25,x24
762	ror	x1,x10,#19
763	bic	x19,x26,x24
764	ror	x3,x20,#28
765	add	x27,x27,x11			// h+=X[i]
766	eor	x16,x16,x24,ror#18
767	eor	x2,x2,x13,ror#8
768	orr	x17,x17,x19			// Ch(e,f,g)
769	eor	x19,x20,x21			// a^b, b^c in next round
770	eor	x16,x16,x24,ror#41	// Sigma1(e)
771	eor	x3,x3,x20,ror#34
772	add	x27,x27,x17			// h+=Ch(e,f,g)
773	and	x28,x28,x19			// (b^c)&=(a^b)
774	eor	x1,x1,x10,ror#61
775	eor	x2,x2,x13,lsr#7	// sigma0(X[i+1])
776	add	x27,x27,x16			// h+=Sigma1(e)
777	eor	x28,x28,x21			// Maj(a,b,c)
778	eor	x17,x3,x20,ror#39	// Sigma0(a)
779	eor	x1,x1,x10,lsr#6	// sigma1(X[i+14])
780	add	x12,x12,x5
781	add	x23,x23,x27			// d+=h
782	add	x27,x27,x28			// h+=Maj(a,b,c)
783	ldr	x28,[x30],#8		// *K++, x19 in next round
784	add	x12,x12,x2
785	add	x27,x27,x17			// h+=Sigma0(a)
786	add	x12,x12,x1
787	ldr	x1,[sp,#16]
788	str	x4,[sp,#8]
789	ror	x16,x23,#14
790	add	x26,x26,x28			// h+=K[i]
791	ror	x3,x14,#1
792	and	x17,x24,x23
793	ror	x2,x11,#19
794	bic	x28,x25,x23
795	ror	x4,x27,#28
796	add	x26,x26,x12			// h+=X[i]
797	eor	x16,x16,x23,ror#18
798	eor	x3,x3,x14,ror#8
799	orr	x17,x17,x28			// Ch(e,f,g)
800	eor	x28,x27,x20			// a^b, b^c in next round
801	eor	x16,x16,x23,ror#41	// Sigma1(e)
802	eor	x4,x4,x27,ror#34
803	add	x26,x26,x17			// h+=Ch(e,f,g)
804	and	x19,x19,x28			// (b^c)&=(a^b)
805	eor	x2,x2,x11,ror#61
806	eor	x3,x3,x14,lsr#7	// sigma0(X[i+1])
807	add	x26,x26,x16			// h+=Sigma1(e)
808	eor	x19,x19,x20			// Maj(a,b,c)
809	eor	x17,x4,x27,ror#39	// Sigma0(a)
810	eor	x2,x2,x11,lsr#6	// sigma1(X[i+14])
811	add	x13,x13,x6
812	add	x22,x22,x26			// d+=h
813	add	x26,x26,x19			// h+=Maj(a,b,c)
814	ldr	x19,[x30],#8		// *K++, x28 in next round
815	add	x13,x13,x3
816	add	x26,x26,x17			// h+=Sigma0(a)
817	add	x13,x13,x2
818	ldr	x2,[sp,#24]
819	str	x5,[sp,#16]
820	ror	x16,x22,#14
821	add	x25,x25,x19			// h+=K[i]
822	ror	x4,x15,#1
823	and	x17,x23,x22
824	ror	x3,x12,#19
825	bic	x19,x24,x22
826	ror	x5,x26,#28
827	add	x25,x25,x13			// h+=X[i]
828	eor	x16,x16,x22,ror#18
829	eor	x4,x4,x15,ror#8
830	orr	x17,x17,x19			// Ch(e,f,g)
831	eor	x19,x26,x27			// a^b, b^c in next round
832	eor	x16,x16,x22,ror#41	// Sigma1(e)
833	eor	x5,x5,x26,ror#34
834	add	x25,x25,x17			// h+=Ch(e,f,g)
835	and	x28,x28,x19			// (b^c)&=(a^b)
836	eor	x3,x3,x12,ror#61
837	eor	x4,x4,x15,lsr#7	// sigma0(X[i+1])
838	add	x25,x25,x16			// h+=Sigma1(e)
839	eor	x28,x28,x27			// Maj(a,b,c)
840	eor	x17,x5,x26,ror#39	// Sigma0(a)
841	eor	x3,x3,x12,lsr#6	// sigma1(X[i+14])
842	add	x14,x14,x7
843	add	x21,x21,x25			// d+=h
844	add	x25,x25,x28			// h+=Maj(a,b,c)
845	ldr	x28,[x30],#8		// *K++, x19 in next round
846	add	x14,x14,x4
847	add	x25,x25,x17			// h+=Sigma0(a)
848	add	x14,x14,x3
849	ldr	x3,[sp,#0]
850	str	x6,[sp,#24]
851	ror	x16,x21,#14
852	add	x24,x24,x28			// h+=K[i]
853	ror	x5,x0,#1
854	and	x17,x22,x21
855	ror	x4,x13,#19
856	bic	x28,x23,x21
857	ror	x6,x25,#28
858	add	x24,x24,x14			// h+=X[i]
859	eor	x16,x16,x21,ror#18
860	eor	x5,x5,x0,ror#8
861	orr	x17,x17,x28			// Ch(e,f,g)
862	eor	x28,x25,x26			// a^b, b^c in next round
863	eor	x16,x16,x21,ror#41	// Sigma1(e)
864	eor	x6,x6,x25,ror#34
865	add	x24,x24,x17			// h+=Ch(e,f,g)
866	and	x19,x19,x28			// (b^c)&=(a^b)
867	eor	x4,x4,x13,ror#61
868	eor	x5,x5,x0,lsr#7	// sigma0(X[i+1])
869	add	x24,x24,x16			// h+=Sigma1(e)
870	eor	x19,x19,x26			// Maj(a,b,c)
871	eor	x17,x6,x25,ror#39	// Sigma0(a)
872	eor	x4,x4,x13,lsr#6	// sigma1(X[i+14])
873	add	x15,x15,x8
874	add	x20,x20,x24			// d+=h
875	add	x24,x24,x19			// h+=Maj(a,b,c)
876	ldr	x19,[x30],#8		// *K++, x28 in next round
877	add	x15,x15,x5
878	add	x24,x24,x17			// h+=Sigma0(a)
879	add	x15,x15,x4
880	ldr	x4,[sp,#8]
881	str	x7,[sp,#0]
882	ror	x16,x20,#14
883	add	x23,x23,x19			// h+=K[i]
884	ror	x6,x1,#1
885	and	x17,x21,x20
886	ror	x5,x14,#19
887	bic	x19,x22,x20
888	ror	x7,x24,#28
889	add	x23,x23,x15			// h+=X[i]
890	eor	x16,x16,x20,ror#18
891	eor	x6,x6,x1,ror#8
892	orr	x17,x17,x19			// Ch(e,f,g)
893	eor	x19,x24,x25			// a^b, b^c in next round
894	eor	x16,x16,x20,ror#41	// Sigma1(e)
895	eor	x7,x7,x24,ror#34
896	add	x23,x23,x17			// h+=Ch(e,f,g)
897	and	x28,x28,x19			// (b^c)&=(a^b)
898	eor	x5,x5,x14,ror#61
899	eor	x6,x6,x1,lsr#7	// sigma0(X[i+1])
900	add	x23,x23,x16			// h+=Sigma1(e)
901	eor	x28,x28,x25			// Maj(a,b,c)
902	eor	x17,x7,x24,ror#39	// Sigma0(a)
903	eor	x5,x5,x14,lsr#6	// sigma1(X[i+14])
904	add	x0,x0,x9
905	add	x27,x27,x23			// d+=h
906	add	x23,x23,x28			// h+=Maj(a,b,c)
907	ldr	x28,[x30],#8		// *K++, x19 in next round
908	add	x0,x0,x6
909	add	x23,x23,x17			// h+=Sigma0(a)
910	add	x0,x0,x5
911	ldr	x5,[sp,#16]
912	str	x8,[sp,#8]
913	ror	x16,x27,#14
914	add	x22,x22,x28			// h+=K[i]
915	ror	x7,x2,#1
916	and	x17,x20,x27
917	ror	x6,x15,#19
918	bic	x28,x21,x27
919	ror	x8,x23,#28
920	add	x22,x22,x0			// h+=X[i]
921	eor	x16,x16,x27,ror#18
922	eor	x7,x7,x2,ror#8
923	orr	x17,x17,x28			// Ch(e,f,g)
924	eor	x28,x23,x24			// a^b, b^c in next round
925	eor	x16,x16,x27,ror#41	// Sigma1(e)
926	eor	x8,x8,x23,ror#34
927	add	x22,x22,x17			// h+=Ch(e,f,g)
928	and	x19,x19,x28			// (b^c)&=(a^b)
929	eor	x6,x6,x15,ror#61
930	eor	x7,x7,x2,lsr#7	// sigma0(X[i+1])
931	add	x22,x22,x16			// h+=Sigma1(e)
932	eor	x19,x19,x24			// Maj(a,b,c)
933	eor	x17,x8,x23,ror#39	// Sigma0(a)
934	eor	x6,x6,x15,lsr#6	// sigma1(X[i+14])
935	add	x1,x1,x10
936	add	x26,x26,x22			// d+=h
937	add	x22,x22,x19			// h+=Maj(a,b,c)
938	ldr	x19,[x30],#8		// *K++, x28 in next round
939	add	x1,x1,x7
940	add	x22,x22,x17			// h+=Sigma0(a)
941	add	x1,x1,x6
942	ldr	x6,[sp,#24]
943	str	x9,[sp,#16]
944	ror	x16,x26,#14
945	add	x21,x21,x19			// h+=K[i]
946	ror	x8,x3,#1
947	and	x17,x27,x26
948	ror	x7,x0,#19
949	bic	x19,x20,x26
950	ror	x9,x22,#28
951	add	x21,x21,x1			// h+=X[i]
952	eor	x16,x16,x26,ror#18
953	eor	x8,x8,x3,ror#8
954	orr	x17,x17,x19			// Ch(e,f,g)
955	eor	x19,x22,x23			// a^b, b^c in next round
956	eor	x16,x16,x26,ror#41	// Sigma1(e)
957	eor	x9,x9,x22,ror#34
958	add	x21,x21,x17			// h+=Ch(e,f,g)
959	and	x28,x28,x19			// (b^c)&=(a^b)
960	eor	x7,x7,x0,ror#61
961	eor	x8,x8,x3,lsr#7	// sigma0(X[i+1])
962	add	x21,x21,x16			// h+=Sigma1(e)
963	eor	x28,x28,x23			// Maj(a,b,c)
964	eor	x17,x9,x22,ror#39	// Sigma0(a)
965	eor	x7,x7,x0,lsr#6	// sigma1(X[i+14])
966	add	x2,x2,x11
967	add	x25,x25,x21			// d+=h
968	add	x21,x21,x28			// h+=Maj(a,b,c)
969	ldr	x28,[x30],#8		// *K++, x19 in next round
970	add	x2,x2,x8
971	add	x21,x21,x17			// h+=Sigma0(a)
972	add	x2,x2,x7
973	ldr	x7,[sp,#0]
974	str	x10,[sp,#24]
975	ror	x16,x25,#14
976	add	x20,x20,x28			// h+=K[i]
977	ror	x9,x4,#1
978	and	x17,x26,x25
979	ror	x8,x1,#19
980	bic	x28,x27,x25
981	ror	x10,x21,#28
982	add	x20,x20,x2			// h+=X[i]
983	eor	x16,x16,x25,ror#18
984	eor	x9,x9,x4,ror#8
985	orr	x17,x17,x28			// Ch(e,f,g)
986	eor	x28,x21,x22			// a^b, b^c in next round
987	eor	x16,x16,x25,ror#41	// Sigma1(e)
988	eor	x10,x10,x21,ror#34
989	add	x20,x20,x17			// h+=Ch(e,f,g)
990	and	x19,x19,x28			// (b^c)&=(a^b)
991	eor	x8,x8,x1,ror#61
992	eor	x9,x9,x4,lsr#7	// sigma0(X[i+1])
993	add	x20,x20,x16			// h+=Sigma1(e)
994	eor	x19,x19,x22			// Maj(a,b,c)
995	eor	x17,x10,x21,ror#39	// Sigma0(a)
996	eor	x8,x8,x1,lsr#6	// sigma1(X[i+14])
997	add	x3,x3,x12
998	add	x24,x24,x20			// d+=h
999	add	x20,x20,x19			// h+=Maj(a,b,c)
1000	ldr	x19,[x30],#8		// *K++, x28 in next round
1001	add	x3,x3,x9
1002	add	x20,x20,x17			// h+=Sigma0(a)
1003	add	x3,x3,x8
1004	cbnz	x19,.Loop_16_xx
1005
1006	ldp	x0,x2,[x29,#96]
1007	ldr	x1,[x29,#112]
1008	sub	x30,x30,#648		// rewind
1009
1010	ldp	x3,x4,[x0]
1011	ldp	x5,x6,[x0,#2*8]
1012	add	x1,x1,#14*8			// advance input pointer
1013	ldp	x7,x8,[x0,#4*8]
1014	add	x20,x20,x3
1015	ldp	x9,x10,[x0,#6*8]
1016	add	x21,x21,x4
1017	add	x22,x22,x5
1018	add	x23,x23,x6
1019	stp	x20,x21,[x0]
1020	add	x24,x24,x7
1021	add	x25,x25,x8
1022	stp	x22,x23,[x0,#2*8]
1023	add	x26,x26,x9
1024	add	x27,x27,x10
1025	cmp	x1,x2
1026	stp	x24,x25,[x0,#4*8]
1027	stp	x26,x27,[x0,#6*8]
1028	b.ne	.Loop
1029
1030	ldp	x19,x20,[x29,#16]
1031	add	sp,sp,#4*8
1032	ldp	x21,x22,[x29,#32]
1033	ldp	x23,x24,[x29,#48]
1034	ldp	x25,x26,[x29,#64]
1035	ldp	x27,x28,[x29,#80]
1036	ldp	x29,x30,[sp],#128
1037.inst	0xd50323bf				// autiasp
1038	ret
1039.size	sha512_block_data_order,.-sha512_block_data_order
1040
1041.align	6
1042.type	.LK512,%object
1043.LK512:
1044.quad	0x428a2f98d728ae22,0x7137449123ef65cd
1045.quad	0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
1046.quad	0x3956c25bf348b538,0x59f111f1b605d019
1047.quad	0x923f82a4af194f9b,0xab1c5ed5da6d8118
1048.quad	0xd807aa98a3030242,0x12835b0145706fbe
1049.quad	0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
1050.quad	0x72be5d74f27b896f,0x80deb1fe3b1696b1
1051.quad	0x9bdc06a725c71235,0xc19bf174cf692694
1052.quad	0xe49b69c19ef14ad2,0xefbe4786384f25e3
1053.quad	0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
1054.quad	0x2de92c6f592b0275,0x4a7484aa6ea6e483
1055.quad	0x5cb0a9dcbd41fbd4,0x76f988da831153b5
1056.quad	0x983e5152ee66dfab,0xa831c66d2db43210
1057.quad	0xb00327c898fb213f,0xbf597fc7beef0ee4
1058.quad	0xc6e00bf33da88fc2,0xd5a79147930aa725
1059.quad	0x06ca6351e003826f,0x142929670a0e6e70
1060.quad	0x27b70a8546d22ffc,0x2e1b21385c26c926
1061.quad	0x4d2c6dfc5ac42aed,0x53380d139d95b3df
1062.quad	0x650a73548baf63de,0x766a0abb3c77b2a8
1063.quad	0x81c2c92e47edaee6,0x92722c851482353b
1064.quad	0xa2bfe8a14cf10364,0xa81a664bbc423001
1065.quad	0xc24b8b70d0f89791,0xc76c51a30654be30
1066.quad	0xd192e819d6ef5218,0xd69906245565a910
1067.quad	0xf40e35855771202a,0x106aa07032bbd1b8
1068.quad	0x19a4c116b8d2d0c8,0x1e376c085141ab53
1069.quad	0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
1070.quad	0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
1071.quad	0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
1072.quad	0x748f82ee5defb2fc,0x78a5636f43172f60
1073.quad	0x84c87814a1f0ab72,0x8cc702081a6439ec
1074.quad	0x90befffa23631e28,0xa4506cebde82bde9
1075.quad	0xbef9a3f7b2c67915,0xc67178f2e372532b
1076.quad	0xca273eceea26619c,0xd186b8c721c0c207
1077.quad	0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
1078.quad	0x06f067aa72176fba,0x0a637dc5a2c898a6
1079.quad	0x113f9804bef90dae,0x1b710b35131c471b
1080.quad	0x28db77f523047d84,0x32caab7b40c72493
1081.quad	0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
1082.quad	0x4cc5d4becb3e42b6,0x597f299cfc657e2a
1083.quad	0x5fcb6fab3ad6faec,0x6c44198c4a475817
1084.quad	0	// terminator
1085.size	.LK512,.-.LK512
1086.byte	83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
1087.align	2
1088.align	2
1089#ifndef	__KERNEL__
1090.type	sha512_block_armv8,%function
1091.align	6
1092sha512_block_armv8:
1093.Lv8_entry:
1094	stp	x29,x30,[sp,#-16]!
1095	add	x29,sp,#0
1096
1097	ld1	{v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64	// load input
1098	ld1	{v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64
1099
1100	ld1	{v0.2d,v1.2d,v2.2d,v3.2d},[x0]		// load context
1101	adr	x3,.LK512
1102
1103	rev64	v16.16b,v16.16b
1104	rev64	v17.16b,v17.16b
1105	rev64	v18.16b,v18.16b
1106	rev64	v19.16b,v19.16b
1107	rev64	v20.16b,v20.16b
1108	rev64	v21.16b,v21.16b
1109	rev64	v22.16b,v22.16b
1110	rev64	v23.16b,v23.16b
1111	b	.Loop_hw
1112
1113.align	4
1114.Loop_hw:
1115	ld1	{v24.2d},[x3],#16
1116	subs	x2,x2,#1
1117	sub	x4,x1,#128
1118	orr	v26.16b,v0.16b,v0.16b			// offload
1119	orr	v27.16b,v1.16b,v1.16b
1120	orr	v28.16b,v2.16b,v2.16b
1121	orr	v29.16b,v3.16b,v3.16b
1122	csel	x1,x1,x4,ne			// conditional rewind
1123	add	v24.2d,v24.2d,v16.2d
1124	ld1	{v25.2d},[x3],#16
1125	ext	v24.16b,v24.16b,v24.16b,#8
1126	ext	v5.16b,v2.16b,v3.16b,#8
1127	ext	v6.16b,v1.16b,v2.16b,#8
1128	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1129.inst	0xcec08230	//sha512su0 v16.16b,v17.16b
1130	ext	v7.16b,v20.16b,v21.16b,#8
1131.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1132.inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1133	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1134.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1135	add	v25.2d,v25.2d,v17.2d
1136	ld1	{v24.2d},[x3],#16
1137	ext	v25.16b,v25.16b,v25.16b,#8
1138	ext	v5.16b,v4.16b,v2.16b,#8
1139	ext	v6.16b,v0.16b,v4.16b,#8
1140	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1141.inst	0xcec08251	//sha512su0 v17.16b,v18.16b
1142	ext	v7.16b,v21.16b,v22.16b,#8
1143.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1144.inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1145	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1146.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1147	add	v24.2d,v24.2d,v18.2d
1148	ld1	{v25.2d},[x3],#16
1149	ext	v24.16b,v24.16b,v24.16b,#8
1150	ext	v5.16b,v1.16b,v4.16b,#8
1151	ext	v6.16b,v3.16b,v1.16b,#8
1152	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1153.inst	0xcec08272	//sha512su0 v18.16b,v19.16b
1154	ext	v7.16b,v22.16b,v23.16b,#8
1155.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1156.inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1157	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1158.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1159	add	v25.2d,v25.2d,v19.2d
1160	ld1	{v24.2d},[x3],#16
1161	ext	v25.16b,v25.16b,v25.16b,#8
1162	ext	v5.16b,v0.16b,v1.16b,#8
1163	ext	v6.16b,v2.16b,v0.16b,#8
1164	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1165.inst	0xcec08293	//sha512su0 v19.16b,v20.16b
1166	ext	v7.16b,v23.16b,v16.16b,#8
1167.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1168.inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1169	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1170.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1171	add	v24.2d,v24.2d,v20.2d
1172	ld1	{v25.2d},[x3],#16
1173	ext	v24.16b,v24.16b,v24.16b,#8
1174	ext	v5.16b,v3.16b,v0.16b,#8
1175	ext	v6.16b,v4.16b,v3.16b,#8
1176	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1177.inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
1178	ext	v7.16b,v16.16b,v17.16b,#8
1179.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1180.inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1181	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1182.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1183	add	v25.2d,v25.2d,v21.2d
1184	ld1	{v24.2d},[x3],#16
1185	ext	v25.16b,v25.16b,v25.16b,#8
1186	ext	v5.16b,v2.16b,v3.16b,#8
1187	ext	v6.16b,v1.16b,v2.16b,#8
1188	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1189.inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
1190	ext	v7.16b,v17.16b,v18.16b,#8
1191.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1192.inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1193	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1194.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1195	add	v24.2d,v24.2d,v22.2d
1196	ld1	{v25.2d},[x3],#16
1197	ext	v24.16b,v24.16b,v24.16b,#8
1198	ext	v5.16b,v4.16b,v2.16b,#8
1199	ext	v6.16b,v0.16b,v4.16b,#8
1200	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1201.inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
1202	ext	v7.16b,v18.16b,v19.16b,#8
1203.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1204.inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1205	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1206.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1207	add	v25.2d,v25.2d,v23.2d
1208	ld1	{v24.2d},[x3],#16
1209	ext	v25.16b,v25.16b,v25.16b,#8
1210	ext	v5.16b,v1.16b,v4.16b,#8
1211	ext	v6.16b,v3.16b,v1.16b,#8
1212	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1213.inst	0xcec08217	//sha512su0 v23.16b,v16.16b
1214	ext	v7.16b,v19.16b,v20.16b,#8
1215.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1216.inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1217	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1218.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1219	add	v24.2d,v24.2d,v16.2d
1220	ld1	{v25.2d},[x3],#16
1221	ext	v24.16b,v24.16b,v24.16b,#8
1222	ext	v5.16b,v0.16b,v1.16b,#8
1223	ext	v6.16b,v2.16b,v0.16b,#8
1224	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1225.inst	0xcec08230	//sha512su0 v16.16b,v17.16b
1226	ext	v7.16b,v20.16b,v21.16b,#8
1227.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1228.inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1229	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1230.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1231	add	v25.2d,v25.2d,v17.2d
1232	ld1	{v24.2d},[x3],#16
1233	ext	v25.16b,v25.16b,v25.16b,#8
1234	ext	v5.16b,v3.16b,v0.16b,#8
1235	ext	v6.16b,v4.16b,v3.16b,#8
1236	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1237.inst	0xcec08251	//sha512su0 v17.16b,v18.16b
1238	ext	v7.16b,v21.16b,v22.16b,#8
1239.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1240.inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1241	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1242.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1243	add	v24.2d,v24.2d,v18.2d
1244	ld1	{v25.2d},[x3],#16
1245	ext	v24.16b,v24.16b,v24.16b,#8
1246	ext	v5.16b,v2.16b,v3.16b,#8
1247	ext	v6.16b,v1.16b,v2.16b,#8
1248	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1249.inst	0xcec08272	//sha512su0 v18.16b,v19.16b
1250	ext	v7.16b,v22.16b,v23.16b,#8
1251.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1252.inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1253	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1254.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1255	add	v25.2d,v25.2d,v19.2d
1256	ld1	{v24.2d},[x3],#16
1257	ext	v25.16b,v25.16b,v25.16b,#8
1258	ext	v5.16b,v4.16b,v2.16b,#8
1259	ext	v6.16b,v0.16b,v4.16b,#8
1260	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1261.inst	0xcec08293	//sha512su0 v19.16b,v20.16b
1262	ext	v7.16b,v23.16b,v16.16b,#8
1263.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1264.inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1265	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1266.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1267	add	v24.2d,v24.2d,v20.2d
1268	ld1	{v25.2d},[x3],#16
1269	ext	v24.16b,v24.16b,v24.16b,#8
1270	ext	v5.16b,v1.16b,v4.16b,#8
1271	ext	v6.16b,v3.16b,v1.16b,#8
1272	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1273.inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
1274	ext	v7.16b,v16.16b,v17.16b,#8
1275.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1276.inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1277	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1278.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1279	add	v25.2d,v25.2d,v21.2d
1280	ld1	{v24.2d},[x3],#16
1281	ext	v25.16b,v25.16b,v25.16b,#8
1282	ext	v5.16b,v0.16b,v1.16b,#8
1283	ext	v6.16b,v2.16b,v0.16b,#8
1284	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1285.inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
1286	ext	v7.16b,v17.16b,v18.16b,#8
1287.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1288.inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1289	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1290.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1291	add	v24.2d,v24.2d,v22.2d
1292	ld1	{v25.2d},[x3],#16
1293	ext	v24.16b,v24.16b,v24.16b,#8
1294	ext	v5.16b,v3.16b,v0.16b,#8
1295	ext	v6.16b,v4.16b,v3.16b,#8
1296	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1297.inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
1298	ext	v7.16b,v18.16b,v19.16b,#8
1299.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1300.inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1301	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1302.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1303	add	v25.2d,v25.2d,v23.2d
1304	ld1	{v24.2d},[x3],#16
1305	ext	v25.16b,v25.16b,v25.16b,#8
1306	ext	v5.16b,v2.16b,v3.16b,#8
1307	ext	v6.16b,v1.16b,v2.16b,#8
1308	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1309.inst	0xcec08217	//sha512su0 v23.16b,v16.16b
1310	ext	v7.16b,v19.16b,v20.16b,#8
1311.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1312.inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1313	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1314.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1315	add	v24.2d,v24.2d,v16.2d
1316	ld1	{v25.2d},[x3],#16
1317	ext	v24.16b,v24.16b,v24.16b,#8
1318	ext	v5.16b,v4.16b,v2.16b,#8
1319	ext	v6.16b,v0.16b,v4.16b,#8
1320	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1321.inst	0xcec08230	//sha512su0 v16.16b,v17.16b
1322	ext	v7.16b,v20.16b,v21.16b,#8
1323.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1324.inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1325	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1326.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1327	add	v25.2d,v25.2d,v17.2d
1328	ld1	{v24.2d},[x3],#16
1329	ext	v25.16b,v25.16b,v25.16b,#8
1330	ext	v5.16b,v1.16b,v4.16b,#8
1331	ext	v6.16b,v3.16b,v1.16b,#8
1332	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1333.inst	0xcec08251	//sha512su0 v17.16b,v18.16b
1334	ext	v7.16b,v21.16b,v22.16b,#8
1335.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1336.inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1337	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1338.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1339	add	v24.2d,v24.2d,v18.2d
1340	ld1	{v25.2d},[x3],#16
1341	ext	v24.16b,v24.16b,v24.16b,#8
1342	ext	v5.16b,v0.16b,v1.16b,#8
1343	ext	v6.16b,v2.16b,v0.16b,#8
1344	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1345.inst	0xcec08272	//sha512su0 v18.16b,v19.16b
1346	ext	v7.16b,v22.16b,v23.16b,#8
1347.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1348.inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1349	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1350.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1351	add	v25.2d,v25.2d,v19.2d
1352	ld1	{v24.2d},[x3],#16
1353	ext	v25.16b,v25.16b,v25.16b,#8
1354	ext	v5.16b,v3.16b,v0.16b,#8
1355	ext	v6.16b,v4.16b,v3.16b,#8
1356	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1357.inst	0xcec08293	//sha512su0 v19.16b,v20.16b
1358	ext	v7.16b,v23.16b,v16.16b,#8
1359.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1360.inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1361	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1362.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1363	add	v24.2d,v24.2d,v20.2d
1364	ld1	{v25.2d},[x3],#16
1365	ext	v24.16b,v24.16b,v24.16b,#8
1366	ext	v5.16b,v2.16b,v3.16b,#8
1367	ext	v6.16b,v1.16b,v2.16b,#8
1368	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1369.inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
1370	ext	v7.16b,v16.16b,v17.16b,#8
1371.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1372.inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1373	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1374.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1375	add	v25.2d,v25.2d,v21.2d
1376	ld1	{v24.2d},[x3],#16
1377	ext	v25.16b,v25.16b,v25.16b,#8
1378	ext	v5.16b,v4.16b,v2.16b,#8
1379	ext	v6.16b,v0.16b,v4.16b,#8
1380	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1381.inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
1382	ext	v7.16b,v17.16b,v18.16b,#8
1383.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1384.inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1385	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1386.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1387	add	v24.2d,v24.2d,v22.2d
1388	ld1	{v25.2d},[x3],#16
1389	ext	v24.16b,v24.16b,v24.16b,#8
1390	ext	v5.16b,v1.16b,v4.16b,#8
1391	ext	v6.16b,v3.16b,v1.16b,#8
1392	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1393.inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
1394	ext	v7.16b,v18.16b,v19.16b,#8
1395.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1396.inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1397	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1398.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1399	add	v25.2d,v25.2d,v23.2d
1400	ld1	{v24.2d},[x3],#16
1401	ext	v25.16b,v25.16b,v25.16b,#8
1402	ext	v5.16b,v0.16b,v1.16b,#8
1403	ext	v6.16b,v2.16b,v0.16b,#8
1404	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1405.inst	0xcec08217	//sha512su0 v23.16b,v16.16b
1406	ext	v7.16b,v19.16b,v20.16b,#8
1407.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1408.inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1409	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1410.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1411	add	v24.2d,v24.2d,v16.2d
1412	ld1	{v25.2d},[x3],#16
1413	ext	v24.16b,v24.16b,v24.16b,#8
1414	ext	v5.16b,v3.16b,v0.16b,#8
1415	ext	v6.16b,v4.16b,v3.16b,#8
1416	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1417.inst	0xcec08230	//sha512su0 v16.16b,v17.16b
1418	ext	v7.16b,v20.16b,v21.16b,#8
1419.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1420.inst	0xce678af0	//sha512su1 v16.16b,v23.16b,v7.16b
1421	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1422.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1423	add	v25.2d,v25.2d,v17.2d
1424	ld1	{v24.2d},[x3],#16
1425	ext	v25.16b,v25.16b,v25.16b,#8
1426	ext	v5.16b,v2.16b,v3.16b,#8
1427	ext	v6.16b,v1.16b,v2.16b,#8
1428	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1429.inst	0xcec08251	//sha512su0 v17.16b,v18.16b
1430	ext	v7.16b,v21.16b,v22.16b,#8
1431.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1432.inst	0xce678a11	//sha512su1 v17.16b,v16.16b,v7.16b
1433	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1434.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1435	add	v24.2d,v24.2d,v18.2d
1436	ld1	{v25.2d},[x3],#16
1437	ext	v24.16b,v24.16b,v24.16b,#8
1438	ext	v5.16b,v4.16b,v2.16b,#8
1439	ext	v6.16b,v0.16b,v4.16b,#8
1440	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1441.inst	0xcec08272	//sha512su0 v18.16b,v19.16b
1442	ext	v7.16b,v22.16b,v23.16b,#8
1443.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1444.inst	0xce678a32	//sha512su1 v18.16b,v17.16b,v7.16b
1445	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1446.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1447	add	v25.2d,v25.2d,v19.2d
1448	ld1	{v24.2d},[x3],#16
1449	ext	v25.16b,v25.16b,v25.16b,#8
1450	ext	v5.16b,v1.16b,v4.16b,#8
1451	ext	v6.16b,v3.16b,v1.16b,#8
1452	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1453.inst	0xcec08293	//sha512su0 v19.16b,v20.16b
1454	ext	v7.16b,v23.16b,v16.16b,#8
1455.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1456.inst	0xce678a53	//sha512su1 v19.16b,v18.16b,v7.16b
1457	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1458.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1459	add	v24.2d,v24.2d,v20.2d
1460	ld1	{v25.2d},[x3],#16
1461	ext	v24.16b,v24.16b,v24.16b,#8
1462	ext	v5.16b,v0.16b,v1.16b,#8
1463	ext	v6.16b,v2.16b,v0.16b,#8
1464	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1465.inst	0xcec082b4	//sha512su0 v20.16b,v21.16b
1466	ext	v7.16b,v16.16b,v17.16b,#8
1467.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1468.inst	0xce678a74	//sha512su1 v20.16b,v19.16b,v7.16b
1469	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1470.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1471	add	v25.2d,v25.2d,v21.2d
1472	ld1	{v24.2d},[x3],#16
1473	ext	v25.16b,v25.16b,v25.16b,#8
1474	ext	v5.16b,v3.16b,v0.16b,#8
1475	ext	v6.16b,v4.16b,v3.16b,#8
1476	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1477.inst	0xcec082d5	//sha512su0 v21.16b,v22.16b
1478	ext	v7.16b,v17.16b,v18.16b,#8
1479.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1480.inst	0xce678a95	//sha512su1 v21.16b,v20.16b,v7.16b
1481	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1482.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1483	add	v24.2d,v24.2d,v22.2d
1484	ld1	{v25.2d},[x3],#16
1485	ext	v24.16b,v24.16b,v24.16b,#8
1486	ext	v5.16b,v2.16b,v3.16b,#8
1487	ext	v6.16b,v1.16b,v2.16b,#8
1488	add	v3.2d,v3.2d,v24.2d			// "T1 + H + K512[i]"
1489.inst	0xcec082f6	//sha512su0 v22.16b,v23.16b
1490	ext	v7.16b,v18.16b,v19.16b,#8
1491.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1492.inst	0xce678ab6	//sha512su1 v22.16b,v21.16b,v7.16b
1493	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1494.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1495	add	v25.2d,v25.2d,v23.2d
1496	ld1	{v24.2d},[x3],#16
1497	ext	v25.16b,v25.16b,v25.16b,#8
1498	ext	v5.16b,v4.16b,v2.16b,#8
1499	ext	v6.16b,v0.16b,v4.16b,#8
1500	add	v2.2d,v2.2d,v25.2d			// "T1 + H + K512[i]"
1501.inst	0xcec08217	//sha512su0 v23.16b,v16.16b
1502	ext	v7.16b,v19.16b,v20.16b,#8
1503.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1504.inst	0xce678ad7	//sha512su1 v23.16b,v22.16b,v7.16b
1505	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1506.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1507	ld1	{v25.2d},[x3],#16
1508	add	v24.2d,v24.2d,v16.2d
1509	ld1	{v16.16b},[x1],#16		// load next input
1510	ext	v24.16b,v24.16b,v24.16b,#8
1511	ext	v5.16b,v1.16b,v4.16b,#8
1512	ext	v6.16b,v3.16b,v1.16b,#8
1513	add	v4.2d,v4.2d,v24.2d			// "T1 + H + K512[i]"
1514.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1515	rev64	v16.16b,v16.16b
1516	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1517.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1518	ld1	{v24.2d},[x3],#16
1519	add	v25.2d,v25.2d,v17.2d
1520	ld1	{v17.16b},[x1],#16		// load next input
1521	ext	v25.16b,v25.16b,v25.16b,#8
1522	ext	v5.16b,v0.16b,v1.16b,#8
1523	ext	v6.16b,v2.16b,v0.16b,#8
1524	add	v1.2d,v1.2d,v25.2d			// "T1 + H + K512[i]"
1525.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1526	rev64	v17.16b,v17.16b
1527	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1528.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1529	ld1	{v25.2d},[x3],#16
1530	add	v24.2d,v24.2d,v18.2d
1531	ld1	{v18.16b},[x1],#16		// load next input
1532	ext	v24.16b,v24.16b,v24.16b,#8
1533	ext	v5.16b,v3.16b,v0.16b,#8
1534	ext	v6.16b,v4.16b,v3.16b,#8
1535	add	v0.2d,v0.2d,v24.2d			// "T1 + H + K512[i]"
1536.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1537	rev64	v18.16b,v18.16b
1538	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1539.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1540	ld1	{v24.2d},[x3],#16
1541	add	v25.2d,v25.2d,v19.2d
1542	ld1	{v19.16b},[x1],#16		// load next input
1543	ext	v25.16b,v25.16b,v25.16b,#8
1544	ext	v5.16b,v2.16b,v3.16b,#8
1545	ext	v6.16b,v1.16b,v2.16b,#8
1546	add	v3.2d,v3.2d,v25.2d			// "T1 + H + K512[i]"
1547.inst	0xce6680a3	//sha512h v3.16b,v5.16b,v6.16b
1548	rev64	v19.16b,v19.16b
1549	add	v4.2d,v1.2d,v3.2d		// "D + T1"
1550.inst	0xce608423	//sha512h2 v3.16b,v1.16b,v0.16b
1551	ld1	{v25.2d},[x3],#16
1552	add	v24.2d,v24.2d,v20.2d
1553	ld1	{v20.16b},[x1],#16		// load next input
1554	ext	v24.16b,v24.16b,v24.16b,#8
1555	ext	v5.16b,v4.16b,v2.16b,#8
1556	ext	v6.16b,v0.16b,v4.16b,#8
1557	add	v2.2d,v2.2d,v24.2d			// "T1 + H + K512[i]"
1558.inst	0xce6680a2	//sha512h v2.16b,v5.16b,v6.16b
1559	rev64	v20.16b,v20.16b
1560	add	v1.2d,v0.2d,v2.2d		// "D + T1"
1561.inst	0xce638402	//sha512h2 v2.16b,v0.16b,v3.16b
1562	ld1	{v24.2d},[x3],#16
1563	add	v25.2d,v25.2d,v21.2d
1564	ld1	{v21.16b},[x1],#16		// load next input
1565	ext	v25.16b,v25.16b,v25.16b,#8
1566	ext	v5.16b,v1.16b,v4.16b,#8
1567	ext	v6.16b,v3.16b,v1.16b,#8
1568	add	v4.2d,v4.2d,v25.2d			// "T1 + H + K512[i]"
1569.inst	0xce6680a4	//sha512h v4.16b,v5.16b,v6.16b
1570	rev64	v21.16b,v21.16b
1571	add	v0.2d,v3.2d,v4.2d		// "D + T1"
1572.inst	0xce628464	//sha512h2 v4.16b,v3.16b,v2.16b
1573	ld1	{v25.2d},[x3],#16
1574	add	v24.2d,v24.2d,v22.2d
1575	ld1	{v22.16b},[x1],#16		// load next input
1576	ext	v24.16b,v24.16b,v24.16b,#8
1577	ext	v5.16b,v0.16b,v1.16b,#8
1578	ext	v6.16b,v2.16b,v0.16b,#8
1579	add	v1.2d,v1.2d,v24.2d			// "T1 + H + K512[i]"
1580.inst	0xce6680a1	//sha512h v1.16b,v5.16b,v6.16b
1581	rev64	v22.16b,v22.16b
1582	add	v3.2d,v2.2d,v1.2d		// "D + T1"
1583.inst	0xce648441	//sha512h2 v1.16b,v2.16b,v4.16b
1584	sub	x3,x3,#80*8	// rewind
1585	add	v25.2d,v25.2d,v23.2d
1586	ld1	{v23.16b},[x1],#16		// load next input
1587	ext	v25.16b,v25.16b,v25.16b,#8
1588	ext	v5.16b,v3.16b,v0.16b,#8
1589	ext	v6.16b,v4.16b,v3.16b,#8
1590	add	v0.2d,v0.2d,v25.2d			// "T1 + H + K512[i]"
1591.inst	0xce6680a0	//sha512h v0.16b,v5.16b,v6.16b
1592	rev64	v23.16b,v23.16b
1593	add	v2.2d,v4.2d,v0.2d		// "D + T1"
1594.inst	0xce618480	//sha512h2 v0.16b,v4.16b,v1.16b
1595	add	v0.2d,v0.2d,v26.2d			// accumulate
1596	add	v1.2d,v1.2d,v27.2d
1597	add	v2.2d,v2.2d,v28.2d
1598	add	v3.2d,v3.2d,v29.2d
1599
1600	cbnz	x2,.Loop_hw
1601
1602	st1	{v0.2d,v1.2d,v2.2d,v3.2d},[x0]		// store context
1603
1604	ldr	x29,[sp],#16
1605	ret
1606.size	sha512_block_armv8,.-sha512_block_armv8
1607#endif
1608