1// Copyright 2016 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Based on CRYPTOGAMS code with the following comment:
6// # ====================================================================
7// # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
8// # project. The module is, however, dual licensed under OpenSSL and
9// # CRYPTOGAMS licenses depending on where you obtain it. For further
10// # details see http://www.openssl.org/~appro/cryptogams/.
11// # ====================================================================
12
13#include "textflag.h"
14
15// SHA512 block routine. See sha512block.go for Go equivalent.
16//
17// The algorithm is detailed in FIPS 180-4:
18//
19//  https://csrc.nist.gov/publications/fips/fips180-4/fips-180-4.pdf
20//
21// Wt = Mt; for 0 <= t <= 15
22// Wt = SIGMA1(Wt-2) + SIGMA0(Wt-15) + Wt-16; for 16 <= t <= 79
23//
24// a = H0
25// b = H1
26// c = H2
27// d = H3
28// e = H4
29// f = H5
30// g = H6
31// h = H7
32//
33// for t = 0 to 79 {
34//    T1 = h + BIGSIGMA1(e) + Ch(e,f,g) + Kt + Wt
35//    T2 = BIGSIGMA0(a) + Maj(a,b,c)
36//    h = g
37//    g = f
38//    f = e
39//    e = d + T1
40//    d = c
41//    c = b
42//    b = a
43//    a = T1 + T2
44// }
45//
46// H0 = a + H0
47// H1 = b + H1
48// H2 = c + H2
49// H3 = d + H3
50// H4 = e + H4
51// H5 = f + H5
52// H6 = g + H6
53// H7 = h + H7
54
55#define CTX	R3
56#define INP	R4
57#define END	R5
58#define TBL	R6
59#define IDX	R7
60#define CNT	R8
61#define LEN	R9
62#define OFFLOAD	R11
63#define TEMP	R12
64
65#define HEX00	R0
66#define HEX10	R10
67#define HEX20	R25
68#define HEX30	R26
69#define HEX40	R27
70#define HEX50	R28
71#define HEX60	R29
72#define HEX70	R31
73
74// V0-V7 are A-H
75// V8-V23 are used for the message schedule
76#define KI	V24
77#define FUNC	V25
78#define S0	V26
79#define S1	V27
80#define s0	V28
81#define s1	V29
82#define LEMASK	V31	// Permutation control register for little endian
83
84// 2 copies of each Kt, to fill both doublewords of a vector register
85DATA  ·kcon+0x000(SB)/8, $0x428a2f98d728ae22
86DATA  ·kcon+0x008(SB)/8, $0x428a2f98d728ae22
87DATA  ·kcon+0x010(SB)/8, $0x7137449123ef65cd
88DATA  ·kcon+0x018(SB)/8, $0x7137449123ef65cd
89DATA  ·kcon+0x020(SB)/8, $0xb5c0fbcfec4d3b2f
90DATA  ·kcon+0x028(SB)/8, $0xb5c0fbcfec4d3b2f
91DATA  ·kcon+0x030(SB)/8, $0xe9b5dba58189dbbc
92DATA  ·kcon+0x038(SB)/8, $0xe9b5dba58189dbbc
93DATA  ·kcon+0x040(SB)/8, $0x3956c25bf348b538
94DATA  ·kcon+0x048(SB)/8, $0x3956c25bf348b538
95DATA  ·kcon+0x050(SB)/8, $0x59f111f1b605d019
96DATA  ·kcon+0x058(SB)/8, $0x59f111f1b605d019
97DATA  ·kcon+0x060(SB)/8, $0x923f82a4af194f9b
98DATA  ·kcon+0x068(SB)/8, $0x923f82a4af194f9b
99DATA  ·kcon+0x070(SB)/8, $0xab1c5ed5da6d8118
100DATA  ·kcon+0x078(SB)/8, $0xab1c5ed5da6d8118
101DATA  ·kcon+0x080(SB)/8, $0xd807aa98a3030242
102DATA  ·kcon+0x088(SB)/8, $0xd807aa98a3030242
103DATA  ·kcon+0x090(SB)/8, $0x12835b0145706fbe
104DATA  ·kcon+0x098(SB)/8, $0x12835b0145706fbe
105DATA  ·kcon+0x0A0(SB)/8, $0x243185be4ee4b28c
106DATA  ·kcon+0x0A8(SB)/8, $0x243185be4ee4b28c
107DATA  ·kcon+0x0B0(SB)/8, $0x550c7dc3d5ffb4e2
108DATA  ·kcon+0x0B8(SB)/8, $0x550c7dc3d5ffb4e2
109DATA  ·kcon+0x0C0(SB)/8, $0x72be5d74f27b896f
110DATA  ·kcon+0x0C8(SB)/8, $0x72be5d74f27b896f
111DATA  ·kcon+0x0D0(SB)/8, $0x80deb1fe3b1696b1
112DATA  ·kcon+0x0D8(SB)/8, $0x80deb1fe3b1696b1
113DATA  ·kcon+0x0E0(SB)/8, $0x9bdc06a725c71235
114DATA  ·kcon+0x0E8(SB)/8, $0x9bdc06a725c71235
115DATA  ·kcon+0x0F0(SB)/8, $0xc19bf174cf692694
116DATA  ·kcon+0x0F8(SB)/8, $0xc19bf174cf692694
117DATA  ·kcon+0x100(SB)/8, $0xe49b69c19ef14ad2
118DATA  ·kcon+0x108(SB)/8, $0xe49b69c19ef14ad2
119DATA  ·kcon+0x110(SB)/8, $0xefbe4786384f25e3
120DATA  ·kcon+0x118(SB)/8, $0xefbe4786384f25e3
121DATA  ·kcon+0x120(SB)/8, $0x0fc19dc68b8cd5b5
122DATA  ·kcon+0x128(SB)/8, $0x0fc19dc68b8cd5b5
123DATA  ·kcon+0x130(SB)/8, $0x240ca1cc77ac9c65
124DATA  ·kcon+0x138(SB)/8, $0x240ca1cc77ac9c65
125DATA  ·kcon+0x140(SB)/8, $0x2de92c6f592b0275
126DATA  ·kcon+0x148(SB)/8, $0x2de92c6f592b0275
127DATA  ·kcon+0x150(SB)/8, $0x4a7484aa6ea6e483
128DATA  ·kcon+0x158(SB)/8, $0x4a7484aa6ea6e483
129DATA  ·kcon+0x160(SB)/8, $0x5cb0a9dcbd41fbd4
130DATA  ·kcon+0x168(SB)/8, $0x5cb0a9dcbd41fbd4
131DATA  ·kcon+0x170(SB)/8, $0x76f988da831153b5
132DATA  ·kcon+0x178(SB)/8, $0x76f988da831153b5
133DATA  ·kcon+0x180(SB)/8, $0x983e5152ee66dfab
134DATA  ·kcon+0x188(SB)/8, $0x983e5152ee66dfab
135DATA  ·kcon+0x190(SB)/8, $0xa831c66d2db43210
136DATA  ·kcon+0x198(SB)/8, $0xa831c66d2db43210
137DATA  ·kcon+0x1A0(SB)/8, $0xb00327c898fb213f
138DATA  ·kcon+0x1A8(SB)/8, $0xb00327c898fb213f
139DATA  ·kcon+0x1B0(SB)/8, $0xbf597fc7beef0ee4
140DATA  ·kcon+0x1B8(SB)/8, $0xbf597fc7beef0ee4
141DATA  ·kcon+0x1C0(SB)/8, $0xc6e00bf33da88fc2
142DATA  ·kcon+0x1C8(SB)/8, $0xc6e00bf33da88fc2
143DATA  ·kcon+0x1D0(SB)/8, $0xd5a79147930aa725
144DATA  ·kcon+0x1D8(SB)/8, $0xd5a79147930aa725
145DATA  ·kcon+0x1E0(SB)/8, $0x06ca6351e003826f
146DATA  ·kcon+0x1E8(SB)/8, $0x06ca6351e003826f
147DATA  ·kcon+0x1F0(SB)/8, $0x142929670a0e6e70
148DATA  ·kcon+0x1F8(SB)/8, $0x142929670a0e6e70
149DATA  ·kcon+0x200(SB)/8, $0x27b70a8546d22ffc
150DATA  ·kcon+0x208(SB)/8, $0x27b70a8546d22ffc
151DATA  ·kcon+0x210(SB)/8, $0x2e1b21385c26c926
152DATA  ·kcon+0x218(SB)/8, $0x2e1b21385c26c926
153DATA  ·kcon+0x220(SB)/8, $0x4d2c6dfc5ac42aed
154DATA  ·kcon+0x228(SB)/8, $0x4d2c6dfc5ac42aed
155DATA  ·kcon+0x230(SB)/8, $0x53380d139d95b3df
156DATA  ·kcon+0x238(SB)/8, $0x53380d139d95b3df
157DATA  ·kcon+0x240(SB)/8, $0x650a73548baf63de
158DATA  ·kcon+0x248(SB)/8, $0x650a73548baf63de
159DATA  ·kcon+0x250(SB)/8, $0x766a0abb3c77b2a8
160DATA  ·kcon+0x258(SB)/8, $0x766a0abb3c77b2a8
161DATA  ·kcon+0x260(SB)/8, $0x81c2c92e47edaee6
162DATA  ·kcon+0x268(SB)/8, $0x81c2c92e47edaee6
163DATA  ·kcon+0x270(SB)/8, $0x92722c851482353b
164DATA  ·kcon+0x278(SB)/8, $0x92722c851482353b
165DATA  ·kcon+0x280(SB)/8, $0xa2bfe8a14cf10364
166DATA  ·kcon+0x288(SB)/8, $0xa2bfe8a14cf10364
167DATA  ·kcon+0x290(SB)/8, $0xa81a664bbc423001
168DATA  ·kcon+0x298(SB)/8, $0xa81a664bbc423001
169DATA  ·kcon+0x2A0(SB)/8, $0xc24b8b70d0f89791
170DATA  ·kcon+0x2A8(SB)/8, $0xc24b8b70d0f89791
171DATA  ·kcon+0x2B0(SB)/8, $0xc76c51a30654be30
172DATA  ·kcon+0x2B8(SB)/8, $0xc76c51a30654be30
173DATA  ·kcon+0x2C0(SB)/8, $0xd192e819d6ef5218
174DATA  ·kcon+0x2C8(SB)/8, $0xd192e819d6ef5218
175DATA  ·kcon+0x2D0(SB)/8, $0xd69906245565a910
176DATA  ·kcon+0x2D8(SB)/8, $0xd69906245565a910
177DATA  ·kcon+0x2E0(SB)/8, $0xf40e35855771202a
178DATA  ·kcon+0x2E8(SB)/8, $0xf40e35855771202a
179DATA  ·kcon+0x2F0(SB)/8, $0x106aa07032bbd1b8
180DATA  ·kcon+0x2F8(SB)/8, $0x106aa07032bbd1b8
181DATA  ·kcon+0x300(SB)/8, $0x19a4c116b8d2d0c8
182DATA  ·kcon+0x308(SB)/8, $0x19a4c116b8d2d0c8
183DATA  ·kcon+0x310(SB)/8, $0x1e376c085141ab53
184DATA  ·kcon+0x318(SB)/8, $0x1e376c085141ab53
185DATA  ·kcon+0x320(SB)/8, $0x2748774cdf8eeb99
186DATA  ·kcon+0x328(SB)/8, $0x2748774cdf8eeb99
187DATA  ·kcon+0x330(SB)/8, $0x34b0bcb5e19b48a8
188DATA  ·kcon+0x338(SB)/8, $0x34b0bcb5e19b48a8
189DATA  ·kcon+0x340(SB)/8, $0x391c0cb3c5c95a63
190DATA  ·kcon+0x348(SB)/8, $0x391c0cb3c5c95a63
191DATA  ·kcon+0x350(SB)/8, $0x4ed8aa4ae3418acb
192DATA  ·kcon+0x358(SB)/8, $0x4ed8aa4ae3418acb
193DATA  ·kcon+0x360(SB)/8, $0x5b9cca4f7763e373
194DATA  ·kcon+0x368(SB)/8, $0x5b9cca4f7763e373
195DATA  ·kcon+0x370(SB)/8, $0x682e6ff3d6b2b8a3
196DATA  ·kcon+0x378(SB)/8, $0x682e6ff3d6b2b8a3
197DATA  ·kcon+0x380(SB)/8, $0x748f82ee5defb2fc
198DATA  ·kcon+0x388(SB)/8, $0x748f82ee5defb2fc
199DATA  ·kcon+0x390(SB)/8, $0x78a5636f43172f60
200DATA  ·kcon+0x398(SB)/8, $0x78a5636f43172f60
201DATA  ·kcon+0x3A0(SB)/8, $0x84c87814a1f0ab72
202DATA  ·kcon+0x3A8(SB)/8, $0x84c87814a1f0ab72
203DATA  ·kcon+0x3B0(SB)/8, $0x8cc702081a6439ec
204DATA  ·kcon+0x3B8(SB)/8, $0x8cc702081a6439ec
205DATA  ·kcon+0x3C0(SB)/8, $0x90befffa23631e28
206DATA  ·kcon+0x3C8(SB)/8, $0x90befffa23631e28
207DATA  ·kcon+0x3D0(SB)/8, $0xa4506cebde82bde9
208DATA  ·kcon+0x3D8(SB)/8, $0xa4506cebde82bde9
209DATA  ·kcon+0x3E0(SB)/8, $0xbef9a3f7b2c67915
210DATA  ·kcon+0x3E8(SB)/8, $0xbef9a3f7b2c67915
211DATA  ·kcon+0x3F0(SB)/8, $0xc67178f2e372532b
212DATA  ·kcon+0x3F8(SB)/8, $0xc67178f2e372532b
213DATA  ·kcon+0x400(SB)/8, $0xca273eceea26619c
214DATA  ·kcon+0x408(SB)/8, $0xca273eceea26619c
215DATA  ·kcon+0x410(SB)/8, $0xd186b8c721c0c207
216DATA  ·kcon+0x418(SB)/8, $0xd186b8c721c0c207
217DATA  ·kcon+0x420(SB)/8, $0xeada7dd6cde0eb1e
218DATA  ·kcon+0x428(SB)/8, $0xeada7dd6cde0eb1e
219DATA  ·kcon+0x430(SB)/8, $0xf57d4f7fee6ed178
220DATA  ·kcon+0x438(SB)/8, $0xf57d4f7fee6ed178
221DATA  ·kcon+0x440(SB)/8, $0x06f067aa72176fba
222DATA  ·kcon+0x448(SB)/8, $0x06f067aa72176fba
223DATA  ·kcon+0x450(SB)/8, $0x0a637dc5a2c898a6
224DATA  ·kcon+0x458(SB)/8, $0x0a637dc5a2c898a6
225DATA  ·kcon+0x460(SB)/8, $0x113f9804bef90dae
226DATA  ·kcon+0x468(SB)/8, $0x113f9804bef90dae
227DATA  ·kcon+0x470(SB)/8, $0x1b710b35131c471b
228DATA  ·kcon+0x478(SB)/8, $0x1b710b35131c471b
229DATA  ·kcon+0x480(SB)/8, $0x28db77f523047d84
230DATA  ·kcon+0x488(SB)/8, $0x28db77f523047d84
231DATA  ·kcon+0x490(SB)/8, $0x32caab7b40c72493
232DATA  ·kcon+0x498(SB)/8, $0x32caab7b40c72493
233DATA  ·kcon+0x4A0(SB)/8, $0x3c9ebe0a15c9bebc
234DATA  ·kcon+0x4A8(SB)/8, $0x3c9ebe0a15c9bebc
235DATA  ·kcon+0x4B0(SB)/8, $0x431d67c49c100d4c
236DATA  ·kcon+0x4B8(SB)/8, $0x431d67c49c100d4c
237DATA  ·kcon+0x4C0(SB)/8, $0x4cc5d4becb3e42b6
238DATA  ·kcon+0x4C8(SB)/8, $0x4cc5d4becb3e42b6
239DATA  ·kcon+0x4D0(SB)/8, $0x597f299cfc657e2a
240DATA  ·kcon+0x4D8(SB)/8, $0x597f299cfc657e2a
241DATA  ·kcon+0x4E0(SB)/8, $0x5fcb6fab3ad6faec
242DATA  ·kcon+0x4E8(SB)/8, $0x5fcb6fab3ad6faec
243DATA  ·kcon+0x4F0(SB)/8, $0x6c44198c4a475817
244DATA  ·kcon+0x4F8(SB)/8, $0x6c44198c4a475817
245DATA  ·kcon+0x500(SB)/8, $0x0000000000000000
246DATA  ·kcon+0x508(SB)/8, $0x0000000000000000
247DATA  ·kcon+0x510(SB)/8, $0x1011121314151617
248DATA  ·kcon+0x518(SB)/8, $0x0001020304050607
249GLOBL ·kcon(SB), RODATA, $1312
250
251#define SHA512ROUND0(a, b, c, d, e, f, g, h, xi) \
252	VSEL		g, f, e, FUNC; \
253	VSHASIGMAD	$15, e, $1, S1; \
254	VADDUDM		xi, h, h; \
255	VSHASIGMAD	$0, a, $1, S0; \
256	VADDUDM		FUNC, h, h; \
257	VXOR		b, a, FUNC; \
258	VADDUDM		S1, h, h; \
259	VSEL		b, c, FUNC, FUNC; \
260	VADDUDM		KI, g, g; \
261	VADDUDM		h, d, d; \
262	VADDUDM		FUNC, S0, S0; \
263	LVX		(TBL)(IDX), KI; \
264	ADD		$16, IDX; \
265	VADDUDM		S0, h, h
266
267#define SHA512ROUND1(a, b, c, d, e, f, g, h, xi, xj, xj_1, xj_9, xj_14) \
268	VSHASIGMAD	$0, xj_1, $0, s0; \
269	VSEL		g, f, e, FUNC; \
270	VSHASIGMAD	$15, e, $1, S1; \
271	VADDUDM		xi, h, h; \
272	VSHASIGMAD	$0, a, $1, S0; \
273	VSHASIGMAD	$15, xj_14, $0, s1; \
274	VADDUDM		FUNC, h, h; \
275	VXOR		b, a, FUNC; \
276	VADDUDM		xj_9, xj, xj; \
277	VADDUDM		S1, h, h; \
278	VSEL		b, c, FUNC, FUNC; \
279	VADDUDM		KI, g, g; \
280	VADDUDM		h, d, d; \
281	VADDUDM		FUNC, S0, S0; \
282	VADDUDM		s0, xj, xj; \
283	LVX		(TBL)(IDX), KI; \
284	ADD		$16, IDX; \
285	VADDUDM		S0, h, h; \
286	VADDUDM		s1, xj, xj
287
288// func block(dig *digest, p []byte)
289TEXT ·block(SB),0,$128-32
290	MOVD	dig+0(FP), CTX
291	MOVD	p_base+8(FP), INP
292	MOVD	p_len+16(FP), LEN
293
294	SRD	$6, LEN
295	SLD	$6, LEN
296
297	ADD	INP, LEN, END
298
299	CMP	INP, END
300	BEQ	end
301
302	MOVDkcon(SB), TBL
303	MOVD	R1, OFFLOAD
304
305	MOVD	R0, CNT
306	MOVWZ	$0x10, HEX10
307	MOVWZ	$0x20, HEX20
308	MOVWZ	$0x30, HEX30
309	MOVWZ	$0x40, HEX40
310	MOVWZ	$0x50, HEX50
311	MOVWZ	$0x60, HEX60
312	MOVWZ	$0x70, HEX70
313
314	MOVWZ	$8, IDX
315	LVSL	(IDX)(R0), LEMASK
316	VSPLTISB	$0x0F, KI
317	VXOR	KI, LEMASK, LEMASK
318
319	LXVD2X	(CTX)(HEX00), VS32	// v0 = vs32
320	LXVD2X	(CTX)(HEX10), VS34	// v2 = vs34
321	LXVD2X	(CTX)(HEX20), VS36	// v4 = vs36
322	// unpack the input values into vector registers
323	VSLDOI	$8, V0, V0, V1
324	LXVD2X	(CTX)(HEX30), VS38	// v6 = vs38
325	VSLDOI	$8, V2, V2, V3
326	VSLDOI	$8, V4, V4, V5
327	VSLDOI	$8, V6, V6, V7
328
329loop:
330	LVX	(TBL)(HEX00), KI
331	MOVWZ	$16, IDX
332
333	LXVD2X	(INP)(R0), VS40	// load v8 (=vs40) in advance
334	ADD	$16, INP
335
336	STVX	V0, (OFFLOAD+HEX00)
337	STVX	V1, (OFFLOAD+HEX10)
338	STVX	V2, (OFFLOAD+HEX20)
339	STVX	V3, (OFFLOAD+HEX30)
340	STVX	V4, (OFFLOAD+HEX40)
341	STVX	V5, (OFFLOAD+HEX50)
342	STVX	V6, (OFFLOAD+HEX60)
343	STVX	V7, (OFFLOAD+HEX70)
344
345	VADDUDM	KI, V7, V7	// h+K[i]
346	LVX	(TBL)(IDX), KI
347	ADD	$16, IDX
348
349	VPERM	V8, V8, LEMASK, V8
350	SHA512ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V8)
351	LXVD2X	(INP)(R0), VS42	// load v10 (=vs42) in advance
352	ADD	$16, INP, INP
353	VSLDOI	$8, V8, V8, V9
354	SHA512ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V9)
355	VPERM	V10, V10, LEMASK, V10
356	SHA512ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V10)
357	LXVD2X	(INP)(R0), VS44	// load v12 (=vs44) in advance
358	ADD	$16, INP, INP
359	VSLDOI	$8, V10, V10, V11
360	SHA512ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V11)
361	VPERM	V12, V12, LEMASK, V12
362	SHA512ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V12)
363	LXVD2X	(INP)(R0), VS46	// load v14 (=vs46) in advance
364	ADD	$16, INP, INP
365	VSLDOI	$8, V12, V12, V13
366	SHA512ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V13)
367	VPERM	V14, V14, LEMASK, V14
368	SHA512ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V14)
369	LXVD2X	(INP)(R0), VS48	// load v16 (=vs48) in advance
370	ADD	$16, INP, INP
371	VSLDOI	$8, V14, V14, V15
372	SHA512ROUND0(V1, V2, V3, V4, V5, V6, V7, V0, V15)
373	VPERM	V16, V16, LEMASK, V16
374	SHA512ROUND0(V0, V1, V2, V3, V4, V5, V6, V7, V16)
375	LXVD2X	(INP)(R0), VS50	// load v18 (=vs50) in advance
376	ADD	$16, INP, INP
377	VSLDOI	$8, V16, V16, V17
378	SHA512ROUND0(V7, V0, V1, V2, V3, V4, V5, V6, V17)
379	VPERM	V18, V18, LEMASK, V18
380	SHA512ROUND0(V6, V7, V0, V1, V2, V3, V4, V5, V18)
381	LXVD2X	(INP)(R0), VS52	// load v20 (=vs52) in advance
382	ADD	$16, INP, INP
383	VSLDOI	$8, V18, V18, V19
384	SHA512ROUND0(V5, V6, V7, V0, V1, V2, V3, V4, V19)
385	VPERM	V20, V20, LEMASK, V20
386	SHA512ROUND0(V4, V5, V6, V7, V0, V1, V2, V3, V20)
387	LXVD2X	(INP)(R0), VS54	// load v22 (=vs54) in advance
388	ADD	$16, INP, INP
389	VSLDOI	$8, V20, V20, V21
390	SHA512ROUND0(V3, V4, V5, V6, V7, V0, V1, V2, V21)
391	VPERM	V22, V22, LEMASK, V22
392	SHA512ROUND0(V2, V3, V4, V5, V6, V7, V0, V1, V22)
393	VSLDOI	$8, V22, V22, V23
394	SHA512ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22)
395
396	MOVWZ	$4, TEMP
397	MOVWZ	TEMP, CTR
398
399L16_xx:
400	SHA512ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V18, V23)
401	SHA512ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V9, V10, V11, V19, V8)
402	SHA512ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V10, V11, V12, V20, V9)
403	SHA512ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V11, V12, V13, V21, V10)
404	SHA512ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V12, V13, V14, V22, V11)
405	SHA512ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V13, V14, V15, V23, V12)
406	SHA512ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V14, V15, V16, V8, V13)
407	SHA512ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V15, V16, V17, V9, V14)
408	SHA512ROUND1(V0, V1, V2, V3, V4, V5, V6, V7, V16, V17, V18, V10, V15)
409	SHA512ROUND1(V7, V0, V1, V2, V3, V4, V5, V6, V17, V18, V19, V11, V16)
410	SHA512ROUND1(V6, V7, V0, V1, V2, V3, V4, V5, V18, V19, V20, V12, V17)
411	SHA512ROUND1(V5, V6, V7, V0, V1, V2, V3, V4, V19, V20, V21, V13, V18)
412	SHA512ROUND1(V4, V5, V6, V7, V0, V1, V2, V3, V20, V21, V22, V14, V19)
413	SHA512ROUND1(V3, V4, V5, V6, V7, V0, V1, V2, V21, V22, V23, V15, V20)
414	SHA512ROUND1(V2, V3, V4, V5, V6, V7, V0, V1, V22, V23, V8, V16, V21)
415	SHA512ROUND1(V1, V2, V3, V4, V5, V6, V7, V0, V23, V8, V9, V17, V22)
416
417	BC	0x10, 0, L16_xx		// bdnz
418
419	LVX	(OFFLOAD)(HEX00), V10
420
421	LVX	(OFFLOAD)(HEX10), V11
422	VADDUDM	V10, V0, V0
423	LVX	(OFFLOAD)(HEX20), V12
424	VADDUDM	V11, V1, V1
425	LVX	(OFFLOAD)(HEX30), V13
426	VADDUDM	V12, V2, V2
427	LVX	(OFFLOAD)(HEX40), V14
428	VADDUDM	V13, V3, V3
429	LVX	(OFFLOAD)(HEX50), V15
430	VADDUDM	V14, V4, V4
431	LVX	(OFFLOAD)(HEX60), V16
432	VADDUDM	V15, V5, V5
433	LVX	(OFFLOAD)(HEX70), V17
434	VADDUDM	V16, V6, V6
435	VADDUDM	V17, V7, V7
436
437	CMPU	INP, END
438	BLT	loop
439
440	VPERM	V0, V1, KI, V0
441	VPERM	V2, V3, KI, V2
442	VPERM	V4, V5, KI, V4
443	VPERM	V6, V7, KI, V6
444	STXVD2X	VS32, (CTX+HEX00)	// v0 = vs32
445	STXVD2X	VS34, (CTX+HEX10)	// v2 = vs34
446	STXVD2X	VS36, (CTX+HEX20)	// v4 = vs36
447	STXVD2X	VS38, (CTX+HEX30)	// v6 = vs38
448
449end:
450	RET
451
452