1/* $OpenBSD: sha512_amd64_generic.S,v 1.1 2024/11/16 14:56:39 jsing Exp $ */
2/*
3 * Copyright (c) 2024 Joel Sing <jsing@openbsd.org>
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 */
17
18#ifdef __CET__
19#include <cet.h>
20#else
21#define _CET_ENDBR
22#endif
23
24#define	ctx		%rdi
25#define	in		%rsi
26#define	num		%rdx
27
28#define	round		%rdi
29
30#define	hs0		%r8
31#define	hs1		%r9
32#define	hs2		%r10
33#define	hs3		%r11
34#define	hs4		%r12
35#define	hs5		%r13
36#define	hs6		%r14
37#define	hs7		%r15
38
39#define	k512		%rbp
40
41#define	tmp0		%rax
42#define	tmp1		%rbx
43#define	tmp2		%rcx
44#define	tmp3		%rdx
45
46/*
47 * Load message into wt, storing a copy in the message schedule:
48 *
49 *  Wt = Mt
50 */
51#define sha512_message_schedule_load(idx, m, w, wt) \
52	movq	(m, round, 8), wt;				\
53	bswapq	wt;						\
54	movq	wt, ((idx&0xf)*8)(w);
55
56/*
57 * Update message schedule and return current value in wt:
58 *
59 *  Wt = sigma1(W(t-2)) + W(t-7) + sigma0(W(t-15)) + W(t-16)
60 *
61 *  sigma0(x) = ror(x, 1) ^ ror(x, 8) ^ (x >> 7)
62 *  sigma1(x) = ror(x, 19) ^ ror(x, 61) ^ (x >> 6)
63 *
64 */
65#define sha512_message_schedule_update(idx, w, wt) \
66	movq	(((idx-2)&0xf)*8)(w), wt;	/* sigma1 */	\
67	movq	wt, tmp1;			/* sigma1 */	\
68	rorq	$(61-19), tmp1;			/* sigma1 */	\
69	xorq	wt, tmp1;			/* sigma1 */	\
70	rorq	$19, tmp1;			/* sigma1 */	\
71	shrq	$6, wt;				/* sigma1 */	\
72	xorq	tmp1, wt;			/* sigma1 */	\
73	\
74	addq	(((idx-7)&0xf)*8)(w), wt;	/* Wt-7 */	\
75	addq	(((idx-16)&0xf)*8)(w), wt;	/* Wt-16 */	\
76	\
77	movq	(((idx-15)&0xf)*8)(w), tmp2;	/* sigma0 */	\
78	movq	tmp2, tmp3;			/* sigma0 */	\
79	rorq	$(8-1), tmp2;			/* sigma0 */	\
80	xorq	tmp3, tmp2;			/* sigma0 */	\
81	rorq	$1, tmp2;			/* sigma0 */	\
82	shrq	$7, tmp3;			/* sigma0 */	\
83	xorq	tmp3, tmp2;			/* sigma0 */	\
84	addq	tmp2, wt;			/* sigma0 */	\
85	\
86	movq	wt, ((idx&0xf)*8)(w);
87
88/*
89 * Compute a SHA-512 round:
90 *
91 *  T1 = h + Sigma1(e) + Ch(e, f, g) + Kt + Wt
92 *  T2 = Sigma0(a) + Maj(a, b, c)
93 *
94 *  Sigma0(x) = ror(x, 28) ^ ror(x, 34) ^ ror(x, 39)
95 *  Sigma1(x) = ror(x, 14) ^ ror(x, 18) ^ ror(x, 41)
96 *  Ch(x, y, z) = (x & y) ^ (~x & z) = ((y ^ z) & x) ^ z
97 *  Maj(x, y, z) = (x & y) ^ (x & z) ^ (y & z) = ((y ^ z) & x) ^ (y & z)
98 *
99 * Upon completion d = d + T1, h = T1 + T2, pending rotation.
100 */
101#define sha512_round(idx, a, b, c, d, e, f, g, h, k, w, wt) \
102	addq	wt, h;				/* T1 Wt */	\
103	addq	(k512, round, 8), h;		/* T1 Kt */	\
104	\
105	movq	e, tmp1;			/* T1 Sigma1 */	\
106	rorq	$(41-18), tmp1;			/* T1 Sigma1 */	\
107	xorq	e, tmp1;			/* T1 Sigma1 */ \
108	rorq	$(18-14), tmp1;			/* T1 Sigma1 */	\
109	xorq	e, tmp1;			/* T1 Sigma1 */ \
110	rorq	$14, tmp1;			/* T1 Sigma1 */	\
111	addq	tmp1, h;			/* T1 Sigma1 */	\
112	\
113	movq	f, tmp2;			/* T1 Ch */	\
114	xorq	g, tmp2;			/* T1 Ch */	\
115	andq	e, tmp2;			/* T1 Ch */	\
116	xorq	g, tmp2;			/* T1 Ch */	\
117	addq	tmp2, h;			/* T1 Ch */	\
118	\
119	addq	h, d;				/* d += T1 */	\
120	\
121	movq	a, tmp1;			/* T2 Sigma0 */	\
122	rorq	$(39-34), tmp1;			/* T2 Sigma0 */	\
123	xorq	a, tmp1;			/* T2 Sigma0 */	\
124	rorq	$(34-28), tmp1;			/* T2 Sigma0 */	\
125	xorq	a, tmp1;			/* T2 Sigma0 */	\
126	rorq	$28, tmp1;			/* T2 Sigma0 */	\
127	addq	tmp1, h;			/* T2 Sigma0 */	\
128	\
129	movq	b, tmp2;			/* T2 Maj */	\
130	xorq	c, tmp2;			/* T2 Maj */	\
131	andq	a, tmp2;			/* T2 Maj */	\
132	movq	b, tmp3;			/* T2 Maj */	\
133	andq	c, tmp3;			/* T2 Maj */	\
134	xorq	tmp2, tmp3;			/* T2 Maj */	\
135	addq	tmp3, h;			/* T2 Maj */	\
136	\
137	addq	$1, round;
138
139#define sha512_round_load(idx, a, b, c, d, e, f, g, h) \
140	sha512_message_schedule_load(idx, in, %rsp, tmp0) \
141	sha512_round(idx, a, b, c, d, e, f, g, h, k512, %rsp, tmp0)
142
143#define sha512_round_update(idx, a, b, c, d, e, f, g, h) \
144	sha512_message_schedule_update(idx, %rsp, tmp0) \
145	sha512_round(idx, a, b, c, d, e, f, g, h, k512, %rsp, tmp0)
146
147.text
148
149/*
150 * void sha512_block_generic(SHA512_CTX *ctx, const void *in, size_t num);
151 *
152 * Standard x86-64 ABI: rdi = ctx, rsi = in, rdx = num
153 */
154.align 16
155.globl	sha512_block_generic
156.type	sha512_block_generic,@function
157sha512_block_generic:
158	_CET_ENDBR
159
160	/* Save callee save registers. */
161	pushq	%rbx
162	pushq	%rbp
163	pushq	%r12
164	pushq	%r13
165	pushq	%r14
166	pushq	%r15
167
168	/* Allocate space for message schedule and context pointer. */
169	movq	%rsp, %rax
170	subq	$(128+3*8), %rsp
171	andq	$~63, %rsp
172	movq	%rax, (128+2*8)(%rsp)
173	movq	ctx, (128+1*8)(%rsp)
174
175	/* Compute and store end of message. */
176	shlq	$7, num
177	leaq	(in, num, 1), %rbx
178	movq	%rbx, (128+0*8)(%rsp)
179
180	/* Address of SHA-512 constants. */
181	leaq	K512(%rip), k512
182
183	/* Load current hash state from context. */
184	movq	(0*8)(ctx), hs0
185	movq	(1*8)(ctx), hs1
186	movq	(2*8)(ctx), hs2
187	movq	(3*8)(ctx), hs3
188	movq	(4*8)(ctx), hs4
189	movq	(5*8)(ctx), hs5
190	movq	(6*8)(ctx), hs6
191	movq	(7*8)(ctx), hs7
192
193	jmp	.Lblock_loop0
194
195.align 16
196.Lblock_loop0:
197	mov	$0, round
198
199	/* Round 0 through 15. */
200	sha512_round_load(0, hs0, hs1, hs2, hs3, hs4, hs5, hs6, hs7)
201	sha512_round_load(1, hs7, hs0, hs1, hs2, hs3, hs4, hs5, hs6)
202	sha512_round_load(2, hs6, hs7, hs0, hs1, hs2, hs3, hs4, hs5)
203	sha512_round_load(3, hs5, hs6, hs7, hs0, hs1, hs2, hs3, hs4)
204	sha512_round_load(4, hs4, hs5, hs6, hs7, hs0, hs1, hs2, hs3)
205	sha512_round_load(5, hs3, hs4, hs5, hs6, hs7, hs0, hs1, hs2)
206	sha512_round_load(6, hs2, hs3, hs4, hs5, hs6, hs7, hs0, hs1)
207	sha512_round_load(7, hs1, hs2, hs3, hs4, hs5, hs6, hs7, hs0)
208	sha512_round_load(8, hs0, hs1, hs2, hs3, hs4, hs5, hs6, hs7)
209	sha512_round_load(9, hs7, hs0, hs1, hs2, hs3, hs4, hs5, hs6)
210	sha512_round_load(10, hs6, hs7, hs0, hs1, hs2, hs3, hs4, hs5)
211	sha512_round_load(11, hs5, hs6, hs7, hs0, hs1, hs2, hs3, hs4)
212	sha512_round_load(12, hs4, hs5, hs6, hs7, hs0, hs1, hs2, hs3)
213	sha512_round_load(13, hs3, hs4, hs5, hs6, hs7, hs0, hs1, hs2)
214	sha512_round_load(14, hs2, hs3, hs4, hs5, hs6, hs7, hs0, hs1)
215	sha512_round_load(15, hs1, hs2, hs3, hs4, hs5, hs6, hs7, hs0)
216
217	jmp	.Lblock_loop16
218
219.align 16
220.Lblock_loop16:
221	/* Round 16 through 79. */
222	sha512_round_update(16, hs0, hs1, hs2, hs3, hs4, hs5, hs6, hs7)
223	sha512_round_update(17, hs7, hs0, hs1, hs2, hs3, hs4, hs5, hs6)
224	sha512_round_update(18, hs6, hs7, hs0, hs1, hs2, hs3, hs4, hs5)
225	sha512_round_update(19, hs5, hs6, hs7, hs0, hs1, hs2, hs3, hs4)
226	sha512_round_update(20, hs4, hs5, hs6, hs7, hs0, hs1, hs2, hs3)
227	sha512_round_update(21, hs3, hs4, hs5, hs6, hs7, hs0, hs1, hs2)
228	sha512_round_update(22, hs2, hs3, hs4, hs5, hs6, hs7, hs0, hs1)
229	sha512_round_update(23, hs1, hs2, hs3, hs4, hs5, hs6, hs7, hs0)
230	sha512_round_update(24, hs0, hs1, hs2, hs3, hs4, hs5, hs6, hs7)
231	sha512_round_update(25, hs7, hs0, hs1, hs2, hs3, hs4, hs5, hs6)
232	sha512_round_update(26, hs6, hs7, hs0, hs1, hs2, hs3, hs4, hs5)
233	sha512_round_update(27, hs5, hs6, hs7, hs0, hs1, hs2, hs3, hs4)
234	sha512_round_update(28, hs4, hs5, hs6, hs7, hs0, hs1, hs2, hs3)
235	sha512_round_update(29, hs3, hs4, hs5, hs6, hs7, hs0, hs1, hs2)
236	sha512_round_update(30, hs2, hs3, hs4, hs5, hs6, hs7, hs0, hs1)
237	sha512_round_update(31, hs1, hs2, hs3, hs4, hs5, hs6, hs7, hs0)
238
239	cmp	$80, round
240	jb	.Lblock_loop16
241
242	movq	(128+1*8)(%rsp), ctx
243
244	/* Add intermediate state to hash state. */
245	addq	(0*8)(ctx), hs0
246	addq	(1*8)(ctx), hs1
247	addq	(2*8)(ctx), hs2
248	addq	(3*8)(ctx), hs3
249	addq	(4*8)(ctx), hs4
250	addq	(5*8)(ctx), hs5
251	addq	(6*8)(ctx), hs6
252	addq	(7*8)(ctx), hs7
253
254	/* Store new hash state to context. */
255	movq	hs0, (0*8)(ctx)
256	movq	hs1, (1*8)(ctx)
257	movq	hs2, (2*8)(ctx)
258	movq	hs3, (3*8)(ctx)
259	movq	hs4, (4*8)(ctx)
260	movq	hs5, (5*8)(ctx)
261	movq	hs6, (6*8)(ctx)
262	movq	hs7, (7*8)(ctx)
263
264	addq	$128, in
265	cmpq	(128+0*8)(%rsp), in
266	jb	.Lblock_loop0
267
268	movq	(128+2*8)(%rsp), %rsp
269
270	/* Restore callee save registers. */
271	popq	%r15
272	popq	%r14
273	popq	%r13
274	popq	%r12
275	popq	%rbp
276	popq	%rbx
277
278	ret
279
280/*
281 * SHA-512 constants - see FIPS 180-4 section 4.2.3.
282 */
283.rodata
284.align	64
285.type	K512,@object
286K512:
287.quad	0x428a2f98d728ae22, 0x7137449123ef65cd, 0xb5c0fbcfec4d3b2f, 0xe9b5dba58189dbbc
288.quad	0x3956c25bf348b538, 0x59f111f1b605d019, 0x923f82a4af194f9b, 0xab1c5ed5da6d8118
289.quad	0xd807aa98a3030242, 0x12835b0145706fbe, 0x243185be4ee4b28c, 0x550c7dc3d5ffb4e2
290.quad	0x72be5d74f27b896f, 0x80deb1fe3b1696b1, 0x9bdc06a725c71235, 0xc19bf174cf692694
291.quad	0xe49b69c19ef14ad2, 0xefbe4786384f25e3, 0x0fc19dc68b8cd5b5, 0x240ca1cc77ac9c65
292.quad	0x2de92c6f592b0275, 0x4a7484aa6ea6e483, 0x5cb0a9dcbd41fbd4, 0x76f988da831153b5
293.quad	0x983e5152ee66dfab, 0xa831c66d2db43210, 0xb00327c898fb213f, 0xbf597fc7beef0ee4
294.quad	0xc6e00bf33da88fc2, 0xd5a79147930aa725, 0x06ca6351e003826f, 0x142929670a0e6e70
295.quad	0x27b70a8546d22ffc, 0x2e1b21385c26c926, 0x4d2c6dfc5ac42aed, 0x53380d139d95b3df
296.quad	0x650a73548baf63de, 0x766a0abb3c77b2a8, 0x81c2c92e47edaee6, 0x92722c851482353b
297.quad	0xa2bfe8a14cf10364, 0xa81a664bbc423001, 0xc24b8b70d0f89791, 0xc76c51a30654be30
298.quad	0xd192e819d6ef5218, 0xd69906245565a910, 0xf40e35855771202a, 0x106aa07032bbd1b8
299.quad	0x19a4c116b8d2d0c8, 0x1e376c085141ab53, 0x2748774cdf8eeb99, 0x34b0bcb5e19b48a8
300.quad	0x391c0cb3c5c95a63, 0x4ed8aa4ae3418acb, 0x5b9cca4f7763e373, 0x682e6ff3d6b2b8a3
301.quad	0x748f82ee5defb2fc, 0x78a5636f43172f60, 0x84c87814a1f0ab72, 0x8cc702081a6439ec
302.quad	0x90befffa23631e28, 0xa4506cebde82bde9, 0xbef9a3f7b2c67915, 0xc67178f2e372532b
303.quad	0xca273eceea26619c, 0xd186b8c721c0c207, 0xeada7dd6cde0eb1e, 0xf57d4f7fee6ed178
304.quad	0x06f067aa72176fba, 0x0a637dc5a2c898a6, 0x113f9804bef90dae, 0x1b710b35131c471b
305.quad	0x28db77f523047d84, 0x32caab7b40c72493, 0x3c9ebe0a15c9bebc, 0x431d67c49c100d4c
306.quad	0x4cc5d4becb3e42b6, 0x597f299cfc657e2a, 0x5fcb6fab3ad6faec, 0x6c44198c4a475817
307.size	K512,.-K512
308