1;;
2;; Copyright (c) 2012-2020, Intel Corporation
3;;
4;; Redistribution and use in source and binary forms, with or without
5;; modification, are permitted provided that the following conditions are met:
6;;
7;;     * Redistributions of source code must retain the above copyright notice,
8;;       this list of conditions and the following disclaimer.
9;;     * Redistributions in binary form must reproduce the above copyright
10;;       notice, this list of conditions and the following disclaimer in the
11;;       documentation and/or other materials provided with the distribution.
12;;     * Neither the name of Intel Corporation nor the names of its contributors
13;;       may be used to endorse or promote products derived from this software
14;;       without specific prior written permission.
15;;
16;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26;;
27
28;; code to compute quad SHA256 using AVX
29;; outer calling routine takes care of save and restore of XMM registers
30;; Logic designed/laid out by JDG
31
32;; Stack must be aligned to 16 bytes before call
33;; Windows clobbers:  rax rbx     rdx             r8 r9 r10 r11 r12
34;; Windows preserves:         rcx     rsi rdi rbp                   r12 r14 r15
35;;
36;; Linux clobbers:    rax rbx         rsi         r8 r9 r10 r11 r12
37;; Linux preserves:           rcx rdx     rdi rbp                   r13 r14 r15
38;;
39;; clobbers xmm0-15
40
41%include "include/os.asm"
42%include "mb_mgr_datastruct.asm"
43%include "include/clear_regs.asm"
44
45extern K256_4
46
47%ifdef LINUX
48 %define arg1 	rdi
49 %define arg2	rsi
50%else
51 ; Windows definitions
52 %define arg1 	rcx
53 %define arg2 	rdx
54%endif
55
56; Common definitions
57%define STATE    arg1
58%define INP_SIZE arg2
59
60%define IDX     rax
61%define ROUND	rbx
62%define TBL	r12
63
64%define inp0 r8
65%define inp1 r9
66%define inp2 r10
67%define inp3 r11
68
69%define a xmm0
70%define b xmm1
71%define c xmm2
72%define d xmm3
73%define e xmm4
74%define f xmm5
75%define g xmm6
76%define h xmm7
77
78%define a0 xmm8
79%define a1 xmm9
80%define a2 xmm10
81
82%define TT0 xmm14
83%define TT1 xmm13
84%define TT2 xmm12
85%define TT3 xmm11
86%define TT4 xmm10
87%define TT5 xmm9
88
89%define T1  xmm14
90%define TMP xmm15
91
92%define SZ4	4*SHA256_DIGEST_WORD_SIZE	; Size of one vector register
93%define ROUNDS 64*SZ4
94
95; Define stack usage
96struc STACK
97_DATA:		resb	SZ4 * 16
98_DIGEST:	resb	SZ4 * NUM_SHA256_DIGEST_WORDS
99		resb	8 	; for alignment, must be odd multiple of 8
100endstruc
101
102%define VMOVPS	vmovups
103
104; transpose r0, r1, r2, r3, t0, t1
105; "transpose" data in {r0..r3} using temps {t0..t3}
106; Input looks like: {r0 r1 r2 r3}
107; r0 = {a3 a2 a1 a0}
108; r1 = {b3 b2 b1 b0}
109; r2 = {c3 c2 c1 c0}
110; r3 = {d3 d2 d1 d0}
111;
112; output looks like: {t0 r1 r0 r3}
113; t0 = {d0 c0 b0 a0}
114; r1 = {d1 c1 b1 a1}
115; r0 = {d2 c2 b2 a2}
116; r3 = {d3 c3 b3 a3}
117;
118%macro TRANSPOSE 6
119%define %%r0 %1
120%define %%r1 %2
121%define %%r2 %3
122%define %%r3 %4
123%define %%t0 %5
124%define %%t1 %6
125	vshufps	%%t0, %%r0, %%r1, 0x44	; t0 = {b1 b0 a1 a0}
126	vshufps	%%r0, %%r0, %%r1, 0xEE	; r0 = {b3 b2 a3 a2}
127
128	vshufps	%%t1, %%r2, %%r3, 0x44	; t1 = {d1 d0 c1 c0}
129	vshufps	%%r2, %%r2, %%r3, 0xEE	; r2 = {d3 d2 c3 c2}
130
131	vshufps	%%r1, %%t0, %%t1, 0xDD	; r1 = {d1 c1 b1 a1}
132
133	vshufps	%%r3, %%r0, %%r2, 0xDD	; r3 = {d3 c3 b3 a3}
134
135	vshufps	%%r0, %%r0, %%r2, 0x88	; r0 = {d2 c2 b2 a2}
136	vshufps	%%t0, %%t0, %%t1, 0x88	; t0 = {d0 c0 b0 a0}
137%endmacro
138
139
140
141%macro ROTATE_ARGS 0
142%xdefine TMP_ h
143%xdefine h g
144%xdefine g f
145%xdefine f e
146%xdefine e d
147%xdefine d c
148%xdefine c b
149%xdefine b a
150%xdefine a TMP_
151%endm
152
153; PRORD reg, imm, tmp
154%macro PRORD 3
155%define %%reg %1
156%define %%imm %2
157%define %%tmp %3
158	vpslld	%%tmp, %%reg, (32-(%%imm))
159	vpsrld	%%reg, %%reg, %%imm
160	vpor	%%reg, %%reg, %%tmp
161%endmacro
162
163; non-destructive
164; PRORD_nd reg, imm, tmp, src
165%macro PRORD_nd 4
166%define %%reg %1
167%define %%imm %2
168%define %%tmp %3
169%define %%src %4
170	;vmovdqa	%%tmp, %%reg
171	vpslld	%%tmp, %%src, (32-(%%imm))
172	vpsrld	%%reg, %%src, %%imm
173	vpor	%%reg, %%reg, %%tmp
174%endmacro
175
176; PRORD dst/src, amt
177%macro PRORD 2
178	PRORD	%1, %2, TMP
179%endmacro
180
181; PRORD_nd dst, src, amt
182%macro PRORD_nd 3
183	PRORD_nd	%1, %3, TMP, %2
184%endmacro
185
186;; arguments passed implicitly in preprocessor symbols i, a...h
187%macro ROUND_00_15 2
188%define %%T1 %1
189%define %%i  %2
190	PRORD_nd	a0, e, (11-6)	; sig1: a0 = (e >> 5)
191
192	vpxor	a2, f, g	; ch: a2 = f^g
193	vpand	a2, a2, e	; ch: a2 = (f^g)&e
194	vpxor	a2, a2, g	; a2 = ch
195
196	PRORD_nd	a1, e, 25		; sig1: a1 = (e >> 25)
197	vmovdqa	[SZ4*(%%i&0xf) + rsp + _DATA], %%T1
198	vpaddd	%%T1, %%T1, [TBL + ROUND]	; T1 = W + K
199	vpxor	a0, a0, e	; sig1: a0 = e ^ (e >> 5)
200	PRORD	a0, 6		; sig1: a0 = (e >> 6) ^ (e >> 11)
201	vpaddd	h, h, a2	; h = h + ch
202	PRORD_nd	a2, a, (13-2)	; sig0: a2 = (a >> 11)
203	vpaddd	h, h, %%T1	; h = h + ch + W + K
204	vpxor	a0, a0, a1	; a0 = sigma1
205	PRORD_nd	a1, a, 22	; sig0: a1 = (a >> 22)
206	vpxor	%%T1, a, c	; maj: T1 = a^c
207	add	ROUND, SZ4	; ROUND++
208	vpand	%%T1, %%T1, b	; maj: T1 = (a^c)&b
209	vpaddd	h, h, a0
210
211	vpaddd	d, d, h
212
213	vpxor	a2, a2, a	; sig0: a2 = a ^ (a >> 11)
214	PRORD	a2, 2		; sig0: a2 = (a >> 2) ^ (a >> 13)
215	vpxor	a2, a2, a1	; a2 = sig0
216	vpand	a1, a, c	; maj: a1 = a&c
217	vpor	a1, a1, %%T1	; a1 = maj
218	vpaddd	h, h, a1	; h = h + ch + W + K + maj
219	vpaddd	h, h, a2	; h = h + ch + W + K + maj + sigma0
220
221	ROTATE_ARGS
222%endm
223
224
225;; arguments passed implicitly in preprocessor symbols i, a...h
226%macro ROUND_16_XX 2
227%define %%T1 %1
228%define %%i  %2
229	vmovdqa	%%T1, [SZ4*((%%i-15)&0xf) + rsp + _DATA]
230	vmovdqa	a1, [SZ4*((%%i-2)&0xf) + rsp + _DATA]
231	vmovdqa	a0, %%T1
232	PRORD	%%T1, 18-7
233	vmovdqa	a2, a1
234	PRORD	a1, 19-17
235	vpxor	%%T1, %%T1, a0
236	PRORD	%%T1, 7
237	vpxor	a1, a1, a2
238	PRORD	a1, 17
239	vpsrld	a0, a0, 3
240	vpxor	%%T1, %%T1, a0
241	vpsrld	a2, a2, 10
242	vpxor	a1, a1, a2
243	vpaddd	%%T1, %%T1, [SZ4*((%%i-16)&0xf) + rsp + _DATA]
244	vpaddd	a1, a1, [SZ4*((%%i-7)&0xf) + rsp + _DATA]
245	vpaddd	%%T1, %%T1, a1
246
247	ROUND_00_15 %%T1, %%i
248%endm
249
250section .data
251default rel
252align 16
253PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203
254	dq 0x0405060700010203, 0x0c0d0e0f08090a0b
255
256section .text
257
258;; SHA256_ARGS:
259;;   UINT128 digest[8];  // transposed digests
260;;   UINT8  *data_ptr[4];
261;;
262
263;; void sha_256_mult_avx(SHA256_ARGS *args, UINT64 num_blocks);
264;; arg 1 : STATE    : pointer args
265;; arg 2 : INP_SIZE : size of data in blocks (assumed >= 1)
266;;
267MKGLOBAL(sha_256_mult_avx,function,internal)
268align 16
269sha_256_mult_avx:
270	; general registers preserved in outer calling routine
271	; outer calling routine saves all the XMM registers
272	sub	rsp, STACK_size
273
274	;; Load the pre-transposed incoming digest.
275	vmovdqa	a,[STATE+0*SHA256_DIGEST_ROW_SIZE]
276	vmovdqa	b,[STATE+1*SHA256_DIGEST_ROW_SIZE]
277	vmovdqa	c,[STATE+2*SHA256_DIGEST_ROW_SIZE]
278	vmovdqa	d,[STATE+3*SHA256_DIGEST_ROW_SIZE]
279	vmovdqa	e,[STATE+4*SHA256_DIGEST_ROW_SIZE]
280	vmovdqa	f,[STATE+5*SHA256_DIGEST_ROW_SIZE]
281	vmovdqa	g,[STATE+6*SHA256_DIGEST_ROW_SIZE]
282	vmovdqa	h,[STATE+7*SHA256_DIGEST_ROW_SIZE]
283
284	lea	TBL,[rel K256_4]
285
286	;; load the address of each of the 4 message lanes
287	;; getting ready to transpose input onto stack
288	mov	inp0,[STATE + _data_ptr_sha256 + 0*PTR_SZ]
289	mov	inp1,[STATE + _data_ptr_sha256 + 1*PTR_SZ]
290	mov	inp2,[STATE + _data_ptr_sha256 + 2*PTR_SZ]
291	mov	inp3,[STATE + _data_ptr_sha256 + 3*PTR_SZ]
292
293	xor	IDX, IDX
294lloop:
295	xor	ROUND, ROUND
296
297	;; save old digest
298	vmovdqa	[rsp + _DIGEST + 0*SZ4], a
299	vmovdqa	[rsp + _DIGEST + 1*SZ4], b
300	vmovdqa	[rsp + _DIGEST + 2*SZ4], c
301	vmovdqa	[rsp + _DIGEST + 3*SZ4], d
302	vmovdqa	[rsp + _DIGEST + 4*SZ4], e
303	vmovdqa	[rsp + _DIGEST + 5*SZ4], f
304	vmovdqa	[rsp + _DIGEST + 6*SZ4], g
305	vmovdqa	[rsp + _DIGEST + 7*SZ4], h
306
307%assign i 0
308%rep 4
309	vmovdqa	TMP, [rel PSHUFFLE_BYTE_FLIP_MASK]
310	VMOVPS	TT2,[inp0+IDX+i*16]
311	VMOVPS	TT1,[inp1+IDX+i*16]
312	VMOVPS	TT4,[inp2+IDX+i*16]
313	VMOVPS	TT3,[inp3+IDX+i*16]
314	TRANSPOSE	TT2, TT1, TT4, TT3, TT0, TT5
315	vpshufb	TT0, TT0, TMP
316	vpshufb	TT1, TT1, TMP
317	vpshufb	TT2, TT2, TMP
318	vpshufb	TT3, TT3, TMP
319	ROUND_00_15	TT0,(i*4+0)
320	ROUND_00_15	TT1,(i*4+1)
321	ROUND_00_15	TT2,(i*4+2)
322	ROUND_00_15	TT3,(i*4+3)
323%assign i (i+1)
324%endrep
325	add	IDX, 4*4*4
326
327%assign i (i*4)
328
329	jmp	Lrounds_16_xx
330align 16
331Lrounds_16_xx:
332%rep 16
333	ROUND_16_XX	T1, i
334%assign i (i+1)
335%endrep
336
337	cmp	ROUND,ROUNDS
338	jb	Lrounds_16_xx
339
340	;; add old digest
341	vpaddd	a, a, [rsp + _DIGEST + 0*SZ4]
342	vpaddd	b, b, [rsp + _DIGEST + 1*SZ4]
343	vpaddd	c, c, [rsp + _DIGEST + 2*SZ4]
344	vpaddd	d, d, [rsp + _DIGEST + 3*SZ4]
345	vpaddd	e, e, [rsp + _DIGEST + 4*SZ4]
346	vpaddd	f, f, [rsp + _DIGEST + 5*SZ4]
347	vpaddd	g, g, [rsp + _DIGEST + 6*SZ4]
348	vpaddd	h, h, [rsp + _DIGEST + 7*SZ4]
349
350	sub	INP_SIZE, 1  ;; unit is blocks
351	jne	lloop
352
353	; write back to memory (state object) the transposed digest
354	vmovdqa	[STATE+0*SHA256_DIGEST_ROW_SIZE],a
355	vmovdqa	[STATE+1*SHA256_DIGEST_ROW_SIZE],b
356	vmovdqa	[STATE+2*SHA256_DIGEST_ROW_SIZE],c
357	vmovdqa	[STATE+3*SHA256_DIGEST_ROW_SIZE],d
358	vmovdqa	[STATE+4*SHA256_DIGEST_ROW_SIZE],e
359	vmovdqa	[STATE+5*SHA256_DIGEST_ROW_SIZE],f
360	vmovdqa	[STATE+6*SHA256_DIGEST_ROW_SIZE],g
361	vmovdqa	[STATE+7*SHA256_DIGEST_ROW_SIZE],h
362
363	; update input pointers
364	add	inp0, IDX
365	mov	[STATE + _data_ptr_sha256 + 0*8], inp0
366	add	inp1, IDX
367	mov	[STATE + _data_ptr_sha256 + 1*8], inp1
368	add	inp2, IDX
369	mov	[STATE + _data_ptr_sha256 + 2*8], inp2
370	add	inp3, IDX
371	mov	[STATE + _data_ptr_sha256 + 3*8], inp3
372
373	;;;;;;;;;;;;;;;;
374	;; Postamble
375
376%ifdef SAFE_DATA
377        ;; Clear stack frame ((16 + 8)*16 bytes)
378        clear_all_xmms_avx_asm
379%assign i 0
380%rep (16+NUM_SHA256_DIGEST_WORDS)
381        vmovdqa [rsp + i*SZ4], xmm0
382%assign i (i+1)
383%endrep
384%endif
385
386	add	rsp, STACK_size
387	; outer calling routine restores XMM and other GP registers
388	ret
389
390%ifdef LINUX
391section .note.GNU-stack noalloc noexec nowrite progbits
392%endif
393