1;;
2;; Copyright (c) 2012-2020, Intel Corporation
3;;
4;; Redistribution and use in source and binary forms, with or without
5;; modification, are permitted provided that the following conditions are met:
6;;
7;;     * Redistributions of source code must retain the above copyright notice,
8;;       this list of conditions and the following disclaimer.
9;;     * Redistributions in binary form must reproduce the above copyright
10;;       notice, this list of conditions and the following disclaimer in the
11;;       documentation and/or other materials provided with the distribution.
12;;     * Neither the name of Intel Corporation nor the names of its contributors
13;;       may be used to endorse or promote products derived from this software
14;;       without specific prior written permission.
15;;
16;; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17;; AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18;; IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19;; DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
20;; FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21;; DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22;; SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23;; CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24;; OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25;; OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
26;;
27
28;; Stack must be aligned to 32 bytes before call
29;;
30;; Registers:		RAX RBX RCX RDX RBP RSI RDI R8  R9  R10 R11 R12 R13 R14 R15
31;;			-----------------------------------------------------------
32;; Windows clobbers:	RAX         RDX             R8  R9  R10 R11 R12 R13 R14 R15
33;; Windows preserves:	    RBX RCX     RBP RSI RDI
34;;			-----------------------------------------------------------
35;; Linux clobbers:	RAX         RDX     RSI         R9  R10 R11 R12 R13 R14 R15
36;; Linux preserves:	    RBX RCX     RBP     RDI R8
37;;			-----------------------------------------------------------
38;; Clobbers ZMM0-31
39
40%include "include/os.asm"
41;%define DO_DBGPRINT
42%include "include/dbgprint.asm"
43%include "mb_mgr_datastruct.asm"
44%include "include/transpose_avx512.asm"
45%include "include/reg_sizes.asm"
46%include "include/clear_regs.asm"
47
48section .data
49default rel
50align 64
51K00_19:	;ddq 0x5A8279995A8279995A8279995A827999
52	;ddq 0x5A8279995A8279995A8279995A827999
53	;ddq 0x5A8279995A8279995A8279995A827999
54	;ddq 0x5A8279995A8279995A8279995A827999
55	dq 0x5A8279995A827999, 0x5A8279995A827999
56	dq 0x5A8279995A827999, 0x5A8279995A827999
57	dq 0x5A8279995A827999, 0x5A8279995A827999
58	dq 0x5A8279995A827999, 0x5A8279995A827999
59K20_39: ;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
60	;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
61	;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
62	;ddq 0x6ED9EBA16ED9EBA16ED9EBA16ED9EBA1
63	dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
64	dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
65	dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
66	dq 0x6ED9EBA16ED9EBA1, 0x6ED9EBA16ED9EBA1
67K40_59: ;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
68	;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
69	;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
70	;ddq 0x8F1BBCDC8F1BBCDC8F1BBCDC8F1BBCDC
71	dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
72	dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
73	dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
74	dq 0x8F1BBCDC8F1BBCDC, 0x8F1BBCDC8F1BBCDC
75K60_79: ;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
76	;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
77	;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
78	;ddq 0xCA62C1D6CA62C1D6CA62C1D6CA62C1D6
79	dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
80	dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
81	dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
82	dq 0xCA62C1D6CA62C1D6, 0xCA62C1D6CA62C1D6
83
84PSHUFFLE_BYTE_FLIP_MASK: ;ddq 0x0c0d0e0f08090a0b0405060700010203
85			 ;ddq 0x0c0d0e0f08090a0b0405060700010203
86			 ;ddq 0x0c0d0e0f08090a0b0405060700010203
87			 ;ddq 0x0c0d0e0f08090a0b0405060700010203
88	dq 0x0405060700010203, 0x0c0d0e0f08090a0b
89	dq 0x0405060700010203, 0x0c0d0e0f08090a0b
90	dq 0x0405060700010203, 0x0c0d0e0f08090a0b
91	dq 0x0405060700010203, 0x0c0d0e0f08090a0b
92
93section .text
94
95%define APPEND(a,b) a %+ b
96
97%ifdef LINUX
98%define arg1	rdi
99%define arg2	rsi
100%define arg3	rdx
101%define arg4	rcx
102%else
103%define arg1	rcx
104%define arg2	rdx
105%define arg3	r8
106%define arg4	r9
107%endif
108
109%define state	arg1
110%define SIZE	arg2
111%define IDX	arg3
112
113%define A	zmm0
114%define B	zmm1
115%define C	zmm2
116%define D	zmm3
117%define E	zmm4
118%define KT  	zmm5
119%define AA	zmm6
120%define BB	zmm7
121%define CC	zmm8
122%define DD	zmm9
123%define EE	zmm10
124%define TMP0	zmm11
125%define TMP1	zmm12
126%define TMP2	zmm13
127%define TMP3	zmm14
128%define TMP4	zmm15
129
130%define W0	zmm16
131%define W1	zmm17
132%define W2	zmm18
133%define W3	zmm19
134%define W4	zmm20
135%define W5	zmm21
136%define W6	zmm22
137%define W7	zmm23
138%define W8	zmm24
139%define W9	zmm25
140%define W10	zmm26
141%define W11	zmm27
142%define W12	zmm28
143%define W13	zmm29
144%define W14	zmm30
145%define W15	zmm31
146
147%define inp0	r9
148%define inp1	r10
149%define inp2	r11
150%define inp3	r12
151%define inp4	r13
152%define inp5	r14
153%define inp6	r15
154%define inp7	rax
155
156%macro ROTATE_ARGS 0
157%xdefine TMP_ E
158%xdefine E D
159%xdefine D C
160%xdefine C B
161%xdefine B A
162%xdefine A TMP_
163%endm
164
165%macro PROCESS_LOOP 2
166%define %%WT		%1
167%define %%F_IMMED	%2
168
169	; T = ROTL_5(A) + Ft(B,C,D) + E + Kt + Wt
170	; E=D, D=C, C=ROTL_30(B), B=A, A=T
171
172	; Ft
173	;  0-19 Ch(B,C,D) = (B&C) ^ (~B&D)
174	; 20-39, 60-79 Parity(B,C,D) = B ^ C ^ D
175	; 40-59 Maj(B,C,D) = (B&C) ^ (B&D) ^ (C&D)
176
177	vmovdqa32	TMP1, B			; Copy B
178	vpaddd		E, E, %%WT		; E = E + Wt
179	vpternlogd	TMP1, C, D, %%F_IMMED	; TMP1 = Ft(B,C,D)
180	vpaddd		E, E, KT		; E = E + Wt + Kt
181	vprold		TMP0, A, 5		; TMP0 = ROTL_5(A)
182	vpaddd		E, E, TMP1		; E = Ft(B,C,D) + E + Kt + Wt
183	vprold		B, B, 30		; B = ROTL_30(B)
184	vpaddd		E, E, TMP0		; E = T
185
186	ROTATE_ARGS
187%endmacro
188
189%macro MSG_SCHED_ROUND_16_79 4
190%define %%WT	%1
191%define %%WTp2	%2
192%define %%WTp8	%3
193%define %%WTp13	%4
194	; Wt = ROTL_1(Wt-3 ^ Wt-8 ^ Wt-14 ^ Wt-16)
195	; Wt+16 = ROTL_1(Wt+13 ^ Wt+8 ^ Wt+2 ^ Wt)
196	vpternlogd	%%WT, %%WTp2, %%WTp8, 0x96
197	vpxord		%%WT, %%WT, %%WTp13
198	vprold		%%WT, %%WT, 1
199%endmacro
200
201
202; Note this is reading in two blocks of data from each lane,
203; in preparation for the upcoming needed transpose to build msg schedule.
204; Each register will contain 32 bytes from one lane plus 32 bytes
205; from another lane.
206; The first 8 registers will contain the first 32 bytes of all lanes,
207; where register X (0 <= X <= 7) will contain bytes 0-31 from lane X in the first half
208; and 0-31 bytes from lane X+8 in the second half.
209; The last 8 registers will contain the last 32 bytes of all lanes,
210; where register Y (8 <= Y <= 15) will contain bytes 32-63 from lane Y-8 in the first half
211; and 32-63 bytes from lane Y in the second half.
212; This method helps reducing the number of shuffles required to transpose the data.
213%macro MSG_SCHED_ROUND_00_15 6
214%define %%Wt         %1 ; [out] zmm register to load the next block
215%define %%LANE_IDX   %2 ; [in] lane index (0-15)
216%define %%BASE_PTR   %3 ; [in] base address of the input data
217%define %%OFFSET_PTR %4 ; [in] offset to get next block of data from the lane
218%define %%TMP1       %5 ; [clobbered] temporary gp register
219%define %%TMP2       %6 ; [clobbered] temporary gp register
220%if (%%LANE_IDX < 8)
221	mov	      %%TMP1,	   [%%BASE_PTR + %%LANE_IDX*PTR_SZ]
222	mov	      %%TMP2,      [%%BASE_PTR + (%%LANE_IDX+8)*PTR_SZ]
223	vmovups       YWORD(%%Wt), [%%TMP1+%%OFFSET_PTR]
224	vinserti64x4  %%Wt, %%Wt,  [%%TMP2+%%OFFSET_PTR], 0x01
225%else
226	mov	     %%TMP1,      [%%BASE_PTR + (%%LANE_IDX-8)*PTR_SZ]
227	mov	     %%TMP2,      [%%BASE_PTR + %%LANE_IDX*PTR_SZ]
228	vmovups      YWORD(%%Wt), [%%TMP1+%%OFFSET_PTR+32]
229	vinserti64x4 %%Wt, %%Wt,  [%%TMP2+%%OFFSET_PTR+32], 0x01
230%endif
231%endmacro
232
233align 64
234; void sha1_mult_x16_avx3(void **input_data, UINT128 *digest, UINT32 size)
235; arg 1 : pointer to SHA1 args structure
236; arg 2 : size (in blocks) ;; assumed to be >= 1
237MKGLOBAL(sha1_x16_avx512,function,internal)
238sha1_x16_avx512:
239	;; Initialize digests
240	vmovdqu32	A, [state + 0*SHA1_DIGEST_ROW_SIZE]
241	vmovdqu32	B, [state + 1*SHA1_DIGEST_ROW_SIZE]
242	vmovdqu32	C, [state + 2*SHA1_DIGEST_ROW_SIZE]
243	vmovdqu32	D, [state + 3*SHA1_DIGEST_ROW_SIZE]
244	vmovdqu32	E, [state + 4*SHA1_DIGEST_ROW_SIZE]
245	DBGPRINTL_ZMM "Sha1-AVX512 incoming transposed digest", A, B, C, D, E
246	DBGPRINTL64   "SIZE", SIZE
247
248	xor IDX, IDX
249
250	;; Load first blocks of data into ZMM registers before
251	;; performing a 16x16 32-bit transpose.
252	;; To speed up the transpose, data is loaded in chunks of 32 bytes,
253	;; interleaving data between lane X and lane X+8.
254	;; This way, final shuffles between top half and bottom half
255	;; of the matrix are avoided.
256	mov	inp0, [state + _data_ptr_sha1 + 0*PTR_SZ]
257	mov	inp1, [state + _data_ptr_sha1 + 1*PTR_SZ]
258	mov	inp2, [state + _data_ptr_sha1 + 2*PTR_SZ]
259	mov	inp3, [state + _data_ptr_sha1 + 3*PTR_SZ]
260	mov	inp4, [state + _data_ptr_sha1 + 4*PTR_SZ]
261	mov	inp5, [state + _data_ptr_sha1 + 5*PTR_SZ]
262	mov	inp6, [state + _data_ptr_sha1 + 6*PTR_SZ]
263	mov	inp7, [state + _data_ptr_sha1 + 7*PTR_SZ]
264
265	TRANSPOSE16_U32_LOAD_FIRST8 W0, W1, W2,  W3,  W4,  W5,  W6,  W7, \
266				    W8, W9, W10, W11, W12, W13, W14, W15, \
267				    inp0, inp1, inp2, inp3, inp4, inp5, \
268				    inp6, inp7, IDX
269
270	mov	inp0, [state + _data_ptr_sha1 + 8*PTR_SZ]
271	mov	inp1, [state + _data_ptr_sha1 + 9*PTR_SZ]
272	mov	inp2, [state + _data_ptr_sha1 +10*PTR_SZ]
273	mov	inp3, [state + _data_ptr_sha1 +11*PTR_SZ]
274	mov	inp4, [state + _data_ptr_sha1 +12*PTR_SZ]
275	mov	inp5, [state + _data_ptr_sha1 +13*PTR_SZ]
276	mov	inp6, [state + _data_ptr_sha1 +14*PTR_SZ]
277	mov	inp7, [state + _data_ptr_sha1 +15*PTR_SZ]
278
279	TRANSPOSE16_U32_LOAD_LAST8 W0, W1, W2,  W3,  W4,  W5,  W6,  W7, \
280				   W8, W9, W10, W11, W12, W13, W14, W15, \
281				   inp0, inp1, inp2, inp3, inp4, inp5, \
282				   inp6, inp7, IDX
283lloop:
284	vmovdqa32	TMP2, [rel PSHUFFLE_BYTE_FLIP_MASK]
285
286	add	IDX, 64
287
288	TRANSPOSE16_U32_PRELOADED W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, \
289                                  W11, W12, W13, W14, W15, TMP0, TMP1, TMP3, TMP4
290	DBGPRINTL_ZMM "Sha1-AVX512 incoming transposed input", W0, W1, W2, W3, W4, \
291                                  W6, W7, W8, W9, W10, W11, W12, W13, W14, W15
292
293%assign I 0
294%rep 16
295       	vpshufb	APPEND(W,I), APPEND(W,I), TMP2
296%assign I (I+1)
297%endrep
298
299	; Save digests for later addition
300	vmovdqa32	AA, A
301	vmovdqa32	BB, B
302	vmovdqa32	CC, C
303	vmovdqa32	DD, D
304	vmovdqa32	EE, E
305
306	vmovdqa32	KT, [rel K00_19]
307%assign I 0xCA
308%assign J 0
309%assign K 2
310%assign L 8
311%assign M 13
312%assign N 0
313%rep 64
314	PROCESS_LOOP  APPEND(W,J),  I
315	MSG_SCHED_ROUND_16_79  APPEND(W,J), APPEND(W,K), APPEND(W,L), APPEND(W,M)
316	%if N = 19
317		vmovdqa32	KT, [rel K20_39]
318		%assign I 0x96
319	%elif N = 39
320		vmovdqa32	KT, [rel K40_59]
321		%assign I 0xE8
322	%elif N = 59
323		vmovdqa32	KT, [rel K60_79]
324		%assign I 0x96
325	%endif
326%assign J ((J+1)% 16)
327%assign K ((K+1)% 16)
328%assign L ((L+1)% 16)
329%assign M ((M+1)% 16)
330%assign N (N+1)
331%endrep
332
333	; Check if this is the last block
334	sub 	SIZE, 1
335	je	lastLoop
336
337%assign I 0x96
338%assign J 0
339%rep 16
340	PROCESS_LOOP  APPEND(W,J),  I
341	MSG_SCHED_ROUND_00_15 APPEND(W,J), J, state + _data_ptr_sha1, IDX, inp0, inp1
342%assign J (J+1)
343%endrep
344
345	; Add old digest
346	vpaddd		A,A,AA
347	vpaddd		B,B,BB
348	vpaddd		C,C,CC
349	vpaddd		D,D,DD
350	vpaddd		E,E,EE
351
352	jmp lloop
353
354lastLoop:
355; Need to reset argument rotation values to Round 64 values
356%xdefine TMP_ A
357%xdefine A B
358%xdefine B C
359%xdefine C D
360%xdefine D E
361%xdefine E TMP_
362
363	; Process last 16 rounds
364%assign I 0x96
365%assign J 0
366%rep 16
367	PROCESS_LOOP  APPEND(W,J), I
368%assign J (J+1)
369%endrep
370
371	; Add old digest
372	vpaddd		A,A,AA
373	vpaddd		B,B,BB
374	vpaddd		C,C,CC
375	vpaddd		D,D,DD
376	vpaddd		E,E,EE
377
378	; Write out digest
379	; Do we need to untranspose digests???
380	vmovdqu32	[state + 0*SHA1_DIGEST_ROW_SIZE], A
381	vmovdqu32	[state + 1*SHA1_DIGEST_ROW_SIZE], B
382	vmovdqu32	[state + 2*SHA1_DIGEST_ROW_SIZE], C
383	vmovdqu32	[state + 3*SHA1_DIGEST_ROW_SIZE], D
384	vmovdqu32	[state + 4*SHA1_DIGEST_ROW_SIZE], E
385	DBGPRINTL_ZMM "Sha1-AVX512 outgoing transposed digest", A, B, C, D, E
386
387	;; update input pointers
388	mov	inp0, [state + _data_ptr_sha1 + 0*PTR_SZ]
389	mov	inp1, [state + _data_ptr_sha1 + 1*PTR_SZ]
390	mov	inp2, [state + _data_ptr_sha1 + 2*PTR_SZ]
391	mov	inp3, [state + _data_ptr_sha1 + 3*PTR_SZ]
392	mov	inp4, [state + _data_ptr_sha1 + 4*PTR_SZ]
393	mov	inp5, [state + _data_ptr_sha1 + 5*PTR_SZ]
394	mov	inp6, [state + _data_ptr_sha1 + 6*PTR_SZ]
395	mov	inp7, [state + _data_ptr_sha1 + 7*PTR_SZ]
396	add	inp0, IDX
397	add	inp1, IDX
398	add	inp2, IDX
399	add	inp3, IDX
400	add	inp4, IDX
401	add	inp5, IDX
402	add	inp6, IDX
403	add	inp7, IDX
404	mov	[state + _data_ptr_sha1 + 0*PTR_SZ], inp0
405	mov	[state + _data_ptr_sha1 + 1*PTR_SZ], inp1
406	mov	[state + _data_ptr_sha1 + 2*PTR_SZ], inp2
407	mov	[state + _data_ptr_sha1 + 3*PTR_SZ], inp3
408	mov	[state + _data_ptr_sha1 + 4*PTR_SZ], inp4
409	mov	[state + _data_ptr_sha1 + 5*PTR_SZ], inp5
410	mov	[state + _data_ptr_sha1 + 6*PTR_SZ], inp6
411	mov	[state + _data_ptr_sha1 + 7*PTR_SZ], inp7
412
413	mov	inp0, [state + _data_ptr_sha1 + 8*PTR_SZ]
414	mov	inp1, [state + _data_ptr_sha1 + 9*PTR_SZ]
415	mov	inp2, [state + _data_ptr_sha1 + 10*PTR_SZ]
416	mov	inp3, [state + _data_ptr_sha1 + 11*PTR_SZ]
417	mov	inp4, [state + _data_ptr_sha1 + 12*PTR_SZ]
418	mov	inp5, [state + _data_ptr_sha1 + 13*PTR_SZ]
419	mov	inp6, [state + _data_ptr_sha1 + 14*PTR_SZ]
420	mov	inp7, [state + _data_ptr_sha1 + 15*PTR_SZ]
421	add	inp0, IDX
422	add	inp1, IDX
423	add	inp2, IDX
424	add	inp3, IDX
425	add	inp4, IDX
426	add	inp5, IDX
427	add	inp6, IDX
428	add	inp7, IDX
429	mov	[state + _data_ptr_sha1 + 8*PTR_SZ], inp0
430	mov	[state + _data_ptr_sha1 + 9*PTR_SZ], inp1
431	mov	[state + _data_ptr_sha1 + 10*PTR_SZ], inp2
432	mov	[state + _data_ptr_sha1 + 11*PTR_SZ], inp3
433	mov	[state + _data_ptr_sha1 + 12*PTR_SZ], inp4
434	mov	[state + _data_ptr_sha1 + 13*PTR_SZ], inp5
435	mov	[state + _data_ptr_sha1 + 14*PTR_SZ], inp6
436	mov	[state + _data_ptr_sha1 + 15*PTR_SZ], inp7
437
438%ifdef SAFE_DATA
439	clear_all_zmms_asm
440%endif ;; SAFE_DATA
441
442	ret
443
444%ifdef LINUX
445section .note.GNU-stack noalloc noexec nowrite progbits
446%endif
447