1#!/usr/bin/env perl
2# Copyright 2017-2023 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8#
9# ====================================================================
10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11# project. The module is, however, dual licensed under OpenSSL and
12# CRYPTOGAMS licenses depending on where you obtain it. For further
13# details see http://www.openssl.org/~appro/cryptogams/.
14# ====================================================================
15#
16# Keccak-1600 for AVX512VL.
17#
18# December 2017.
19#
20# This is an adaptation of AVX2 module that reuses register data
21# layout, but utilizes new 256-bit AVX512VL instructions. See AVX2
22# module for further information on layout.
23#
24########################################################################
25# Numbers are cycles per processed byte out of large message.
26#
27#			r=1088(*)
28#
29# Skylake-X		6.4/+47%
30#
31# (*)	Corresponds to SHA3-256. Percentage after slash is improvement
32#	coefficient in comparison to scalar keccak1600-x86_64.pl.
33
34# Digits in variables' names denote right-most coordinates:
35
36my ($A00,	# [0][0] [0][0] [0][0] [0][0]		# %ymm0
37    $A01,	# [0][4] [0][3] [0][2] [0][1]		# %ymm1
38    $A20,	# [3][0] [1][0] [4][0] [2][0]		# %ymm2
39    $A31,	# [2][4] [4][3] [1][2] [3][1]		# %ymm3
40    $A21,	# [3][4] [1][3] [4][2] [2][1]		# %ymm4
41    $A41,	# [1][4] [2][3] [3][2] [4][1]		# %ymm5
42    $A11) =	# [4][4] [3][3] [2][2] [1][1]		# %ymm6
43    map("%ymm$_",(0..6));
44
45# We also need to map the magic order into offsets within structure:
46
47my @A_jagged = ([0,0], [1,0], [1,1], [1,2], [1,3],	# [0][0..4]
48		[2,2], [6,0], [3,1], [4,2], [5,3],	# [1][0..4]
49		[2,0], [4,0], [6,1], [5,2], [3,3],	# [2][0..4]
50		[2,3], [3,0], [5,1], [6,2], [4,3],	# [3][0..4]
51		[2,1], [5,0], [4,1], [3,2], [6,3]);	# [4][0..4]
52   @A_jagged = map(8*($$_[0]*4+$$_[1]), @A_jagged);	# ... and now linear
53
54my @T = map("%ymm$_",(7..15));
55my ($C14,$C00,$D00,$D14) = @T[5..8];
56my ($R20,$R01,$R31,$R21,$R41,$R11) = map("%ymm$_",(16..21));
57
58$code.=<<___;
59.text
60
61.type	__KeccakF1600,\@function
62.align	32
63__KeccakF1600:
64	lea		iotas(%rip),%r10
65	mov		\$24,%eax
66	jmp		.Loop_avx512vl
67
68.align	32
69.Loop_avx512vl:
70	######################################### Theta
71	vpshufd		\$0b01001110,$A20,$C00
72	vpxor		$A31,$A41,$C14
73	vpxor		$A11,$A21,@T[2]
74	vpternlogq	\$0x96,$A01,$T[2],$C14	# C[1..4]
75
76	vpxor		$A20,$C00,$C00
77	vpermq		\$0b01001110,$C00,@T[0]
78
79	vpermq		\$0b10010011,$C14,@T[4]
80	vprolq		\$1,$C14,@T[1]		# ROL64(C[1..4],1)
81
82	vpermq		\$0b00111001,@T[1],$D14
83	vpxor		@T[4],@T[1],$D00
84	vpermq		\$0b00000000,$D00,$D00	# D[0..0] = ROL64(C[1],1) ^ C[4]
85
86	vpternlogq	\$0x96,@T[0],$A00,$C00	# C[0..0]
87	vprolq		\$1,$C00,@T[1]		# ROL64(C[0..0],1)
88
89	vpxor		$D00,$A00,$A00		# ^= D[0..0]
90
91	vpblendd	\$0b11000000,@T[1],$D14,$D14
92	vpblendd	\$0b00000011,$C00,@T[4],@T[0]
93
94	######################################### Rho + Pi + pre-Chi shuffle
95	 vpxor		$D00,$A20,$A20		# ^= D[0..0] from Theta
96	vprolvq		$R20,$A20,$A20
97
98	 vpternlogq	\$0x96,@T[0],$D14,$A31	# ^= D[1..4] from Theta
99	vprolvq		$R31,$A31,$A31
100
101	 vpternlogq	\$0x96,@T[0],$D14,$A21	# ^= D[1..4] from Theta
102	vprolvq		$R21,$A21,$A21
103
104	 vpternlogq	\$0x96,@T[0],$D14,$A41	# ^= D[1..4] from Theta
105	vprolvq		$R41,$A41,$A41
106
107	 vpermq		\$0b10001101,$A20,@T[3]	# $A20 -> future $A31
108	 vpermq		\$0b10001101,$A31,@T[4]	# $A31 -> future $A21
109	 vpternlogq	\$0x96,@T[0],$D14,$A11	# ^= D[1..4] from Theta
110	vprolvq		$R11,$A11,@T[1]		# $A11 -> future $A01
111
112	 vpermq		\$0b00011011,$A21,@T[5]	# $A21 -> future $A41
113	 vpermq		\$0b01110010,$A41,@T[6]	# $A41 -> future $A11
114	 vpternlogq	\$0x96,@T[0],$D14,$A01	# ^= D[1..4] from Theta
115	vprolvq		$R01,$A01,@T[2]		# $A01 -> future $A20
116
117	######################################### Chi
118	vpblendd	\$0b00001100,@T[6],@T[2],$A31	#               [4][4] [2][0]
119	vpblendd	\$0b00001100,@T[2],@T[4],@T[8]	#               [4][0] [2][1]
120	 vpblendd	\$0b00001100,@T[4],@T[3],$A41	#               [4][2] [2][4]
121	 vpblendd	\$0b00001100,@T[3],@T[2],@T[7]	#               [4][3] [2][0]
122	vpblendd	\$0b00110000,@T[4],$A31,$A31	#        [1][3] [4][4] [2][0]
123	vpblendd	\$0b00110000,@T[5],@T[8],@T[8]	#        [1][4] [4][0] [2][1]
124	 vpblendd	\$0b00110000,@T[2],$A41,$A41	#        [1][0] [4][2] [2][4]
125	 vpblendd	\$0b00110000,@T[6],@T[7],@T[7]	#        [1][1] [4][3] [2][0]
126	vpblendd	\$0b11000000,@T[5],$A31,$A31	# [3][2] [1][3] [4][4] [2][0]
127	vpblendd	\$0b11000000,@T[6],@T[8],@T[8]	# [3][3] [1][4] [4][0] [2][1]
128	 vpblendd	\$0b11000000,@T[6],$A41,$A41	# [3][3] [1][0] [4][2] [2][4]
129	 vpblendd	\$0b11000000,@T[4],@T[7],@T[7]	# [3][4] [1][1] [4][3] [2][0]
130	vpternlogq	\$0xC6,@T[8],@T[3],$A31		# [3][1] [1][2] [4][3] [2][4]
131	 vpternlogq	\$0xC6,@T[7],@T[5],$A41		# [3][2] [1][4] [4][1] [2][3]
132
133	vpsrldq		\$8,@T[1],@T[0]
134	vpandn		@T[0],@T[1],@T[0]	# tgting  [0][0] [0][0] [0][0] [0][0]
135
136	vpblendd	\$0b00001100,@T[2],@T[5],$A11	#               [4][0] [2][3]
137	vpblendd	\$0b00001100,@T[5],@T[3],@T[8]	#               [4][1] [2][4]
138	vpblendd	\$0b00110000,@T[3],$A11,$A11	#        [1][2] [4][0] [2][3]
139	vpblendd	\$0b00110000,@T[4],@T[8],@T[8]	#        [1][3] [4][1] [2][4]
140	vpblendd	\$0b11000000,@T[4],$A11,$A11	# [3][4] [1][2] [4][0] [2][3]
141	vpblendd	\$0b11000000,@T[2],@T[8],@T[8]	# [3][0] [1][3] [4][1] [2][4]
142	vpternlogq	\$0xC6,@T[8],@T[6],$A11		# [3][3] [1][1] [4][4] [2][2]
143
144	  vpermq	\$0b00011110,@T[1],$A21		# [0][1] [0][2] [0][4] [0][3]
145	  vpblendd	\$0b00110000,$A00,$A21,@T[8]	# [0][1] [0][0] [0][4] [0][3]
146	  vpermq	\$0b00111001,@T[1],$A01		# [0][1] [0][4] [0][3] [0][2]
147	  vpblendd	\$0b11000000,$A00,$A01,$A01	# [0][0] [0][4] [0][3] [0][2]
148
149	vpblendd	\$0b00001100,@T[5],@T[4],$A20	#               [4][1] [2][1]
150	vpblendd	\$0b00001100,@T[4],@T[6],@T[7]	#               [4][2] [2][2]
151	vpblendd	\$0b00110000,@T[6],$A20,$A20	#        [1][1] [4][1] [2][1]
152	vpblendd	\$0b00110000,@T[3],@T[7],@T[7]	#        [1][2] [4][2] [2][2]
153	vpblendd	\$0b11000000,@T[3],$A20,$A20	# [3][1] [1][1] [4][1] [2][1]
154	vpblendd	\$0b11000000,@T[5],@T[7],@T[7]	# [3][2] [1][2] [4][2] [2][2]
155	vpternlogq	\$0xC6,@T[7],@T[2],$A20		# [3][0] [1][0] [4][0] [2][0]
156
157	 vpermq		\$0b00000000,@T[0],@T[0]	# [0][0] [0][0] [0][0] [0][0]
158	 vpermq		\$0b00011011,$A31,$A31		# post-Chi shuffle
159	 vpermq		\$0b10001101,$A41,$A41
160	 vpermq		\$0b01110010,$A11,$A11
161
162	vpblendd	\$0b00001100,@T[3],@T[6],$A21	#               [4][3] [2][2]
163	vpblendd	\$0b00001100,@T[6],@T[5],@T[7]	#               [4][4] [2][3]
164	vpblendd	\$0b00110000,@T[5],$A21,$A21	#        [1][4] [4][3] [2][2]
165	vpblendd	\$0b00110000,@T[2],@T[7],@T[7]	#        [1][0] [4][4] [2][3]
166	vpblendd	\$0b11000000,@T[2],$A21,$A21	# [3][0] [1][4] [4][3] [2][2]
167	vpblendd	\$0b11000000,@T[3],@T[7],@T[7]	# [3][1] [1][0] [4][4] [2][3]
168
169	vpternlogq	\$0xC6,@T[8],@T[1],$A01		# [0][4] [0][3] [0][2] [0][1]
170	vpternlogq	\$0xC6,@T[7],@T[4],$A21		# [3][4] [1][3] [4][2] [2][1]
171
172	######################################### Iota
173	vpternlogq	\$0x96,(%r10),@T[0],$A00
174	lea		32(%r10),%r10
175
176	dec		%eax
177	jnz		.Loop_avx512vl
178
179	ret
180.size	__KeccakF1600,.-__KeccakF1600
181___
182my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
183my  $out = $inp;	# in squeeze
184
185$code.=<<___;
186.globl	SHA3_absorb
187.type	SHA3_absorb,\@function
188.align	32
189SHA3_absorb:
190	mov	%rsp,%r11
191
192	lea	-240(%rsp),%rsp
193	and	\$-32,%rsp
194
195	lea	96($A_flat),$A_flat
196	lea	96($inp),$inp
197	lea	96(%rsp),%r10
198	lea	rhotates_left(%rip),%r8
199
200	vzeroupper
201
202	vpbroadcastq	-96($A_flat),$A00	# load A[5][5]
203	vmovdqu		8+32*0-96($A_flat),$A01
204	vmovdqu		8+32*1-96($A_flat),$A20
205	vmovdqu		8+32*2-96($A_flat),$A31
206	vmovdqu		8+32*3-96($A_flat),$A21
207	vmovdqu		8+32*4-96($A_flat),$A41
208	vmovdqu		8+32*5-96($A_flat),$A11
209
210	vmovdqa64	0*32(%r8),$R20		# load "rhotate" indices
211	vmovdqa64	1*32(%r8),$R01
212	vmovdqa64	2*32(%r8),$R31
213	vmovdqa64	3*32(%r8),$R21
214	vmovdqa64	4*32(%r8),$R41
215	vmovdqa64	5*32(%r8),$R11
216
217	vpxor		@T[0],@T[0],@T[0]
218	vmovdqa		@T[0],32*2-96(%r10)	# zero transfer area on stack
219	vmovdqa		@T[0],32*3-96(%r10)
220	vmovdqa		@T[0],32*4-96(%r10)
221	vmovdqa		@T[0],32*5-96(%r10)
222	vmovdqa		@T[0],32*6-96(%r10)
223
224.Loop_absorb_avx512vl:
225	mov		$bsz,%rax
226	sub		$bsz,$len
227	jc		.Ldone_absorb_avx512vl
228
229	shr		\$3,%eax
230	vpbroadcastq	0-96($inp),@T[0]
231	vmovdqu		8-96($inp),@T[1]
232	sub		\$4,%eax
233___
234for(my $i=5; $i<25; $i++) {
235$code.=<<___
236	dec	%eax
237	jz	.Labsorved_avx512vl
238	mov	8*$i-96($inp),%r8
239	mov	%r8,$A_jagged[$i]-96(%r10)
240___
241}
242$code.=<<___;
243.Labsorved_avx512vl:
244	lea	($inp,$bsz),$inp
245
246	vpxor	@T[0],$A00,$A00
247	vpxor	@T[1],$A01,$A01
248	vpxor	32*2-96(%r10),$A20,$A20
249	vpxor	32*3-96(%r10),$A31,$A31
250	vpxor	32*4-96(%r10),$A21,$A21
251	vpxor	32*5-96(%r10),$A41,$A41
252	vpxor	32*6-96(%r10),$A11,$A11
253
254	call	__KeccakF1600
255
256	lea	96(%rsp),%r10
257	jmp	.Loop_absorb_avx512vl
258
259.Ldone_absorb_avx512vl:
260	vmovq	%xmm0,-96($A_flat)
261	vmovdqu	$A01,8+32*0-96($A_flat)
262	vmovdqu	$A20,8+32*1-96($A_flat)
263	vmovdqu	$A31,8+32*2-96($A_flat)
264	vmovdqu	$A21,8+32*3-96($A_flat)
265	vmovdqu	$A41,8+32*4-96($A_flat)
266	vmovdqu	$A11,8+32*5-96($A_flat)
267
268	vzeroupper
269
270	lea	(%r11),%rsp
271	lea	($len,$bsz),%rax		# return value
272	ret
273.size	SHA3_absorb,.-SHA3_absorb
274
275.globl	SHA3_squeeze
276.type	SHA3_squeeze,\@function
277.align	32
278SHA3_squeeze:
279	mov	%rsp,%r11
280
281	lea	96($A_flat),$A_flat
282	lea	rhotates_left(%rip),%r8
283	shr	\$3,$bsz
284
285	vzeroupper
286
287	vpbroadcastq	-96($A_flat),$A00
288	vpxor		@T[0],@T[0],@T[0]
289	vmovdqu		8+32*0-96($A_flat),$A01
290	vmovdqu		8+32*1-96($A_flat),$A20
291	vmovdqu		8+32*2-96($A_flat),$A31
292	vmovdqu		8+32*3-96($A_flat),$A21
293	vmovdqu		8+32*4-96($A_flat),$A41
294	vmovdqu		8+32*5-96($A_flat),$A11
295
296	vmovdqa64	0*32(%r8),$R20		# load "rhotate" indices
297	vmovdqa64	1*32(%r8),$R01
298	vmovdqa64	2*32(%r8),$R31
299	vmovdqa64	3*32(%r8),$R21
300	vmovdqa64	4*32(%r8),$R41
301	vmovdqa64	5*32(%r8),$R11
302
303	mov	$bsz,%rax
304
305.Loop_squeeze_avx512vl:
306	mov	@A_jagged[$i]-96($A_flat),%r8
307___
308for (my $i=0; $i<25; $i++) {
309$code.=<<___;
310	sub	\$8,$len
311	jc	.Ltail_squeeze_avx512vl
312	mov	%r8,($out)
313	lea	8($out),$out
314	je	.Ldone_squeeze_avx512vl
315	dec	%eax
316	je	.Lextend_output_avx512vl
317	mov	@A_jagged[$i+1]-120($A_flat),%r8
318___
319}
320$code.=<<___;
321.Lextend_output_avx512vl:
322	call	__KeccakF1600
323
324	vmovq	%xmm0,-96($A_flat)
325	vmovdqu	$A01,8+32*0-96($A_flat)
326	vmovdqu	$A20,8+32*1-96($A_flat)
327	vmovdqu	$A31,8+32*2-96($A_flat)
328	vmovdqu	$A21,8+32*3-96($A_flat)
329	vmovdqu	$A41,8+32*4-96($A_flat)
330	vmovdqu	$A11,8+32*5-96($A_flat)
331
332	mov	$bsz,%rax
333	jmp	.Loop_squeeze_avx512vl
334
335
336.Ltail_squeeze_avx512vl:
337	add	\$8,$len
338.Loop_tail_avx512vl:
339	mov	%r8b,($out)
340	lea	1($out),$out
341	shr	\$8,%r8
342	dec	$len
343	jnz	.Loop_tail_avx512vl
344
345.Ldone_squeeze_avx512vl:
346	vzeroupper
347
348	lea	(%r11),%rsp
349	ret
350.size	SHA3_squeeze,.-SHA3_squeeze
351
352.section .rodata
353.align	64
354rhotates_left:
355	.quad	3,	18,	36,	41	# [2][0] [4][0] [1][0] [3][0]
356	.quad	1,	62,	28,	27	# [0][1] [0][2] [0][3] [0][4]
357	.quad	45,	6,	56,	39	# [3][1] [1][2] [4][3] [2][4]
358	.quad	10,	61,	55,	8	# [2][1] [4][2] [1][3] [3][4]
359	.quad	2,	15,	25,	20	# [4][1] [3][2] [2][3] [1][4]
360	.quad	44,	43,	21,	14	# [1][1] [2][2] [3][3] [4][4]
361iotas:
362	.quad	0x0000000000000001, 0x0000000000000001, 0x0000000000000001, 0x0000000000000001
363	.quad	0x0000000000008082, 0x0000000000008082, 0x0000000000008082, 0x0000000000008082
364	.quad	0x800000000000808a, 0x800000000000808a, 0x800000000000808a, 0x800000000000808a
365	.quad	0x8000000080008000, 0x8000000080008000, 0x8000000080008000, 0x8000000080008000
366	.quad	0x000000000000808b, 0x000000000000808b, 0x000000000000808b, 0x000000000000808b
367	.quad	0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
368	.quad	0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
369	.quad	0x8000000000008009, 0x8000000000008009, 0x8000000000008009, 0x8000000000008009
370	.quad	0x000000000000008a, 0x000000000000008a, 0x000000000000008a, 0x000000000000008a
371	.quad	0x0000000000000088, 0x0000000000000088, 0x0000000000000088, 0x0000000000000088
372	.quad	0x0000000080008009, 0x0000000080008009, 0x0000000080008009, 0x0000000080008009
373	.quad	0x000000008000000a, 0x000000008000000a, 0x000000008000000a, 0x000000008000000a
374	.quad	0x000000008000808b, 0x000000008000808b, 0x000000008000808b, 0x000000008000808b
375	.quad	0x800000000000008b, 0x800000000000008b, 0x800000000000008b, 0x800000000000008b
376	.quad	0x8000000000008089, 0x8000000000008089, 0x8000000000008089, 0x8000000000008089
377	.quad	0x8000000000008003, 0x8000000000008003, 0x8000000000008003, 0x8000000000008003
378	.quad	0x8000000000008002, 0x8000000000008002, 0x8000000000008002, 0x8000000000008002
379	.quad	0x8000000000000080, 0x8000000000000080, 0x8000000000000080, 0x8000000000000080
380	.quad	0x000000000000800a, 0x000000000000800a, 0x000000000000800a, 0x000000000000800a
381	.quad	0x800000008000000a, 0x800000008000000a, 0x800000008000000a, 0x800000008000000a
382	.quad	0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
383	.quad	0x8000000000008080, 0x8000000000008080, 0x8000000000008080, 0x8000000000008080
384	.quad	0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
385	.quad	0x8000000080008008, 0x8000000080008008, 0x8000000080008008, 0x8000000080008008
386
387.asciz	"Keccak-1600 absorb and squeeze for AVX512VL, CRYPTOGAMS by <appro\@openssl.org>"
388___
389
390$output=pop and open STDOUT,">$output";
391print $code;
392close STDOUT or die "error closing STDOUT: $!";
393