1#!/usr/bin/env perl
2# Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8#
9# ====================================================================
10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
11# project. The module is, however, dual licensed under OpenSSL and
12# CRYPTOGAMS licenses depending on where you obtain it. For further
13# details see http://www.openssl.org/~appro/cryptogams/.
14# ====================================================================
15#
16# Keccak-1600 for AVX2.
17#
18# July 2017.
19#
20# To paraphrase Gilles Van Assche, if you contemplate Fig. 2.3 on page
21# 20 of The Keccak reference [or Fig. 5 of FIPS PUB 202], and load data
22# other than A[0][0] in magic order into 6 [256-bit] registers, *each
23# dedicated to one axis*, Pi permutation is reduced to intra-register
24# shuffles...
25#
26# It makes other steps more intricate, but overall, is it a win? To be
27# more specific index permutations organized by quadruples are:
28#
29#       [4][4] [3][3] [2][2] [1][1]<-+
30#       [0][4] [0][3] [0][2] [0][1]<-+
31#       [3][0] [1][0] [4][0] [2][0]  |
32#       [4][3] [3][1] [2][4] [1][2]  |
33#       [3][4] [1][3] [4][2] [2][1]  |
34#       [2][3] [4][1] [1][4] [3][2]  |
35#       [2][2] [4][4] [1][1] [3][3] -+
36#
37# This however is highly impractical for Theta and Chi. What would help
38# Theta is if x indices were aligned column-wise, or in other words:
39#
40#       [0][4] [0][3] [0][2] [0][1]
41#       [3][0] [1][0] [4][0] [2][0]
42#vpermq([4][3] [3][1] [2][4] [1][2], 0b01110010)
43#       [2][4] [4][3] [1][2] [3][1]
44#vpermq([4][2] [3][4] [2][1] [1][3], 0b10001101)
45#       [3][4] [1][3] [4][2] [2][1]
46#vpermq([2][3] [4][1] [1][4] [3][2], 0b01110010)
47#       [1][4] [2][3] [3][2] [4][1]
48#vpermq([1][1] [2][2] [3][3] [4][4], 0b00011011)
49#       [4][4] [3][3] [2][2] [1][1]
50#
51# So here we have it, lines not marked with vpermq() represent the magic
52# order in which data is to be loaded and maintained. [And lines marked
53# with vpermq() represent Pi circular permutation in chosen layout. Note
54# that first step is permutation-free.] A[0][0] is loaded to register of
55# its own, to all lanes. [A[0][0] is not part of Pi permutation or Rho.]
56# Digits in variables' names denote right-most coordinates:
57
58my ($A00,	# [0][0] [0][0] [0][0] [0][0]		# %ymm0
59    $A01,	# [0][4] [0][3] [0][2] [0][1]		# %ymm1
60    $A20,	# [3][0] [1][0] [4][0] [2][0]		# %ymm2
61    $A31,	# [2][4] [4][3] [1][2] [3][1]		# %ymm3
62    $A21,	# [3][4] [1][3] [4][2] [2][1]		# %ymm4
63    $A41,	# [1][4] [2][3] [3][2] [4][1]		# %ymm5
64    $A11) =	# [4][4] [3][3] [2][2] [1][1]		# %ymm6
65    map("%ymm$_",(0..6));
66
67# We also need to map the magic order into offsets within structure:
68
69my @A_jagged = ([0,0], [1,0], [1,1], [1,2], [1,3],	# [0][0..4]
70		[2,2], [6,0], [3,1], [4,2], [5,3],	# [1][0..4]
71		[2,0], [4,0], [6,1], [5,2], [3,3],	# [2][0..4]
72		[2,3], [3,0], [5,1], [6,2], [4,3],	# [3][0..4]
73		[2,1], [5,0], [4,1], [3,2], [6,3]);	# [4][0..4]
74   @A_jagged = map(8*($$_[0]*4+$$_[1]), @A_jagged);	# ... and now linear
75
76# But on the other hand Chi is much better off if y indices were aligned
77# column-wise, not x. For this reason we have to shuffle data prior
78# Chi and revert it afterwards. Prior shuffle is naturally merged with
79# Pi itself:
80#
81#       [0][4] [0][3] [0][2] [0][1]
82#       [3][0] [1][0] [4][0] [2][0]
83#vpermq([4][3] [3][1] [2][4] [1][2], 0b01110010)
84#vpermq([2][4] [4][3] [1][2] [3][1], 0b00011011) = 0b10001101
85#       [3][1] [1][2] [4][3] [2][4]
86#vpermq([4][2] [3][4] [2][1] [1][3], 0b10001101)
87#vpermq([3][4] [1][3] [4][2] [2][1], 0b11100100) = 0b10001101
88#       [3][4] [1][3] [4][2] [2][1]
89#vpermq([2][3] [4][1] [1][4] [3][2], 0b01110010)
90#vpermq([1][4] [2][3] [3][2] [4][1], 0b01110010) = 0b00011011
91#       [3][2] [1][4] [4][1] [2][3]
92#vpermq([1][1] [2][2] [3][3] [4][4], 0b00011011)
93#vpermq([4][4] [3][3] [2][2] [1][1], 0b10001101) = 0b01110010
94#       [3][3] [1][1] [4][4] [2][2]
95#
96# And reverse post-Chi permutation:
97#
98#       [0][4] [0][3] [0][2] [0][1]
99#       [3][0] [1][0] [4][0] [2][0]
100#vpermq([3][1] [1][2] [4][3] [2][4], 0b00011011)
101#       [2][4] [4][3] [1][2] [3][1]
102#vpermq([3][4] [1][3] [4][2] [2][1], 0b11100100) = nop :-)
103#       [3][4] [1][3] [4][2] [2][1]
104#vpermq([3][2] [1][4] [4][1] [2][3], 0b10001101)
105#       [1][4] [2][3] [3][2] [4][1]
106#vpermq([3][3] [1][1] [4][4] [2][2], 0b01110010)
107#       [4][4] [3][3] [2][2] [1][1]
108#
109########################################################################
110# Numbers are cycles per processed byte out of large message.
111#
112#			r=1088(*)
113#
114# Haswell		8.7/+10%
115# Skylake		7.8/+20%
116# Ryzen			17(**)
117#
118# (*)	Corresponds to SHA3-256. Percentage after slash is improvement
119#	coefficient in comparison to scalar keccak1600-x86_64.pl.
120# (**)	It's expected that Ryzen performs poorly, because instruction
121#	issue rate is limited to two AVX2 instructions per cycle and
122#	in addition vpblendd is reportedly bound to specific port.
123#	Obviously this code path should not be executed on Ryzen.
124
125my @T = map("%ymm$_",(7..15));
126my ($C14,$C00,$D00,$D14) = @T[5..8];
127
128$code.=<<___;
129.text
130
131.type	__KeccakF1600,\@function
132.align	32
133__KeccakF1600:
134	lea		rhotates_left+96(%rip),%r8
135	lea		rhotates_right+96(%rip),%r9
136	lea		iotas(%rip),%r10
137	mov		\$24,%eax
138	jmp		.Loop_avx2
139
140.align	32
141.Loop_avx2:
142	######################################### Theta
143	vpshufd		\$0b01001110,$A20,$C00
144	vpxor		$A31,$A41,$C14
145	vpxor		$A11,$A21,@T[2]
146	vpxor		$A01,$C14,$C14
147	vpxor		@T[2],$C14,$C14		# C[1..4]
148
149	vpermq		\$0b10010011,$C14,@T[4]
150	vpxor		$A20,$C00,$C00
151	vpermq		\$0b01001110,$C00,@T[0]
152
153	vpsrlq		\$63,$C14,@T[1]
154	vpaddq		$C14,$C14,@T[2]
155	vpor		@T[2],@T[1],@T[1]	# ROL64(C[1..4],1)
156
157	vpermq		\$0b00111001,@T[1],$D14
158	vpxor		@T[4],@T[1],$D00
159	vpermq		\$0b00000000,$D00,$D00	# D[0..0] = ROL64(C[1],1) ^ C[4]
160
161	vpxor		$A00,$C00,$C00
162	vpxor		@T[0],$C00,$C00		# C[0..0]
163
164	vpsrlq		\$63,$C00,@T[0]
165	vpaddq		$C00,$C00,@T[1]
166	vpor		@T[0],@T[1],@T[1]	# ROL64(C[0..0],1)
167
168	vpxor		$D00,$A20,$A20		# ^= D[0..0]
169	vpxor		$D00,$A00,$A00		# ^= D[0..0]
170
171	vpblendd	\$0b11000000,@T[1],$D14,$D14
172	vpblendd	\$0b00000011,$C00,@T[4],@T[4]
173	vpxor		@T[4],$D14,$D14		# D[1..4] = ROL64(C[2..4,0),1) ^ C[0..3]
174
175	######################################### Rho + Pi + pre-Chi shuffle
176	vpsllvq		0*32-96(%r8),$A20,@T[3]
177	vpsrlvq		0*32-96(%r9),$A20,$A20
178	vpor		@T[3],$A20,$A20
179
180	 vpxor		$D14,$A31,$A31		# ^= D[1..4] from Theta
181	vpsllvq		2*32-96(%r8),$A31,@T[4]
182	vpsrlvq		2*32-96(%r9),$A31,$A31
183	vpor		@T[4],$A31,$A31
184
185	 vpxor		$D14,$A21,$A21		# ^= D[1..4] from Theta
186	vpsllvq		3*32-96(%r8),$A21,@T[5]
187	vpsrlvq		3*32-96(%r9),$A21,$A21
188	vpor		@T[5],$A21,$A21
189
190	 vpxor		$D14,$A41,$A41		# ^= D[1..4] from Theta
191	vpsllvq		4*32-96(%r8),$A41,@T[6]
192	vpsrlvq		4*32-96(%r9),$A41,$A41
193	vpor		@T[6],$A41,$A41
194
195	 vpxor		$D14,$A11,$A11		# ^= D[1..4] from Theta
196	 vpermq		\$0b10001101,$A20,@T[3]	# $A20 -> future $A31
197	 vpermq		\$0b10001101,$A31,@T[4]	# $A31 -> future $A21
198	vpsllvq		5*32-96(%r8),$A11,@T[7]
199	vpsrlvq		5*32-96(%r9),$A11,@T[1]
200	vpor		@T[7],@T[1],@T[1]	# $A11 -> future $A01
201
202	 vpxor		$D14,$A01,$A01		# ^= D[1..4] from Theta
203	 vpermq		\$0b00011011,$A21,@T[5]	# $A21 -> future $A41
204	 vpermq		\$0b01110010,$A41,@T[6]	# $A41 -> future $A11
205	vpsllvq		1*32-96(%r8),$A01,@T[8]
206	vpsrlvq		1*32-96(%r9),$A01,@T[2]
207	vpor		@T[8],@T[2],@T[2]	# $A01 -> future $A20
208
209	######################################### Chi
210	vpsrldq		\$8,@T[1],@T[7]
211	vpandn		@T[7],@T[1],@T[0]	# tgting  [0][0] [0][0] [0][0] [0][0]
212
213	vpblendd	\$0b00001100,@T[6],@T[2],$A31	#               [4][4] [2][0]
214	vpblendd	\$0b00001100,@T[2],@T[4],@T[8]	#               [4][0] [2][1]
215	 vpblendd	\$0b00001100,@T[4],@T[3],$A41	#               [4][2] [2][4]
216	 vpblendd	\$0b00001100,@T[3],@T[2],@T[7]	#               [4][3] [2][0]
217	vpblendd	\$0b00110000,@T[4],$A31,$A31	#        [1][3] [4][4] [2][0]
218	vpblendd	\$0b00110000,@T[5],@T[8],@T[8]	#        [1][4] [4][0] [2][1]
219	 vpblendd	\$0b00110000,@T[2],$A41,$A41	#        [1][0] [4][2] [2][4]
220	 vpblendd	\$0b00110000,@T[6],@T[7],@T[7]	#        [1][1] [4][3] [2][0]
221	vpblendd	\$0b11000000,@T[5],$A31,$A31	# [3][2] [1][3] [4][4] [2][0]
222	vpblendd	\$0b11000000,@T[6],@T[8],@T[8]	# [3][3] [1][4] [4][0] [2][1]
223	 vpblendd	\$0b11000000,@T[6],$A41,$A41	# [3][3] [1][0] [4][2] [2][4]
224	 vpblendd	\$0b11000000,@T[4],@T[7],@T[7]	# [3][4] [1][1] [4][3] [2][0]
225	vpandn		@T[8],$A31,$A31		# tgting  [3][1] [1][2] [4][3] [2][4]
226	 vpandn		@T[7],$A41,$A41		# tgting  [3][2] [1][4] [4][1] [2][3]
227
228	vpblendd	\$0b00001100,@T[2],@T[5],$A11	#               [4][0] [2][3]
229	vpblendd	\$0b00001100,@T[5],@T[3],@T[8]	#               [4][1] [2][4]
230	 vpxor		@T[3],$A31,$A31
231	vpblendd	\$0b00110000,@T[3],$A11,$A11	#        [1][2] [4][0] [2][3]
232	vpblendd	\$0b00110000,@T[4],@T[8],@T[8]	#        [1][3] [4][1] [2][4]
233	 vpxor		@T[5],$A41,$A41
234	vpblendd	\$0b11000000,@T[4],$A11,$A11	# [3][4] [1][2] [4][0] [2][3]
235	vpblendd	\$0b11000000,@T[2],@T[8],@T[8]	# [3][0] [1][3] [4][1] [2][4]
236	vpandn		@T[8],$A11,$A11		# tgting  [3][3] [1][1] [4][4] [2][2]
237	vpxor		@T[6],$A11,$A11
238
239	  vpermq	\$0b00011110,@T[1],$A21		# [0][1] [0][2] [0][4] [0][3]
240	  vpblendd	\$0b00110000,$A00,$A21,@T[8]	# [0][1] [0][0] [0][4] [0][3]
241	  vpermq	\$0b00111001,@T[1],$A01		# [0][1] [0][4] [0][3] [0][2]
242	  vpblendd	\$0b11000000,$A00,$A01,$A01	# [0][0] [0][4] [0][3] [0][2]
243	  vpandn	@T[8],$A01,$A01		# tgting  [0][4] [0][3] [0][2] [0][1]
244
245	vpblendd	\$0b00001100,@T[5],@T[4],$A20	#               [4][1] [2][1]
246	vpblendd	\$0b00001100,@T[4],@T[6],@T[7]	#               [4][2] [2][2]
247	vpblendd	\$0b00110000,@T[6],$A20,$A20	#        [1][1] [4][1] [2][1]
248	vpblendd	\$0b00110000,@T[3],@T[7],@T[7]	#        [1][2] [4][2] [2][2]
249	vpblendd	\$0b11000000,@T[3],$A20,$A20	# [3][1] [1][1] [4][1] [2][1]
250	vpblendd	\$0b11000000,@T[5],@T[7],@T[7]	# [3][2] [1][2] [4][2] [2][2]
251	vpandn		@T[7],$A20,$A20		# tgting  [3][0] [1][0] [4][0] [2][0]
252	vpxor		@T[2],$A20,$A20
253
254	 vpermq		\$0b00000000,@T[0],@T[0]	# [0][0] [0][0] [0][0] [0][0]
255	 vpermq		\$0b00011011,$A31,$A31	# post-Chi shuffle
256	 vpermq		\$0b10001101,$A41,$A41
257	 vpermq		\$0b01110010,$A11,$A11
258
259	vpblendd	\$0b00001100,@T[3],@T[6],$A21	#               [4][3] [2][2]
260	vpblendd	\$0b00001100,@T[6],@T[5],@T[7]	#               [4][4] [2][3]
261	vpblendd	\$0b00110000,@T[5],$A21,$A21	#        [1][4] [4][3] [2][2]
262	vpblendd	\$0b00110000,@T[2],@T[7],@T[7]	#        [1][0] [4][4] [2][3]
263	vpblendd	\$0b11000000,@T[2],$A21,$A21	# [3][0] [1][4] [4][3] [2][2]
264	vpblendd	\$0b11000000,@T[3],@T[7],@T[7]	# [3][1] [1][0] [4][4] [2][3]
265	vpandn		@T[7],$A21,$A21		# tgting  [3][4] [1][3] [4][2] [2][1]
266
267	vpxor		@T[0],$A00,$A00
268	vpxor		@T[1],$A01,$A01
269	vpxor		@T[4],$A21,$A21
270
271	######################################### Iota
272	vpxor		(%r10),$A00,$A00
273	lea		32(%r10),%r10
274
275	dec		%eax
276	jnz		.Loop_avx2
277
278	ret
279.size	__KeccakF1600,.-__KeccakF1600
280___
281my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx");
282my  $out = $inp;	# in squeeze
283
284$code.=<<___;
285.globl	SHA3_absorb
286.type	SHA3_absorb,\@function
287.align	32
288SHA3_absorb:
289	mov	%rsp,%r11
290
291	lea	-240(%rsp),%rsp
292	and	\$-32,%rsp
293
294	lea	96($A_flat),$A_flat
295	lea	96($inp),$inp
296	lea	96(%rsp),%r10
297
298	vzeroupper
299
300	vpbroadcastq	-96($A_flat),$A00	# load A[5][5]
301	vmovdqu		8+32*0-96($A_flat),$A01
302	vmovdqu		8+32*1-96($A_flat),$A20
303	vmovdqu		8+32*2-96($A_flat),$A31
304	vmovdqu		8+32*3-96($A_flat),$A21
305	vmovdqu		8+32*4-96($A_flat),$A41
306	vmovdqu		8+32*5-96($A_flat),$A11
307
308	vpxor		@T[0],@T[0],@T[0]
309	vmovdqa		@T[0],32*2-96(%r10)	# zero transfer area on stack
310	vmovdqa		@T[0],32*3-96(%r10)
311	vmovdqa		@T[0],32*4-96(%r10)
312	vmovdqa		@T[0],32*5-96(%r10)
313	vmovdqa		@T[0],32*6-96(%r10)
314
315.Loop_absorb_avx2:
316	mov		$bsz,%rax
317	sub		$bsz,$len
318	jc		.Ldone_absorb_avx2
319
320	shr		\$3,%eax
321	vpbroadcastq	0-96($inp),@T[0]
322	vmovdqu		8-96($inp),@T[1]
323	sub		\$4,%eax
324___
325for(my $i=5; $i<25; $i++) {
326$code.=<<___
327	dec	%eax
328	jz	.Labsorved_avx2
329	mov	8*$i-96($inp),%r8
330	mov	%r8,$A_jagged[$i]-96(%r10)
331___
332}
333$code.=<<___;
334.Labsorved_avx2:
335	lea	($inp,$bsz),$inp
336
337	vpxor	@T[0],$A00,$A00
338	vpxor	@T[1],$A01,$A01
339	vpxor	32*2-96(%r10),$A20,$A20
340	vpxor	32*3-96(%r10),$A31,$A31
341	vpxor	32*4-96(%r10),$A21,$A21
342	vpxor	32*5-96(%r10),$A41,$A41
343	vpxor	32*6-96(%r10),$A11,$A11
344
345	call	__KeccakF1600
346
347	lea	96(%rsp),%r10
348	jmp	.Loop_absorb_avx2
349
350.Ldone_absorb_avx2:
351	vmovq	%xmm0,-96($A_flat)
352	vmovdqu	$A01,8+32*0-96($A_flat)
353	vmovdqu	$A20,8+32*1-96($A_flat)
354	vmovdqu	$A31,8+32*2-96($A_flat)
355	vmovdqu	$A21,8+32*3-96($A_flat)
356	vmovdqu	$A41,8+32*4-96($A_flat)
357	vmovdqu	$A11,8+32*5-96($A_flat)
358
359	vzeroupper
360
361	lea	(%r11),%rsp
362	lea	($len,$bsz),%rax		# return value
363	ret
364.size	SHA3_absorb,.-SHA3_absorb
365
366.globl	SHA3_squeeze
367.type	SHA3_squeeze,\@function
368.align	32
369SHA3_squeeze:
370	mov	%rsp,%r11
371
372	lea	96($A_flat),$A_flat
373	shr	\$3,$bsz
374
375	vzeroupper
376
377	vpbroadcastq	-96($A_flat),$A00
378	vpxor		@T[0],@T[0],@T[0]
379	vmovdqu		8+32*0-96($A_flat),$A01
380	vmovdqu		8+32*1-96($A_flat),$A20
381	vmovdqu		8+32*2-96($A_flat),$A31
382	vmovdqu		8+32*3-96($A_flat),$A21
383	vmovdqu		8+32*4-96($A_flat),$A41
384	vmovdqu		8+32*5-96($A_flat),$A11
385
386	mov	$bsz,%rax
387
388.Loop_squeeze_avx2:
389	mov	@A_jagged[$i]-96($A_flat),%r8
390___
391for (my $i=0; $i<25; $i++) {
392$code.=<<___;
393	sub	\$8,$len
394	jc	.Ltail_squeeze_avx2
395	mov	%r8,($out)
396	lea	8($out),$out
397	je	.Ldone_squeeze_avx2
398	dec	%eax
399	je	.Lextend_output_avx2
400	mov	@A_jagged[$i+1]-120($A_flat),%r8
401___
402}
403$code.=<<___;
404.Lextend_output_avx2:
405	call	__KeccakF1600
406
407	vmovq	%xmm0,-96($A_flat)
408	vmovdqu	$A01,8+32*0-96($A_flat)
409	vmovdqu	$A20,8+32*1-96($A_flat)
410	vmovdqu	$A31,8+32*2-96($A_flat)
411	vmovdqu	$A21,8+32*3-96($A_flat)
412	vmovdqu	$A41,8+32*4-96($A_flat)
413	vmovdqu	$A11,8+32*5-96($A_flat)
414
415	mov	$bsz,%rax
416	jmp	.Loop_squeeze_avx2
417
418
419.Ltail_squeeze_avx2:
420	add	\$8,$len
421.Loop_tail_avx2:
422	mov	%r8b,($out)
423	lea	1($out),$out
424	shr	\$8,%r8
425	dec	$len
426	jnz	.Loop_tail_avx2
427
428.Ldone_squeeze_avx2:
429	vzeroupper
430
431	lea	(%r11),%rsp
432	ret
433.size	SHA3_squeeze,.-SHA3_squeeze
434
435.align	64
436rhotates_left:
437	.quad	3,	18,	36,	41	# [2][0] [4][0] [1][0] [3][0]
438	.quad	1,	62,	28,	27	# [0][1] [0][2] [0][3] [0][4]
439	.quad	45,	6,	56,	39	# [3][1] [1][2] [4][3] [2][4]
440	.quad	10,	61,	55,	8	# [2][1] [4][2] [1][3] [3][4]
441	.quad	2,	15,	25,	20	# [4][1] [3][2] [2][3] [1][4]
442	.quad	44,	43,	21,	14	# [1][1] [2][2] [3][3] [4][4]
443rhotates_right:
444	.quad	64-3,	64-18,	64-36,	64-41
445	.quad	64-1,	64-62,	64-28,	64-27
446	.quad	64-45,	64-6,	64-56,	64-39
447	.quad	64-10,	64-61,	64-55,	64-8
448	.quad	64-2,	64-15,	64-25,	64-20
449	.quad	64-44,	64-43,	64-21,	64-14
450iotas:
451	.quad	0x0000000000000001, 0x0000000000000001, 0x0000000000000001, 0x0000000000000001
452	.quad	0x0000000000008082, 0x0000000000008082, 0x0000000000008082, 0x0000000000008082
453	.quad	0x800000000000808a, 0x800000000000808a, 0x800000000000808a, 0x800000000000808a
454	.quad	0x8000000080008000, 0x8000000080008000, 0x8000000080008000, 0x8000000080008000
455	.quad	0x000000000000808b, 0x000000000000808b, 0x000000000000808b, 0x000000000000808b
456	.quad	0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
457	.quad	0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
458	.quad	0x8000000000008009, 0x8000000000008009, 0x8000000000008009, 0x8000000000008009
459	.quad	0x000000000000008a, 0x000000000000008a, 0x000000000000008a, 0x000000000000008a
460	.quad	0x0000000000000088, 0x0000000000000088, 0x0000000000000088, 0x0000000000000088
461	.quad	0x0000000080008009, 0x0000000080008009, 0x0000000080008009, 0x0000000080008009
462	.quad	0x000000008000000a, 0x000000008000000a, 0x000000008000000a, 0x000000008000000a
463	.quad	0x000000008000808b, 0x000000008000808b, 0x000000008000808b, 0x000000008000808b
464	.quad	0x800000000000008b, 0x800000000000008b, 0x800000000000008b, 0x800000000000008b
465	.quad	0x8000000000008089, 0x8000000000008089, 0x8000000000008089, 0x8000000000008089
466	.quad	0x8000000000008003, 0x8000000000008003, 0x8000000000008003, 0x8000000000008003
467	.quad	0x8000000000008002, 0x8000000000008002, 0x8000000000008002, 0x8000000000008002
468	.quad	0x8000000000000080, 0x8000000000000080, 0x8000000000000080, 0x8000000000000080
469	.quad	0x000000000000800a, 0x000000000000800a, 0x000000000000800a, 0x000000000000800a
470	.quad	0x800000008000000a, 0x800000008000000a, 0x800000008000000a, 0x800000008000000a
471	.quad	0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081
472	.quad	0x8000000000008080, 0x8000000000008080, 0x8000000000008080, 0x8000000000008080
473	.quad	0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001
474	.quad	0x8000000080008008, 0x8000000080008008, 0x8000000080008008, 0x8000000080008008
475
476.asciz	"Keccak-1600 absorb and squeeze for AVX2, CRYPTOGAMS by <appro\@openssl.org>"
477___
478
479$output=pop and open STDOUT,">$output";
480print $code;
481close STDOUT or die "error closing STDOUT: $!";
482