1#!/usr/bin/env perl 2# Copyright 2017-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8# 9# ==================================================================== 10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 11# project. The module is, however, dual licensed under OpenSSL and 12# CRYPTOGAMS licenses depending on where you obtain it. For further 13# details see http://www.openssl.org/~appro/cryptogams/. 14# ==================================================================== 15# 16# Keccak-1600 for AVX2. 17# 18# July 2017. 19# 20# To paraphrase Gilles Van Assche, if you contemplate Fig. 2.3 on page 21# 20 of The Keccak reference [or Fig. 5 of FIPS PUB 202], and load data 22# other than A[0][0] in magic order into 6 [256-bit] registers, *each 23# dedicated to one axis*, Pi permutation is reduced to intra-register 24# shuffles... 25# 26# It makes other steps more intricate, but overall, is it a win? To be 27# more specific index permutations organized by quadruples are: 28# 29# [4][4] [3][3] [2][2] [1][1]<-+ 30# [0][4] [0][3] [0][2] [0][1]<-+ 31# [3][0] [1][0] [4][0] [2][0] | 32# [4][3] [3][1] [2][4] [1][2] | 33# [3][4] [1][3] [4][2] [2][1] | 34# [2][3] [4][1] [1][4] [3][2] | 35# [2][2] [4][4] [1][1] [3][3] -+ 36# 37# This however is highly impractical for Theta and Chi. What would help 38# Theta is if x indices were aligned column-wise, or in other words: 39# 40# [0][4] [0][3] [0][2] [0][1] 41# [3][0] [1][0] [4][0] [2][0] 42#vpermq([4][3] [3][1] [2][4] [1][2], 0b01110010) 43# [2][4] [4][3] [1][2] [3][1] 44#vpermq([4][2] [3][4] [2][1] [1][3], 0b10001101) 45# [3][4] [1][3] [4][2] [2][1] 46#vpermq([2][3] [4][1] [1][4] [3][2], 0b01110010) 47# [1][4] [2][3] [3][2] [4][1] 48#vpermq([1][1] [2][2] [3][3] [4][4], 0b00011011) 49# [4][4] [3][3] [2][2] [1][1] 50# 51# So here we have it, lines not marked with vpermq() represent the magic 52# order in which data is to be loaded and maintained. [And lines marked 53# with vpermq() represent Pi circular permutation in chosen layout. Note 54# that first step is permutation-free.] A[0][0] is loaded to register of 55# its own, to all lanes. [A[0][0] is not part of Pi permutation or Rho.] 56# Digits in variables' names denote right-most coordinates: 57 58my ($A00, # [0][0] [0][0] [0][0] [0][0] # %ymm0 59 $A01, # [0][4] [0][3] [0][2] [0][1] # %ymm1 60 $A20, # [3][0] [1][0] [4][0] [2][0] # %ymm2 61 $A31, # [2][4] [4][3] [1][2] [3][1] # %ymm3 62 $A21, # [3][4] [1][3] [4][2] [2][1] # %ymm4 63 $A41, # [1][4] [2][3] [3][2] [4][1] # %ymm5 64 $A11) = # [4][4] [3][3] [2][2] [1][1] # %ymm6 65 map("%ymm$_",(0..6)); 66 67# We also need to map the magic order into offsets within structure: 68 69my @A_jagged = ([0,0], [1,0], [1,1], [1,2], [1,3], # [0][0..4] 70 [2,2], [6,0], [3,1], [4,2], [5,3], # [1][0..4] 71 [2,0], [4,0], [6,1], [5,2], [3,3], # [2][0..4] 72 [2,3], [3,0], [5,1], [6,2], [4,3], # [3][0..4] 73 [2,1], [5,0], [4,1], [3,2], [6,3]); # [4][0..4] 74 @A_jagged = map(8*($$_[0]*4+$$_[1]), @A_jagged); # ... and now linear 75 76# But on the other hand Chi is much better off if y indices were aligned 77# column-wise, not x. For this reason we have to shuffle data prior 78# Chi and revert it afterwards. Prior shuffle is naturally merged with 79# Pi itself: 80# 81# [0][4] [0][3] [0][2] [0][1] 82# [3][0] [1][0] [4][0] [2][0] 83#vpermq([4][3] [3][1] [2][4] [1][2], 0b01110010) 84#vpermq([2][4] [4][3] [1][2] [3][1], 0b00011011) = 0b10001101 85# [3][1] [1][2] [4][3] [2][4] 86#vpermq([4][2] [3][4] [2][1] [1][3], 0b10001101) 87#vpermq([3][4] [1][3] [4][2] [2][1], 0b11100100) = 0b10001101 88# [3][4] [1][3] [4][2] [2][1] 89#vpermq([2][3] [4][1] [1][4] [3][2], 0b01110010) 90#vpermq([1][4] [2][3] [3][2] [4][1], 0b01110010) = 0b00011011 91# [3][2] [1][4] [4][1] [2][3] 92#vpermq([1][1] [2][2] [3][3] [4][4], 0b00011011) 93#vpermq([4][4] [3][3] [2][2] [1][1], 0b10001101) = 0b01110010 94# [3][3] [1][1] [4][4] [2][2] 95# 96# And reverse post-Chi permutation: 97# 98# [0][4] [0][3] [0][2] [0][1] 99# [3][0] [1][0] [4][0] [2][0] 100#vpermq([3][1] [1][2] [4][3] [2][4], 0b00011011) 101# [2][4] [4][3] [1][2] [3][1] 102#vpermq([3][4] [1][3] [4][2] [2][1], 0b11100100) = nop :-) 103# [3][4] [1][3] [4][2] [2][1] 104#vpermq([3][2] [1][4] [4][1] [2][3], 0b10001101) 105# [1][4] [2][3] [3][2] [4][1] 106#vpermq([3][3] [1][1] [4][4] [2][2], 0b01110010) 107# [4][4] [3][3] [2][2] [1][1] 108# 109######################################################################## 110# Numbers are cycles per processed byte out of large message. 111# 112# r=1088(*) 113# 114# Haswell 8.7/+10% 115# Skylake 7.8/+20% 116# Ryzen 17(**) 117# 118# (*) Corresponds to SHA3-256. Percentage after slash is improvement 119# coefficient in comparison to scalar keccak1600-x86_64.pl. 120# (**) It's expected that Ryzen performs poorly, because instruction 121# issue rate is limited to two AVX2 instructions per cycle and 122# in addition vpblendd is reportedly bound to specific port. 123# Obviously this code path should not be executed on Ryzen. 124 125my @T = map("%ymm$_",(7..15)); 126my ($C14,$C00,$D00,$D14) = @T[5..8]; 127 128$code.=<<___; 129.text 130 131.type __KeccakF1600,\@function 132.align 32 133__KeccakF1600: 134 lea rhotates_left+96(%rip),%r8 135 lea rhotates_right+96(%rip),%r9 136 lea iotas(%rip),%r10 137 mov \$24,%eax 138 jmp .Loop_avx2 139 140.align 32 141.Loop_avx2: 142 ######################################### Theta 143 vpshufd \$0b01001110,$A20,$C00 144 vpxor $A31,$A41,$C14 145 vpxor $A11,$A21,@T[2] 146 vpxor $A01,$C14,$C14 147 vpxor @T[2],$C14,$C14 # C[1..4] 148 149 vpermq \$0b10010011,$C14,@T[4] 150 vpxor $A20,$C00,$C00 151 vpermq \$0b01001110,$C00,@T[0] 152 153 vpsrlq \$63,$C14,@T[1] 154 vpaddq $C14,$C14,@T[2] 155 vpor @T[2],@T[1],@T[1] # ROL64(C[1..4],1) 156 157 vpermq \$0b00111001,@T[1],$D14 158 vpxor @T[4],@T[1],$D00 159 vpermq \$0b00000000,$D00,$D00 # D[0..0] = ROL64(C[1],1) ^ C[4] 160 161 vpxor $A00,$C00,$C00 162 vpxor @T[0],$C00,$C00 # C[0..0] 163 164 vpsrlq \$63,$C00,@T[0] 165 vpaddq $C00,$C00,@T[1] 166 vpor @T[0],@T[1],@T[1] # ROL64(C[0..0],1) 167 168 vpxor $D00,$A20,$A20 # ^= D[0..0] 169 vpxor $D00,$A00,$A00 # ^= D[0..0] 170 171 vpblendd \$0b11000000,@T[1],$D14,$D14 172 vpblendd \$0b00000011,$C00,@T[4],@T[4] 173 vpxor @T[4],$D14,$D14 # D[1..4] = ROL64(C[2..4,0),1) ^ C[0..3] 174 175 ######################################### Rho + Pi + pre-Chi shuffle 176 vpsllvq 0*32-96(%r8),$A20,@T[3] 177 vpsrlvq 0*32-96(%r9),$A20,$A20 178 vpor @T[3],$A20,$A20 179 180 vpxor $D14,$A31,$A31 # ^= D[1..4] from Theta 181 vpsllvq 2*32-96(%r8),$A31,@T[4] 182 vpsrlvq 2*32-96(%r9),$A31,$A31 183 vpor @T[4],$A31,$A31 184 185 vpxor $D14,$A21,$A21 # ^= D[1..4] from Theta 186 vpsllvq 3*32-96(%r8),$A21,@T[5] 187 vpsrlvq 3*32-96(%r9),$A21,$A21 188 vpor @T[5],$A21,$A21 189 190 vpxor $D14,$A41,$A41 # ^= D[1..4] from Theta 191 vpsllvq 4*32-96(%r8),$A41,@T[6] 192 vpsrlvq 4*32-96(%r9),$A41,$A41 193 vpor @T[6],$A41,$A41 194 195 vpxor $D14,$A11,$A11 # ^= D[1..4] from Theta 196 vpermq \$0b10001101,$A20,@T[3] # $A20 -> future $A31 197 vpermq \$0b10001101,$A31,@T[4] # $A31 -> future $A21 198 vpsllvq 5*32-96(%r8),$A11,@T[7] 199 vpsrlvq 5*32-96(%r9),$A11,@T[1] 200 vpor @T[7],@T[1],@T[1] # $A11 -> future $A01 201 202 vpxor $D14,$A01,$A01 # ^= D[1..4] from Theta 203 vpermq \$0b00011011,$A21,@T[5] # $A21 -> future $A41 204 vpermq \$0b01110010,$A41,@T[6] # $A41 -> future $A11 205 vpsllvq 1*32-96(%r8),$A01,@T[8] 206 vpsrlvq 1*32-96(%r9),$A01,@T[2] 207 vpor @T[8],@T[2],@T[2] # $A01 -> future $A20 208 209 ######################################### Chi 210 vpsrldq \$8,@T[1],@T[7] 211 vpandn @T[7],@T[1],@T[0] # tgting [0][0] [0][0] [0][0] [0][0] 212 213 vpblendd \$0b00001100,@T[6],@T[2],$A31 # [4][4] [2][0] 214 vpblendd \$0b00001100,@T[2],@T[4],@T[8] # [4][0] [2][1] 215 vpblendd \$0b00001100,@T[4],@T[3],$A41 # [4][2] [2][4] 216 vpblendd \$0b00001100,@T[3],@T[2],@T[7] # [4][3] [2][0] 217 vpblendd \$0b00110000,@T[4],$A31,$A31 # [1][3] [4][4] [2][0] 218 vpblendd \$0b00110000,@T[5],@T[8],@T[8] # [1][4] [4][0] [2][1] 219 vpblendd \$0b00110000,@T[2],$A41,$A41 # [1][0] [4][2] [2][4] 220 vpblendd \$0b00110000,@T[6],@T[7],@T[7] # [1][1] [4][3] [2][0] 221 vpblendd \$0b11000000,@T[5],$A31,$A31 # [3][2] [1][3] [4][4] [2][0] 222 vpblendd \$0b11000000,@T[6],@T[8],@T[8] # [3][3] [1][4] [4][0] [2][1] 223 vpblendd \$0b11000000,@T[6],$A41,$A41 # [3][3] [1][0] [4][2] [2][4] 224 vpblendd \$0b11000000,@T[4],@T[7],@T[7] # [3][4] [1][1] [4][3] [2][0] 225 vpandn @T[8],$A31,$A31 # tgting [3][1] [1][2] [4][3] [2][4] 226 vpandn @T[7],$A41,$A41 # tgting [3][2] [1][4] [4][1] [2][3] 227 228 vpblendd \$0b00001100,@T[2],@T[5],$A11 # [4][0] [2][3] 229 vpblendd \$0b00001100,@T[5],@T[3],@T[8] # [4][1] [2][4] 230 vpxor @T[3],$A31,$A31 231 vpblendd \$0b00110000,@T[3],$A11,$A11 # [1][2] [4][0] [2][3] 232 vpblendd \$0b00110000,@T[4],@T[8],@T[8] # [1][3] [4][1] [2][4] 233 vpxor @T[5],$A41,$A41 234 vpblendd \$0b11000000,@T[4],$A11,$A11 # [3][4] [1][2] [4][0] [2][3] 235 vpblendd \$0b11000000,@T[2],@T[8],@T[8] # [3][0] [1][3] [4][1] [2][4] 236 vpandn @T[8],$A11,$A11 # tgting [3][3] [1][1] [4][4] [2][2] 237 vpxor @T[6],$A11,$A11 238 239 vpermq \$0b00011110,@T[1],$A21 # [0][1] [0][2] [0][4] [0][3] 240 vpblendd \$0b00110000,$A00,$A21,@T[8] # [0][1] [0][0] [0][4] [0][3] 241 vpermq \$0b00111001,@T[1],$A01 # [0][1] [0][4] [0][3] [0][2] 242 vpblendd \$0b11000000,$A00,$A01,$A01 # [0][0] [0][4] [0][3] [0][2] 243 vpandn @T[8],$A01,$A01 # tgting [0][4] [0][3] [0][2] [0][1] 244 245 vpblendd \$0b00001100,@T[5],@T[4],$A20 # [4][1] [2][1] 246 vpblendd \$0b00001100,@T[4],@T[6],@T[7] # [4][2] [2][2] 247 vpblendd \$0b00110000,@T[6],$A20,$A20 # [1][1] [4][1] [2][1] 248 vpblendd \$0b00110000,@T[3],@T[7],@T[7] # [1][2] [4][2] [2][2] 249 vpblendd \$0b11000000,@T[3],$A20,$A20 # [3][1] [1][1] [4][1] [2][1] 250 vpblendd \$0b11000000,@T[5],@T[7],@T[7] # [3][2] [1][2] [4][2] [2][2] 251 vpandn @T[7],$A20,$A20 # tgting [3][0] [1][0] [4][0] [2][0] 252 vpxor @T[2],$A20,$A20 253 254 vpermq \$0b00000000,@T[0],@T[0] # [0][0] [0][0] [0][0] [0][0] 255 vpermq \$0b00011011,$A31,$A31 # post-Chi shuffle 256 vpermq \$0b10001101,$A41,$A41 257 vpermq \$0b01110010,$A11,$A11 258 259 vpblendd \$0b00001100,@T[3],@T[6],$A21 # [4][3] [2][2] 260 vpblendd \$0b00001100,@T[6],@T[5],@T[7] # [4][4] [2][3] 261 vpblendd \$0b00110000,@T[5],$A21,$A21 # [1][4] [4][3] [2][2] 262 vpblendd \$0b00110000,@T[2],@T[7],@T[7] # [1][0] [4][4] [2][3] 263 vpblendd \$0b11000000,@T[2],$A21,$A21 # [3][0] [1][4] [4][3] [2][2] 264 vpblendd \$0b11000000,@T[3],@T[7],@T[7] # [3][1] [1][0] [4][4] [2][3] 265 vpandn @T[7],$A21,$A21 # tgting [3][4] [1][3] [4][2] [2][1] 266 267 vpxor @T[0],$A00,$A00 268 vpxor @T[1],$A01,$A01 269 vpxor @T[4],$A21,$A21 270 271 ######################################### Iota 272 vpxor (%r10),$A00,$A00 273 lea 32(%r10),%r10 274 275 dec %eax 276 jnz .Loop_avx2 277 278 ret 279.size __KeccakF1600,.-__KeccakF1600 280___ 281my ($A_flat,$inp,$len,$bsz) = ("%rdi","%rsi","%rdx","%rcx"); 282my $out = $inp; # in squeeze 283 284$code.=<<___; 285.globl SHA3_absorb 286.type SHA3_absorb,\@function 287.align 32 288SHA3_absorb: 289 mov %rsp,%r11 290 291 lea -240(%rsp),%rsp 292 and \$-32,%rsp 293 294 lea 96($A_flat),$A_flat 295 lea 96($inp),$inp 296 lea 96(%rsp),%r10 297 298 vzeroupper 299 300 vpbroadcastq -96($A_flat),$A00 # load A[5][5] 301 vmovdqu 8+32*0-96($A_flat),$A01 302 vmovdqu 8+32*1-96($A_flat),$A20 303 vmovdqu 8+32*2-96($A_flat),$A31 304 vmovdqu 8+32*3-96($A_flat),$A21 305 vmovdqu 8+32*4-96($A_flat),$A41 306 vmovdqu 8+32*5-96($A_flat),$A11 307 308 vpxor @T[0],@T[0],@T[0] 309 vmovdqa @T[0],32*2-96(%r10) # zero transfer area on stack 310 vmovdqa @T[0],32*3-96(%r10) 311 vmovdqa @T[0],32*4-96(%r10) 312 vmovdqa @T[0],32*5-96(%r10) 313 vmovdqa @T[0],32*6-96(%r10) 314 315.Loop_absorb_avx2: 316 mov $bsz,%rax 317 sub $bsz,$len 318 jc .Ldone_absorb_avx2 319 320 shr \$3,%eax 321 vpbroadcastq 0-96($inp),@T[0] 322 vmovdqu 8-96($inp),@T[1] 323 sub \$4,%eax 324___ 325for(my $i=5; $i<25; $i++) { 326$code.=<<___ 327 dec %eax 328 jz .Labsorved_avx2 329 mov 8*$i-96($inp),%r8 330 mov %r8,$A_jagged[$i]-96(%r10) 331___ 332} 333$code.=<<___; 334.Labsorved_avx2: 335 lea ($inp,$bsz),$inp 336 337 vpxor @T[0],$A00,$A00 338 vpxor @T[1],$A01,$A01 339 vpxor 32*2-96(%r10),$A20,$A20 340 vpxor 32*3-96(%r10),$A31,$A31 341 vpxor 32*4-96(%r10),$A21,$A21 342 vpxor 32*5-96(%r10),$A41,$A41 343 vpxor 32*6-96(%r10),$A11,$A11 344 345 call __KeccakF1600 346 347 lea 96(%rsp),%r10 348 jmp .Loop_absorb_avx2 349 350.Ldone_absorb_avx2: 351 vmovq %xmm0,-96($A_flat) 352 vmovdqu $A01,8+32*0-96($A_flat) 353 vmovdqu $A20,8+32*1-96($A_flat) 354 vmovdqu $A31,8+32*2-96($A_flat) 355 vmovdqu $A21,8+32*3-96($A_flat) 356 vmovdqu $A41,8+32*4-96($A_flat) 357 vmovdqu $A11,8+32*5-96($A_flat) 358 359 vzeroupper 360 361 lea (%r11),%rsp 362 lea ($len,$bsz),%rax # return value 363 ret 364.size SHA3_absorb,.-SHA3_absorb 365 366.globl SHA3_squeeze 367.type SHA3_squeeze,\@function 368.align 32 369SHA3_squeeze: 370 mov %rsp,%r11 371 372 lea 96($A_flat),$A_flat 373 shr \$3,$bsz 374 375 vzeroupper 376 377 vpbroadcastq -96($A_flat),$A00 378 vpxor @T[0],@T[0],@T[0] 379 vmovdqu 8+32*0-96($A_flat),$A01 380 vmovdqu 8+32*1-96($A_flat),$A20 381 vmovdqu 8+32*2-96($A_flat),$A31 382 vmovdqu 8+32*3-96($A_flat),$A21 383 vmovdqu 8+32*4-96($A_flat),$A41 384 vmovdqu 8+32*5-96($A_flat),$A11 385 386 mov $bsz,%rax 387 388.Loop_squeeze_avx2: 389 mov @A_jagged[$i]-96($A_flat),%r8 390___ 391for (my $i=0; $i<25; $i++) { 392$code.=<<___; 393 sub \$8,$len 394 jc .Ltail_squeeze_avx2 395 mov %r8,($out) 396 lea 8($out),$out 397 je .Ldone_squeeze_avx2 398 dec %eax 399 je .Lextend_output_avx2 400 mov @A_jagged[$i+1]-120($A_flat),%r8 401___ 402} 403$code.=<<___; 404.Lextend_output_avx2: 405 call __KeccakF1600 406 407 vmovq %xmm0,-96($A_flat) 408 vmovdqu $A01,8+32*0-96($A_flat) 409 vmovdqu $A20,8+32*1-96($A_flat) 410 vmovdqu $A31,8+32*2-96($A_flat) 411 vmovdqu $A21,8+32*3-96($A_flat) 412 vmovdqu $A41,8+32*4-96($A_flat) 413 vmovdqu $A11,8+32*5-96($A_flat) 414 415 mov $bsz,%rax 416 jmp .Loop_squeeze_avx2 417 418 419.Ltail_squeeze_avx2: 420 add \$8,$len 421.Loop_tail_avx2: 422 mov %r8b,($out) 423 lea 1($out),$out 424 shr \$8,%r8 425 dec $len 426 jnz .Loop_tail_avx2 427 428.Ldone_squeeze_avx2: 429 vzeroupper 430 431 lea (%r11),%rsp 432 ret 433.size SHA3_squeeze,.-SHA3_squeeze 434 435.align 64 436rhotates_left: 437 .quad 3, 18, 36, 41 # [2][0] [4][0] [1][0] [3][0] 438 .quad 1, 62, 28, 27 # [0][1] [0][2] [0][3] [0][4] 439 .quad 45, 6, 56, 39 # [3][1] [1][2] [4][3] [2][4] 440 .quad 10, 61, 55, 8 # [2][1] [4][2] [1][3] [3][4] 441 .quad 2, 15, 25, 20 # [4][1] [3][2] [2][3] [1][4] 442 .quad 44, 43, 21, 14 # [1][1] [2][2] [3][3] [4][4] 443rhotates_right: 444 .quad 64-3, 64-18, 64-36, 64-41 445 .quad 64-1, 64-62, 64-28, 64-27 446 .quad 64-45, 64-6, 64-56, 64-39 447 .quad 64-10, 64-61, 64-55, 64-8 448 .quad 64-2, 64-15, 64-25, 64-20 449 .quad 64-44, 64-43, 64-21, 64-14 450iotas: 451 .quad 0x0000000000000001, 0x0000000000000001, 0x0000000000000001, 0x0000000000000001 452 .quad 0x0000000000008082, 0x0000000000008082, 0x0000000000008082, 0x0000000000008082 453 .quad 0x800000000000808a, 0x800000000000808a, 0x800000000000808a, 0x800000000000808a 454 .quad 0x8000000080008000, 0x8000000080008000, 0x8000000080008000, 0x8000000080008000 455 .quad 0x000000000000808b, 0x000000000000808b, 0x000000000000808b, 0x000000000000808b 456 .quad 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001 457 .quad 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081 458 .quad 0x8000000000008009, 0x8000000000008009, 0x8000000000008009, 0x8000000000008009 459 .quad 0x000000000000008a, 0x000000000000008a, 0x000000000000008a, 0x000000000000008a 460 .quad 0x0000000000000088, 0x0000000000000088, 0x0000000000000088, 0x0000000000000088 461 .quad 0x0000000080008009, 0x0000000080008009, 0x0000000080008009, 0x0000000080008009 462 .quad 0x000000008000000a, 0x000000008000000a, 0x000000008000000a, 0x000000008000000a 463 .quad 0x000000008000808b, 0x000000008000808b, 0x000000008000808b, 0x000000008000808b 464 .quad 0x800000000000008b, 0x800000000000008b, 0x800000000000008b, 0x800000000000008b 465 .quad 0x8000000000008089, 0x8000000000008089, 0x8000000000008089, 0x8000000000008089 466 .quad 0x8000000000008003, 0x8000000000008003, 0x8000000000008003, 0x8000000000008003 467 .quad 0x8000000000008002, 0x8000000000008002, 0x8000000000008002, 0x8000000000008002 468 .quad 0x8000000000000080, 0x8000000000000080, 0x8000000000000080, 0x8000000000000080 469 .quad 0x000000000000800a, 0x000000000000800a, 0x000000000000800a, 0x000000000000800a 470 .quad 0x800000008000000a, 0x800000008000000a, 0x800000008000000a, 0x800000008000000a 471 .quad 0x8000000080008081, 0x8000000080008081, 0x8000000080008081, 0x8000000080008081 472 .quad 0x8000000000008080, 0x8000000000008080, 0x8000000000008080, 0x8000000000008080 473 .quad 0x0000000080000001, 0x0000000080000001, 0x0000000080000001, 0x0000000080000001 474 .quad 0x8000000080008008, 0x8000000080008008, 0x8000000080008008, 0x8000000080008008 475 476.asciz "Keccak-1600 absorb and squeeze for AVX2, CRYPTOGAMS by <appro\@openssl.org>" 477___ 478 479$output=pop and open STDOUT,">$output"; 480print $code; 481close STDOUT or die "error closing STDOUT: $!"; 482