1#!/usr/bin/env perl 2# Copyright 2017-2018 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8# 9# ==================================================================== 10# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 11# project. The module is, however, dual licensed under OpenSSL and 12# CRYPTOGAMS licenses depending on where you obtain it. For further 13# details see http://www.openssl.org/~appro/cryptogams/. 14# ==================================================================== 15# 16# Keccak-1600 for ARMv8. 17# 18# June 2017. 19# 20# This is straightforward KECCAK_1X_ALT implementation. It makes no 21# sense to attempt SIMD/NEON implementation for following reason. 22# 64-bit lanes of vector registers can't be addressed as easily as in 23# 32-bit mode. This means that 64-bit NEON is bound to be slower than 24# 32-bit NEON, and this implementation is faster than 32-bit NEON on 25# same processor. Even though it takes more scalar xor's and andn's, 26# it gets compensated by availability of rotate. Not to forget that 27# most processors achieve higher issue rate with scalar instructions. 28# 29# February 2018. 30# 31# Add hardware-assisted ARMv8.2 implementation. It's KECCAK_1X_ALT 32# variant with register permutation/rotation twist that allows to 33# eliminate copies to temporary registers. If you look closely you'll 34# notice that it uses only one lane of vector registers. The new 35# instructions effectively facilitate parallel hashing, which we don't 36# support [yet?]. But lowest-level core procedure is prepared for it. 37# The inner round is 67 [vector] instructions, so it's not actually 38# obvious that it will provide performance improvement [in serial 39# hash] as long as vector instructions issue rate is limited to 1 per 40# cycle... 41# 42###################################################################### 43# Numbers are cycles per processed byte. 44# 45# r=1088(*) 46# 47# Cortex-A53 13 48# Cortex-A57 12 49# X-Gene 14 50# Mongoose 10 51# Kryo 12 52# Denver 7.8 53# Apple A7 7.2 54# 55# (*) Corresponds to SHA3-256. No improvement coefficients are listed 56# because they vary too much from compiler to compiler. Newer 57# compiler does much better and improvement varies from 5% on 58# Cortex-A57 to 25% on Cortex-A53. While in comparison to older 59# compiler this code is at least 2x faster... 60 61$flavour = shift; 62$output = shift; 63 64$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 65( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 66( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 67die "can't locate arm-xlate.pl"; 68 69open OUT,"| \"$^X\" $xlate $flavour $output"; 70*STDOUT=*OUT; 71 72my @rhotates = ([ 0, 1, 62, 28, 27 ], 73 [ 36, 44, 6, 55, 20 ], 74 [ 3, 10, 43, 25, 39 ], 75 [ 41, 45, 15, 21, 8 ], 76 [ 18, 2, 61, 56, 14 ]); 77 78$code.=<<___; 79.text 80 81.align 8 // strategic alignment and padding that allows to use 82 // address value as loop termination condition... 83 .quad 0,0,0,0,0,0,0,0 84.type iotas,%object 85iotas: 86 .quad 0x0000000000000001 87 .quad 0x0000000000008082 88 .quad 0x800000000000808a 89 .quad 0x8000000080008000 90 .quad 0x000000000000808b 91 .quad 0x0000000080000001 92 .quad 0x8000000080008081 93 .quad 0x8000000000008009 94 .quad 0x000000000000008a 95 .quad 0x0000000000000088 96 .quad 0x0000000080008009 97 .quad 0x000000008000000a 98 .quad 0x000000008000808b 99 .quad 0x800000000000008b 100 .quad 0x8000000000008089 101 .quad 0x8000000000008003 102 .quad 0x8000000000008002 103 .quad 0x8000000000000080 104 .quad 0x000000000000800a 105 .quad 0x800000008000000a 106 .quad 0x8000000080008081 107 .quad 0x8000000000008080 108 .quad 0x0000000080000001 109 .quad 0x8000000080008008 110.size iotas,.-iotas 111___ 112 {{{ 113my @A = map([ "x$_", "x".($_+1), "x".($_+2), "x".($_+3), "x".($_+4) ], 114 (0, 5, 10, 15, 20)); 115 $A[3][3] = "x25"; # x18 is reserved 116 117my @C = map("x$_", (26,27,28,30)); 118 119$code.=<<___; 120.type KeccakF1600_int,%function 121.align 5 122KeccakF1600_int: 123 adr $C[2],iotas 124 stp $C[2],x30,[sp,#16] // 32 bytes on top are mine 125 b .Loop 126.align 4 127.Loop: 128 ////////////////////////////////////////// Theta 129 eor $C[0],$A[0][0],$A[1][0] 130 stp $A[0][4],$A[1][4],[sp,#0] // offload pair... 131 eor $C[1],$A[0][1],$A[1][1] 132 eor $C[2],$A[0][2],$A[1][2] 133 eor $C[3],$A[0][3],$A[1][3] 134___ 135 $C[4]=$A[0][4]; 136 $C[5]=$A[1][4]; 137$code.=<<___; 138 eor $C[4],$A[0][4],$A[1][4] 139 eor $C[0],$C[0],$A[2][0] 140 eor $C[1],$C[1],$A[2][1] 141 eor $C[2],$C[2],$A[2][2] 142 eor $C[3],$C[3],$A[2][3] 143 eor $C[4],$C[4],$A[2][4] 144 eor $C[0],$C[0],$A[3][0] 145 eor $C[1],$C[1],$A[3][1] 146 eor $C[2],$C[2],$A[3][2] 147 eor $C[3],$C[3],$A[3][3] 148 eor $C[4],$C[4],$A[3][4] 149 eor $C[0],$C[0],$A[4][0] 150 eor $C[2],$C[2],$A[4][2] 151 eor $C[1],$C[1],$A[4][1] 152 eor $C[3],$C[3],$A[4][3] 153 eor $C[4],$C[4],$A[4][4] 154 155 eor $C[5],$C[0],$C[2],ror#63 156 157 eor $A[0][1],$A[0][1],$C[5] 158 eor $A[1][1],$A[1][1],$C[5] 159 eor $A[2][1],$A[2][1],$C[5] 160 eor $A[3][1],$A[3][1],$C[5] 161 eor $A[4][1],$A[4][1],$C[5] 162 163 eor $C[5],$C[1],$C[3],ror#63 164 eor $C[2],$C[2],$C[4],ror#63 165 eor $C[3],$C[3],$C[0],ror#63 166 eor $C[4],$C[4],$C[1],ror#63 167 168 eor $C[1], $A[0][2],$C[5] // mov $C[1],$A[0][2] 169 eor $A[1][2],$A[1][2],$C[5] 170 eor $A[2][2],$A[2][2],$C[5] 171 eor $A[3][2],$A[3][2],$C[5] 172 eor $A[4][2],$A[4][2],$C[5] 173 174 eor $A[0][0],$A[0][0],$C[4] 175 eor $A[1][0],$A[1][0],$C[4] 176 eor $A[2][0],$A[2][0],$C[4] 177 eor $A[3][0],$A[3][0],$C[4] 178 eor $A[4][0],$A[4][0],$C[4] 179___ 180 $C[4]=undef; 181 $C[5]=undef; 182$code.=<<___; 183 ldp $A[0][4],$A[1][4],[sp,#0] // re-load offloaded data 184 eor $C[0], $A[0][3],$C[2] // mov $C[0],$A[0][3] 185 eor $A[1][3],$A[1][3],$C[2] 186 eor $A[2][3],$A[2][3],$C[2] 187 eor $A[3][3],$A[3][3],$C[2] 188 eor $A[4][3],$A[4][3],$C[2] 189 190 eor $C[2], $A[0][4],$C[3] // mov $C[2],$A[0][4] 191 eor $A[1][4],$A[1][4],$C[3] 192 eor $A[2][4],$A[2][4],$C[3] 193 eor $A[3][4],$A[3][4],$C[3] 194 eor $A[4][4],$A[4][4],$C[3] 195 196 ////////////////////////////////////////// Rho+Pi 197 mov $C[3],$A[0][1] 198 ror $A[0][1],$A[1][1],#64-$rhotates[1][1] 199 //mov $C[1],$A[0][2] 200 ror $A[0][2],$A[2][2],#64-$rhotates[2][2] 201 //mov $C[0],$A[0][3] 202 ror $A[0][3],$A[3][3],#64-$rhotates[3][3] 203 //mov $C[2],$A[0][4] 204 ror $A[0][4],$A[4][4],#64-$rhotates[4][4] 205 206 ror $A[1][1],$A[1][4],#64-$rhotates[1][4] 207 ror $A[2][2],$A[2][3],#64-$rhotates[2][3] 208 ror $A[3][3],$A[3][2],#64-$rhotates[3][2] 209 ror $A[4][4],$A[4][1],#64-$rhotates[4][1] 210 211 ror $A[1][4],$A[4][2],#64-$rhotates[4][2] 212 ror $A[2][3],$A[3][4],#64-$rhotates[3][4] 213 ror $A[3][2],$A[2][1],#64-$rhotates[2][1] 214 ror $A[4][1],$A[1][3],#64-$rhotates[1][3] 215 216 ror $A[4][2],$A[2][4],#64-$rhotates[2][4] 217 ror $A[3][4],$A[4][3],#64-$rhotates[4][3] 218 ror $A[2][1],$A[1][2],#64-$rhotates[1][2] 219 ror $A[1][3],$A[3][1],#64-$rhotates[3][1] 220 221 ror $A[2][4],$A[4][0],#64-$rhotates[4][0] 222 ror $A[4][3],$A[3][0],#64-$rhotates[3][0] 223 ror $A[1][2],$A[2][0],#64-$rhotates[2][0] 224 ror $A[3][1],$A[1][0],#64-$rhotates[1][0] 225 226 ror $A[1][0],$C[0],#64-$rhotates[0][3] 227 ror $A[2][0],$C[3],#64-$rhotates[0][1] 228 ror $A[3][0],$C[2],#64-$rhotates[0][4] 229 ror $A[4][0],$C[1],#64-$rhotates[0][2] 230 231 ////////////////////////////////////////// Chi+Iota 232 bic $C[0],$A[0][2],$A[0][1] 233 bic $C[1],$A[0][3],$A[0][2] 234 bic $C[2],$A[0][0],$A[0][4] 235 bic $C[3],$A[0][1],$A[0][0] 236 eor $A[0][0],$A[0][0],$C[0] 237 bic $C[0],$A[0][4],$A[0][3] 238 eor $A[0][1],$A[0][1],$C[1] 239 ldr $C[1],[sp,#16] 240 eor $A[0][3],$A[0][3],$C[2] 241 eor $A[0][4],$A[0][4],$C[3] 242 eor $A[0][2],$A[0][2],$C[0] 243 ldr $C[3],[$C[1]],#8 // Iota[i++] 244 245 bic $C[0],$A[1][2],$A[1][1] 246 tst $C[1],#255 // are we done? 247 str $C[1],[sp,#16] 248 bic $C[1],$A[1][3],$A[1][2] 249 bic $C[2],$A[1][0],$A[1][4] 250 eor $A[0][0],$A[0][0],$C[3] // A[0][0] ^= Iota 251 bic $C[3],$A[1][1],$A[1][0] 252 eor $A[1][0],$A[1][0],$C[0] 253 bic $C[0],$A[1][4],$A[1][3] 254 eor $A[1][1],$A[1][1],$C[1] 255 eor $A[1][3],$A[1][3],$C[2] 256 eor $A[1][4],$A[1][4],$C[3] 257 eor $A[1][2],$A[1][2],$C[0] 258 259 bic $C[0],$A[2][2],$A[2][1] 260 bic $C[1],$A[2][3],$A[2][2] 261 bic $C[2],$A[2][0],$A[2][4] 262 bic $C[3],$A[2][1],$A[2][0] 263 eor $A[2][0],$A[2][0],$C[0] 264 bic $C[0],$A[2][4],$A[2][3] 265 eor $A[2][1],$A[2][1],$C[1] 266 eor $A[2][3],$A[2][3],$C[2] 267 eor $A[2][4],$A[2][4],$C[3] 268 eor $A[2][2],$A[2][2],$C[0] 269 270 bic $C[0],$A[3][2],$A[3][1] 271 bic $C[1],$A[3][3],$A[3][2] 272 bic $C[2],$A[3][0],$A[3][4] 273 bic $C[3],$A[3][1],$A[3][0] 274 eor $A[3][0],$A[3][0],$C[0] 275 bic $C[0],$A[3][4],$A[3][3] 276 eor $A[3][1],$A[3][1],$C[1] 277 eor $A[3][3],$A[3][3],$C[2] 278 eor $A[3][4],$A[3][4],$C[3] 279 eor $A[3][2],$A[3][2],$C[0] 280 281 bic $C[0],$A[4][2],$A[4][1] 282 bic $C[1],$A[4][3],$A[4][2] 283 bic $C[2],$A[4][0],$A[4][4] 284 bic $C[3],$A[4][1],$A[4][0] 285 eor $A[4][0],$A[4][0],$C[0] 286 bic $C[0],$A[4][4],$A[4][3] 287 eor $A[4][1],$A[4][1],$C[1] 288 eor $A[4][3],$A[4][3],$C[2] 289 eor $A[4][4],$A[4][4],$C[3] 290 eor $A[4][2],$A[4][2],$C[0] 291 292 bne .Loop 293 294 ldr x30,[sp,#24] 295 ret 296.size KeccakF1600_int,.-KeccakF1600_int 297 298.type KeccakF1600,%function 299.align 5 300KeccakF1600: 301 stp x29,x30,[sp,#-128]! 302 add x29,sp,#0 303 stp x19,x20,[sp,#16] 304 stp x21,x22,[sp,#32] 305 stp x23,x24,[sp,#48] 306 stp x25,x26,[sp,#64] 307 stp x27,x28,[sp,#80] 308 sub sp,sp,#48 309 310 str x0,[sp,#32] // offload argument 311 mov $C[0],x0 312 ldp $A[0][0],$A[0][1],[x0,#16*0] 313 ldp $A[0][2],$A[0][3],[$C[0],#16*1] 314 ldp $A[0][4],$A[1][0],[$C[0],#16*2] 315 ldp $A[1][1],$A[1][2],[$C[0],#16*3] 316 ldp $A[1][3],$A[1][4],[$C[0],#16*4] 317 ldp $A[2][0],$A[2][1],[$C[0],#16*5] 318 ldp $A[2][2],$A[2][3],[$C[0],#16*6] 319 ldp $A[2][4],$A[3][0],[$C[0],#16*7] 320 ldp $A[3][1],$A[3][2],[$C[0],#16*8] 321 ldp $A[3][3],$A[3][4],[$C[0],#16*9] 322 ldp $A[4][0],$A[4][1],[$C[0],#16*10] 323 ldp $A[4][2],$A[4][3],[$C[0],#16*11] 324 ldr $A[4][4],[$C[0],#16*12] 325 326 bl KeccakF1600_int 327 328 ldr $C[0],[sp,#32] 329 stp $A[0][0],$A[0][1],[$C[0],#16*0] 330 stp $A[0][2],$A[0][3],[$C[0],#16*1] 331 stp $A[0][4],$A[1][0],[$C[0],#16*2] 332 stp $A[1][1],$A[1][2],[$C[0],#16*3] 333 stp $A[1][3],$A[1][4],[$C[0],#16*4] 334 stp $A[2][0],$A[2][1],[$C[0],#16*5] 335 stp $A[2][2],$A[2][3],[$C[0],#16*6] 336 stp $A[2][4],$A[3][0],[$C[0],#16*7] 337 stp $A[3][1],$A[3][2],[$C[0],#16*8] 338 stp $A[3][3],$A[3][4],[$C[0],#16*9] 339 stp $A[4][0],$A[4][1],[$C[0],#16*10] 340 stp $A[4][2],$A[4][3],[$C[0],#16*11] 341 str $A[4][4],[$C[0],#16*12] 342 343 ldp x19,x20,[x29,#16] 344 add sp,sp,#48 345 ldp x21,x22,[x29,#32] 346 ldp x23,x24,[x29,#48] 347 ldp x25,x26,[x29,#64] 348 ldp x27,x28,[x29,#80] 349 ldp x29,x30,[sp],#128 350 ret 351.size KeccakF1600,.-KeccakF1600 352 353.globl SHA3_absorb 354.type SHA3_absorb,%function 355.align 5 356SHA3_absorb: 357 stp x29,x30,[sp,#-128]! 358 add x29,sp,#0 359 stp x19,x20,[sp,#16] 360 stp x21,x22,[sp,#32] 361 stp x23,x24,[sp,#48] 362 stp x25,x26,[sp,#64] 363 stp x27,x28,[sp,#80] 364 sub sp,sp,#64 365 366 stp x0,x1,[sp,#32] // offload arguments 367 stp x2,x3,[sp,#48] 368 369 mov $C[0],x0 // uint64_t A[5][5] 370 mov $C[1],x1 // const void *inp 371 mov $C[2],x2 // size_t len 372 mov $C[3],x3 // size_t bsz 373 ldp $A[0][0],$A[0][1],[$C[0],#16*0] 374 ldp $A[0][2],$A[0][3],[$C[0],#16*1] 375 ldp $A[0][4],$A[1][0],[$C[0],#16*2] 376 ldp $A[1][1],$A[1][2],[$C[0],#16*3] 377 ldp $A[1][3],$A[1][4],[$C[0],#16*4] 378 ldp $A[2][0],$A[2][1],[$C[0],#16*5] 379 ldp $A[2][2],$A[2][3],[$C[0],#16*6] 380 ldp $A[2][4],$A[3][0],[$C[0],#16*7] 381 ldp $A[3][1],$A[3][2],[$C[0],#16*8] 382 ldp $A[3][3],$A[3][4],[$C[0],#16*9] 383 ldp $A[4][0],$A[4][1],[$C[0],#16*10] 384 ldp $A[4][2],$A[4][3],[$C[0],#16*11] 385 ldr $A[4][4],[$C[0],#16*12] 386 b .Loop_absorb 387 388.align 4 389.Loop_absorb: 390 subs $C[0],$C[2],$C[3] // len - bsz 391 blo .Labsorbed 392 393 str $C[0],[sp,#48] // save len - bsz 394___ 395for (my $i=0; $i<24; $i+=2) { 396my $j = $i+1; 397$code.=<<___; 398 ldr $C[0],[$C[1]],#8 // *inp++ 399#ifdef __AARCH64EB__ 400 rev $C[0],$C[0] 401#endif 402 eor $A[$i/5][$i%5],$A[$i/5][$i%5],$C[0] 403 cmp $C[3],#8*($i+2) 404 blo .Lprocess_block 405 ldr $C[0],[$C[1]],#8 // *inp++ 406#ifdef __AARCH64EB__ 407 rev $C[0],$C[0] 408#endif 409 eor $A[$j/5][$j%5],$A[$j/5][$j%5],$C[0] 410 beq .Lprocess_block 411___ 412} 413$code.=<<___; 414 ldr $C[0],[$C[1]],#8 // *inp++ 415#ifdef __AARCH64EB__ 416 rev $C[0],$C[0] 417#endif 418 eor $A[4][4],$A[4][4],$C[0] 419 420.Lprocess_block: 421 str $C[1],[sp,#40] // save inp 422 423 bl KeccakF1600_int 424 425 ldr $C[1],[sp,#40] // restore arguments 426 ldp $C[2],$C[3],[sp,#48] 427 b .Loop_absorb 428 429.align 4 430.Labsorbed: 431 ldr $C[1],[sp,#32] 432 stp $A[0][0],$A[0][1],[$C[1],#16*0] 433 stp $A[0][2],$A[0][3],[$C[1],#16*1] 434 stp $A[0][4],$A[1][0],[$C[1],#16*2] 435 stp $A[1][1],$A[1][2],[$C[1],#16*3] 436 stp $A[1][3],$A[1][4],[$C[1],#16*4] 437 stp $A[2][0],$A[2][1],[$C[1],#16*5] 438 stp $A[2][2],$A[2][3],[$C[1],#16*6] 439 stp $A[2][4],$A[3][0],[$C[1],#16*7] 440 stp $A[3][1],$A[3][2],[$C[1],#16*8] 441 stp $A[3][3],$A[3][4],[$C[1],#16*9] 442 stp $A[4][0],$A[4][1],[$C[1],#16*10] 443 stp $A[4][2],$A[4][3],[$C[1],#16*11] 444 str $A[4][4],[$C[1],#16*12] 445 446 mov x0,$C[2] // return value 447 ldp x19,x20,[x29,#16] 448 add sp,sp,#64 449 ldp x21,x22,[x29,#32] 450 ldp x23,x24,[x29,#48] 451 ldp x25,x26,[x29,#64] 452 ldp x27,x28,[x29,#80] 453 ldp x29,x30,[sp],#128 454 ret 455.size SHA3_absorb,.-SHA3_absorb 456___ 457{ 458my ($A_flat,$out,$len,$bsz) = map("x$_",(19..22)); 459$code.=<<___; 460.globl SHA3_squeeze 461.type SHA3_squeeze,%function 462.align 5 463SHA3_squeeze: 464 stp x29,x30,[sp,#-48]! 465 add x29,sp,#0 466 stp x19,x20,[sp,#16] 467 stp x21,x22,[sp,#32] 468 469 mov $A_flat,x0 // put aside arguments 470 mov $out,x1 471 mov $len,x2 472 mov $bsz,x3 473 474.Loop_squeeze: 475 ldr x4,[x0],#8 476 cmp $len,#8 477 blo .Lsqueeze_tail 478#ifdef __AARCH64EB__ 479 rev x4,x4 480#endif 481 str x4,[$out],#8 482 subs $len,$len,#8 483 beq .Lsqueeze_done 484 485 subs x3,x3,#8 486 bhi .Loop_squeeze 487 488 mov x0,$A_flat 489 bl KeccakF1600 490 mov x0,$A_flat 491 mov x3,$bsz 492 b .Loop_squeeze 493 494.align 4 495.Lsqueeze_tail: 496 strb w4,[$out],#1 497 lsr x4,x4,#8 498 subs $len,$len,#1 499 beq .Lsqueeze_done 500 strb w4,[$out],#1 501 lsr x4,x4,#8 502 subs $len,$len,#1 503 beq .Lsqueeze_done 504 strb w4,[$out],#1 505 lsr x4,x4,#8 506 subs $len,$len,#1 507 beq .Lsqueeze_done 508 strb w4,[$out],#1 509 lsr x4,x4,#8 510 subs $len,$len,#1 511 beq .Lsqueeze_done 512 strb w4,[$out],#1 513 lsr x4,x4,#8 514 subs $len,$len,#1 515 beq .Lsqueeze_done 516 strb w4,[$out],#1 517 lsr x4,x4,#8 518 subs $len,$len,#1 519 beq .Lsqueeze_done 520 strb w4,[$out],#1 521 522.Lsqueeze_done: 523 ldp x19,x20,[sp,#16] 524 ldp x21,x22,[sp,#32] 525 ldp x29,x30,[sp],#48 526 ret 527.size SHA3_squeeze,.-SHA3_squeeze 528___ 529} }}} 530 {{{ 531my @A = map([ "v".$_.".16b", "v".($_+1).".16b", "v".($_+2).".16b", 532 "v".($_+3).".16b", "v".($_+4).".16b" ], 533 (0, 5, 10, 15, 20)); 534 535my @C = map("v$_.16b", (25..31)); 536 537$code.=<<___; 538.type KeccakF1600_ce,%function 539.align 5 540KeccakF1600_ce: 541 mov x9,#12 542 adr x10,iotas 543 b .Loop_ce 544.align 4 545.Loop_ce: 546___ 547for($i=0; $i<2; $i++) { 548$code.=<<___; 549 ////////////////////////////////////////////////// Theta 550 eor3 $C[0],$A[0][0],$A[1][0],$A[2][0] 551 eor3 $C[1],$A[0][1],$A[1][1],$A[2][1] 552 eor3 $C[2],$A[0][2],$A[1][2],$A[2][2] 553 eor3 $C[3],$A[0][3],$A[1][3],$A[2][3] 554 eor3 $C[4],$A[0][4],$A[1][4],$A[2][4] 555 eor3 $C[0],$C[0], $A[3][0],$A[4][0] 556 eor3 $C[1],$C[1], $A[3][1],$A[4][1] 557 eor3 $C[2],$C[2], $A[3][2],$A[4][2] 558 eor3 $C[3],$C[3], $A[3][3],$A[4][3] 559 eor3 $C[4],$C[4], $A[3][4],$A[4][4] 560 561 rax1 $C[5],$C[0],$C[2] // D[1] 562 rax1 $C[6],$C[1],$C[3] // D[2] 563 rax1 $C[2],$C[2],$C[4] // D[3] 564 rax1 $C[3],$C[3],$C[0] // D[4] 565 rax1 $C[4],$C[4],$C[1] // D[0] 566 567 ////////////////////////////////////////////////// Theta+Rho+Pi 568 xar $C[0], $A[1][1],$C[5],#64-$rhotates[1][1] // C[0]=A[0][1] 569 xar $A[1][1],$A[1][4],$C[3],#64-$rhotates[1][4] 570 xar $A[1][4],$A[4][2],$C[6],#64-$rhotates[4][2] 571 xar $A[4][2],$A[2][4],$C[3],#64-$rhotates[2][4] 572 xar $A[2][4],$A[4][0],$C[4],#64-$rhotates[4][0] 573 574 xar $A[4][0],$A[0][2],$C[6],#64-$rhotates[0][2] 575 576 xar $A[0][2],$A[2][2],$C[6],#64-$rhotates[2][2] 577 xar $A[2][2],$A[2][3],$C[2],#64-$rhotates[2][3] 578 xar $A[2][3],$A[3][4],$C[3],#64-$rhotates[3][4] 579 xar $A[3][4],$A[4][3],$C[2],#64-$rhotates[4][3] 580 xar $A[4][3],$A[3][0],$C[4],#64-$rhotates[3][0] 581 582 xar $A[3][0],$A[0][4],$C[3],#64-$rhotates[0][4] 583 584 eor $A[0][0],$A[0][0],$C[4] 585 ldr x11,[x10],#8 586 587 xar $C[1], $A[3][3],$C[2],#64-$rhotates[3][3] // C[1]=A[0][3] 588 xar $A[3][3],$A[3][2],$C[6],#64-$rhotates[3][2] 589 xar $A[3][2],$A[2][1],$C[5],#64-$rhotates[2][1] 590 xar $A[2][1],$A[1][2],$C[6],#64-$rhotates[1][2] 591 xar $A[1][2],$A[2][0],$C[4],#64-$rhotates[2][0] 592 593 xar $A[2][0],$A[0][1],$C[5],#64-$rhotates[0][1] // * 594 595 xar $A[0][4],$A[4][4],$C[3],#64-$rhotates[4][4] 596 xar $A[4][4],$A[4][1],$C[5],#64-$rhotates[4][1] 597 xar $A[4][1],$A[1][3],$C[2],#64-$rhotates[1][3] 598 xar $A[1][3],$A[3][1],$C[5],#64-$rhotates[3][1] 599 xar $A[3][1],$A[1][0],$C[4],#64-$rhotates[1][0] 600 601 xar $C[2], $A[0][3],$C[2],#64-$rhotates[0][3] // C[2]=A[1][0] 602 603 ////////////////////////////////////////////////// Chi+Iota 604 dup $C[6],x11 // borrow C[6] 605 bcax $C[3], $A[0][0],$A[0][2],$C[0] // * 606 bcax $A[0][1],$C[0], $C[1], $A[0][2] // * 607 bcax $A[0][2],$A[0][2],$A[0][4],$C[1] 608 bcax $A[0][3],$C[1], $A[0][0],$A[0][4] 609 bcax $A[0][4],$A[0][4],$C[0], $A[0][0] 610 611 bcax $A[1][0],$C[2], $A[1][2],$A[1][1] // * 612 bcax $C[0], $A[1][1],$A[1][3],$A[1][2] // * 613 bcax $A[1][2],$A[1][2],$A[1][4],$A[1][3] 614 bcax $A[1][3],$A[1][3],$C[2], $A[1][4] 615 bcax $A[1][4],$A[1][4],$A[1][1],$C[2] 616 617 eor $A[0][0],$C[3],$C[6] // Iota 618 619 bcax $C[1], $A[2][0],$A[2][2],$A[2][1] // * 620 bcax $C[2], $A[2][1],$A[2][3],$A[2][2] // * 621 bcax $A[2][2],$A[2][2],$A[2][4],$A[2][3] 622 bcax $A[2][3],$A[2][3],$A[2][0],$A[2][4] 623 bcax $A[2][4],$A[2][4],$A[2][1],$A[2][0] 624 625 bcax $C[3], $A[3][0],$A[3][2],$A[3][1] // * 626 bcax $C[4], $A[3][1],$A[3][3],$A[3][2] // * 627 bcax $A[3][2],$A[3][2],$A[3][4],$A[3][3] 628 bcax $A[3][3],$A[3][3],$A[3][0],$A[3][4] 629 bcax $A[3][4],$A[3][4],$A[3][1],$A[3][0] 630 631 bcax $C[5], $A[4][0],$A[4][2],$A[4][1] // * 632 bcax $C[6], $A[4][1],$A[4][3],$A[4][2] // * 633 bcax $A[4][2],$A[4][2],$A[4][4],$A[4][3] 634 bcax $A[4][3],$A[4][3],$A[4][0],$A[4][4] 635 bcax $A[4][4],$A[4][4],$A[4][1],$A[4][0] 636___ 637 ( $A[1][1], $C[0]) = ( $C[0], $A[1][1]); 638 ($A[2][0],$A[2][1], $C[1],$C[2]) = ($C[1],$C[2], $A[2][0],$A[2][1]); 639 ($A[3][0],$A[3][1], $C[3],$C[4]) = ($C[3],$C[4], $A[3][0],$A[3][1]); 640 ($A[4][0],$A[4][1], $C[5],$C[6]) = ($C[5],$C[6], $A[4][0],$A[4][1]); 641} 642$code.=<<___; 643 subs x9,x9,#1 644 bne .Loop_ce 645 646 ret 647.size KeccakF1600_ce,.-KeccakF1600_ce 648 649.type KeccakF1600_cext,%function 650.align 5 651KeccakF1600_cext: 652 stp x29,x30,[sp,#-80]! 653 add x29,sp,#0 654 stp d8,d9,[sp,#16] // per ABI requirement 655 stp d10,d11,[sp,#32] 656 stp d12,d13,[sp,#48] 657 stp d14,d15,[sp,#64] 658___ 659for($i=0; $i<24; $i+=2) { # load A[5][5] 660my $j=$i+1; 661$code.=<<___; 662 ldp d$i,d$j,[x0,#8*$i] 663___ 664} 665$code.=<<___; 666 ldr d24,[x0,#8*$i] 667 bl KeccakF1600_ce 668 ldr x30,[sp,#8] 669___ 670for($i=0; $i<24; $i+=2) { # store A[5][5] 671my $j=$i+1; 672$code.=<<___; 673 stp d$i,d$j,[x0,#8*$i] 674___ 675} 676$code.=<<___; 677 str d24,[x0,#8*$i] 678 679 ldp d8,d9,[sp,#16] 680 ldp d10,d11,[sp,#32] 681 ldp d12,d13,[sp,#48] 682 ldp d14,d15,[sp,#64] 683 ldr x29,[sp],#80 684 ret 685.size KeccakF1600_cext,.-KeccakF1600_cext 686___ 687 688{ 689my ($ctx,$inp,$len,$bsz) = map("x$_",(0..3)); 690 691$code.=<<___; 692.globl SHA3_absorb_cext 693.type SHA3_absorb_cext,%function 694.align 5 695SHA3_absorb_cext: 696 stp x29,x30,[sp,#-80]! 697 add x29,sp,#0 698 stp d8,d9,[sp,#16] // per ABI requirement 699 stp d10,d11,[sp,#32] 700 stp d12,d13,[sp,#48] 701 stp d14,d15,[sp,#64] 702___ 703for($i=0; $i<24; $i+=2) { # load A[5][5] 704my $j=$i+1; 705$code.=<<___; 706 ldp d$i,d$j,[x0,#8*$i] 707___ 708} 709$code.=<<___; 710 ldr d24,[x0,#8*$i] 711 b .Loop_absorb_ce 712 713.align 4 714.Loop_absorb_ce: 715 subs $len,$len,$bsz // len - bsz 716 blo .Labsorbed_ce 717___ 718for (my $i=0; $i<24; $i+=2) { 719my $j = $i+1; 720$code.=<<___; 721 ldr d31,[$inp],#8 // *inp++ 722#ifdef __AARCH64EB__ 723 rev64 v31.16b,v31.16b 724#endif 725 eor $A[$i/5][$i%5],$A[$i/5][$i%5],v31.16b 726 cmp $bsz,#8*($i+2) 727 blo .Lprocess_block_ce 728 ldr d31,[$inp],#8 // *inp++ 729#ifdef __AARCH64EB__ 730 rev v31.16b,v31.16b 731#endif 732 eor $A[$j/5][$j%5],$A[$j/5][$j%5],v31.16b 733 beq .Lprocess_block_ce 734___ 735} 736$code.=<<___; 737 ldr d31,[$inp],#8 // *inp++ 738#ifdef __AARCH64EB__ 739 rev v31.16b,v31.16b 740#endif 741 eor $A[4][4],$A[4][4],v31.16b 742 743.Lprocess_block_ce: 744 745 bl KeccakF1600_ce 746 747 b .Loop_absorb_ce 748 749.align 4 750.Labsorbed_ce: 751___ 752for($i=0; $i<24; $i+=2) { # store A[5][5] 753my $j=$i+1; 754$code.=<<___; 755 stp d$i,d$j,[x0,#8*$i] 756___ 757} 758$code.=<<___; 759 str d24,[x0,#8*$i] 760 add x0,$len,$bsz // return value 761 762 ldp d8,d9,[sp,#16] 763 ldp d10,d11,[sp,#32] 764 ldp d12,d13,[sp,#48] 765 ldp d14,d15,[sp,#64] 766 ldp x29,x30,[sp],#80 767 ret 768.size SHA3_absorb_cext,.-SHA3_absorb_cext 769___ 770} 771{ 772my ($ctx,$out,$len,$bsz) = map("x$_",(0..3)); 773$code.=<<___; 774.globl SHA3_squeeze_cext 775.type SHA3_squeeze_cext,%function 776.align 5 777SHA3_squeeze_cext: 778 stp x29,x30,[sp,#-16]! 779 add x29,sp,#0 780 mov x9,$ctx 781 mov x10,$bsz 782 783.Loop_squeeze_ce: 784 ldr x4,[x9],#8 785 cmp $len,#8 786 blo .Lsqueeze_tail_ce 787#ifdef __AARCH64EB__ 788 rev x4,x4 789#endif 790 str x4,[$out],#8 791 beq .Lsqueeze_done_ce 792 793 sub $len,$len,#8 794 subs x10,x10,#8 795 bhi .Loop_squeeze_ce 796 797 bl KeccakF1600_cext 798 ldr x30,[sp,#8] 799 mov x9,$ctx 800 mov x10,$bsz 801 b .Loop_squeeze_ce 802 803.align 4 804.Lsqueeze_tail_ce: 805 strb w4,[$out],#1 806 lsr x4,x4,#8 807 subs $len,$len,#1 808 beq .Lsqueeze_done_ce 809 strb w4,[$out],#1 810 lsr x4,x4,#8 811 subs $len,$len,#1 812 beq .Lsqueeze_done_ce 813 strb w4,[$out],#1 814 lsr x4,x4,#8 815 subs $len,$len,#1 816 beq .Lsqueeze_done_ce 817 strb w4,[$out],#1 818 lsr x4,x4,#8 819 subs $len,$len,#1 820 beq .Lsqueeze_done_ce 821 strb w4,[$out],#1 822 lsr x4,x4,#8 823 subs $len,$len,#1 824 beq .Lsqueeze_done_ce 825 strb w4,[$out],#1 826 lsr x4,x4,#8 827 subs $len,$len,#1 828 beq .Lsqueeze_done_ce 829 strb w4,[$out],#1 830 831.Lsqueeze_done_ce: 832 ldr x29,[sp],#16 833 ret 834.size SHA3_squeeze_cext,.-SHA3_squeeze_cext 835___ 836} }}} 837$code.=<<___; 838.asciz "Keccak-1600 absorb and squeeze for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" 839___ 840 841{ my %opcode = ( 842 "rax1" => 0xce608c00, "eor3" => 0xce000000, 843 "bcax" => 0xce200000, "xar" => 0xce800000 ); 844 845 sub unsha3 { 846 my ($mnemonic,$arg)=@_; 847 848 $arg =~ m/[qv]([0-9]+)[^,]*,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv]([0-9]+)[^,]*(?:,\s*[qv#]([0-9\-]+))?)?/ 849 && 850 sprintf ".inst\t0x%08x\t//%s %s", 851 $opcode{$mnemonic}|$1|($2<<5)|($3<<16)|(eval($4)<<10), 852 $mnemonic,$arg; 853 } 854} 855 856foreach(split("\n",$code)) { 857 858 s/\`([^\`]*)\`/eval($1)/ge; 859 860 m/\bdup\b/ and s/\.16b/.2d/g or 861 s/\b(eor3|rax1|xar|bcax)\s+(v.*)/unsha3($1,$2)/ge; 862 863 print $_,"\n"; 864} 865 866close STDOUT; 867