1#! /usr/bin/env perl 2# Copyright 2007-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# 16# Permission to use under GPL terms is granted. 17# ==================================================================== 18 19# SHA256 block procedure for ARMv4. May 2007. 20 21# Performance is ~2x better than gcc 3.4 generated code and in "abso- 22# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per 23# byte [on single-issue Xscale PXA250 core]. 24 25# July 2010. 26# 27# Rescheduling for dual-issue pipeline resulted in 22% improvement on 28# Cortex A8 core and ~20 cycles per processed byte. 29 30# February 2011. 31# 32# Profiler-assisted and platform-specific optimization resulted in 16% 33# improvement on Cortex A8 core and ~15.4 cycles per processed byte. 34 35# September 2013. 36# 37# Add NEON implementation. On Cortex A8 it was measured to process one 38# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon 39# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only 40# code (meaning that latter performs sub-optimally, nothing was done 41# about it). 42 43# May 2014. 44# 45# Add ARMv8 code path performing at 2.0 cpb on Apple A7. 46 47# $output is the last argument if it looks like a file (it has an extension) 48# $flavour is the first argument if it doesn't look like a file 49$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 50$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 51 52if ($flavour && $flavour ne "void") { 53 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 54 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 55 ( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or 56 die "can't locate arm-xlate.pl"; 57 58 open STDOUT,"| \"$^X\" $xlate $flavour \"$output\"" 59 or die "can't call $xlate: $!"; 60} else { 61 $output and open STDOUT,">$output"; 62} 63 64$ctx="r0"; $t0="r0"; 65$inp="r1"; $t4="r1"; 66$len="r2"; $t1="r2"; 67$T1="r3"; $t3="r3"; 68$A="r4"; 69$B="r5"; 70$C="r6"; 71$D="r7"; 72$E="r8"; 73$F="r9"; 74$G="r10"; 75$H="r11"; 76@V=($A,$B,$C,$D,$E,$F,$G,$H); 77$t2="r12"; 78$Ktbl="r14"; 79 80@Sigma0=( 2,13,22); 81@Sigma1=( 6,11,25); 82@sigma0=( 7,18, 3); 83@sigma1=(17,19,10); 84 85sub BODY_00_15 { 86my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 87 88$code.=<<___ if ($i<16); 89#if __ARM_ARCH__>=7 90 @ ldr $t1,[$inp],#4 @ $i 91# if $i==15 92 str $inp,[sp,#17*4] @ make room for $t4 93# endif 94 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` 95 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past 96 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) 97# ifndef __ARMEB__ 98 rev $t1,$t1 99# endif 100#else 101 @ ldrb $t1,[$inp,#3] @ $i 102 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past 103 ldrb $t2,[$inp,#2] 104 ldrb $t0,[$inp,#1] 105 orr $t1,$t1,$t2,lsl#8 106 ldrb $t2,[$inp],#4 107 orr $t1,$t1,$t0,lsl#16 108# if $i==15 109 str $inp,[sp,#17*4] @ make room for $t4 110# endif 111 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` 112 orr $t1,$t1,$t2,lsl#24 113 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) 114#endif 115___ 116$code.=<<___; 117 ldr $t2,[$Ktbl],#4 @ *K256++ 118 add $h,$h,$t1 @ h+=X[i] 119 str $t1,[sp,#`$i%16`*4] 120 eor $t1,$f,$g 121 add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e) 122 and $t1,$t1,$e 123 add $h,$h,$t2 @ h+=K256[i] 124 eor $t1,$t1,$g @ Ch(e,f,g) 125 eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]` 126 add $h,$h,$t1 @ h+=Ch(e,f,g) 127#if $i==31 128 and $t2,$t2,#0xff 129 cmp $t2,#0xf2 @ done? 130#endif 131#if $i<15 132# if __ARM_ARCH__>=7 133 ldr $t1,[$inp],#4 @ prefetch 134# else 135 ldrb $t1,[$inp,#3] 136# endif 137 eor $t2,$a,$b @ a^b, b^c in next round 138#else 139 ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx 140 eor $t2,$a,$b @ a^b, b^c in next round 141 ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx 142#endif 143 eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a) 144 and $t3,$t3,$t2 @ (b^c)&=(a^b) 145 add $d,$d,$h @ d+=h 146 eor $t3,$t3,$b @ Maj(a,b,c) 147 add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a) 148 @ add $h,$h,$t3 @ h+=Maj(a,b,c) 149___ 150 ($t2,$t3)=($t3,$t2); 151} 152 153sub BODY_16_XX { 154my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 155 156$code.=<<___; 157 @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i 158 @ ldr $t4,[sp,#`($i+14)%16`*4] 159 mov $t0,$t1,ror#$sigma0[0] 160 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past 161 mov $t2,$t4,ror#$sigma1[0] 162 eor $t0,$t0,$t1,ror#$sigma0[1] 163 eor $t2,$t2,$t4,ror#$sigma1[1] 164 eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1]) 165 ldr $t1,[sp,#`($i+0)%16`*4] 166 eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14]) 167 ldr $t4,[sp,#`($i+9)%16`*4] 168 169 add $t2,$t2,$t0 170 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15 171 add $t1,$t1,$t2 172 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) 173 add $t1,$t1,$t4 @ X[i] 174___ 175 &BODY_00_15(@_); 176} 177 178$code=<<___; 179#ifndef __KERNEL__ 180# include "arm_arch.h" 181#else 182# define __ARM_ARCH__ __LINUX_ARM_ARCH__ 183# define __ARM_MAX_ARCH__ 7 184#endif 185 186#if defined(__thumb2__) 187.syntax unified 188.thumb 189#else 190.code 32 191#endif 192 193.text 194 195.type K256,%object 196.align 5 197K256: 198.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 199.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 200.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 201.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 202.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 203.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 204.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 205.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 206.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 207.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 208.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 209.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 210.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 211.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 212.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 213.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 214.size K256,.-K256 215.word 0 @ terminator 216#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 217.LOPENSSL_armcap: 218# ifdef _WIN32 219.word OPENSSL_armcap_P 220# else 221.word OPENSSL_armcap_P-.Lsha256_block_data_order 222# endif 223#endif 224.align 5 225 226.global sha256_block_data_order 227.type sha256_block_data_order,%function 228sha256_block_data_order: 229.Lsha256_block_data_order: 230#if __ARM_ARCH__<7 && !defined(__thumb2__) 231 sub r3,pc,#8 @ sha256_block_data_order 232#else 233 adr r3,.Lsha256_block_data_order 234#endif 235#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 236 ldr r12,.LOPENSSL_armcap 237# if !defined(_WIN32) 238 ldr r12,[r3,r12] @ OPENSSL_armcap_P 239# endif 240# if defined(__APPLE__) || defined(_WIN32) 241 ldr r12,[r12] 242# endif 243 tst r12,#ARMV8_SHA256 244 bne .LARMv8 245 tst r12,#ARMV7_NEON 246 bne .LNEON 247#endif 248 add $len,$inp,$len,lsl#6 @ len to point at the end of inp 249 stmdb sp!,{$ctx,$inp,$len,r4-r11,lr} 250 ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H} 251 sub $Ktbl,r3,#256+32 @ K256 252 sub sp,sp,#16*4 @ alloca(X[16]) 253.Loop: 254# if __ARM_ARCH__>=7 255 ldr $t1,[$inp],#4 256# else 257 ldrb $t1,[$inp,#3] 258# endif 259 eor $t3,$B,$C @ magic 260 eor $t2,$t2,$t2 261___ 262for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); } 263$code.=".Lrounds_16_xx:\n"; 264for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); } 265$code.=<<___; 266#ifdef __thumb2__ 267 ite eq @ Thumb2 thing, sanity check in ARM 268#endif 269 ldreq $t3,[sp,#16*4] @ pull ctx 270 bne .Lrounds_16_xx 271 272 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past 273 ldr $t0,[$t3,#0] 274 ldr $t1,[$t3,#4] 275 ldr $t2,[$t3,#8] 276 add $A,$A,$t0 277 ldr $t0,[$t3,#12] 278 add $B,$B,$t1 279 ldr $t1,[$t3,#16] 280 add $C,$C,$t2 281 ldr $t2,[$t3,#20] 282 add $D,$D,$t0 283 ldr $t0,[$t3,#24] 284 add $E,$E,$t1 285 ldr $t1,[$t3,#28] 286 add $F,$F,$t2 287 ldr $inp,[sp,#17*4] @ pull inp 288 ldr $t2,[sp,#18*4] @ pull inp+len 289 add $G,$G,$t0 290 add $H,$H,$t1 291 stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H} 292 cmp $inp,$t2 293 sub $Ktbl,$Ktbl,#256 @ rewind Ktbl 294 bne .Loop 295 296 add sp,sp,#`16+3`*4 @ destroy frame 297#if __ARM_ARCH__>=5 298 ldmia sp!,{r4-r11,pc} 299#else 300 ldmia sp!,{r4-r11,lr} 301 tst lr,#1 302 moveq pc,lr @ be binary compatible with V4, yet 303 bx lr @ interoperable with Thumb ISA:-) 304#endif 305.size sha256_block_data_order,.-sha256_block_data_order 306___ 307###################################################################### 308# NEON stuff 309# 310{{{ 311my @X=map("q$_",(0..3)); 312my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25"); 313my $Xfer=$t4; 314my $j=0; 315 316sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } 317sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } 318 319sub AUTOLOAD() # thunk [simplified] x86-style perlasm 320{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; 321 my $arg = pop; 322 $arg = "#$arg" if ($arg*1 eq $arg); 323 $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; 324} 325 326sub Xupdate() 327{ use integer; 328 my $body = shift; 329 my @insns = (&$body,&$body,&$body,&$body); 330 my ($a,$b,$c,$d,$e,$f,$g,$h); 331 332 &vext_8 ($T0,@X[0],@X[1],4); # X[1..4] 333 eval(shift(@insns)); 334 eval(shift(@insns)); 335 eval(shift(@insns)); 336 &vext_8 ($T1,@X[2],@X[3],4); # X[9..12] 337 eval(shift(@insns)); 338 eval(shift(@insns)); 339 eval(shift(@insns)); 340 &vshr_u32 ($T2,$T0,$sigma0[0]); 341 eval(shift(@insns)); 342 eval(shift(@insns)); 343 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12] 344 eval(shift(@insns)); 345 eval(shift(@insns)); 346 &vshr_u32 ($T1,$T0,$sigma0[2]); 347 eval(shift(@insns)); 348 eval(shift(@insns)); 349 &vsli_32 ($T2,$T0,32-$sigma0[0]); 350 eval(shift(@insns)); 351 eval(shift(@insns)); 352 &vshr_u32 ($T3,$T0,$sigma0[1]); 353 eval(shift(@insns)); 354 eval(shift(@insns)); 355 &veor ($T1,$T1,$T2); 356 eval(shift(@insns)); 357 eval(shift(@insns)); 358 &vsli_32 ($T3,$T0,32-$sigma0[1]); 359 eval(shift(@insns)); 360 eval(shift(@insns)); 361 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]); 362 eval(shift(@insns)); 363 eval(shift(@insns)); 364 &veor ($T1,$T1,$T3); # sigma0(X[1..4]) 365 eval(shift(@insns)); 366 eval(shift(@insns)); 367 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]); 368 eval(shift(@insns)); 369 eval(shift(@insns)); 370 &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]); 371 eval(shift(@insns)); 372 eval(shift(@insns)); 373 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4]) 374 eval(shift(@insns)); 375 eval(shift(@insns)); 376 &veor ($T5,$T5,$T4); 377 eval(shift(@insns)); 378 eval(shift(@insns)); 379 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]); 380 eval(shift(@insns)); 381 eval(shift(@insns)); 382 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]); 383 eval(shift(@insns)); 384 eval(shift(@insns)); 385 &veor ($T5,$T5,$T4); # sigma1(X[14..15]) 386 eval(shift(@insns)); 387 eval(shift(@insns)); 388 &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15]) 389 eval(shift(@insns)); 390 eval(shift(@insns)); 391 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]); 392 eval(shift(@insns)); 393 eval(shift(@insns)); 394 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]); 395 eval(shift(@insns)); 396 eval(shift(@insns)); 397 &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]); 398 eval(shift(@insns)); 399 eval(shift(@insns)); 400 &veor ($T5,$T5,$T4); 401 eval(shift(@insns)); 402 eval(shift(@insns)); 403 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]); 404 eval(shift(@insns)); 405 eval(shift(@insns)); 406 &vld1_32 ("{$T0}","[$Ktbl,:128]!"); 407 eval(shift(@insns)); 408 eval(shift(@insns)); 409 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]); 410 eval(shift(@insns)); 411 eval(shift(@insns)); 412 &veor ($T5,$T5,$T4); # sigma1(X[16..17]) 413 eval(shift(@insns)); 414 eval(shift(@insns)); 415 &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17]) 416 eval(shift(@insns)); 417 eval(shift(@insns)); 418 &vadd_i32 ($T0,$T0,@X[0]); 419 while($#insns>=2) { eval(shift(@insns)); } 420 &vst1_32 ("{$T0}","[$Xfer,:128]!"); 421 eval(shift(@insns)); 422 eval(shift(@insns)); 423 424 push(@X,shift(@X)); # "rotate" X[] 425} 426 427sub Xpreload() 428{ use integer; 429 my $body = shift; 430 my @insns = (&$body,&$body,&$body,&$body); 431 my ($a,$b,$c,$d,$e,$f,$g,$h); 432 433 eval(shift(@insns)); 434 eval(shift(@insns)); 435 eval(shift(@insns)); 436 eval(shift(@insns)); 437 &vld1_32 ("{$T0}","[$Ktbl,:128]!"); 438 eval(shift(@insns)); 439 eval(shift(@insns)); 440 eval(shift(@insns)); 441 eval(shift(@insns)); 442 &vrev32_8 (@X[0],@X[0]); 443 eval(shift(@insns)); 444 eval(shift(@insns)); 445 eval(shift(@insns)); 446 eval(shift(@insns)); 447 &vadd_i32 ($T0,$T0,@X[0]); 448 foreach (@insns) { eval; } # remaining instructions 449 &vst1_32 ("{$T0}","[$Xfer,:128]!"); 450 451 push(@X,shift(@X)); # "rotate" X[] 452} 453 454sub body_00_15 () { 455 ( 456 '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'. 457 '&add ($h,$h,$t1)', # h+=X[i]+K[i] 458 '&eor ($t1,$f,$g)', 459 '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))', 460 '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past 461 '&and ($t1,$t1,$e)', 462 '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e) 463 '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))', 464 '&eor ($t1,$t1,$g)', # Ch(e,f,g) 465 '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e) 466 '&eor ($t2,$a,$b)', # a^b, b^c in next round 467 '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a) 468 '&add ($h,$h,$t1)', # h+=Ch(e,f,g) 469 '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'. 470 '&ldr ($t1,"[$Ktbl]") if ($j==15);'. 471 '&ldr ($t1,"[sp,#64]") if ($j==31)', 472 '&and ($t3,$t3,$t2)', # (b^c)&=(a^b) 473 '&add ($d,$d,$h)', # d+=h 474 '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a) 475 '&eor ($t3,$t3,$b)', # Maj(a,b,c) 476 '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);' 477 ) 478} 479 480$code.=<<___; 481#if __ARM_MAX_ARCH__>=7 482.arch armv7-a 483.fpu neon 484 485.global sha256_block_data_order_neon 486.type sha256_block_data_order_neon,%function 487.align 5 488.skip 16 489sha256_block_data_order_neon: 490.LNEON: 491 stmdb sp!,{r4-r12,lr} 492 493 sub $H,sp,#16*4+16 494 adr $Ktbl,K256 495 bic $H,$H,#15 @ align for 128-bit stores 496 mov $t2,sp 497 mov sp,$H @ alloca 498 add $len,$inp,$len,lsl#6 @ len to point at the end of inp 499 500 vld1.8 {@X[0]},[$inp]! 501 vld1.8 {@X[1]},[$inp]! 502 vld1.8 {@X[2]},[$inp]! 503 vld1.8 {@X[3]},[$inp]! 504 vld1.32 {$T0},[$Ktbl,:128]! 505 vld1.32 {$T1},[$Ktbl,:128]! 506 vld1.32 {$T2},[$Ktbl,:128]! 507 vld1.32 {$T3},[$Ktbl,:128]! 508 vrev32.8 @X[0],@X[0] @ yes, even on 509 str $ctx,[sp,#64] 510 vrev32.8 @X[1],@X[1] @ big-endian 511 str $inp,[sp,#68] 512 mov $Xfer,sp 513 vrev32.8 @X[2],@X[2] 514 str $len,[sp,#72] 515 vrev32.8 @X[3],@X[3] 516 str $t2,[sp,#76] @ save original sp 517 vadd.i32 $T0,$T0,@X[0] 518 vadd.i32 $T1,$T1,@X[1] 519 vst1.32 {$T0},[$Xfer,:128]! 520 vadd.i32 $T2,$T2,@X[2] 521 vst1.32 {$T1},[$Xfer,:128]! 522 vadd.i32 $T3,$T3,@X[3] 523 vst1.32 {$T2},[$Xfer,:128]! 524 vst1.32 {$T3},[$Xfer,:128]! 525 526 ldmia $ctx,{$A-$H} 527 sub $Xfer,$Xfer,#64 528 ldr $t1,[sp,#0] 529 eor $t2,$t2,$t2 530 eor $t3,$B,$C 531 b .L_00_48 532 533.align 4 534.L_00_48: 535___ 536 &Xupdate(\&body_00_15); 537 &Xupdate(\&body_00_15); 538 &Xupdate(\&body_00_15); 539 &Xupdate(\&body_00_15); 540$code.=<<___; 541 teq $t1,#0 @ check for K256 terminator 542 ldr $t1,[sp,#0] 543 sub $Xfer,$Xfer,#64 544 bne .L_00_48 545 546 ldr $inp,[sp,#68] 547 ldr $t0,[sp,#72] 548 sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl 549 teq $inp,$t0 550 it eq 551 subeq $inp,$inp,#64 @ avoid SEGV 552 vld1.8 {@X[0]},[$inp]! @ load next input block 553 vld1.8 {@X[1]},[$inp]! 554 vld1.8 {@X[2]},[$inp]! 555 vld1.8 {@X[3]},[$inp]! 556 it ne 557 strne $inp,[sp,#68] 558 mov $Xfer,sp 559___ 560 &Xpreload(\&body_00_15); 561 &Xpreload(\&body_00_15); 562 &Xpreload(\&body_00_15); 563 &Xpreload(\&body_00_15); 564$code.=<<___; 565 ldr $t0,[$t1,#0] 566 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past 567 ldr $t2,[$t1,#4] 568 ldr $t3,[$t1,#8] 569 ldr $t4,[$t1,#12] 570 add $A,$A,$t0 @ accumulate 571 ldr $t0,[$t1,#16] 572 add $B,$B,$t2 573 ldr $t2,[$t1,#20] 574 add $C,$C,$t3 575 ldr $t3,[$t1,#24] 576 add $D,$D,$t4 577 ldr $t4,[$t1,#28] 578 add $E,$E,$t0 579 str $A,[$t1],#4 580 add $F,$F,$t2 581 str $B,[$t1],#4 582 add $G,$G,$t3 583 str $C,[$t1],#4 584 add $H,$H,$t4 585 str $D,[$t1],#4 586 stmia $t1,{$E-$H} 587 588 ittte ne 589 movne $Xfer,sp 590 ldrne $t1,[sp,#0] 591 eorne $t2,$t2,$t2 592 ldreq sp,[sp,#76] @ restore original sp 593 itt ne 594 eorne $t3,$B,$C 595 bne .L_00_48 596 597 ldmia sp!,{r4-r12,pc} 598.size sha256_block_data_order_neon,.-sha256_block_data_order_neon 599#endif 600___ 601}}} 602###################################################################### 603# ARMv8 stuff 604# 605{{{ 606my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2)); 607my @MSG=map("q$_",(8..11)); 608my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15)); 609my $Ktbl="r3"; 610my $_byte = ($flavour =~ /win/ ? "DCB" : ".byte"); 611 612$code.=<<___; 613#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 614 615# if defined(__thumb2__) 616# define INST(a,b,c,d) $_byte c,d|0xc,a,b 617# else 618# define INST(a,b,c,d) $_byte a,b,c,d 619# endif 620 621.type sha256_block_data_order_armv8,%function 622.align 5 623sha256_block_data_order_armv8: 624.LARMv8: 625 vld1.32 {$ABCD,$EFGH},[$ctx] 626 sub $Ktbl,$Ktbl,#256+32 627 add $len,$inp,$len,lsl#6 @ len to point at the end of inp 628 b .Loop_v8 629 630.align 4 631.Loop_v8: 632 vld1.8 {@MSG[0]-@MSG[1]},[$inp]! 633 vld1.8 {@MSG[2]-@MSG[3]},[$inp]! 634 vld1.32 {$W0},[$Ktbl]! 635 vrev32.8 @MSG[0],@MSG[0] 636 vrev32.8 @MSG[1],@MSG[1] 637 vrev32.8 @MSG[2],@MSG[2] 638 vrev32.8 @MSG[3],@MSG[3] 639 vmov $ABCD_SAVE,$ABCD @ offload 640 vmov $EFGH_SAVE,$EFGH 641 teq $inp,$len 642___ 643for($i=0;$i<12;$i++) { 644$code.=<<___; 645 vld1.32 {$W1},[$Ktbl]! 646 vadd.i32 $W0,$W0,@MSG[0] 647 sha256su0 @MSG[0],@MSG[1] 648 vmov $abcd,$ABCD 649 sha256h $ABCD,$EFGH,$W0 650 sha256h2 $EFGH,$abcd,$W0 651 sha256su1 @MSG[0],@MSG[2],@MSG[3] 652___ 653 ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG)); 654} 655$code.=<<___; 656 vld1.32 {$W1},[$Ktbl]! 657 vadd.i32 $W0,$W0,@MSG[0] 658 vmov $abcd,$ABCD 659 sha256h $ABCD,$EFGH,$W0 660 sha256h2 $EFGH,$abcd,$W0 661 662 vld1.32 {$W0},[$Ktbl]! 663 vadd.i32 $W1,$W1,@MSG[1] 664 vmov $abcd,$ABCD 665 sha256h $ABCD,$EFGH,$W1 666 sha256h2 $EFGH,$abcd,$W1 667 668 vld1.32 {$W1},[$Ktbl] 669 vadd.i32 $W0,$W0,@MSG[2] 670 sub $Ktbl,$Ktbl,#256-16 @ rewind 671 vmov $abcd,$ABCD 672 sha256h $ABCD,$EFGH,$W0 673 sha256h2 $EFGH,$abcd,$W0 674 675 vadd.i32 $W1,$W1,@MSG[3] 676 vmov $abcd,$ABCD 677 sha256h $ABCD,$EFGH,$W1 678 sha256h2 $EFGH,$abcd,$W1 679 680 vadd.i32 $ABCD,$ABCD,$ABCD_SAVE 681 vadd.i32 $EFGH,$EFGH,$EFGH_SAVE 682 it ne 683 bne .Loop_v8 684 685 vst1.32 {$ABCD,$EFGH},[$ctx] 686 687 ret @ bx lr 688.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8 689#endif 690___ 691}}} 692$code.=<<___; 693.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>" 694.align 2 695#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 696.comm OPENSSL_armcap_P,4,4 697#endif 698___ 699 700open SELF,$0; 701while(<SELF>) { 702 next if (/^#!/); 703 last if (!s/^#/@/ and !/^$/); 704 print; 705} 706close SELF; 707 708{ my %opcode = ( 709 "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40, 710 "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 ); 711 712 sub unsha256 { 713 my ($mnemonic,$arg)=@_; 714 715 if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) { 716 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) 717 |(($2&7)<<17)|(($2&8)<<4) 718 |(($3&7)<<1) |(($3&8)<<2); 719 # since ARMv7 instructions are always encoded little-endian. 720 # correct solution is to use .inst directive, but older 721 # assemblers don't implement it:-( 722 sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s", 723 $word&0xff,($word>>8)&0xff, 724 ($word>>16)&0xff,($word>>24)&0xff, 725 $mnemonic,$arg; 726 } 727 } 728} 729 730foreach (split($/,$code)) { 731 732 s/\`([^\`]*)\`/eval $1/geo; 733 734 s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo; 735 736 s/\bret\b/bx lr/go or 737 s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 738 739 print $_,"\n"; 740} 741 742close STDOUT or die "error closing STDOUT: $!"; # enforce flush 743