1#! /usr/bin/env perl 2# Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# 16# Permission to use under GPL terms is granted. 17# ==================================================================== 18 19# SHA256 block procedure for ARMv4. May 2007. 20 21# Performance is ~2x better than gcc 3.4 generated code and in "abso- 22# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per 23# byte [on single-issue Xscale PXA250 core]. 24 25# July 2010. 26# 27# Rescheduling for dual-issue pipeline resulted in 22% improvement on 28# Cortex A8 core and ~20 cycles per processed byte. 29 30# February 2011. 31# 32# Profiler-assisted and platform-specific optimization resulted in 16% 33# improvement on Cortex A8 core and ~15.4 cycles per processed byte. 34 35# September 2013. 36# 37# Add NEON implementation. On Cortex A8 it was measured to process one 38# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon 39# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only 40# code (meaning that latter performs sub-optimally, nothing was done 41# about it). 42 43# May 2014. 44# 45# Add ARMv8 code path performing at 2.0 cpb on Apple A7. 46 47$flavour = shift; 48if ($flavour=~/\w[\w\-]*\.\w+$/) { $output=$flavour; undef $flavour; } 49else { while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} } 50 51if ($flavour && $flavour ne "void") { 52 $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 53 ( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 54 ( $xlate="${dir}../../../perlasm/arm-xlate.pl" and -f $xlate) or 55 die "can't locate arm-xlate.pl"; 56 57 open OUT,"| \"$^X\" $xlate $flavour $output"; 58 *STDOUT=*OUT; 59} else { 60 open OUT,">$output"; 61 *STDOUT=*OUT; 62} 63 64$ctx="r0"; $t0="r0"; 65$inp="r1"; $t4="r1"; 66$len="r2"; $t1="r2"; 67$T1="r3"; $t3="r3"; 68$A="r4"; 69$B="r5"; 70$C="r6"; 71$D="r7"; 72$E="r8"; 73$F="r9"; 74$G="r10"; 75$H="r11"; 76@V=($A,$B,$C,$D,$E,$F,$G,$H); 77$t2="r12"; 78$Ktbl="r14"; 79 80@Sigma0=( 2,13,22); 81@Sigma1=( 6,11,25); 82@sigma0=( 7,18, 3); 83@sigma1=(17,19,10); 84 85sub BODY_00_15 { 86my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 87 88$code.=<<___ if ($i<16); 89#if __ARM_ARCH__>=7 90 @ ldr $t1,[$inp],#4 @ $i 91# if $i==15 92 str $inp,[sp,#17*4] @ make room for $t4 93# endif 94 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` 95 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past 96 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) 97# ifndef __ARMEB__ 98 rev $t1,$t1 99# endif 100#else 101 @ ldrb $t1,[$inp,#3] @ $i 102 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past 103 ldrb $t2,[$inp,#2] 104 ldrb $t0,[$inp,#1] 105 orr $t1,$t1,$t2,lsl#8 106 ldrb $t2,[$inp],#4 107 orr $t1,$t1,$t0,lsl#16 108# if $i==15 109 str $inp,[sp,#17*4] @ make room for $t4 110# endif 111 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` 112 orr $t1,$t1,$t2,lsl#24 113 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) 114#endif 115___ 116$code.=<<___; 117 ldr $t2,[$Ktbl],#4 @ *K256++ 118 add $h,$h,$t1 @ h+=X[i] 119 str $t1,[sp,#`$i%16`*4] 120 eor $t1,$f,$g 121 add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e) 122 and $t1,$t1,$e 123 add $h,$h,$t2 @ h+=K256[i] 124 eor $t1,$t1,$g @ Ch(e,f,g) 125 eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]` 126 add $h,$h,$t1 @ h+=Ch(e,f,g) 127#if $i==31 128 and $t2,$t2,#0xff 129 cmp $t2,#0xf2 @ done? 130#endif 131#if $i<15 132# if __ARM_ARCH__>=7 133 ldr $t1,[$inp],#4 @ prefetch 134# else 135 ldrb $t1,[$inp,#3] 136# endif 137 eor $t2,$a,$b @ a^b, b^c in next round 138#else 139 ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx 140 eor $t2,$a,$b @ a^b, b^c in next round 141 ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx 142#endif 143 eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a) 144 and $t3,$t3,$t2 @ (b^c)&=(a^b) 145 add $d,$d,$h @ d+=h 146 eor $t3,$t3,$b @ Maj(a,b,c) 147 add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a) 148 @ add $h,$h,$t3 @ h+=Maj(a,b,c) 149___ 150 ($t2,$t3)=($t3,$t2); 151} 152 153sub BODY_16_XX { 154my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_; 155 156$code.=<<___; 157 @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i 158 @ ldr $t4,[sp,#`($i+14)%16`*4] 159 mov $t0,$t1,ror#$sigma0[0] 160 add $a,$a,$t2 @ h+=Maj(a,b,c) from the past 161 mov $t2,$t4,ror#$sigma1[0] 162 eor $t0,$t0,$t1,ror#$sigma0[1] 163 eor $t2,$t2,$t4,ror#$sigma1[1] 164 eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1]) 165 ldr $t1,[sp,#`($i+0)%16`*4] 166 eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14]) 167 ldr $t4,[sp,#`($i+9)%16`*4] 168 169 add $t2,$t2,$t0 170 eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15 171 add $t1,$t1,$t2 172 eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e) 173 add $t1,$t1,$t4 @ X[i] 174___ 175 &BODY_00_15(@_); 176} 177 178$code=<<___; 179#ifndef __KERNEL__ 180# include <GFp/arm_arch.h> 181#else 182# define __ARM_ARCH__ __LINUX_ARM_ARCH__ 183# define __ARM_MAX_ARCH__ 7 184#endif 185 186@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both 187@ ARMv7 and ARMv8 processors. It does have ARMv8-only code, but those 188@ instructions are manually-encoded. (See unsha256.) 189.arch armv7-a 190 191.text 192#if defined(__thumb2__) 193.syntax unified 194.thumb 195#else 196.code 32 197#endif 198 199.type K256,%object 200.align 5 201K256: 202.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 203.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 204.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 205.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 206.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc 207.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da 208.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 209.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 210.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 211.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 212.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 213.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 214.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 215.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 216.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 217.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 218.size K256,.-K256 219.word 0 @ terminator 220#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 221.extern GFp_armcap_P 222.hidden GFp_armcap_P 223.LOPENSSL_armcap: 224.word GFp_armcap_P-.Lsha256_block_data_order 225#endif 226.align 5 227 228.global GFp_sha256_block_data_order 229.type GFp_sha256_block_data_order,%function 230GFp_sha256_block_data_order: 231.Lsha256_block_data_order: 232#if __ARM_ARCH__<7 && !defined(__thumb2__) 233 sub r3,pc,#8 @ GFp_sha256_block_data_order 234#else 235 adr r3,.Lsha256_block_data_order 236#endif 237#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 238 ldr r12,.LOPENSSL_armcap 239 ldr r12,[r3,r12] @ GFp_armcap_P 240#ifdef __APPLE__ 241 ldr r12,[r12] 242#endif 243 tst r12,#ARMV8_SHA256 244 bne .LARMv8 245 tst r12,#ARMV7_NEON 246 bne .LNEON 247#endif 248 add $len,$inp,$len,lsl#6 @ len to point at the end of inp 249 stmdb sp!,{$ctx,$inp,$len,r4-r11,lr} 250 ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H} 251 sub $Ktbl,r3,#256+32 @ K256 252 sub sp,sp,#16*4 @ alloca(X[16]) 253.Loop: 254# if __ARM_ARCH__>=7 255 ldr $t1,[$inp],#4 256# else 257 ldrb $t1,[$inp,#3] 258# endif 259 eor $t3,$B,$C @ magic 260 eor $t2,$t2,$t2 261___ 262for($i=0;$i<16;$i++) { &BODY_00_15($i,@V); unshift(@V,pop(@V)); } 263$code.=".Lrounds_16_xx:\n"; 264for (;$i<32;$i++) { &BODY_16_XX($i,@V); unshift(@V,pop(@V)); } 265$code.=<<___; 266#if __ARM_ARCH__>=7 267 ite eq @ Thumb2 thing, sanity check in ARM 268#endif 269 ldreq $t3,[sp,#16*4] @ pull ctx 270 bne .Lrounds_16_xx 271 272 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past 273 ldr $t0,[$t3,#0] 274 ldr $t1,[$t3,#4] 275 ldr $t2,[$t3,#8] 276 add $A,$A,$t0 277 ldr $t0,[$t3,#12] 278 add $B,$B,$t1 279 ldr $t1,[$t3,#16] 280 add $C,$C,$t2 281 ldr $t2,[$t3,#20] 282 add $D,$D,$t0 283 ldr $t0,[$t3,#24] 284 add $E,$E,$t1 285 ldr $t1,[$t3,#28] 286 add $F,$F,$t2 287 ldr $inp,[sp,#17*4] @ pull inp 288 ldr $t2,[sp,#18*4] @ pull inp+len 289 add $G,$G,$t0 290 add $H,$H,$t1 291 stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H} 292 cmp $inp,$t2 293 sub $Ktbl,$Ktbl,#256 @ rewind Ktbl 294 bne .Loop 295 296 add sp,sp,#`16+3`*4 @ destroy frame 297#if __ARM_ARCH__>=5 298 ldmia sp!,{r4-r11,pc} 299#else 300 ldmia sp!,{r4-r11,lr} 301 tst lr,#1 302 moveq pc,lr @ be binary compatible with V4, yet 303 bx lr @ interoperable with Thumb ISA:-) 304#endif 305.size GFp_sha256_block_data_order,.-GFp_sha256_block_data_order 306___ 307###################################################################### 308# NEON stuff 309# 310{{{ 311my @X=map("q$_",(0..3)); 312my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25"); 313my $Xfer=$t4; 314my $j=0; 315 316sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; } 317sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; } 318 319sub AUTOLOAD() # thunk [simplified] x86-style perlasm 320{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; 321 my $arg = pop; 322 $arg = "#$arg" if ($arg*1 eq $arg); 323 $code .= "\t$opcode\t".join(',',@_,$arg)."\n"; 324} 325 326sub Xupdate() 327{ use integer; 328 my $body = shift; 329 my @insns = (&$body,&$body,&$body,&$body); 330 my ($a,$b,$c,$d,$e,$f,$g,$h); 331 332 &vext_8 ($T0,@X[0],@X[1],4); # X[1..4] 333 eval(shift(@insns)); 334 eval(shift(@insns)); 335 eval(shift(@insns)); 336 &vext_8 ($T1,@X[2],@X[3],4); # X[9..12] 337 eval(shift(@insns)); 338 eval(shift(@insns)); 339 eval(shift(@insns)); 340 &vshr_u32 ($T2,$T0,$sigma0[0]); 341 eval(shift(@insns)); 342 eval(shift(@insns)); 343 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += X[9..12] 344 eval(shift(@insns)); 345 eval(shift(@insns)); 346 &vshr_u32 ($T1,$T0,$sigma0[2]); 347 eval(shift(@insns)); 348 eval(shift(@insns)); 349 &vsli_32 ($T2,$T0,32-$sigma0[0]); 350 eval(shift(@insns)); 351 eval(shift(@insns)); 352 &vshr_u32 ($T3,$T0,$sigma0[1]); 353 eval(shift(@insns)); 354 eval(shift(@insns)); 355 &veor ($T1,$T1,$T2); 356 eval(shift(@insns)); 357 eval(shift(@insns)); 358 &vsli_32 ($T3,$T0,32-$sigma0[1]); 359 eval(shift(@insns)); 360 eval(shift(@insns)); 361 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]); 362 eval(shift(@insns)); 363 eval(shift(@insns)); 364 &veor ($T1,$T1,$T3); # sigma0(X[1..4]) 365 eval(shift(@insns)); 366 eval(shift(@insns)); 367 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]); 368 eval(shift(@insns)); 369 eval(shift(@insns)); 370 &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]); 371 eval(shift(@insns)); 372 eval(shift(@insns)); 373 &vadd_i32 (@X[0],@X[0],$T1); # X[0..3] += sigma0(X[1..4]) 374 eval(shift(@insns)); 375 eval(shift(@insns)); 376 &veor ($T5,$T5,$T4); 377 eval(shift(@insns)); 378 eval(shift(@insns)); 379 &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]); 380 eval(shift(@insns)); 381 eval(shift(@insns)); 382 &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]); 383 eval(shift(@insns)); 384 eval(shift(@insns)); 385 &veor ($T5,$T5,$T4); # sigma1(X[14..15]) 386 eval(shift(@insns)); 387 eval(shift(@insns)); 388 &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15]) 389 eval(shift(@insns)); 390 eval(shift(@insns)); 391 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]); 392 eval(shift(@insns)); 393 eval(shift(@insns)); 394 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]); 395 eval(shift(@insns)); 396 eval(shift(@insns)); 397 &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]); 398 eval(shift(@insns)); 399 eval(shift(@insns)); 400 &veor ($T5,$T5,$T4); 401 eval(shift(@insns)); 402 eval(shift(@insns)); 403 &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]); 404 eval(shift(@insns)); 405 eval(shift(@insns)); 406 &vld1_32 ("{$T0}","[$Ktbl,:128]!"); 407 eval(shift(@insns)); 408 eval(shift(@insns)); 409 &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]); 410 eval(shift(@insns)); 411 eval(shift(@insns)); 412 &veor ($T5,$T5,$T4); # sigma1(X[16..17]) 413 eval(shift(@insns)); 414 eval(shift(@insns)); 415 &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17]) 416 eval(shift(@insns)); 417 eval(shift(@insns)); 418 &vadd_i32 ($T0,$T0,@X[0]); 419 while($#insns>=2) { eval(shift(@insns)); } 420 &vst1_32 ("{$T0}","[$Xfer,:128]!"); 421 eval(shift(@insns)); 422 eval(shift(@insns)); 423 424 push(@X,shift(@X)); # "rotate" X[] 425} 426 427sub Xpreload() 428{ use integer; 429 my $body = shift; 430 my @insns = (&$body,&$body,&$body,&$body); 431 my ($a,$b,$c,$d,$e,$f,$g,$h); 432 433 eval(shift(@insns)); 434 eval(shift(@insns)); 435 eval(shift(@insns)); 436 eval(shift(@insns)); 437 &vld1_32 ("{$T0}","[$Ktbl,:128]!"); 438 eval(shift(@insns)); 439 eval(shift(@insns)); 440 eval(shift(@insns)); 441 eval(shift(@insns)); 442 &vrev32_8 (@X[0],@X[0]); 443 eval(shift(@insns)); 444 eval(shift(@insns)); 445 eval(shift(@insns)); 446 eval(shift(@insns)); 447 &vadd_i32 ($T0,$T0,@X[0]); 448 foreach (@insns) { eval; } # remaining instructions 449 &vst1_32 ("{$T0}","[$Xfer,:128]!"); 450 451 push(@X,shift(@X)); # "rotate" X[] 452} 453 454sub body_00_15 () { 455 ( 456 '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'. 457 '&add ($h,$h,$t1)', # h+=X[i]+K[i] 458 '&eor ($t1,$f,$g)', 459 '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))', 460 '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past 461 '&and ($t1,$t1,$e)', 462 '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e) 463 '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))', 464 '&eor ($t1,$t1,$g)', # Ch(e,f,g) 465 '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e) 466 '&eor ($t2,$a,$b)', # a^b, b^c in next round 467 '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a) 468 '&add ($h,$h,$t1)', # h+=Ch(e,f,g) 469 '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'. 470 '&ldr ($t1,"[$Ktbl]") if ($j==15);'. 471 '&ldr ($t1,"[sp,#64]") if ($j==31)', 472 '&and ($t3,$t3,$t2)', # (b^c)&=(a^b) 473 '&add ($d,$d,$h)', # d+=h 474 '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a) 475 '&eor ($t3,$t3,$b)', # Maj(a,b,c) 476 '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);' 477 ) 478} 479 480$code.=<<___; 481#if __ARM_MAX_ARCH__>=7 482.arch armv7-a 483.fpu neon 484 485.type sha256_block_data_order_neon,%function 486.align 5 487.skip 16 488sha256_block_data_order_neon: 489.LNEON: 490 stmdb sp!,{r4-r12,lr} 491 492 sub $H,sp,#16*4+16 493 adr $Ktbl,K256 494 bic $H,$H,#15 @ align for 128-bit stores 495 mov $t2,sp 496 mov sp,$H @ alloca 497 add $len,$inp,$len,lsl#6 @ len to point at the end of inp 498 499 vld1.8 {@X[0]},[$inp]! 500 vld1.8 {@X[1]},[$inp]! 501 vld1.8 {@X[2]},[$inp]! 502 vld1.8 {@X[3]},[$inp]! 503 vld1.32 {$T0},[$Ktbl,:128]! 504 vld1.32 {$T1},[$Ktbl,:128]! 505 vld1.32 {$T2},[$Ktbl,:128]! 506 vld1.32 {$T3},[$Ktbl,:128]! 507 vrev32.8 @X[0],@X[0] @ yes, even on 508 str $ctx,[sp,#64] 509 vrev32.8 @X[1],@X[1] @ big-endian 510 str $inp,[sp,#68] 511 mov $Xfer,sp 512 vrev32.8 @X[2],@X[2] 513 str $len,[sp,#72] 514 vrev32.8 @X[3],@X[3] 515 str $t2,[sp,#76] @ save original sp 516 vadd.i32 $T0,$T0,@X[0] 517 vadd.i32 $T1,$T1,@X[1] 518 vst1.32 {$T0},[$Xfer,:128]! 519 vadd.i32 $T2,$T2,@X[2] 520 vst1.32 {$T1},[$Xfer,:128]! 521 vadd.i32 $T3,$T3,@X[3] 522 vst1.32 {$T2},[$Xfer,:128]! 523 vst1.32 {$T3},[$Xfer,:128]! 524 525 ldmia $ctx,{$A-$H} 526 sub $Xfer,$Xfer,#64 527 ldr $t1,[sp,#0] 528 eor $t2,$t2,$t2 529 eor $t3,$B,$C 530 b .L_00_48 531 532.align 4 533.L_00_48: 534___ 535 &Xupdate(\&body_00_15); 536 &Xupdate(\&body_00_15); 537 &Xupdate(\&body_00_15); 538 &Xupdate(\&body_00_15); 539$code.=<<___; 540 teq $t1,#0 @ check for K256 terminator 541 ldr $t1,[sp,#0] 542 sub $Xfer,$Xfer,#64 543 bne .L_00_48 544 545 ldr $inp,[sp,#68] 546 ldr $t0,[sp,#72] 547 sub $Ktbl,$Ktbl,#256 @ rewind $Ktbl 548 teq $inp,$t0 549 it eq 550 subeq $inp,$inp,#64 @ avoid SEGV 551 vld1.8 {@X[0]},[$inp]! @ load next input block 552 vld1.8 {@X[1]},[$inp]! 553 vld1.8 {@X[2]},[$inp]! 554 vld1.8 {@X[3]},[$inp]! 555 it ne 556 strne $inp,[sp,#68] 557 mov $Xfer,sp 558___ 559 &Xpreload(\&body_00_15); 560 &Xpreload(\&body_00_15); 561 &Xpreload(\&body_00_15); 562 &Xpreload(\&body_00_15); 563$code.=<<___; 564 ldr $t0,[$t1,#0] 565 add $A,$A,$t2 @ h+=Maj(a,b,c) from the past 566 ldr $t2,[$t1,#4] 567 ldr $t3,[$t1,#8] 568 ldr $t4,[$t1,#12] 569 add $A,$A,$t0 @ accumulate 570 ldr $t0,[$t1,#16] 571 add $B,$B,$t2 572 ldr $t2,[$t1,#20] 573 add $C,$C,$t3 574 ldr $t3,[$t1,#24] 575 add $D,$D,$t4 576 ldr $t4,[$t1,#28] 577 add $E,$E,$t0 578 str $A,[$t1],#4 579 add $F,$F,$t2 580 str $B,[$t1],#4 581 add $G,$G,$t3 582 str $C,[$t1],#4 583 add $H,$H,$t4 584 str $D,[$t1],#4 585 stmia $t1,{$E-$H} 586 587 ittte ne 588 movne $Xfer,sp 589 ldrne $t1,[sp,#0] 590 eorne $t2,$t2,$t2 591 ldreq sp,[sp,#76] @ restore original sp 592 itt ne 593 eorne $t3,$B,$C 594 bne .L_00_48 595 596 ldmia sp!,{r4-r12,pc} 597.size sha256_block_data_order_neon,.-sha256_block_data_order_neon 598#endif 599___ 600}}} 601###################################################################### 602# ARMv8 stuff 603# 604{{{ 605my ($ABCD,$EFGH,$abcd)=map("q$_",(0..2)); 606my @MSG=map("q$_",(8..11)); 607my ($W0,$W1,$ABCD_SAVE,$EFGH_SAVE)=map("q$_",(12..15)); 608my $Ktbl="r3"; 609 610$code.=<<___; 611#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) 612 613# if defined(__thumb2__) 614# define INST(a,b,c,d) .byte c,d|0xc,a,b 615# else 616# define INST(a,b,c,d) .byte a,b,c,d 617# endif 618 619.type sha256_block_data_order_armv8,%function 620.align 5 621sha256_block_data_order_armv8: 622.LARMv8: 623 vld1.32 {$ABCD,$EFGH},[$ctx] 624 sub $Ktbl,$Ktbl,#256+32 625 add $len,$inp,$len,lsl#6 @ len to point at the end of inp 626 b .Loop_v8 627 628.align 4 629.Loop_v8: 630 vld1.8 {@MSG[0]-@MSG[1]},[$inp]! 631 vld1.8 {@MSG[2]-@MSG[3]},[$inp]! 632 vld1.32 {$W0},[$Ktbl]! 633 vrev32.8 @MSG[0],@MSG[0] 634 vrev32.8 @MSG[1],@MSG[1] 635 vrev32.8 @MSG[2],@MSG[2] 636 vrev32.8 @MSG[3],@MSG[3] 637 vmov $ABCD_SAVE,$ABCD @ offload 638 vmov $EFGH_SAVE,$EFGH 639 teq $inp,$len 640___ 641for($i=0;$i<12;$i++) { 642$code.=<<___; 643 vld1.32 {$W1},[$Ktbl]! 644 vadd.i32 $W0,$W0,@MSG[0] 645 sha256su0 @MSG[0],@MSG[1] 646 vmov $abcd,$ABCD 647 sha256h $ABCD,$EFGH,$W0 648 sha256h2 $EFGH,$abcd,$W0 649 sha256su1 @MSG[0],@MSG[2],@MSG[3] 650___ 651 ($W0,$W1)=($W1,$W0); push(@MSG,shift(@MSG)); 652} 653$code.=<<___; 654 vld1.32 {$W1},[$Ktbl]! 655 vadd.i32 $W0,$W0,@MSG[0] 656 vmov $abcd,$ABCD 657 sha256h $ABCD,$EFGH,$W0 658 sha256h2 $EFGH,$abcd,$W0 659 660 vld1.32 {$W0},[$Ktbl]! 661 vadd.i32 $W1,$W1,@MSG[1] 662 vmov $abcd,$ABCD 663 sha256h $ABCD,$EFGH,$W1 664 sha256h2 $EFGH,$abcd,$W1 665 666 vld1.32 {$W1},[$Ktbl] 667 vadd.i32 $W0,$W0,@MSG[2] 668 sub $Ktbl,$Ktbl,#256-16 @ rewind 669 vmov $abcd,$ABCD 670 sha256h $ABCD,$EFGH,$W0 671 sha256h2 $EFGH,$abcd,$W0 672 673 vadd.i32 $W1,$W1,@MSG[3] 674 vmov $abcd,$ABCD 675 sha256h $ABCD,$EFGH,$W1 676 sha256h2 $EFGH,$abcd,$W1 677 678 vadd.i32 $ABCD,$ABCD,$ABCD_SAVE 679 vadd.i32 $EFGH,$EFGH,$EFGH_SAVE 680 it ne 681 bne .Loop_v8 682 683 vst1.32 {$ABCD,$EFGH},[$ctx] 684 685 ret @ bx lr 686.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8 687#endif 688___ 689}}} 690$code.=<<___; 691.asciz "SHA256 block transform for ARMv4/NEON/ARMv8, CRYPTOGAMS by <appro\@openssl.org>" 692___ 693 694open SELF,$0; 695while(<SELF>) { 696 next if (/^#!/); 697 last if (!s/^#/@/ and !/^$/); 698 print; 699} 700close SELF; 701 702{ my %opcode = ( 703 "sha256h" => 0xf3000c40, "sha256h2" => 0xf3100c40, 704 "sha256su0" => 0xf3ba03c0, "sha256su1" => 0xf3200c40 ); 705 706 sub unsha256 { 707 my ($mnemonic,$arg)=@_; 708 709 if ($arg =~ m/q([0-9]+)(?:,\s*q([0-9]+))?,\s*q([0-9]+)/o) { 710 my $word = $opcode{$mnemonic}|(($1&7)<<13)|(($1&8)<<19) 711 |(($2&7)<<17)|(($2&8)<<4) 712 |(($3&7)<<1) |(($3&8)<<2); 713 # since ARMv7 instructions are always encoded little-endian. 714 # correct solution is to use .inst directive, but older 715 # assemblers don't implement it:-( 716 sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s", 717 $word&0xff,($word>>8)&0xff, 718 ($word>>16)&0xff,($word>>24)&0xff, 719 $mnemonic,$arg; 720 } 721 } 722} 723 724foreach (split($/,$code)) { 725 726 s/\`([^\`]*)\`/eval $1/geo; 727 728 s/\b(sha256\w+)\s+(q.*)/unsha256($1,$2)/geo; 729 730 s/\bret\b/bx lr/go or 731 s/\bbx\s+lr\b/.word\t0xe12fff1e/go; # make it possible to compile with -march=armv4 732 733 print $_,"\n"; 734} 735 736close STDOUT or die "error closing STDOUT"; # enforce flush 737