1#! /usr/bin/env perl 2# Copyright 2012-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# SHA256 for C64x+. 18# 19# January 2012 20# 21# Performance is just below 10 cycles per processed byte, which is 22# almost 40% faster than compiler-generated code. Unroll is unlikely 23# to give more than ~8% improvement... 24# 25# !!! Note that this module uses AMR, which means that all interrupt 26# service routines are expected to preserve it and for own well-being 27# zero it upon entry. 28 29while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} 30open STDOUT,">$output"; 31 32($CTXA,$INP,$NUM) = ("A4","B4","A6"); # arguments 33 $K256="A3"; 34 35($A,$Actx,$B,$Bctx,$C,$Cctx,$D,$Dctx,$T2,$S0,$s1,$t0a,$t1a,$t2a,$X9,$X14) 36 =map("A$_",(16..31)); 37($E,$Ectx,$F,$Fctx,$G,$Gctx,$H,$Hctx,$T1,$S1,$s0,$t0e,$t1e,$t2e,$X1,$X15) 38 =map("B$_",(16..31)); 39 40($Xia,$Xib)=("A5","B5"); # circular/ring buffer 41 $CTXB=$t2e; 42 43($Xn,$X0,$K)=("B7","B8","B9"); 44($Maj,$Ch)=($T2,"B6"); 45 46$code.=<<___; 47 .text 48 49 .if .ASSEMBLER_VERSION<7000000 50 .asg 0,__TI_EABI__ 51 .endif 52 .if __TI_EABI__ 53 .nocmp 54 .asg sha256_block_data_order,_sha256_block_data_order 55 .endif 56 57 .asg B3,RA 58 .asg A15,FP 59 .asg B15,SP 60 61 .if .BIG_ENDIAN 62 .asg SWAP2,MV 63 .asg SWAP4,MV 64 .endif 65 66 .global _sha256_block_data_order 67_sha256_block_data_order: 68__sha256_block: 69 .asmfunc stack_usage(64) 70 MV $NUM,A0 ; reassign $NUM 71|| MVK -64,B0 72 [!A0] BNOP RA ; if ($NUM==0) return; 73|| [A0] STW FP,*SP--[16] ; save frame pointer and alloca(64) 74|| [A0] MV SP,FP 75 [A0] ADDKPC __sha256_block,B2 76|| [A0] AND B0,SP,SP ; align stack at 64 bytes 77 .if __TI_EABI__ 78 [A0] MVK 0x00404,B1 79|| [A0] MVKL \$PCR_OFFSET(K256,__sha256_block),$K256 80 [A0] MVKH 0x50000,B1 81|| [A0] MVKH \$PCR_OFFSET(K256,__sha256_block),$K256 82 .else 83 [A0] MVK 0x00404,B1 84|| [A0] MVKL (K256-__sha256_block),$K256 85 [A0] MVKH 0x50000,B1 86|| [A0] MVKH (K256-__sha256_block),$K256 87 .endif 88 [A0] MVC B1,AMR ; setup circular addressing 89|| [A0] MV SP,$Xia 90 [A0] MV SP,$Xib 91|| [A0] ADD B2,$K256,$K256 92|| [A0] MV $CTXA,$CTXB 93|| [A0] SUBAW SP,2,SP ; reserve two words above buffer 94 LDW *${CTXA}[0],$A ; load ctx 95|| LDW *${CTXB}[4],$E 96 LDW *${CTXA}[1],$B 97|| LDW *${CTXB}[5],$F 98 LDW *${CTXA}[2],$C 99|| LDW *${CTXB}[6],$G 100 LDW *${CTXA}[3],$D 101|| LDW *${CTXB}[7],$H 102 103 LDNW *$INP++,$Xn ; pre-fetch input 104 LDW *$K256++,$K ; pre-fetch K256[0] 105 MVK 14,B0 ; loop counters 106 MVK 47,B1 107|| ADDAW $Xia,9,$Xia 108outerloop?: 109 SUB A0,1,A0 110|| MV $A,$Actx 111|| MV $E,$Ectx 112|| MVD $B,$Bctx 113|| MVD $F,$Fctx 114 MV $C,$Cctx 115|| MV $G,$Gctx 116|| MVD $D,$Dctx 117|| MVD $H,$Hctx 118|| SWAP4 $Xn,$X0 119 120 SPLOOPD 8 ; BODY_00_14 121|| MVC B0,ILC 122|| SWAP2 $X0,$X0 123 124 LDNW *$INP++,$Xn 125|| ROTL $A,30,$S0 126|| OR $A,$B,$Maj 127|| AND $A,$B,$t2a 128|| ROTL $E,26,$S1 129|| AND $F,$E,$Ch 130|| ANDN $G,$E,$t2e 131 ROTL $A,19,$t0a 132|| AND $C,$Maj,$Maj 133|| ROTL $E,21,$t0e 134|| XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g) 135 ROTL $A,10,$t1a 136|| OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b) 137|| ROTL $E,7,$t1e 138|| ADD $K,$H,$T1 ; T1 = h + K256[i] 139 ADD $X0,$T1,$T1 ; T1 += X[i]; 140|| STW $X0,*$Xib++ 141|| XOR $t0a,$S0,$S0 142|| XOR $t0e,$S1,$S1 143 XOR $t1a,$S0,$S0 ; Sigma0(a) 144|| XOR $t1e,$S1,$S1 ; Sigma1(e) 145|| LDW *$K256++,$K ; pre-fetch K256[i+1] 146|| ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g) 147 ADD $S1,$T1,$T1 ; T1 += Sigma1(e) 148|| ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c) 149|| ROTL $G,0,$H ; h = g 150|| MV $F,$G ; g = f 151|| MV $X0,$X14 152|| SWAP4 $Xn,$X0 153 SWAP2 $X0,$X0 154|| MV $E,$F ; f = e 155|| ADD $D,$T1,$E ; e = d + T1 156|| MV $C,$D ; d = c 157 MV $B,$C ; c = b 158|| MV $A,$B ; b = a 159|| ADD $T1,$T2,$A ; a = T1 + T2 160 SPKERNEL 161 162 ROTL $A,30,$S0 ; BODY_15 163|| OR $A,$B,$Maj 164|| AND $A,$B,$t2a 165|| ROTL $E,26,$S1 166|| AND $F,$E,$Ch 167|| ANDN $G,$E,$t2e 168|| LDW *${Xib}[1],$Xn ; modulo-scheduled 169 ROTL $A,19,$t0a 170|| AND $C,$Maj,$Maj 171|| ROTL $E,21,$t0e 172|| XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g) 173|| LDW *${Xib}[2],$X1 ; modulo-scheduled 174 ROTL $A,10,$t1a 175|| OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b) 176|| ROTL $E,7,$t1e 177|| ADD $K,$H,$T1 ; T1 = h + K256[i] 178 ADD $X0,$T1,$T1 ; T1 += X[i]; 179|| STW $X0,*$Xib++ 180|| XOR $t0a,$S0,$S0 181|| XOR $t0e,$S1,$S1 182 XOR $t1a,$S0,$S0 ; Sigma0(a) 183|| XOR $t1e,$S1,$S1 ; Sigma1(e) 184|| LDW *$K256++,$K ; pre-fetch K256[i+1] 185|| ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g) 186 ADD $S1,$T1,$T1 ; T1 += Sigma1(e) 187|| ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c) 188|| ROTL $G,0,$H ; h = g 189|| MV $F,$G ; g = f 190|| MV $X0,$X15 191 MV $E,$F ; f = e 192|| ADD $D,$T1,$E ; e = d + T1 193|| MV $C,$D ; d = c 194|| MV $Xn,$X0 ; modulo-scheduled 195|| LDW *$Xia,$X9 ; modulo-scheduled 196|| ROTL $X1,25,$t0e ; modulo-scheduled 197|| ROTL $X14,15,$t0a ; modulo-scheduled 198 SHRU $X1,3,$s0 ; modulo-scheduled 199|| SHRU $X14,10,$s1 ; modulo-scheduled 200|| ROTL $B,0,$C ; c = b 201|| MV $A,$B ; b = a 202|| ADD $T1,$T2,$A ; a = T1 + T2 203 204 SPLOOPD 10 ; BODY_16_63 205|| MVC B1,ILC 206|| ROTL $X1,14,$t1e ; modulo-scheduled 207|| ROTL $X14,13,$t1a ; modulo-scheduled 208 209 XOR $t0e,$s0,$s0 210|| XOR $t0a,$s1,$s1 211|| MV $X15,$X14 212|| MV $X1,$Xn 213 XOR $t1e,$s0,$s0 ; sigma0(X[i+1]) 214|| XOR $t1a,$s1,$s1 ; sigma1(X[i+14]) 215|| LDW *${Xib}[2],$X1 ; module-scheduled 216 ROTL $A,30,$S0 217|| OR $A,$B,$Maj 218|| AND $A,$B,$t2a 219|| ROTL $E,26,$S1 220|| AND $F,$E,$Ch 221|| ANDN $G,$E,$t2e 222|| ADD $X9,$X0,$X0 ; X[i] += X[i+9] 223 ROTL $A,19,$t0a 224|| AND $C,$Maj,$Maj 225|| ROTL $E,21,$t0e 226|| XOR $t2e,$Ch,$Ch ; Ch(e,f,g) = (e&f)^(~e&g) 227|| ADD $s0,$X0,$X0 ; X[i] += sigma1(X[i+1]) 228 ROTL $A,10,$t1a 229|| OR $t2a,$Maj,$Maj ; Maj(a,b,c) = ((a|b)&c)|(a&b) 230|| ROTL $E,7,$t1e 231|| ADD $H,$K,$T1 ; T1 = h + K256[i] 232|| ADD $s1,$X0,$X0 ; X[i] += sigma1(X[i+14]) 233 XOR $t0a,$S0,$S0 234|| XOR $t0e,$S1,$S1 235|| ADD $X0,$T1,$T1 ; T1 += X[i] 236|| STW $X0,*$Xib++ 237 XOR $t1a,$S0,$S0 ; Sigma0(a) 238|| XOR $t1e,$S1,$S1 ; Sigma1(e) 239|| ADD $Ch,$T1,$T1 ; T1 += Ch(e,f,g) 240|| MV $X0,$X15 241|| ROTL $G,0,$H ; h = g 242|| LDW *$K256++,$K ; pre-fetch K256[i+1] 243 ADD $S1,$T1,$T1 ; T1 += Sigma1(e) 244|| ADD $S0,$Maj,$T2 ; T2 = Sigma0(a) + Maj(a,b,c) 245|| MV $F,$G ; g = f 246|| MV $Xn,$X0 ; modulo-scheduled 247|| LDW *++$Xia,$X9 ; modulo-scheduled 248|| ROTL $X1,25,$t0e ; module-scheduled 249|| ROTL $X14,15,$t0a ; modulo-scheduled 250 ROTL $X1,14,$t1e ; modulo-scheduled 251|| ROTL $X14,13,$t1a ; modulo-scheduled 252|| MV $E,$F ; f = e 253|| ADD $D,$T1,$E ; e = d + T1 254|| MV $C,$D ; d = c 255|| MV $B,$C ; c = b 256 MV $A,$B ; b = a 257|| ADD $T1,$T2,$A ; a = T1 + T2 258|| SHRU $X1,3,$s0 ; modulo-scheduled 259|| SHRU $X14,10,$s1 ; modulo-scheduled 260 SPKERNEL 261 262 [A0] B outerloop? 263|| [A0] LDNW *$INP++,$Xn ; pre-fetch input 264|| [A0] ADDK -260,$K256 ; rewind K256 265|| ADD $Actx,$A,$A ; accumulate ctx 266|| ADD $Ectx,$E,$E 267|| ADD $Bctx,$B,$B 268 ADD $Fctx,$F,$F 269|| ADD $Cctx,$C,$C 270|| ADD $Gctx,$G,$G 271|| ADD $Dctx,$D,$D 272|| ADD $Hctx,$H,$H 273|| [A0] LDW *$K256++,$K ; pre-fetch K256[0] 274 275 [!A0] BNOP RA 276||[!A0] MV $CTXA,$CTXB 277 [!A0] MV FP,SP ; restore stack pointer 278||[!A0] LDW *FP[0],FP ; restore frame pointer 279 [!A0] STW $A,*${CTXA}[0] ; save ctx 280||[!A0] STW $E,*${CTXB}[4] 281||[!A0] MVK 0,B0 282 [!A0] STW $B,*${CTXA}[1] 283||[!A0] STW $F,*${CTXB}[5] 284||[!A0] MVC B0,AMR ; clear AMR 285 STW $C,*${CTXA}[2] 286|| STW $G,*${CTXB}[6] 287 STW $D,*${CTXA}[3] 288|| STW $H,*${CTXB}[7] 289 .endasmfunc 290 291 .if __TI_EABI__ 292 .sect ".text:sha_asm.const" 293 .else 294 .sect ".const:sha_asm" 295 .endif 296 .align 128 297K256: 298 .uword 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5 299 .uword 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5 300 .uword 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3 301 .uword 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174 302 .uword 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc 303 .uword 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da 304 .uword 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7 305 .uword 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967 306 .uword 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13 307 .uword 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85 308 .uword 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3 309 .uword 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070 310 .uword 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5 311 .uword 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3 312 .uword 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208 313 .uword 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 314 .cstring "SHA256 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>" 315 .align 4 316 317___ 318 319print $code; 320close STDOUT or die "error closing STDOUT: $!"; 321