1#! /usr/bin/env perl 2# Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# SHA1 for C64x+. 18# 19# November 2011 20# 21# If compared to compiler-generated code with similar characteristics, 22# i.e. compiled with OPENSSL_SMALL_FOOTPRINT and utilizing SPLOOPs, 23# this implementation is 25% smaller and >2x faster. In absolute terms 24# performance is (quite impressive) ~6.5 cycles per processed byte. 25# Fully unrolled assembler would be ~5x larger and is likely to be 26# ~15% faster. It would be free from references to intermediate ring 27# buffer, but put more pressure on L1P [both because the code would be 28# larger and won't be using SPLOOP buffer]. There are no plans to 29# realize fully unrolled variant though... 30# 31# !!! Note that this module uses AMR, which means that all interrupt 32# service routines are expected to preserve it and for own well-being 33# zero it upon entry. 34 35while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {} 36open STDOUT,">$output"; 37 38($CTX,$INP,$NUM) = ("A4","B4","A6"); # arguments 39 40($A,$B,$C,$D,$E, $Arot,$F,$F0,$T,$K) = map("A$_",(16..20, 21..25)); 41($X0,$X2,$X8,$X13) = ("A26","B26","A27","B27"); 42($TX0,$TX1,$TX2,$TX3) = map("B$_",(28..31)); 43($XPA,$XPB) = ("A5","B5"); # X circular buffer 44($Actx,$Bctx,$Cctx,$Dctx,$Ectx) = map("A$_",(3,6..9)); # zaps $NUM 45 46$code=<<___; 47 .text 48 49 .if .ASSEMBLER_VERSION<7000000 50 .asg 0,__TI_EABI__ 51 .endif 52 .if __TI_EABI__ 53 .asg sha1_block_data_order,_sha1_block_data_order 54 .endif 55 56 .asg B3,RA 57 .asg A15,FP 58 .asg B15,SP 59 60 .if .BIG_ENDIAN 61 .asg MV,SWAP2 62 .asg MV,SWAP4 63 .endif 64 65 .global _sha1_block_data_order 66_sha1_block_data_order: 67 .asmfunc stack_usage(64) 68 MV $NUM,A0 ; reassign $NUM 69|| MVK -64,B0 70 [!A0] BNOP RA ; if ($NUM==0) return; 71|| [A0] STW FP,*SP--[16] ; save frame pointer and alloca(64) 72|| [A0] MV SP,FP 73 [A0] LDW *${CTX}[0],$A ; load A-E... 74|| [A0] AND B0,SP,SP ; align stack at 64 bytes 75 [A0] LDW *${CTX}[1],$B 76|| [A0] SUBAW SP,2,SP ; reserve two words above buffer 77 [A0] LDW *${CTX}[2],$C 78|| [A0] MVK 0x00404,B0 79 [A0] LDW *${CTX}[3],$D 80|| [A0] MVKH 0x50000,B0 ; 0x050404, 64 bytes for $XP[AB] 81 [A0] LDW *${CTX}[4],$E 82|| [A0] MVC B0,AMR ; setup circular addressing 83 LDNW *${INP}++,$TX1 ; pre-fetch input 84 NOP 1 85 86loop?: 87 MVK 0x00007999,$K 88|| ADDAW SP,2,$XPA 89|| SUB A0,1,A0 90|| MVK 13,B0 91 MVKH 0x5a820000,$K ; K_00_19 92|| ADDAW SP,2,$XPB 93|| MV $A,$Actx 94|| MV $B,$Bctx 95;;================================================== 96 SPLOOPD 5 ; BODY_00_13 97|| MV $C,$Cctx 98|| MV $D,$Dctx 99|| MV $E,$Ectx 100|| MVC B0,ILC 101 102 ROTL $A,5,$Arot 103|| AND $C,$B,$F 104|| ANDN $D,$B,$F0 105|| ADD $K,$E,$T ; T=E+K 106 107 XOR $F0,$F,$F ; F_00_19(B,C,D) 108|| MV $D,$E ; E=D 109|| MV $C,$D ; D=C 110|| SWAP2 $TX1,$TX2 111|| LDNW *${INP}++,$TX1 112 113 ADD $F,$T,$T ; T+=F_00_19(B,C,D) 114|| ROTL $B,30,$C ; C=ROL(B,30) 115|| SWAP4 $TX2,$TX3 ; byte swap 116 117 ADD $Arot,$T,$T ; T+=ROL(A,5) 118|| MV $A,$B ; B=A 119 120 ADD $TX3,$T,$A ; A=T+Xi 121|| STW $TX3,*${XPB}++ 122 SPKERNEL 123;;================================================== 124 ROTL $A,5,$Arot ; BODY_14 125|| AND $C,$B,$F 126|| ANDN $D,$B,$F0 127|| ADD $K,$E,$T ; T=E+K 128 129 XOR $F0,$F,$F ; F_00_19(B,C,D) 130|| MV $D,$E ; E=D 131|| MV $C,$D ; D=C 132|| SWAP2 $TX1,$TX2 133|| LDNW *${INP}++,$TX1 134 135 ADD $F,$T,$T ; T+=F_00_19(B,C,D) 136|| ROTL $B,30,$C ; C=ROL(B,30) 137|| SWAP4 $TX2,$TX2 ; byte swap 138|| LDW *${XPA}++,$X0 ; fetches from X ring buffer are 139|| LDW *${XPB}[4],$X2 ; 2 iterations ahead 140 141 ADD $Arot,$T,$T ; T+=ROL(A,5) 142|| MV $A,$B ; B=A 143|| LDW *${XPA}[7],$X8 144|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13 145|| MV $TX2,$TX3 146 147 ADD $TX2,$T,$A ; A=T+Xi 148|| STW $TX2,*${XPB}++ 149;;================================================== 150 ROTL $A,5,$Arot ; BODY_15 151|| AND $C,$B,$F 152|| ANDN $D,$B,$F0 153|| ADD $K,$E,$T ; T=E+K 154 155 XOR $F0,$F,$F ; F_00_19(B,C,D) 156|| MV $D,$E ; E=D 157|| MV $C,$D ; D=C 158|| SWAP2 $TX1,$TX2 159 160 ADD $F,$T,$T ; T+=F_00_19(B,C,D) 161|| ROTL $B,30,$C ; C=ROL(B,30) 162|| SWAP4 $TX2,$TX2 ; byte swap 163|| XOR $X0,$X2,$TX0 ; Xupdate XORs are 1 iteration ahead 164|| LDW *${XPA}++,$X0 165|| LDW *${XPB}[4],$X2 166 167 ADD $Arot,$T,$T ; T+=ROL(A,5) 168|| MV $A,$B ; B=A 169|| XOR $X8,$X13,$TX1 170|| LDW *${XPA}[7],$X8 171|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13 172|| MV $TX2,$TX3 173 174 ADD $TX2,$T,$A ; A=T+Xi 175|| STW $TX2,*${XPB}++ 176|| XOR $TX0,$TX1,$TX1 177|| MVK 3,B0 178;;================================================== 179 SPLOOPD 5 ; BODY_16_19 180|| MVC B0,ILC 181 182 ROTL $A,5,$Arot 183|| AND $C,$B,$F 184|| ANDN $D,$B,$F0 185|| ADD $K,$E,$T ; T=E+K 186|| ROTL $TX1,1,$TX2 ; Xupdate output 187 188 XOR $F0,$F,$F ; F_00_19(B,C,D) 189|| MV $D,$E ; E=D 190|| MV $C,$D ; D=C 191 192 ADD $F,$T,$T ; T+=F_00_19(B,C,D) 193|| ROTL $B,30,$C ; C=ROL(B,30) 194|| XOR $X0,$X2,$TX0 195|| LDW *${XPA}++,$X0 196|| LDW *${XPB}[4],$X2 197 198 ADD $Arot,$T,$T ; T+=ROL(A,5) 199|| MV $A,$B ; B=A 200|| XOR $X8,$X13,$TX1 201|| LDW *${XPA}[7],$X8 202|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13 203|| MV $TX2,$TX3 204 205 ADD $TX2,$T,$A ; A=T+Xi 206|| STW $TX2,*${XPB}++ 207|| XOR $TX0,$TX1,$TX1 208 SPKERNEL 209 210 MVK 0xffffeba1,$K 211|| MVK 19,B0 212 MVKH 0x6ed90000,$K ; K_20_39 213___ 214sub BODY_20_39 { 215$code.=<<___; 216;;================================================== 217 SPLOOPD 5 ; BODY_20_39 218|| MVC B0,ILC 219 220 ROTL $A,5,$Arot 221|| XOR $B,$C,$F 222|| ADD $K,$E,$T ; T=E+K 223|| ROTL $TX1,1,$TX2 ; Xupdate output 224 225 XOR $D,$F,$F ; F_20_39(B,C,D) 226|| MV $D,$E ; E=D 227|| MV $C,$D ; D=C 228 229 ADD $F,$T,$T ; T+=F_20_39(B,C,D) 230|| ROTL $B,30,$C ; C=ROL(B,30) 231|| XOR $X0,$X2,$TX0 232|| LDW *${XPA}++,$X0 233|| LDW *${XPB}[4],$X2 234 235 ADD $Arot,$T,$T ; T+=ROL(A,5) 236|| MV $A,$B ; B=A 237|| XOR $X8,$X13,$TX1 238|| LDW *${XPA}[7],$X8 239|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13 240|| MV $TX2,$TX3 241 242 ADD $TX2,$T,$A ; A=T+Xi 243|| STW $TX2,*${XPB}++ ; last one is redundant 244|| XOR $TX0,$TX1,$TX1 245 SPKERNEL 246___ 247$code.=<<___ if (!shift); 248 MVK 0xffffbcdc,$K 249 MVKH 0x8f1b0000,$K ; K_40_59 250___ 251} &BODY_20_39(); 252$code.=<<___; 253;;================================================== 254 SPLOOPD 5 ; BODY_40_59 255|| MVC B0,ILC 256|| AND $B,$C,$F 257|| AND $B,$D,$F0 258 259 ROTL $A,5,$Arot 260|| XOR $F0,$F,$F 261|| AND $C,$D,$F0 262|| ADD $K,$E,$T ; T=E+K 263|| ROTL $TX1,1,$TX2 ; Xupdate output 264 265 XOR $F0,$F,$F ; F_40_59(B,C,D) 266|| MV $D,$E ; E=D 267|| MV $C,$D ; D=C 268 269 ADD $F,$T,$T ; T+=F_40_59(B,C,D) 270|| ROTL $B,30,$C ; C=ROL(B,30) 271|| XOR $X0,$X2,$TX0 272|| LDW *${XPA}++,$X0 273|| LDW *${XPB}[4],$X2 274 275 ADD $Arot,$T,$T ; T+=ROL(A,5) 276|| MV $A,$B ; B=A 277|| XOR $X8,$X13,$TX1 278|| LDW *${XPA}[7],$X8 279|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13 280|| MV $TX2,$TX3 281 282 ADD $TX2,$T,$A ; A=T+Xi 283|| STW $TX2,*${XPB}++ 284|| XOR $TX0,$TX1,$TX1 285|| AND $B,$C,$F 286|| AND $B,$D,$F0 287 SPKERNEL 288 289 MVK 0xffffc1d6,$K 290|| MVK 18,B0 291 MVKH 0xca620000,$K ; K_60_79 292___ 293 &BODY_20_39(-1); # BODY_60_78 294$code.=<<___; 295;;================================================== 296 [A0] B loop? 297|| ROTL $A,5,$Arot ; BODY_79 298|| XOR $B,$C,$F 299|| ROTL $TX1,1,$TX2 ; Xupdate output 300 301 [A0] LDNW *${INP}++,$TX1 ; pre-fetch input 302|| ADD $K,$E,$T ; T=E+K 303|| XOR $D,$F,$F ; F_20_39(B,C,D) 304 305 ADD $F,$T,$T ; T+=F_20_39(B,C,D) 306|| ADD $Ectx,$D,$E ; E=D,E+=Ectx 307|| ADD $Dctx,$C,$D ; D=C,D+=Dctx 308|| ROTL $B,30,$C ; C=ROL(B,30) 309 310 ADD $Arot,$T,$T ; T+=ROL(A,5) 311|| ADD $Bctx,$A,$B ; B=A,B+=Bctx 312 313 ADD $TX2,$T,$A ; A=T+Xi 314 315 ADD $Actx,$A,$A ; A+=Actx 316|| ADD $Cctx,$C,$C ; C+=Cctx 317;; end of loop? 318 319 BNOP RA ; return 320|| MV FP,SP ; restore stack pointer 321|| LDW *FP[0],FP ; restore frame pointer 322 STW $A,*${CTX}[0] ; emit A-E... 323|| MVK 0,B0 324 STW $B,*${CTX}[1] 325|| MVC B0,AMR ; clear AMR 326 STW $C,*${CTX}[2] 327 STW $D,*${CTX}[3] 328 STW $E,*${CTX}[4] 329 .endasmfunc 330 331 .sect .const 332 .cstring "SHA1 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>" 333 .align 4 334___ 335 336print $code; 337close STDOUT; 338