1#! /usr/bin/env perl 2# Copyright 2012-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16# 17# SHA1 for C64x+. 18# 19# November 2011 20# 21# If compared to compiler-generated code with similar characteristics, 22# i.e. compiled with OPENSSL_SMALL_FOOTPRINT and utilizing SPLOOPs, 23# this implementation is 25% smaller and >2x faster. In absolute terms 24# performance is (quite impressive) ~6.5 cycles per processed byte. 25# Fully unrolled assembler would be ~5x larger and is likely to be 26# ~15% faster. It would be free from references to intermediate ring 27# buffer, but put more pressure on L1P [both because the code would be 28# larger and won't be using SPLOOP buffer]. There are no plans to 29# realize fully unrolled variant though... 30# 31# !!! Note that this module uses AMR, which means that all interrupt 32# service routines are expected to preserve it and for own well-being 33# zero it upon entry. 34 35$output = pop and open STDOUT,">$output"; 36 37($CTX,$INP,$NUM) = ("A4","B4","A6"); # arguments 38 39($A,$B,$C,$D,$E, $Arot,$F,$F0,$T,$K) = map("A$_",(16..20, 21..25)); 40($X0,$X2,$X8,$X13) = ("A26","B26","A27","B27"); 41($TX0,$TX1,$TX2,$TX3) = map("B$_",(28..31)); 42($XPA,$XPB) = ("A5","B5"); # X circular buffer 43($Actx,$Bctx,$Cctx,$Dctx,$Ectx) = map("A$_",(3,6..9)); # zaps $NUM 44 45$code=<<___; 46 .text 47 48 .if .ASSEMBLER_VERSION<7000000 49 .asg 0,__TI_EABI__ 50 .endif 51 .if __TI_EABI__ 52 .asg sha1_block_data_order,_sha1_block_data_order 53 .endif 54 55 .asg B3,RA 56 .asg A15,FP 57 .asg B15,SP 58 59 .if .BIG_ENDIAN 60 .asg MV,SWAP2 61 .asg MV,SWAP4 62 .endif 63 64 .global _sha1_block_data_order 65_sha1_block_data_order: 66 .asmfunc stack_usage(64) 67 MV $NUM,A0 ; reassign $NUM 68|| MVK -64,B0 69 [!A0] BNOP RA ; if ($NUM==0) return; 70|| [A0] STW FP,*SP--[16] ; save frame pointer and alloca(64) 71|| [A0] MV SP,FP 72 [A0] LDW *${CTX}[0],$A ; load A-E... 73|| [A0] AND B0,SP,SP ; align stack at 64 bytes 74 [A0] LDW *${CTX}[1],$B 75|| [A0] SUBAW SP,2,SP ; reserve two words above buffer 76 [A0] LDW *${CTX}[2],$C 77|| [A0] MVK 0x00404,B0 78 [A0] LDW *${CTX}[3],$D 79|| [A0] MVKH 0x50000,B0 ; 0x050404, 64 bytes for $XP[AB] 80 [A0] LDW *${CTX}[4],$E 81|| [A0] MVC B0,AMR ; setup circular addressing 82 LDNW *${INP}++,$TX1 ; pre-fetch input 83 NOP 1 84 85loop?: 86 MVK 0x00007999,$K 87|| ADDAW SP,2,$XPA 88|| SUB A0,1,A0 89|| MVK 13,B0 90 MVKH 0x5a820000,$K ; K_00_19 91|| ADDAW SP,2,$XPB 92|| MV $A,$Actx 93|| MV $B,$Bctx 94;;================================================== 95 SPLOOPD 5 ; BODY_00_13 96|| MV $C,$Cctx 97|| MV $D,$Dctx 98|| MV $E,$Ectx 99|| MVC B0,ILC 100 101 ROTL $A,5,$Arot 102|| AND $C,$B,$F 103|| ANDN $D,$B,$F0 104|| ADD $K,$E,$T ; T=E+K 105 106 XOR $F0,$F,$F ; F_00_19(B,C,D) 107|| MV $D,$E ; E=D 108|| MV $C,$D ; D=C 109|| SWAP2 $TX1,$TX2 110|| LDNW *${INP}++,$TX1 111 112 ADD $F,$T,$T ; T+=F_00_19(B,C,D) 113|| ROTL $B,30,$C ; C=ROL(B,30) 114|| SWAP4 $TX2,$TX3 ; byte swap 115 116 ADD $Arot,$T,$T ; T+=ROL(A,5) 117|| MV $A,$B ; B=A 118 119 ADD $TX3,$T,$A ; A=T+Xi 120|| STW $TX3,*${XPB}++ 121 SPKERNEL 122;;================================================== 123 ROTL $A,5,$Arot ; BODY_14 124|| AND $C,$B,$F 125|| ANDN $D,$B,$F0 126|| ADD $K,$E,$T ; T=E+K 127 128 XOR $F0,$F,$F ; F_00_19(B,C,D) 129|| MV $D,$E ; E=D 130|| MV $C,$D ; D=C 131|| SWAP2 $TX1,$TX2 132|| LDNW *${INP}++,$TX1 133 134 ADD $F,$T,$T ; T+=F_00_19(B,C,D) 135|| ROTL $B,30,$C ; C=ROL(B,30) 136|| SWAP4 $TX2,$TX2 ; byte swap 137|| LDW *${XPA}++,$X0 ; fetches from X ring buffer are 138|| LDW *${XPB}[4],$X2 ; 2 iterations ahead 139 140 ADD $Arot,$T,$T ; T+=ROL(A,5) 141|| MV $A,$B ; B=A 142|| LDW *${XPA}[7],$X8 143|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13 144|| MV $TX2,$TX3 145 146 ADD $TX2,$T,$A ; A=T+Xi 147|| STW $TX2,*${XPB}++ 148;;================================================== 149 ROTL $A,5,$Arot ; BODY_15 150|| AND $C,$B,$F 151|| ANDN $D,$B,$F0 152|| ADD $K,$E,$T ; T=E+K 153 154 XOR $F0,$F,$F ; F_00_19(B,C,D) 155|| MV $D,$E ; E=D 156|| MV $C,$D ; D=C 157|| SWAP2 $TX1,$TX2 158 159 ADD $F,$T,$T ; T+=F_00_19(B,C,D) 160|| ROTL $B,30,$C ; C=ROL(B,30) 161|| SWAP4 $TX2,$TX2 ; byte swap 162|| XOR $X0,$X2,$TX0 ; Xupdate XORs are 1 iteration ahead 163|| LDW *${XPA}++,$X0 164|| LDW *${XPB}[4],$X2 165 166 ADD $Arot,$T,$T ; T+=ROL(A,5) 167|| MV $A,$B ; B=A 168|| XOR $X8,$X13,$TX1 169|| LDW *${XPA}[7],$X8 170|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13 171|| MV $TX2,$TX3 172 173 ADD $TX2,$T,$A ; A=T+Xi 174|| STW $TX2,*${XPB}++ 175|| XOR $TX0,$TX1,$TX1 176|| MVK 3,B0 177;;================================================== 178 SPLOOPD 5 ; BODY_16_19 179|| MVC B0,ILC 180 181 ROTL $A,5,$Arot 182|| AND $C,$B,$F 183|| ANDN $D,$B,$F0 184|| ADD $K,$E,$T ; T=E+K 185|| ROTL $TX1,1,$TX2 ; Xupdate output 186 187 XOR $F0,$F,$F ; F_00_19(B,C,D) 188|| MV $D,$E ; E=D 189|| MV $C,$D ; D=C 190 191 ADD $F,$T,$T ; T+=F_00_19(B,C,D) 192|| ROTL $B,30,$C ; C=ROL(B,30) 193|| XOR $X0,$X2,$TX0 194|| LDW *${XPA}++,$X0 195|| LDW *${XPB}[4],$X2 196 197 ADD $Arot,$T,$T ; T+=ROL(A,5) 198|| MV $A,$B ; B=A 199|| XOR $X8,$X13,$TX1 200|| LDW *${XPA}[7],$X8 201|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13 202|| MV $TX2,$TX3 203 204 ADD $TX2,$T,$A ; A=T+Xi 205|| STW $TX2,*${XPB}++ 206|| XOR $TX0,$TX1,$TX1 207 SPKERNEL 208 209 MVK 0xffffeba1,$K 210|| MVK 19,B0 211 MVKH 0x6ed90000,$K ; K_20_39 212___ 213sub BODY_20_39 { 214$code.=<<___; 215;;================================================== 216 SPLOOPD 5 ; BODY_20_39 217|| MVC B0,ILC 218 219 ROTL $A,5,$Arot 220|| XOR $B,$C,$F 221|| ADD $K,$E,$T ; T=E+K 222|| ROTL $TX1,1,$TX2 ; Xupdate output 223 224 XOR $D,$F,$F ; F_20_39(B,C,D) 225|| MV $D,$E ; E=D 226|| MV $C,$D ; D=C 227 228 ADD $F,$T,$T ; T+=F_20_39(B,C,D) 229|| ROTL $B,30,$C ; C=ROL(B,30) 230|| XOR $X0,$X2,$TX0 231|| LDW *${XPA}++,$X0 232|| LDW *${XPB}[4],$X2 233 234 ADD $Arot,$T,$T ; T+=ROL(A,5) 235|| MV $A,$B ; B=A 236|| XOR $X8,$X13,$TX1 237|| LDW *${XPA}[7],$X8 238|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13 239|| MV $TX2,$TX3 240 241 ADD $TX2,$T,$A ; A=T+Xi 242|| STW $TX2,*${XPB}++ ; last one is redundant 243|| XOR $TX0,$TX1,$TX1 244 SPKERNEL 245___ 246$code.=<<___ if (!shift); 247 MVK 0xffffbcdc,$K 248 MVKH 0x8f1b0000,$K ; K_40_59 249___ 250} &BODY_20_39(); 251$code.=<<___; 252;;================================================== 253 SPLOOPD 5 ; BODY_40_59 254|| MVC B0,ILC 255|| AND $B,$C,$F 256|| AND $B,$D,$F0 257 258 ROTL $A,5,$Arot 259|| XOR $F0,$F,$F 260|| AND $C,$D,$F0 261|| ADD $K,$E,$T ; T=E+K 262|| ROTL $TX1,1,$TX2 ; Xupdate output 263 264 XOR $F0,$F,$F ; F_40_59(B,C,D) 265|| MV $D,$E ; E=D 266|| MV $C,$D ; D=C 267 268 ADD $F,$T,$T ; T+=F_40_59(B,C,D) 269|| ROTL $B,30,$C ; C=ROL(B,30) 270|| XOR $X0,$X2,$TX0 271|| LDW *${XPA}++,$X0 272|| LDW *${XPB}[4],$X2 273 274 ADD $Arot,$T,$T ; T+=ROL(A,5) 275|| MV $A,$B ; B=A 276|| XOR $X8,$X13,$TX1 277|| LDW *${XPA}[7],$X8 278|| MV $TX3,$X13 ; || LDW *${XPB}[15],$X13 279|| MV $TX2,$TX3 280 281 ADD $TX2,$T,$A ; A=T+Xi 282|| STW $TX2,*${XPB}++ 283|| XOR $TX0,$TX1,$TX1 284|| AND $B,$C,$F 285|| AND $B,$D,$F0 286 SPKERNEL 287 288 MVK 0xffffc1d6,$K 289|| MVK 18,B0 290 MVKH 0xca620000,$K ; K_60_79 291___ 292 &BODY_20_39(-1); # BODY_60_78 293$code.=<<___; 294;;================================================== 295 [A0] B loop? 296|| ROTL $A,5,$Arot ; BODY_79 297|| XOR $B,$C,$F 298|| ROTL $TX1,1,$TX2 ; Xupdate output 299 300 [A0] LDNW *${INP}++,$TX1 ; pre-fetch input 301|| ADD $K,$E,$T ; T=E+K 302|| XOR $D,$F,$F ; F_20_39(B,C,D) 303 304 ADD $F,$T,$T ; T+=F_20_39(B,C,D) 305|| ADD $Ectx,$D,$E ; E=D,E+=Ectx 306|| ADD $Dctx,$C,$D ; D=C,D+=Dctx 307|| ROTL $B,30,$C ; C=ROL(B,30) 308 309 ADD $Arot,$T,$T ; T+=ROL(A,5) 310|| ADD $Bctx,$A,$B ; B=A,B+=Bctx 311 312 ADD $TX2,$T,$A ; A=T+Xi 313 314 ADD $Actx,$A,$A ; A+=Actx 315|| ADD $Cctx,$C,$C ; C+=Cctx 316;; end of loop? 317 318 BNOP RA ; return 319|| MV FP,SP ; restore stack pointer 320|| LDW *FP[0],FP ; restore frame pointer 321 STW $A,*${CTX}[0] ; emit A-E... 322|| MVK 0,B0 323 STW $B,*${CTX}[1] 324|| MVC B0,AMR ; clear AMR 325 STW $C,*${CTX}[2] 326 STW $D,*${CTX}[3] 327 STW $E,*${CTX}[4] 328 .endasmfunc 329 330 .sect .const 331 .cstring "SHA1 block transform for C64x+, CRYPTOGAMS by <appro\@openssl.org>" 332 .align 4 333___ 334 335print $code; 336close STDOUT or die "error closing STDOUT: $!"; 337