1#! /usr/bin/env perl 2# Copyright 2009-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16 17# RC4 for PA-RISC. 18 19# June 2009. 20# 21# Performance is 33% better than gcc 3.2 generated code on PA-7100LC. 22# For reference, [4x] unrolled loop is >40% faster than folded one. 23# It's possible to unroll loop 8 times on PA-RISC 2.0, but improvement 24# is believed to be not sufficient to justify the effort... 25# 26# Special thanks to polarhome.com for providing HP-UX account. 27 28$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 29 30$flavour = shift; 31$output = shift; 32open STDOUT,">$output"; 33 34if ($flavour =~ /64/) { 35 $LEVEL ="2.0W"; 36 $SIZE_T =8; 37 $FRAME_MARKER =80; 38 $SAVED_RP =16; 39 $PUSH ="std"; 40 $PUSHMA ="std,ma"; 41 $POP ="ldd"; 42 $POPMB ="ldd,mb"; 43} else { 44 $LEVEL ="1.0"; 45 $SIZE_T =4; 46 $FRAME_MARKER =48; 47 $SAVED_RP =20; 48 $PUSH ="stw"; 49 $PUSHMA ="stwm"; 50 $POP ="ldw"; 51 $POPMB ="ldwm"; 52} 53 54$FRAME=4*$SIZE_T+$FRAME_MARKER; # 4 saved regs + frame marker 55 # [+ argument transfer] 56$SZ=1; # defaults to RC4_CHAR 57if (open CONF,"<${dir}../../opensslconf.h") { 58 while(<CONF>) { 59 if (m/#\s*define\s+RC4_INT\s+(.*)/) { 60 $SZ = ($1=~/char$/) ? 1 : 4; 61 last; 62 } 63 } 64 close CONF; 65} 66 67if ($SZ==1) { # RC4_CHAR 68 $LD="ldb"; 69 $LDX="ldbx"; 70 $MKX="addl"; 71 $ST="stb"; 72} else { # RC4_INT (~5% faster than RC4_CHAR on PA-7100LC) 73 $LD="ldw"; 74 $LDX="ldwx,s"; 75 $MKX="sh2addl"; 76 $ST="stw"; 77} 78 79$key="%r26"; 80$len="%r25"; 81$inp="%r24"; 82$out="%r23"; 83 84@XX=("%r19","%r20"); 85@TX=("%r21","%r22"); 86$YY="%r28"; 87$TY="%r29"; 88 89$acc="%r1"; 90$ix="%r2"; 91$iy="%r3"; 92$dat0="%r4"; 93$dat1="%r5"; 94$rem="%r6"; 95$mask="%r31"; 96 97sub unrolledloopbody { 98for ($i=0;$i<4;$i++) { 99$code.=<<___; 100 ldo 1($XX[0]),$XX[1] 101 `sprintf("$LDX %$TY(%$key),%$dat1") if ($i>0)` 102 and $mask,$XX[1],$XX[1] 103 $LDX $YY($key),$TY 104 $MKX $YY,$key,$ix 105 $LDX $XX[1]($key),$TX[1] 106 $MKX $XX[0],$key,$iy 107 $ST $TX[0],0($ix) 108 comclr,<> $XX[1],$YY,%r0 ; conditional 109 copy $TX[0],$TX[1] ; move 110 `sprintf("%sdep %$dat1,%d,8,%$acc",$i==1?"z":"",8*($i-1)+7) if ($i>0)` 111 $ST $TY,0($iy) 112 addl $TX[0],$TY,$TY 113 addl $TX[1],$YY,$YY 114 and $mask,$TY,$TY 115 and $mask,$YY,$YY 116___ 117push(@TX,shift(@TX)); push(@XX,shift(@XX)); # "rotate" registers 118} } 119 120sub foldedloop { 121my ($label,$count)=@_; 122$code.=<<___; 123$label 124 $MKX $YY,$key,$iy 125 $LDX $YY($key),$TY 126 $MKX $XX[0],$key,$ix 127 $ST $TX[0],0($iy) 128 ldo 1($XX[0]),$XX[0] 129 $ST $TY,0($ix) 130 addl $TX[0],$TY,$TY 131 ldbx $inp($out),$dat1 132 and $mask,$TY,$TY 133 and $mask,$XX[0],$XX[0] 134 $LDX $TY($key),$acc 135 $LDX $XX[0]($key),$TX[0] 136 ldo 1($out),$out 137 xor $dat1,$acc,$acc 138 addl $TX[0],$YY,$YY 139 stb $acc,-1($out) 140 addib,<> -1,$count,$label ; $count is always small 141 and $mask,$YY,$YY 142___ 143} 144 145$code=<<___; 146 .LEVEL $LEVEL 147 .SPACE \$TEXT\$ 148 .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY 149 150 .EXPORT RC4,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR 151RC4 152 .PROC 153 .CALLINFO FRAME=`$FRAME-4*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=6 154 .ENTRY 155 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue 156 $PUSHMA %r3,$FRAME(%sp) 157 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) 158 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) 159 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) 160 161 cmpib,*= 0,$len,L\$abort 162 sub $inp,$out,$inp ; distance between $inp and $out 163 164 $LD `0*$SZ`($key),$XX[0] 165 $LD `1*$SZ`($key),$YY 166 ldo `2*$SZ`($key),$key 167 168 ldi 0xff,$mask 169 ldi 3,$dat0 170 171 ldo 1($XX[0]),$XX[0] ; warm up loop 172 and $mask,$XX[0],$XX[0] 173 $LDX $XX[0]($key),$TX[0] 174 addl $TX[0],$YY,$YY 175 cmpib,*>>= 6,$len,L\$oop1 ; is $len large enough to bother? 176 and $mask,$YY,$YY 177 178 and,<> $out,$dat0,$rem ; is $out aligned? 179 b L\$alignedout 180 subi 4,$rem,$rem 181 sub $len,$rem,$len 182___ 183&foldedloop("L\$alignout",$rem); # process till $out is aligned 184 185$code.=<<___; 186L\$alignedout ; $len is at least 4 here 187 and,<> $inp,$dat0,$acc ; is $inp aligned? 188 b L\$oop4 189 sub $inp,$acc,$rem ; align $inp 190 191 sh3addl $acc,%r0,$acc 192 subi 32,$acc,$acc 193 mtctl $acc,%cr11 ; load %sar with vshd align factor 194 ldwx $rem($out),$dat0 195 ldo 4($rem),$rem 196L\$oop4misalignedinp 197___ 198&unrolledloopbody(); 199$code.=<<___; 200 $LDX $TY($key),$ix 201 ldwx $rem($out),$dat1 202 ldo -4($len),$len 203 or $ix,$acc,$acc ; last piece, no need to dep 204 vshd $dat0,$dat1,$iy ; align data 205 copy $dat1,$dat0 206 xor $iy,$acc,$acc 207 stw $acc,0($out) 208 cmpib,*<< 3,$len,L\$oop4misalignedinp 209 ldo 4($out),$out 210 cmpib,*= 0,$len,L\$done 211 nop 212 b L\$oop1 213 nop 214 215 .ALIGN 8 216L\$oop4 217___ 218&unrolledloopbody(); 219$code.=<<___; 220 $LDX $TY($key),$ix 221 ldwx $inp($out),$dat0 222 ldo -4($len),$len 223 or $ix,$acc,$acc ; last piece, no need to dep 224 xor $dat0,$acc,$acc 225 stw $acc,0($out) 226 cmpib,*<< 3,$len,L\$oop4 227 ldo 4($out),$out 228 cmpib,*= 0,$len,L\$done 229 nop 230___ 231&foldedloop("L\$oop1",$len); 232$code.=<<___; 233L\$done 234 $POP `-$FRAME-$SAVED_RP`(%sp),%r2 235 ldo -1($XX[0]),$XX[0] ; chill out loop 236 sub $YY,$TX[0],$YY 237 and $mask,$XX[0],$XX[0] 238 and $mask,$YY,$YY 239 $ST $XX[0],`-2*$SZ`($key) 240 $ST $YY,`-1*$SZ`($key) 241 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 242 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 243 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 244L\$abort 245 bv (%r2) 246 .EXIT 247 $POPMB -$FRAME(%sp),%r3 248 .PROCEND 249___ 250 251$code.=<<___; 252 253 .EXPORT RC4_set_key,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR 254 .ALIGN 8 255RC4_set_key 256 .PROC 257 .CALLINFO NO_CALLS 258 .ENTRY 259 $ST %r0,`0*$SZ`($key) 260 $ST %r0,`1*$SZ`($key) 261 ldo `2*$SZ`($key),$key 262 copy %r0,@XX[0] 263L\$1st 264 $ST @XX[0],0($key) 265 ldo 1(@XX[0]),@XX[0] 266 bb,>= @XX[0],`31-8`,L\$1st ; @XX[0]<256 267 ldo $SZ($key),$key 268 269 ldo `-256*$SZ`($key),$key ; rewind $key 270 addl $len,$inp,$inp ; $inp to point at the end 271 sub %r0,$len,%r23 ; inverse index 272 copy %r0,@XX[0] 273 copy %r0,@XX[1] 274 ldi 0xff,$mask 275 276L\$2nd 277 $LDX @XX[0]($key),@TX[0] 278 ldbx %r23($inp),@TX[1] 279 addi,nuv 1,%r23,%r23 ; increment and conditional 280 sub %r0,$len,%r23 ; inverse index 281 addl @TX[0],@XX[1],@XX[1] 282 addl @TX[1],@XX[1],@XX[1] 283 and $mask,@XX[1],@XX[1] 284 $MKX @XX[0],$key,$TY 285 $LDX @XX[1]($key),@TX[1] 286 $MKX @XX[1],$key,$YY 287 ldo 1(@XX[0]),@XX[0] 288 $ST @TX[0],0($YY) 289 bb,>= @XX[0],`31-8`,L\$2nd ; @XX[0]<256 290 $ST @TX[1],0($TY) 291 292 bv,n (%r2) 293 .EXIT 294 nop 295 .PROCEND 296 297 .EXPORT RC4_options,ENTRY 298 .ALIGN 8 299RC4_options 300 .PROC 301 .CALLINFO NO_CALLS 302 .ENTRY 303 blr %r0,%r28 304 ldi 3,%r1 305L\$pic 306 andcm %r28,%r1,%r28 307 bv (%r2) 308 .EXIT 309 ldo L\$opts-L\$pic(%r28),%r28 310 .PROCEND 311 .ALIGN 8 312L\$opts 313 .STRINGZ "rc4(4x,`$SZ==1?"char":"int"`)" 314 .STRINGZ "RC4 for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>" 315___ 316 317if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 318 =~ /GNU assembler/) { 319 $gnuas = 1; 320} 321 322foreach(split("\n",$code)) { 323 s/\`([^\`]*)\`/eval $1/ge; 324 325 s/(\.LEVEL\s+2\.0)W/$1w/ if ($gnuas && $SIZE_T==8); 326 s/\.SPACE\s+\$TEXT\$/.text/ if ($gnuas && $SIZE_T==8); 327 s/\.SUBSPA.*// if ($gnuas && $SIZE_T==8); 328 s/cmpib,\*/comib,/ if ($SIZE_T==4); 329 s/\bbv\b/bve/ if ($SIZE_T==8); 330 331 print $_,"\n"; 332} 333close STDOUT or die "error closing STDOUT: $!"; 334