1#!/usr/bin/env perl 2 3# ==================================================================== 4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9 10# On PA-7100LC this module performs ~90-50% better, less for longer 11# keys, than code generated by gcc 3.2 for PA-RISC 1.1. Latter means 12# that compiler utilized xmpyu instruction to perform 32x32=64-bit 13# multiplication, which in turn means that "baseline" performance was 14# optimal in respect to instruction set capabilities. Fair comparison 15# with vendor compiler is problematic, because OpenSSL doesn't define 16# BN_LLONG [presumably] for historical reasons, which drives compiler 17# toward 4 times 16x16=32-bit multiplicatons [plus complementary 18# shifts and additions] instead. This means that you should observe 19# several times improvement over code generated by vendor compiler 20# for PA-RISC 1.1, but the "baseline" is far from optimal. The actual 21# improvement coefficient was never collected on PA-7100LC, or any 22# other 1.1 CPU, because I don't have access to such machine with 23# vendor compiler. But to give you a taste, PA-RISC 1.1 code path 24# reportedly outperformed code generated by cc +DA1.1 +O3 by factor 25# of ~5x on PA-8600. 26# 27# On PA-RISC 2.0 it has to compete with pa-risc2[W].s, which is 28# reportedly ~2x faster than vendor compiler generated code [according 29# to comment in pa-risc2[W].s]. Here comes a catch. Execution core of 30# this implementation is actually 32-bit one, in the sense that it 31# operates on 32-bit values. But pa-risc2[W].s operates on arrays of 32# 64-bit BN_LONGs... How do they interoperate then? No problem. This 33# module picks halves of 64-bit values in reverse order and pretends 34# they were 32-bit BN_LONGs. But can 32-bit core compete with "pure" 35# 64-bit code such as pa-risc2[W].s then? Well, the thing is that 36# 32x32=64-bit multiplication is the best even PA-RISC 2.0 can do, 37# i.e. there is no "wider" multiplication like on most other 64-bit 38# platforms. This means that even being effectively 32-bit, this 39# implementation performs "64-bit" computational task in same amount 40# of arithmetic operations, most notably multiplications. It requires 41# more memory references, most notably to tp[num], but this doesn't 42# seem to exhaust memory port capacity. And indeed, dedicated PA-RISC 43# 2.0 code path, provides virtually same performance as pa-risc2[W].s: 44# it's ~10% better for shortest key length and ~10% worse for longest 45# one. 46# 47# In case it wasn't clear. The module has two distinct code paths: 48# PA-RISC 1.1 and PA-RISC 2.0 ones. Latter features carry-free 64-bit 49# additions and 64-bit integer loads, not to mention specific 50# instruction scheduling. In 64-bit build naturally only 2.0 code path 51# is assembled. In 32-bit application context both code paths are 52# assembled, PA-RISC 2.0 CPU is detected at run-time and proper path 53# is taken automatically. Also, in 32-bit build the module imposes 54# couple of limitations: vector lengths has to be even and vector 55# addresses has to be 64-bit aligned. Normally neither is a problem: 56# most common key lengths are even and vectors are commonly malloc-ed, 57# which ensures alignment. 58# 59# Special thanks to polarhome.com for providing HP-UX account on 60# PA-RISC 1.1 machine, and to correspondent who chose to remain 61# anonymous for testing the code on PA-RISC 2.0 machine. 62 63$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 64 65$flavour = shift; 66$output = shift; 67 68open STDOUT,">$output"; 69 70if ($flavour =~ /64/) { 71 $LEVEL ="2.0W"; 72 $SIZE_T =8; 73 $FRAME_MARKER =80; 74 $SAVED_RP =16; 75 $PUSH ="std"; 76 $PUSHMA ="std,ma"; 77 $POP ="ldd"; 78 $POPMB ="ldd,mb"; 79 $BN_SZ =$SIZE_T; 80} else { 81 $LEVEL ="1.1"; #$LEVEL.="\n\t.ALLOW\t2.0"; 82 $SIZE_T =4; 83 $FRAME_MARKER =48; 84 $SAVED_RP =20; 85 $PUSH ="stw"; 86 $PUSHMA ="stwm"; 87 $POP ="ldw"; 88 $POPMB ="ldwm"; 89 $BN_SZ =$SIZE_T; 90 if (open CONF,"<${dir}../../opensslconf.h") { 91 while(<CONF>) { 92 if (m/#\s*define\s+SIXTY_FOUR_BIT/) { 93 $BN_SZ=8; 94 $LEVEL="2.0"; 95 last; 96 } 97 } 98 close CONF; 99 } 100} 101 102$FRAME=8*$SIZE_T+$FRAME_MARKER; # 8 saved regs + frame marker 103 # [+ argument transfer] 104$LOCALS=$FRAME-$FRAME_MARKER; 105$FRAME+=32; # local variables 106 107$tp="%r31"; 108$ti1="%r29"; 109$ti0="%r28"; 110 111$rp="%r26"; 112$ap="%r25"; 113$bp="%r24"; 114$np="%r23"; 115$n0="%r22"; # passed through stack in 32-bit 116$num="%r21"; # passed through stack in 32-bit 117$idx="%r20"; 118$arrsz="%r19"; 119 120$nm1="%r7"; 121$nm0="%r6"; 122$ab1="%r5"; 123$ab0="%r4"; 124 125$fp="%r3"; 126$hi1="%r2"; 127$hi0="%r1"; 128 129$xfer=$n0; # accomodates [-16..15] offset in fld[dw]s 130 131$fm0="%fr4"; $fti=$fm0; 132$fbi="%fr5L"; 133$fn0="%fr5R"; 134$fai="%fr6"; $fab0="%fr7"; $fab1="%fr8"; 135$fni="%fr9"; $fnm0="%fr10"; $fnm1="%fr11"; 136 137$code=<<___; 138 .LEVEL $LEVEL 139 .SPACE \$TEXT\$ 140 .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY 141 142 .EXPORT bn_mul_mont,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR 143 .ALIGN 64 144bn_mul_mont 145 .PROC 146 .CALLINFO FRAME=`$FRAME-8*$SIZE_T`,NO_CALLS,SAVE_RP,SAVE_SP,ENTRY_GR=6 147 .ENTRY 148 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue 149 $PUSHMA %r3,$FRAME(%sp) 150 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) 151 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) 152 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) 153 $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) 154 $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) 155 $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) 156 $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) 157 ldo -$FRAME(%sp),$fp 158___ 159$code.=<<___ if ($SIZE_T==4); 160 ldw `-$FRAME_MARKER-4`($fp),$n0 161 ldw `-$FRAME_MARKER-8`($fp),$num 162 nop 163 nop ; alignment 164___ 165$code.=<<___ if ($BN_SZ==4); 166 comiclr,<= 6,$num,%r0 ; are vectors long enough? 167 b L\$abort 168 ldi 0,%r28 ; signal "unhandled" 169 add,ev %r0,$num,$num ; is $num even? 170 b L\$abort 171 nop 172 or $ap,$np,$ti1 173 extru,= $ti1,31,3,%r0 ; are ap and np 64-bit aligned? 174 b L\$abort 175 nop 176 nop ; alignment 177 nop 178 179 fldws 0($n0),${fn0} 180 fldws,ma 4($bp),${fbi} ; bp[0] 181___ 182$code.=<<___ if ($BN_SZ==8); 183 comib,> 3,$num,L\$abort ; are vectors long enough? 184 ldi 0,%r28 ; signal "unhandled" 185 addl $num,$num,$num ; I operate on 32-bit values 186 187 fldws 4($n0),${fn0} ; only low part of n0 188 fldws 4($bp),${fbi} ; bp[0] in flipped word order 189___ 190$code.=<<___; 191 fldds 0($ap),${fai} ; ap[0,1] 192 fldds 0($np),${fni} ; np[0,1] 193 194 sh2addl $num,%r0,$arrsz 195 ldi 31,$hi0 196 ldo 36($arrsz),$hi1 ; space for tp[num+1] 197 andcm $hi1,$hi0,$hi1 ; align 198 addl $hi1,%sp,%sp 199 $PUSH $fp,-$SIZE_T(%sp) 200 201 ldo `$LOCALS+16`($fp),$xfer 202 ldo `$LOCALS+32+4`($fp),$tp 203 204 xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[0] 205 xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[0] 206 xmpyu ${fn0},${fab0}R,${fm0} 207 208 addl $arrsz,$ap,$ap ; point at the end 209 addl $arrsz,$np,$np 210 subi 0,$arrsz,$idx ; j=0 211 ldo 8($idx),$idx ; j++++ 212 213 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m 214 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m 215 fstds ${fab0},-16($xfer) 216 fstds ${fnm0},-8($xfer) 217 fstds ${fab1},0($xfer) 218 fstds ${fnm1},8($xfer) 219 flddx $idx($ap),${fai} ; ap[2,3] 220 flddx $idx($np),${fni} ; np[2,3] 221___ 222$code.=<<___ if ($BN_SZ==4); 223 mtctl $hi0,%cr11 ; $hi0 still holds 31 224 extrd,u,*= $hi0,%sar,1,$hi0 ; executes on PA-RISC 1.0 225 b L\$parisc11 226 nop 227___ 228$code.=<<___; # PA-RISC 2.0 code-path 229 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] 230 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 231 ldd -16($xfer),$ab0 232 fstds ${fab0},-16($xfer) 233 234 extrd,u $ab0,31,32,$hi0 235 extrd,u $ab0,63,32,$ab0 236 ldd -8($xfer),$nm0 237 fstds ${fnm0},-8($xfer) 238 ldo 8($idx),$idx ; j++++ 239 addl $ab0,$nm0,$nm0 ; low part is discarded 240 extrd,u $nm0,31,32,$hi1 241 242L\$1st 243 xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0] 244 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m 245 ldd 0($xfer),$ab1 246 fstds ${fab1},0($xfer) 247 addl $hi0,$ab1,$ab1 248 extrd,u $ab1,31,32,$hi0 249 ldd 8($xfer),$nm1 250 fstds ${fnm1},8($xfer) 251 extrd,u $ab1,63,32,$ab1 252 addl $hi1,$nm1,$nm1 253 flddx $idx($ap),${fai} ; ap[j,j+1] 254 flddx $idx($np),${fni} ; np[j,j+1] 255 addl $ab1,$nm1,$nm1 256 extrd,u $nm1,31,32,$hi1 257 258 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] 259 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 260 ldd -16($xfer),$ab0 261 fstds ${fab0},-16($xfer) 262 addl $hi0,$ab0,$ab0 263 extrd,u $ab0,31,32,$hi0 264 ldd -8($xfer),$nm0 265 fstds ${fnm0},-8($xfer) 266 extrd,u $ab0,63,32,$ab0 267 addl $hi1,$nm0,$nm0 268 stw $nm1,-4($tp) ; tp[j-1] 269 addl $ab0,$nm0,$nm0 270 stw,ma $nm0,8($tp) ; tp[j-1] 271 addib,<> 8,$idx,L\$1st ; j++++ 272 extrd,u $nm0,31,32,$hi1 273 274 xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0] 275 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m 276 ldd 0($xfer),$ab1 277 fstds ${fab1},0($xfer) 278 addl $hi0,$ab1,$ab1 279 extrd,u $ab1,31,32,$hi0 280 ldd 8($xfer),$nm1 281 fstds ${fnm1},8($xfer) 282 extrd,u $ab1,63,32,$ab1 283 addl $hi1,$nm1,$nm1 284 ldd -16($xfer),$ab0 285 addl $ab1,$nm1,$nm1 286 ldd -8($xfer),$nm0 287 extrd,u $nm1,31,32,$hi1 288 289 addl $hi0,$ab0,$ab0 290 extrd,u $ab0,31,32,$hi0 291 stw $nm1,-4($tp) ; tp[j-1] 292 extrd,u $ab0,63,32,$ab0 293 addl $hi1,$nm0,$nm0 294 ldd 0($xfer),$ab1 295 addl $ab0,$nm0,$nm0 296 ldd,mb 8($xfer),$nm1 297 extrd,u $nm0,31,32,$hi1 298 stw,ma $nm0,8($tp) ; tp[j-1] 299 300 ldo -1($num),$num ; i-- 301 subi 0,$arrsz,$idx ; j=0 302___ 303$code.=<<___ if ($BN_SZ==4); 304 fldws,ma 4($bp),${fbi} ; bp[1] 305___ 306$code.=<<___ if ($BN_SZ==8); 307 fldws 0($bp),${fbi} ; bp[1] in flipped word order 308___ 309$code.=<<___; 310 flddx $idx($ap),${fai} ; ap[0,1] 311 flddx $idx($np),${fni} ; np[0,1] 312 fldws 8($xfer),${fti}R ; tp[0] 313 addl $hi0,$ab1,$ab1 314 extrd,u $ab1,31,32,$hi0 315 extrd,u $ab1,63,32,$ab1 316 ldo 8($idx),$idx ; j++++ 317 xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1] 318 xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1] 319 addl $hi1,$nm1,$nm1 320 addl $ab1,$nm1,$nm1 321 extrd,u $nm1,31,32,$hi1 322 fstws,mb ${fab0}L,-8($xfer) ; save high part 323 stw $nm1,-4($tp) ; tp[j-1] 324 325 fcpy,sgl %fr0,${fti}L ; zero high part 326 fcpy,sgl %fr0,${fab0}L 327 addl $hi1,$hi0,$hi0 328 extrd,u $hi0,31,32,$hi1 329 fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double 330 fcnvxf,dbl,dbl ${fab0},${fab0} 331 stw $hi0,0($tp) 332 stw $hi1,4($tp) 333 334 fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] 335 fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int 336 xmpyu ${fn0},${fab0}R,${fm0} 337 ldo `$LOCALS+32+4`($fp),$tp 338L\$outer 339 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m 340 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m 341 fstds ${fab0},-16($xfer) ; 33-bit value 342 fstds ${fnm0},-8($xfer) 343 flddx $idx($ap),${fai} ; ap[2] 344 flddx $idx($np),${fni} ; np[2] 345 ldo 8($idx),$idx ; j++++ 346 ldd -16($xfer),$ab0 ; 33-bit value 347 ldd -8($xfer),$nm0 348 ldw 0($xfer),$hi0 ; high part 349 350 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] 351 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 352 extrd,u $ab0,31,32,$ti0 ; carry bit 353 extrd,u $ab0,63,32,$ab0 354 fstds ${fab1},0($xfer) 355 addl $ti0,$hi0,$hi0 ; account carry bit 356 fstds ${fnm1},8($xfer) 357 addl $ab0,$nm0,$nm0 ; low part is discarded 358 ldw 0($tp),$ti1 ; tp[1] 359 extrd,u $nm0,31,32,$hi1 360 fstds ${fab0},-16($xfer) 361 fstds ${fnm0},-8($xfer) 362 363L\$inner 364 xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i] 365 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m 366 ldd 0($xfer),$ab1 367 fstds ${fab1},0($xfer) 368 addl $hi0,$ti1,$ti1 369 addl $ti1,$ab1,$ab1 370 ldd 8($xfer),$nm1 371 fstds ${fnm1},8($xfer) 372 extrd,u $ab1,31,32,$hi0 373 extrd,u $ab1,63,32,$ab1 374 flddx $idx($ap),${fai} ; ap[j,j+1] 375 flddx $idx($np),${fni} ; np[j,j+1] 376 addl $hi1,$nm1,$nm1 377 addl $ab1,$nm1,$nm1 378 ldw 4($tp),$ti0 ; tp[j] 379 stw $nm1,-4($tp) ; tp[j-1] 380 381 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] 382 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 383 ldd -16($xfer),$ab0 384 fstds ${fab0},-16($xfer) 385 addl $hi0,$ti0,$ti0 386 addl $ti0,$ab0,$ab0 387 ldd -8($xfer),$nm0 388 fstds ${fnm0},-8($xfer) 389 extrd,u $ab0,31,32,$hi0 390 extrd,u $nm1,31,32,$hi1 391 ldw 8($tp),$ti1 ; tp[j] 392 extrd,u $ab0,63,32,$ab0 393 addl $hi1,$nm0,$nm0 394 addl $ab0,$nm0,$nm0 395 stw,ma $nm0,8($tp) ; tp[j-1] 396 addib,<> 8,$idx,L\$inner ; j++++ 397 extrd,u $nm0,31,32,$hi1 398 399 xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i] 400 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m 401 ldd 0($xfer),$ab1 402 fstds ${fab1},0($xfer) 403 addl $hi0,$ti1,$ti1 404 addl $ti1,$ab1,$ab1 405 ldd 8($xfer),$nm1 406 fstds ${fnm1},8($xfer) 407 extrd,u $ab1,31,32,$hi0 408 extrd,u $ab1,63,32,$ab1 409 ldw 4($tp),$ti0 ; tp[j] 410 addl $hi1,$nm1,$nm1 411 addl $ab1,$nm1,$nm1 412 ldd -16($xfer),$ab0 413 ldd -8($xfer),$nm0 414 extrd,u $nm1,31,32,$hi1 415 416 addl $hi0,$ab0,$ab0 417 addl $ti0,$ab0,$ab0 418 stw $nm1,-4($tp) ; tp[j-1] 419 extrd,u $ab0,31,32,$hi0 420 ldw 8($tp),$ti1 ; tp[j] 421 extrd,u $ab0,63,32,$ab0 422 addl $hi1,$nm0,$nm0 423 ldd 0($xfer),$ab1 424 addl $ab0,$nm0,$nm0 425 ldd,mb 8($xfer),$nm1 426 extrd,u $nm0,31,32,$hi1 427 stw,ma $nm0,8($tp) ; tp[j-1] 428 429 addib,= -1,$num,L\$outerdone ; i-- 430 subi 0,$arrsz,$idx ; j=0 431___ 432$code.=<<___ if ($BN_SZ==4); 433 fldws,ma 4($bp),${fbi} ; bp[i] 434___ 435$code.=<<___ if ($BN_SZ==8); 436 ldi 12,$ti0 ; bp[i] in flipped word order 437 addl,ev %r0,$num,$num 438 ldi -4,$ti0 439 addl $ti0,$bp,$bp 440 fldws 0($bp),${fbi} 441___ 442$code.=<<___; 443 flddx $idx($ap),${fai} ; ap[0] 444 addl $hi0,$ab1,$ab1 445 flddx $idx($np),${fni} ; np[0] 446 fldws 8($xfer),${fti}R ; tp[0] 447 addl $ti1,$ab1,$ab1 448 extrd,u $ab1,31,32,$hi0 449 extrd,u $ab1,63,32,$ab1 450 451 ldo 8($idx),$idx ; j++++ 452 xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i] 453 xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i] 454 ldw 4($tp),$ti0 ; tp[j] 455 456 addl $hi1,$nm1,$nm1 457 fstws,mb ${fab0}L,-8($xfer) ; save high part 458 addl $ab1,$nm1,$nm1 459 extrd,u $nm1,31,32,$hi1 460 fcpy,sgl %fr0,${fti}L ; zero high part 461 fcpy,sgl %fr0,${fab0}L 462 stw $nm1,-4($tp) ; tp[j-1] 463 464 fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double 465 fcnvxf,dbl,dbl ${fab0},${fab0} 466 addl $hi1,$hi0,$hi0 467 fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] 468 addl $ti0,$hi0,$hi0 469 extrd,u $hi0,31,32,$hi1 470 fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int 471 stw $hi0,0($tp) 472 stw $hi1,4($tp) 473 xmpyu ${fn0},${fab0}R,${fm0} 474 475 b L\$outer 476 ldo `$LOCALS+32+4`($fp),$tp 477 478L\$outerdone 479 addl $hi0,$ab1,$ab1 480 addl $ti1,$ab1,$ab1 481 extrd,u $ab1,31,32,$hi0 482 extrd,u $ab1,63,32,$ab1 483 484 ldw 4($tp),$ti0 ; tp[j] 485 486 addl $hi1,$nm1,$nm1 487 addl $ab1,$nm1,$nm1 488 extrd,u $nm1,31,32,$hi1 489 stw $nm1,-4($tp) ; tp[j-1] 490 491 addl $hi1,$hi0,$hi0 492 addl $ti0,$hi0,$hi0 493 extrd,u $hi0,31,32,$hi1 494 stw $hi0,0($tp) 495 stw $hi1,4($tp) 496 497 ldo `$LOCALS+32`($fp),$tp 498 sub %r0,%r0,%r0 ; clear borrow 499___ 500$code.=<<___ if ($BN_SZ==4); 501 ldws,ma 4($tp),$ti0 502 extru,= $rp,31,3,%r0 ; is rp 64-bit aligned? 503 b L\$sub_pa11 504 addl $tp,$arrsz,$tp 505L\$sub 506 ldwx $idx($np),$hi0 507 subb $ti0,$hi0,$hi1 508 ldwx $idx($tp),$ti0 509 addib,<> 4,$idx,L\$sub 510 stws,ma $hi1,4($rp) 511 512 subb $ti0,%r0,$hi1 513 ldo -4($tp),$tp 514___ 515$code.=<<___ if ($BN_SZ==8); 516 ldd,ma 8($tp),$ti0 517L\$sub 518 ldd $idx($np),$hi0 519 shrpd $ti0,$ti0,32,$ti0 ; flip word order 520 std $ti0,-8($tp) ; save flipped value 521 sub,db $ti0,$hi0,$hi1 522 ldd,ma 8($tp),$ti0 523 addib,<> 8,$idx,L\$sub 524 std,ma $hi1,8($rp) 525 526 extrd,u $ti0,31,32,$ti0 ; carry in flipped word order 527 sub,db $ti0,%r0,$hi1 528 ldo -8($tp),$tp 529___ 530$code.=<<___; 531 and $tp,$hi1,$ap 532 andcm $rp,$hi1,$bp 533 or $ap,$bp,$np 534 535 sub $rp,$arrsz,$rp ; rewind rp 536 subi 0,$arrsz,$idx 537 ldo `$LOCALS+32`($fp),$tp 538L\$copy 539 ldd $idx($np),$hi0 540 std,ma %r0,8($tp) 541 addib,<> 8,$idx,.-8 ; L\$copy 542 std,ma $hi0,8($rp) 543___ 544 545if ($BN_SZ==4) { # PA-RISC 1.1 code-path 546$ablo=$ab0; 547$abhi=$ab1; 548$nmlo0=$nm0; 549$nmhi0=$nm1; 550$nmlo1="%r9"; 551$nmhi1="%r8"; 552 553$code.=<<___; 554 b L\$done 555 nop 556 557 .ALIGN 8 558L\$parisc11 559 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] 560 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 561 ldw -12($xfer),$ablo 562 ldw -16($xfer),$hi0 563 ldw -4($xfer),$nmlo0 564 ldw -8($xfer),$nmhi0 565 fstds ${fab0},-16($xfer) 566 fstds ${fnm0},-8($xfer) 567 568 ldo 8($idx),$idx ; j++++ 569 add $ablo,$nmlo0,$nmlo0 ; discarded 570 addc %r0,$nmhi0,$hi1 571 ldw 4($xfer),$ablo 572 ldw 0($xfer),$abhi 573 nop 574 575L\$1st_pa11 576 xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0] 577 flddx $idx($ap),${fai} ; ap[j,j+1] 578 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m 579 flddx $idx($np),${fni} ; np[j,j+1] 580 add $hi0,$ablo,$ablo 581 ldw 12($xfer),$nmlo1 582 addc %r0,$abhi,$hi0 583 ldw 8($xfer),$nmhi1 584 add $ablo,$nmlo1,$nmlo1 585 fstds ${fab1},0($xfer) 586 addc %r0,$nmhi1,$nmhi1 587 fstds ${fnm1},8($xfer) 588 add $hi1,$nmlo1,$nmlo1 589 ldw -12($xfer),$ablo 590 addc %r0,$nmhi1,$hi1 591 ldw -16($xfer),$abhi 592 593 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] 594 ldw -4($xfer),$nmlo0 595 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 596 ldw -8($xfer),$nmhi0 597 add $hi0,$ablo,$ablo 598 stw $nmlo1,-4($tp) ; tp[j-1] 599 addc %r0,$abhi,$hi0 600 fstds ${fab0},-16($xfer) 601 add $ablo,$nmlo0,$nmlo0 602 fstds ${fnm0},-8($xfer) 603 addc %r0,$nmhi0,$nmhi0 604 ldw 0($xfer),$abhi 605 add $hi1,$nmlo0,$nmlo0 606 ldw 4($xfer),$ablo 607 stws,ma $nmlo0,8($tp) ; tp[j-1] 608 addib,<> 8,$idx,L\$1st_pa11 ; j++++ 609 addc %r0,$nmhi0,$hi1 610 611 ldw 8($xfer),$nmhi1 612 ldw 12($xfer),$nmlo1 613 xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0] 614 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m 615 add $hi0,$ablo,$ablo 616 fstds ${fab1},0($xfer) 617 addc %r0,$abhi,$hi0 618 fstds ${fnm1},8($xfer) 619 add $ablo,$nmlo1,$nmlo1 620 ldw -16($xfer),$abhi 621 addc %r0,$nmhi1,$nmhi1 622 ldw -12($xfer),$ablo 623 add $hi1,$nmlo1,$nmlo1 624 ldw -8($xfer),$nmhi0 625 addc %r0,$nmhi1,$hi1 626 ldw -4($xfer),$nmlo0 627 628 add $hi0,$ablo,$ablo 629 stw $nmlo1,-4($tp) ; tp[j-1] 630 addc %r0,$abhi,$hi0 631 ldw 0($xfer),$abhi 632 add $ablo,$nmlo0,$nmlo0 633 ldw 4($xfer),$ablo 634 addc %r0,$nmhi0,$nmhi0 635 ldws,mb 8($xfer),$nmhi1 636 add $hi1,$nmlo0,$nmlo0 637 ldw 4($xfer),$nmlo1 638 addc %r0,$nmhi0,$hi1 639 stws,ma $nmlo0,8($tp) ; tp[j-1] 640 641 ldo -1($num),$num ; i-- 642 subi 0,$arrsz,$idx ; j=0 643 644 fldws,ma 4($bp),${fbi} ; bp[1] 645 flddx $idx($ap),${fai} ; ap[0,1] 646 flddx $idx($np),${fni} ; np[0,1] 647 fldws 8($xfer),${fti}R ; tp[0] 648 add $hi0,$ablo,$ablo 649 addc %r0,$abhi,$hi0 650 ldo 8($idx),$idx ; j++++ 651 xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1] 652 xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1] 653 add $hi1,$nmlo1,$nmlo1 654 addc %r0,$nmhi1,$nmhi1 655 add $ablo,$nmlo1,$nmlo1 656 addc %r0,$nmhi1,$hi1 657 fstws,mb ${fab0}L,-8($xfer) ; save high part 658 stw $nmlo1,-4($tp) ; tp[j-1] 659 660 fcpy,sgl %fr0,${fti}L ; zero high part 661 fcpy,sgl %fr0,${fab0}L 662 add $hi1,$hi0,$hi0 663 addc %r0,%r0,$hi1 664 fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double 665 fcnvxf,dbl,dbl ${fab0},${fab0} 666 stw $hi0,0($tp) 667 stw $hi1,4($tp) 668 669 fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] 670 fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int 671 xmpyu ${fn0},${fab0}R,${fm0} 672 ldo `$LOCALS+32+4`($fp),$tp 673L\$outer_pa11 674 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m 675 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m 676 fstds ${fab0},-16($xfer) ; 33-bit value 677 fstds ${fnm0},-8($xfer) 678 flddx $idx($ap),${fai} ; ap[2,3] 679 flddx $idx($np),${fni} ; np[2,3] 680 ldw -16($xfer),$abhi ; carry bit actually 681 ldo 8($idx),$idx ; j++++ 682 ldw -12($xfer),$ablo 683 ldw -8($xfer),$nmhi0 684 ldw -4($xfer),$nmlo0 685 ldw 0($xfer),$hi0 ; high part 686 687 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] 688 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 689 fstds ${fab1},0($xfer) 690 addl $abhi,$hi0,$hi0 ; account carry bit 691 fstds ${fnm1},8($xfer) 692 add $ablo,$nmlo0,$nmlo0 ; discarded 693 ldw 0($tp),$ti1 ; tp[1] 694 addc %r0,$nmhi0,$hi1 695 fstds ${fab0},-16($xfer) 696 fstds ${fnm0},-8($xfer) 697 ldw 4($xfer),$ablo 698 ldw 0($xfer),$abhi 699 700L\$inner_pa11 701 xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i] 702 flddx $idx($ap),${fai} ; ap[j,j+1] 703 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m 704 flddx $idx($np),${fni} ; np[j,j+1] 705 add $hi0,$ablo,$ablo 706 ldw 4($tp),$ti0 ; tp[j] 707 addc %r0,$abhi,$abhi 708 ldw 12($xfer),$nmlo1 709 add $ti1,$ablo,$ablo 710 ldw 8($xfer),$nmhi1 711 addc %r0,$abhi,$hi0 712 fstds ${fab1},0($xfer) 713 add $ablo,$nmlo1,$nmlo1 714 fstds ${fnm1},8($xfer) 715 addc %r0,$nmhi1,$nmhi1 716 ldw -12($xfer),$ablo 717 add $hi1,$nmlo1,$nmlo1 718 ldw -16($xfer),$abhi 719 addc %r0,$nmhi1,$hi1 720 721 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] 722 ldw 8($tp),$ti1 ; tp[j] 723 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 724 ldw -4($xfer),$nmlo0 725 add $hi0,$ablo,$ablo 726 ldw -8($xfer),$nmhi0 727 addc %r0,$abhi,$abhi 728 stw $nmlo1,-4($tp) ; tp[j-1] 729 add $ti0,$ablo,$ablo 730 fstds ${fab0},-16($xfer) 731 addc %r0,$abhi,$hi0 732 fstds ${fnm0},-8($xfer) 733 add $ablo,$nmlo0,$nmlo0 734 ldw 4($xfer),$ablo 735 addc %r0,$nmhi0,$nmhi0 736 ldw 0($xfer),$abhi 737 add $hi1,$nmlo0,$nmlo0 738 stws,ma $nmlo0,8($tp) ; tp[j-1] 739 addib,<> 8,$idx,L\$inner_pa11 ; j++++ 740 addc %r0,$nmhi0,$hi1 741 742 xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i] 743 ldw 12($xfer),$nmlo1 744 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m 745 ldw 8($xfer),$nmhi1 746 add $hi0,$ablo,$ablo 747 ldw 4($tp),$ti0 ; tp[j] 748 addc %r0,$abhi,$abhi 749 fstds ${fab1},0($xfer) 750 add $ti1,$ablo,$ablo 751 fstds ${fnm1},8($xfer) 752 addc %r0,$abhi,$hi0 753 ldw -16($xfer),$abhi 754 add $ablo,$nmlo1,$nmlo1 755 ldw -12($xfer),$ablo 756 addc %r0,$nmhi1,$nmhi1 757 ldw -8($xfer),$nmhi0 758 add $hi1,$nmlo1,$nmlo1 759 ldw -4($xfer),$nmlo0 760 addc %r0,$nmhi1,$hi1 761 762 add $hi0,$ablo,$ablo 763 stw $nmlo1,-4($tp) ; tp[j-1] 764 addc %r0,$abhi,$abhi 765 add $ti0,$ablo,$ablo 766 ldw 8($tp),$ti1 ; tp[j] 767 addc %r0,$abhi,$hi0 768 ldw 0($xfer),$abhi 769 add $ablo,$nmlo0,$nmlo0 770 ldw 4($xfer),$ablo 771 addc %r0,$nmhi0,$nmhi0 772 ldws,mb 8($xfer),$nmhi1 773 add $hi1,$nmlo0,$nmlo0 774 ldw 4($xfer),$nmlo1 775 addc %r0,$nmhi0,$hi1 776 stws,ma $nmlo0,8($tp) ; tp[j-1] 777 778 addib,= -1,$num,L\$outerdone_pa11; i-- 779 subi 0,$arrsz,$idx ; j=0 780 781 fldws,ma 4($bp),${fbi} ; bp[i] 782 flddx $idx($ap),${fai} ; ap[0] 783 add $hi0,$ablo,$ablo 784 addc %r0,$abhi,$abhi 785 flddx $idx($np),${fni} ; np[0] 786 fldws 8($xfer),${fti}R ; tp[0] 787 add $ti1,$ablo,$ablo 788 addc %r0,$abhi,$hi0 789 790 ldo 8($idx),$idx ; j++++ 791 xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i] 792 xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i] 793 ldw 4($tp),$ti0 ; tp[j] 794 795 add $hi1,$nmlo1,$nmlo1 796 addc %r0,$nmhi1,$nmhi1 797 fstws,mb ${fab0}L,-8($xfer) ; save high part 798 add $ablo,$nmlo1,$nmlo1 799 addc %r0,$nmhi1,$hi1 800 fcpy,sgl %fr0,${fti}L ; zero high part 801 fcpy,sgl %fr0,${fab0}L 802 stw $nmlo1,-4($tp) ; tp[j-1] 803 804 fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double 805 fcnvxf,dbl,dbl ${fab0},${fab0} 806 add $hi1,$hi0,$hi0 807 addc %r0,%r0,$hi1 808 fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] 809 add $ti0,$hi0,$hi0 810 addc %r0,$hi1,$hi1 811 fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int 812 stw $hi0,0($tp) 813 stw $hi1,4($tp) 814 xmpyu ${fn0},${fab0}R,${fm0} 815 816 b L\$outer_pa11 817 ldo `$LOCALS+32+4`($fp),$tp 818 819L\$outerdone_pa11 820 add $hi0,$ablo,$ablo 821 addc %r0,$abhi,$abhi 822 add $ti1,$ablo,$ablo 823 addc %r0,$abhi,$hi0 824 825 ldw 4($tp),$ti0 ; tp[j] 826 827 add $hi1,$nmlo1,$nmlo1 828 addc %r0,$nmhi1,$nmhi1 829 add $ablo,$nmlo1,$nmlo1 830 addc %r0,$nmhi1,$hi1 831 stw $nmlo1,-4($tp) ; tp[j-1] 832 833 add $hi1,$hi0,$hi0 834 addc %r0,%r0,$hi1 835 add $ti0,$hi0,$hi0 836 addc %r0,$hi1,$hi1 837 stw $hi0,0($tp) 838 stw $hi1,4($tp) 839 840 ldo `$LOCALS+32+4`($fp),$tp 841 sub %r0,%r0,%r0 ; clear borrow 842 ldw -4($tp),$ti0 843 addl $tp,$arrsz,$tp 844L\$sub_pa11 845 ldwx $idx($np),$hi0 846 subb $ti0,$hi0,$hi1 847 ldwx $idx($tp),$ti0 848 addib,<> 4,$idx,L\$sub_pa11 849 stws,ma $hi1,4($rp) 850 851 subb $ti0,%r0,$hi1 852 ldo -4($tp),$tp 853 and $tp,$hi1,$ap 854 andcm $rp,$hi1,$bp 855 or $ap,$bp,$np 856 857 sub $rp,$arrsz,$rp ; rewind rp 858 subi 0,$arrsz,$idx 859 ldo `$LOCALS+32`($fp),$tp 860L\$copy_pa11 861 ldwx $idx($np),$hi0 862 stws,ma %r0,4($tp) 863 addib,<> 4,$idx,L\$copy_pa11 864 stws,ma $hi0,4($rp) 865 866 nop ; alignment 867L\$done 868___ 869} 870 871$code.=<<___; 872 ldi 1,%r28 ; signal "handled" 873 ldo $FRAME($fp),%sp ; destroy tp[num+1] 874 875 $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue 876 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 877 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 878 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 879 $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 880 $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 881 $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 882 $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 883L\$abort 884 bv (%r2) 885 .EXIT 886 $POPMB -$FRAME(%sp),%r3 887 .PROCEND 888 .STRINGZ "Montgomery Multiplication for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>" 889___ 890 891# Explicitly encode PA-RISC 2.0 instructions used in this module, so 892# that it can be compiled with .LEVEL 1.0. It should be noted that I 893# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0 894# directive... 895 896my $ldd = sub { 897 my ($mod,$args) = @_; 898 my $orig = "ldd$mod\t$args"; 899 900 if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 4 901 { my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3; 902 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 903 } 904 elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 5 905 { my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3; 906 $opcode|=(($1&0xF)<<17)|(($1&0x10)<<12); # encode offset 907 $opcode|=(1<<5) if ($mod =~ /^,m/); 908 $opcode|=(1<<13) if ($mod =~ /^,mb/); 909 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 910 } 911 else { "\t".$orig; } 912}; 913 914my $std = sub { 915 my ($mod,$args) = @_; 916 my $orig = "std$mod\t$args"; 917 918 if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 6 919 { my $opcode=(0x03<<26)|($3<<21)|($1<<16)|(1<<12)|(0xB<<6); 920 $opcode|=(($2&0xF)<<1)|(($2&0x10)>>4); # encode offset 921 $opcode|=(1<<5) if ($mod =~ /^,m/); 922 $opcode|=(1<<13) if ($mod =~ /^,mb/); 923 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 924 } 925 else { "\t".$orig; } 926}; 927 928my $extrd = sub { 929 my ($mod,$args) = @_; 930 my $orig = "extrd$mod\t$args"; 931 932 # I only have ",u" completer, it's implicitly encoded... 933 if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15 934 { my $opcode=(0x36<<26)|($1<<21)|($4<<16); 935 my $len=32-$3; 936 $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos 937 $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len 938 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 939 } 940 elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12 941 { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9); 942 my $len=32-$2; 943 $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len 944 $opcode |= (1<<13) if ($mod =~ /,\**=/); 945 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 946 } 947 else { "\t".$orig; } 948}; 949 950my $shrpd = sub { 951 my ($mod,$args) = @_; 952 my $orig = "shrpd$mod\t$args"; 953 954 if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14 955 { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4; 956 my $cpos=63-$3; 957 $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa 958 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 959 } 960 else { "\t".$orig; } 961}; 962 963my $sub = sub { 964 my ($mod,$args) = @_; 965 my $orig = "sub$mod\t$args"; 966 967 if ($mod eq ",db" && $args =~ /%r([0-9]+),%r([0-9]+),%r([0-9]+)/) { 968 my $opcode=(0x02<<26)|($2<<21)|($1<<16)|$3; 969 $opcode|=(1<<10); # e1 970 $opcode|=(1<<8); # e2 971 $opcode|=(1<<5); # d 972 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig 973 } 974 else { "\t".$orig; } 975}; 976 977sub assemble { 978 my ($mnemonic,$mod,$args)=@_; 979 my $opcode = eval("\$$mnemonic"); 980 981 ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args"; 982} 983 984foreach (split("\n",$code)) { 985 s/\`([^\`]*)\`/eval $1/ge; 986 # flip word order in 64-bit mode... 987 s/(xmpyu\s+)($fai|$fni)([LR])/$1.$2.($3 eq "L"?"R":"L")/e if ($BN_SZ==8); 988 # assemble 2.0 instructions in 32-bit mode... 989 s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($BN_SZ==4); 990 991 print $_,"\n"; 992} 993close STDOUT; 994