1#!/usr/bin/env perl 2 3# ==================================================================== 4# Written by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9 10# On PA-7100LC this module performs ~90-50% better, less for longer 11# keys, than code generated by gcc 3.2 for PA-RISC 1.1. Latter means 12# that compiler utilized xmpyu instruction to perform 32x32=64-bit 13# multiplication, which in turn means that "baseline" performance was 14# optimal in respect to instruction set capabilities. Fair comparison 15# with vendor compiler is problematic, because OpenSSL doesn't define 16# BN_LLONG [presumably] for historical reasons, which drives compiler 17# toward 4 times 16x16=32-bit multiplicatons [plus complementary 18# shifts and additions] instead. This means that you should observe 19# several times improvement over code generated by vendor compiler 20# for PA-RISC 1.1, but the "baseline" is far from optimal. The actual 21# improvement coefficient was never collected on PA-7100LC, or any 22# other 1.1 CPU, because I don't have access to such machine with 23# vendor compiler. But to give you a taste, PA-RISC 1.1 code path 24# reportedly outperformed code generated by cc +DA1.1 +O3 by factor 25# of ~5x on PA-8600. 26# 27# On PA-RISC 2.0 it has to compete with pa-risc2[W].s, which is 28# reportedly ~2x faster than vendor compiler generated code [according 29# to comment in pa-risc2[W].s]. Here comes a catch. Execution core of 30# this implementation is actually 32-bit one, in the sense that it 31# operates on 32-bit values. But pa-risc2[W].s operates on arrays of 32# 64-bit BN_LONGs... How do they interoperate then? No problem. This 33# module picks halves of 64-bit values in reverse order and pretends 34# they were 32-bit BN_LONGs. But can 32-bit core compete with "pure" 35# 64-bit code such as pa-risc2[W].s then? Well, the thing is that 36# 32x32=64-bit multiplication is the best even PA-RISC 2.0 can do, 37# i.e. there is no "wider" multiplication like on most other 64-bit 38# platforms. This means that even being effectively 32-bit, this 39# implementation performs "64-bit" computational task in same amount 40# of arithmetic operations, most notably multiplications. It requires 41# more memory references, most notably to tp[num], but this doesn't 42# seem to exhaust memory port capacity. And indeed, dedicated PA-RISC 43# 2.0 code path provides virtually same performance as pa-risc2[W].s: 44# it's ~10% better for shortest key length and ~10% worse for longest 45# one. 46# 47# In case it wasn't clear. The module has two distinct code paths: 48# PA-RISC 1.1 and PA-RISC 2.0 ones. Latter features carry-free 64-bit 49# additions and 64-bit integer loads, not to mention specific 50# instruction scheduling. In 64-bit build naturally only 2.0 code path 51# is assembled. In 32-bit application context both code paths are 52# assembled, PA-RISC 2.0 CPU is detected at run-time and proper path 53# is taken automatically. Also, in 32-bit build the module imposes 54# couple of limitations: vector lengths has to be even and vector 55# addresses has to be 64-bit aligned. Normally neither is a problem: 56# most common key lengths are even and vectors are commonly malloc-ed, 57# which ensures alignment. 58# 59# Special thanks to polarhome.com for providing HP-UX account on 60# PA-RISC 1.1 machine, and to correspondent who chose to remain 61# anonymous for testing the code on PA-RISC 2.0 machine. 62 63$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 64 65$flavour = shift; 66$output = shift; 67 68open STDOUT,">$output"; 69 70if ($flavour =~ /64/) { 71 $LEVEL ="2.0W"; 72 $SIZE_T =8; 73 $FRAME_MARKER =80; 74 $SAVED_RP =16; 75 $PUSH ="std"; 76 $PUSHMA ="std,ma"; 77 $POP ="ldd"; 78 $POPMB ="ldd,mb"; 79 $BN_SZ =$SIZE_T; 80} else { 81 $LEVEL ="1.1"; #$LEVEL.="\n\t.ALLOW\t2.0"; 82 $SIZE_T =4; 83 $FRAME_MARKER =48; 84 $SAVED_RP =20; 85 $PUSH ="stw"; 86 $PUSHMA ="stwm"; 87 $POP ="ldw"; 88 $POPMB ="ldwm"; 89 $BN_SZ =$SIZE_T; 90} 91 92$FRAME=8*$SIZE_T+$FRAME_MARKER; # 8 saved regs + frame marker 93 # [+ argument transfer] 94$LOCALS=$FRAME-$FRAME_MARKER; 95$FRAME+=32; # local variables 96 97$tp="%r31"; 98$ti1="%r29"; 99$ti0="%r28"; 100 101$rp="%r26"; 102$ap="%r25"; 103$bp="%r24"; 104$np="%r23"; 105$n0="%r22"; # passed through stack in 32-bit 106$num="%r21"; # passed through stack in 32-bit 107$idx="%r20"; 108$arrsz="%r19"; 109 110$nm1="%r7"; 111$nm0="%r6"; 112$ab1="%r5"; 113$ab0="%r4"; 114 115$fp="%r3"; 116$hi1="%r2"; 117$hi0="%r1"; 118 119$xfer=$n0; # accomodates [-16..15] offset in fld[dw]s 120 121$fm0="%fr4"; $fti=$fm0; 122$fbi="%fr5L"; 123$fn0="%fr5R"; 124$fai="%fr6"; $fab0="%fr7"; $fab1="%fr8"; 125$fni="%fr9"; $fnm0="%fr10"; $fnm1="%fr11"; 126 127$code=<<___; 128 .LEVEL $LEVEL 129#if 0 130 .SPACE \$TEXT\$ 131 .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY 132#else 133 .text 134#endif 135 136 .EXPORT bn_mul_mont,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR 137 .ALIGN 64 138bn_mul_mont 139 .PROC 140 .CALLINFO FRAME=`$FRAME-8*$SIZE_T`,NO_CALLS,SAVE_RP,SAVE_SP,ENTRY_GR=6 141 .ENTRY 142 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue 143 $PUSHMA %r3,$FRAME(%sp) 144 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) 145 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) 146 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) 147 $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) 148 $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) 149 $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) 150 $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) 151 ldo -$FRAME(%sp),$fp 152___ 153$code.=<<___ if ($SIZE_T==4); 154 ldw `-$FRAME_MARKER-4`($fp),$n0 155 ldw `-$FRAME_MARKER-8`($fp),$num 156 nop 157 nop ; alignment 158___ 159$code.=<<___ if ($BN_SZ==4); 160 comiclr,<= 6,$num,%r0 ; are vectors long enough? 161 b L\$abort 162 ldi 0,%r28 ; signal "unhandled" 163 add,ev %r0,$num,$num ; is $num even? 164 b L\$abort 165 nop 166 or $ap,$np,$ti1 167 extru,= $ti1,31,3,%r0 ; are ap and np 64-bit aligned? 168 b L\$abort 169 nop 170 nop ; alignment 171 nop 172 173 fldws 0($n0),${fn0} 174 fldws,ma 4($bp),${fbi} ; bp[0] 175___ 176$code.=<<___ if ($BN_SZ==8); 177 comib,> 3,$num,L\$abort ; are vectors long enough? 178 ldi 0,%r28 ; signal "unhandled" 179 addl $num,$num,$num ; I operate on 32-bit values 180 181 fldws 4($n0),${fn0} ; only low part of n0 182 fldws 4($bp),${fbi} ; bp[0] in flipped word order 183___ 184$code.=<<___; 185 fldds 0($ap),${fai} ; ap[0,1] 186 fldds 0($np),${fni} ; np[0,1] 187 188 sh2addl $num,%r0,$arrsz 189 ldi 31,$hi0 190 ldo 36($arrsz),$hi1 ; space for tp[num+1] 191 andcm $hi1,$hi0,$hi1 ; align 192 addl $hi1,%sp,%sp 193 $PUSH $fp,-$SIZE_T(%sp) 194 195 ldo `$LOCALS+16`($fp),$xfer 196 ldo `$LOCALS+32+4`($fp),$tp 197 198 xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[0] 199 xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[0] 200 xmpyu ${fn0},${fab0}R,${fm0} 201 202 addl $arrsz,$ap,$ap ; point at the end 203 addl $arrsz,$np,$np 204 subi 0,$arrsz,$idx ; j=0 205 ldo 8($idx),$idx ; j++++ 206 207 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m 208 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m 209 fstds ${fab0},-16($xfer) 210 fstds ${fnm0},-8($xfer) 211 fstds ${fab1},0($xfer) 212 fstds ${fnm1},8($xfer) 213 flddx $idx($ap),${fai} ; ap[2,3] 214 flddx $idx($np),${fni} ; np[2,3] 215___ 216$code.=<<___ if ($BN_SZ==4); 217#ifndef __OpenBSD__ 218 mtctl $hi0,%cr11 ; $hi0 still holds 31 219 extrd,u,*= $hi0,%sar,1,$hi0 ; executes on PA-RISC 1.0 220 b L\$parisc11 221 nop 222___ 223$code.=<<___; # PA-RISC 2.0 code-path 224 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] 225 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 226 ldd -16($xfer),$ab0 227 fstds ${fab0},-16($xfer) 228 229 extrd,u $ab0,31,32,$hi0 230 extrd,u $ab0,63,32,$ab0 231 ldd -8($xfer),$nm0 232 fstds ${fnm0},-8($xfer) 233 ldo 8($idx),$idx ; j++++ 234 addl $ab0,$nm0,$nm0 ; low part is discarded 235 extrd,u $nm0,31,32,$hi1 236 237L\$1st 238 xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0] 239 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m 240 ldd 0($xfer),$ab1 241 fstds ${fab1},0($xfer) 242 addl $hi0,$ab1,$ab1 243 extrd,u $ab1,31,32,$hi0 244 ldd 8($xfer),$nm1 245 fstds ${fnm1},8($xfer) 246 extrd,u $ab1,63,32,$ab1 247 addl $hi1,$nm1,$nm1 248 flddx $idx($ap),${fai} ; ap[j,j+1] 249 flddx $idx($np),${fni} ; np[j,j+1] 250 addl $ab1,$nm1,$nm1 251 extrd,u $nm1,31,32,$hi1 252 253 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] 254 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 255 ldd -16($xfer),$ab0 256 fstds ${fab0},-16($xfer) 257 addl $hi0,$ab0,$ab0 258 extrd,u $ab0,31,32,$hi0 259 ldd -8($xfer),$nm0 260 fstds ${fnm0},-8($xfer) 261 extrd,u $ab0,63,32,$ab0 262 addl $hi1,$nm0,$nm0 263 stw $nm1,-4($tp) ; tp[j-1] 264 addl $ab0,$nm0,$nm0 265 stw,ma $nm0,8($tp) ; tp[j-1] 266 addib,<> 8,$idx,L\$1st ; j++++ 267 extrd,u $nm0,31,32,$hi1 268 269 xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0] 270 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m 271 ldd 0($xfer),$ab1 272 fstds ${fab1},0($xfer) 273 addl $hi0,$ab1,$ab1 274 extrd,u $ab1,31,32,$hi0 275 ldd 8($xfer),$nm1 276 fstds ${fnm1},8($xfer) 277 extrd,u $ab1,63,32,$ab1 278 addl $hi1,$nm1,$nm1 279 ldd -16($xfer),$ab0 280 addl $ab1,$nm1,$nm1 281 ldd -8($xfer),$nm0 282 extrd,u $nm1,31,32,$hi1 283 284 addl $hi0,$ab0,$ab0 285 extrd,u $ab0,31,32,$hi0 286 stw $nm1,-4($tp) ; tp[j-1] 287 extrd,u $ab0,63,32,$ab0 288 addl $hi1,$nm0,$nm0 289 ldd 0($xfer),$ab1 290 addl $ab0,$nm0,$nm0 291 ldd,mb 8($xfer),$nm1 292 extrd,u $nm0,31,32,$hi1 293 stw,ma $nm0,8($tp) ; tp[j-1] 294 295 ldo -1($num),$num ; i-- 296 subi 0,$arrsz,$idx ; j=0 297___ 298$code.=<<___ if ($BN_SZ==4); 299 fldws,ma 4($bp),${fbi} ; bp[1] 300___ 301$code.=<<___ if ($BN_SZ==8); 302 fldws 0($bp),${fbi} ; bp[1] in flipped word order 303___ 304$code.=<<___; 305 flddx $idx($ap),${fai} ; ap[0,1] 306 flddx $idx($np),${fni} ; np[0,1] 307 fldws 8($xfer),${fti}R ; tp[0] 308 addl $hi0,$ab1,$ab1 309 extrd,u $ab1,31,32,$hi0 310 extrd,u $ab1,63,32,$ab1 311 ldo 8($idx),$idx ; j++++ 312 xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1] 313 xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1] 314 addl $hi1,$nm1,$nm1 315 addl $ab1,$nm1,$nm1 316 extrd,u $nm1,31,32,$hi1 317 fstws,mb ${fab0}L,-8($xfer) ; save high part 318 stw $nm1,-4($tp) ; tp[j-1] 319 320 fcpy,sgl %fr0,${fti}L ; zero high part 321 fcpy,sgl %fr0,${fab0}L 322 addl $hi1,$hi0,$hi0 323 extrd,u $hi0,31,32,$hi1 324 fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double 325 fcnvxf,dbl,dbl ${fab0},${fab0} 326 stw $hi0,0($tp) 327 stw $hi1,4($tp) 328 329 fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] 330 fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int 331 xmpyu ${fn0},${fab0}R,${fm0} 332 ldo `$LOCALS+32+4`($fp),$tp 333L\$outer 334 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m 335 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m 336 fstds ${fab0},-16($xfer) ; 33-bit value 337 fstds ${fnm0},-8($xfer) 338 flddx $idx($ap),${fai} ; ap[2] 339 flddx $idx($np),${fni} ; np[2] 340 ldo 8($idx),$idx ; j++++ 341 ldd -16($xfer),$ab0 ; 33-bit value 342 ldd -8($xfer),$nm0 343 ldw 0($xfer),$hi0 ; high part 344 345 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] 346 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 347 extrd,u $ab0,31,32,$ti0 ; carry bit 348 extrd,u $ab0,63,32,$ab0 349 fstds ${fab1},0($xfer) 350 addl $ti0,$hi0,$hi0 ; account carry bit 351 fstds ${fnm1},8($xfer) 352 addl $ab0,$nm0,$nm0 ; low part is discarded 353 ldw 0($tp),$ti1 ; tp[1] 354 extrd,u $nm0,31,32,$hi1 355 fstds ${fab0},-16($xfer) 356 fstds ${fnm0},-8($xfer) 357 358L\$inner 359 xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i] 360 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m 361 ldd 0($xfer),$ab1 362 fstds ${fab1},0($xfer) 363 addl $hi0,$ti1,$ti1 364 addl $ti1,$ab1,$ab1 365 ldd 8($xfer),$nm1 366 fstds ${fnm1},8($xfer) 367 extrd,u $ab1,31,32,$hi0 368 extrd,u $ab1,63,32,$ab1 369 flddx $idx($ap),${fai} ; ap[j,j+1] 370 flddx $idx($np),${fni} ; np[j,j+1] 371 addl $hi1,$nm1,$nm1 372 addl $ab1,$nm1,$nm1 373 ldw 4($tp),$ti0 ; tp[j] 374 stw $nm1,-4($tp) ; tp[j-1] 375 376 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] 377 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 378 ldd -16($xfer),$ab0 379 fstds ${fab0},-16($xfer) 380 addl $hi0,$ti0,$ti0 381 addl $ti0,$ab0,$ab0 382 ldd -8($xfer),$nm0 383 fstds ${fnm0},-8($xfer) 384 extrd,u $ab0,31,32,$hi0 385 extrd,u $nm1,31,32,$hi1 386 ldw 8($tp),$ti1 ; tp[j] 387 extrd,u $ab0,63,32,$ab0 388 addl $hi1,$nm0,$nm0 389 addl $ab0,$nm0,$nm0 390 stw,ma $nm0,8($tp) ; tp[j-1] 391 addib,<> 8,$idx,L\$inner ; j++++ 392 extrd,u $nm0,31,32,$hi1 393 394 xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i] 395 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m 396 ldd 0($xfer),$ab1 397 fstds ${fab1},0($xfer) 398 addl $hi0,$ti1,$ti1 399 addl $ti1,$ab1,$ab1 400 ldd 8($xfer),$nm1 401 fstds ${fnm1},8($xfer) 402 extrd,u $ab1,31,32,$hi0 403 extrd,u $ab1,63,32,$ab1 404 ldw 4($tp),$ti0 ; tp[j] 405 addl $hi1,$nm1,$nm1 406 addl $ab1,$nm1,$nm1 407 ldd -16($xfer),$ab0 408 ldd -8($xfer),$nm0 409 extrd,u $nm1,31,32,$hi1 410 411 addl $hi0,$ab0,$ab0 412 addl $ti0,$ab0,$ab0 413 stw $nm1,-4($tp) ; tp[j-1] 414 extrd,u $ab0,31,32,$hi0 415 ldw 8($tp),$ti1 ; tp[j] 416 extrd,u $ab0,63,32,$ab0 417 addl $hi1,$nm0,$nm0 418 ldd 0($xfer),$ab1 419 addl $ab0,$nm0,$nm0 420 ldd,mb 8($xfer),$nm1 421 extrd,u $nm0,31,32,$hi1 422 stw,ma $nm0,8($tp) ; tp[j-1] 423 424 addib,= -1,$num,L\$outerdone ; i-- 425 subi 0,$arrsz,$idx ; j=0 426___ 427$code.=<<___ if ($BN_SZ==4); 428 fldws,ma 4($bp),${fbi} ; bp[i] 429___ 430$code.=<<___ if ($BN_SZ==8); 431 ldi 12,$ti0 ; bp[i] in flipped word order 432 addl,ev %r0,$num,$num 433 ldi -4,$ti0 434 addl $ti0,$bp,$bp 435 fldws 0($bp),${fbi} 436___ 437$code.=<<___; 438 flddx $idx($ap),${fai} ; ap[0] 439 addl $hi0,$ab1,$ab1 440 flddx $idx($np),${fni} ; np[0] 441 fldws 8($xfer),${fti}R ; tp[0] 442 addl $ti1,$ab1,$ab1 443 extrd,u $ab1,31,32,$hi0 444 extrd,u $ab1,63,32,$ab1 445 446 ldo 8($idx),$idx ; j++++ 447 xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i] 448 xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i] 449 ldw 4($tp),$ti0 ; tp[j] 450 451 addl $hi1,$nm1,$nm1 452 fstws,mb ${fab0}L,-8($xfer) ; save high part 453 addl $ab1,$nm1,$nm1 454 extrd,u $nm1,31,32,$hi1 455 fcpy,sgl %fr0,${fti}L ; zero high part 456 fcpy,sgl %fr0,${fab0}L 457 stw $nm1,-4($tp) ; tp[j-1] 458 459 fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double 460 fcnvxf,dbl,dbl ${fab0},${fab0} 461 addl $hi1,$hi0,$hi0 462 fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] 463 addl $ti0,$hi0,$hi0 464 extrd,u $hi0,31,32,$hi1 465 fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int 466 stw $hi0,0($tp) 467 stw $hi1,4($tp) 468 xmpyu ${fn0},${fab0}R,${fm0} 469 470 b L\$outer 471 ldo `$LOCALS+32+4`($fp),$tp 472 473L\$outerdone 474 addl $hi0,$ab1,$ab1 475 addl $ti1,$ab1,$ab1 476 extrd,u $ab1,31,32,$hi0 477 extrd,u $ab1,63,32,$ab1 478 479 ldw 4($tp),$ti0 ; tp[j] 480 481 addl $hi1,$nm1,$nm1 482 addl $ab1,$nm1,$nm1 483 extrd,u $nm1,31,32,$hi1 484 stw $nm1,-4($tp) ; tp[j-1] 485 486 addl $hi1,$hi0,$hi0 487 addl $ti0,$hi0,$hi0 488 extrd,u $hi0,31,32,$hi1 489 stw $hi0,0($tp) 490 stw $hi1,4($tp) 491 492 ldo `$LOCALS+32`($fp),$tp 493 sub %r0,%r0,%r0 ; clear borrow 494___ 495$code.=<<___ if ($BN_SZ==4); 496 ldws,ma 4($tp),$ti0 497 extru,= $rp,31,3,%r0 ; is rp 64-bit aligned? 498 b L\$sub_pa11 499 addl $tp,$arrsz,$tp 500L\$sub 501 ldwx $idx($np),$hi0 502 subb $ti0,$hi0,$hi1 503 ldwx $idx($tp),$ti0 504 addib,<> 4,$idx,L\$sub 505 stws,ma $hi1,4($rp) 506 507 subb $ti0,%r0,$hi1 508 ldo -4($tp),$tp 509___ 510$code.=<<___ if ($BN_SZ==8); 511 ldd,ma 8($tp),$ti0 512L\$sub 513 ldd $idx($np),$hi0 514 shrpd $ti0,$ti0,32,$ti0 ; flip word order 515 std $ti0,-8($tp) ; save flipped value 516 sub,db $ti0,$hi0,$hi1 517 ldd,ma 8($tp),$ti0 518 addib,<> 8,$idx,L\$sub 519 std,ma $hi1,8($rp) 520 521 extrd,u $ti0,31,32,$ti0 ; carry in flipped word order 522 sub,db $ti0,%r0,$hi1 523 ldo -8($tp),$tp 524___ 525$code.=<<___; 526 and $tp,$hi1,$ap 527 andcm $rp,$hi1,$bp 528 or $ap,$bp,$np 529 530 sub $rp,$arrsz,$rp ; rewind rp 531 subi 0,$arrsz,$idx 532 ldo `$LOCALS+32`($fp),$tp 533L\$copy 534 ldd $idx($np),$hi0 535 std,ma %r0,8($tp) 536 addib,<> 8,$idx,.-8 ; L\$copy 537 std,ma $hi0,8($rp) 538___ 539 540if ($BN_SZ==4) { # PA-RISC 1.1 code-path 541$ablo=$ab0; 542$abhi=$ab1; 543$nmlo0=$nm0; 544$nmhi0=$nm1; 545$nmlo1="%r9"; 546$nmhi1="%r8"; 547 548$code.=<<___; 549 b L\$done 550 nop 551 552 .ALIGN 8 553L\$parisc11 554#endif 555 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] 556 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 557 ldw -12($xfer),$ablo 558 ldw -16($xfer),$hi0 559 ldw -4($xfer),$nmlo0 560 ldw -8($xfer),$nmhi0 561 fstds ${fab0},-16($xfer) 562 fstds ${fnm0},-8($xfer) 563 564 ldo 8($idx),$idx ; j++++ 565 add $ablo,$nmlo0,$nmlo0 ; discarded 566 addc %r0,$nmhi0,$hi1 567 ldw 4($xfer),$ablo 568 ldw 0($xfer),$abhi 569 nop 570 571L\$1st_pa11 572 xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0] 573 flddx $idx($ap),${fai} ; ap[j,j+1] 574 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m 575 flddx $idx($np),${fni} ; np[j,j+1] 576 add $hi0,$ablo,$ablo 577 ldw 12($xfer),$nmlo1 578 addc %r0,$abhi,$hi0 579 ldw 8($xfer),$nmhi1 580 add $ablo,$nmlo1,$nmlo1 581 fstds ${fab1},0($xfer) 582 addc %r0,$nmhi1,$nmhi1 583 fstds ${fnm1},8($xfer) 584 add $hi1,$nmlo1,$nmlo1 585 ldw -12($xfer),$ablo 586 addc %r0,$nmhi1,$hi1 587 ldw -16($xfer),$abhi 588 589 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] 590 ldw -4($xfer),$nmlo0 591 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 592 ldw -8($xfer),$nmhi0 593 add $hi0,$ablo,$ablo 594 stw $nmlo1,-4($tp) ; tp[j-1] 595 addc %r0,$abhi,$hi0 596 fstds ${fab0},-16($xfer) 597 add $ablo,$nmlo0,$nmlo0 598 fstds ${fnm0},-8($xfer) 599 addc %r0,$nmhi0,$nmhi0 600 ldw 0($xfer),$abhi 601 add $hi1,$nmlo0,$nmlo0 602 ldw 4($xfer),$ablo 603 stws,ma $nmlo0,8($tp) ; tp[j-1] 604 addib,<> 8,$idx,L\$1st_pa11 ; j++++ 605 addc %r0,$nmhi0,$hi1 606 607 ldw 8($xfer),$nmhi1 608 ldw 12($xfer),$nmlo1 609 xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0] 610 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m 611 add $hi0,$ablo,$ablo 612 fstds ${fab1},0($xfer) 613 addc %r0,$abhi,$hi0 614 fstds ${fnm1},8($xfer) 615 add $ablo,$nmlo1,$nmlo1 616 ldw -16($xfer),$abhi 617 addc %r0,$nmhi1,$nmhi1 618 ldw -12($xfer),$ablo 619 add $hi1,$nmlo1,$nmlo1 620 ldw -8($xfer),$nmhi0 621 addc %r0,$nmhi1,$hi1 622 ldw -4($xfer),$nmlo0 623 624 add $hi0,$ablo,$ablo 625 stw $nmlo1,-4($tp) ; tp[j-1] 626 addc %r0,$abhi,$hi0 627 ldw 0($xfer),$abhi 628 add $ablo,$nmlo0,$nmlo0 629 ldw 4($xfer),$ablo 630 addc %r0,$nmhi0,$nmhi0 631 ldws,mb 8($xfer),$nmhi1 632 add $hi1,$nmlo0,$nmlo0 633 ldw 4($xfer),$nmlo1 634 addc %r0,$nmhi0,$hi1 635 stws,ma $nmlo0,8($tp) ; tp[j-1] 636 637 ldo -1($num),$num ; i-- 638 subi 0,$arrsz,$idx ; j=0 639 640 fldws,ma 4($bp),${fbi} ; bp[1] 641 flddx $idx($ap),${fai} ; ap[0,1] 642 flddx $idx($np),${fni} ; np[0,1] 643 fldws 8($xfer),${fti}R ; tp[0] 644 add $hi0,$ablo,$ablo 645 addc %r0,$abhi,$hi0 646 ldo 8($idx),$idx ; j++++ 647 xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1] 648 xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1] 649 add $hi1,$nmlo1,$nmlo1 650 addc %r0,$nmhi1,$nmhi1 651 add $ablo,$nmlo1,$nmlo1 652 addc %r0,$nmhi1,$hi1 653 fstws,mb ${fab0}L,-8($xfer) ; save high part 654 stw $nmlo1,-4($tp) ; tp[j-1] 655 656 fcpy,sgl %fr0,${fti}L ; zero high part 657 fcpy,sgl %fr0,${fab0}L 658 add $hi1,$hi0,$hi0 659 addc %r0,%r0,$hi1 660 fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double 661 fcnvxf,dbl,dbl ${fab0},${fab0} 662 stw $hi0,0($tp) 663 stw $hi1,4($tp) 664 665 fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] 666 fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int 667 xmpyu ${fn0},${fab0}R,${fm0} 668 ldo `$LOCALS+32+4`($fp),$tp 669L\$outer_pa11 670 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m 671 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m 672 fstds ${fab0},-16($xfer) ; 33-bit value 673 fstds ${fnm0},-8($xfer) 674 flddx $idx($ap),${fai} ; ap[2,3] 675 flddx $idx($np),${fni} ; np[2,3] 676 ldw -16($xfer),$abhi ; carry bit actually 677 ldo 8($idx),$idx ; j++++ 678 ldw -12($xfer),$ablo 679 ldw -8($xfer),$nmhi0 680 ldw -4($xfer),$nmlo0 681 ldw 0($xfer),$hi0 ; high part 682 683 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] 684 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 685 fstds ${fab1},0($xfer) 686 addl $abhi,$hi0,$hi0 ; account carry bit 687 fstds ${fnm1},8($xfer) 688 add $ablo,$nmlo0,$nmlo0 ; discarded 689 ldw 0($tp),$ti1 ; tp[1] 690 addc %r0,$nmhi0,$hi1 691 fstds ${fab0},-16($xfer) 692 fstds ${fnm0},-8($xfer) 693 ldw 4($xfer),$ablo 694 ldw 0($xfer),$abhi 695 696L\$inner_pa11 697 xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i] 698 flddx $idx($ap),${fai} ; ap[j,j+1] 699 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m 700 flddx $idx($np),${fni} ; np[j,j+1] 701 add $hi0,$ablo,$ablo 702 ldw 4($tp),$ti0 ; tp[j] 703 addc %r0,$abhi,$abhi 704 ldw 12($xfer),$nmlo1 705 add $ti1,$ablo,$ablo 706 ldw 8($xfer),$nmhi1 707 addc %r0,$abhi,$hi0 708 fstds ${fab1},0($xfer) 709 add $ablo,$nmlo1,$nmlo1 710 fstds ${fnm1},8($xfer) 711 addc %r0,$nmhi1,$nmhi1 712 ldw -12($xfer),$ablo 713 add $hi1,$nmlo1,$nmlo1 714 ldw -16($xfer),$abhi 715 addc %r0,$nmhi1,$hi1 716 717 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] 718 ldw 8($tp),$ti1 ; tp[j] 719 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 720 ldw -4($xfer),$nmlo0 721 add $hi0,$ablo,$ablo 722 ldw -8($xfer),$nmhi0 723 addc %r0,$abhi,$abhi 724 stw $nmlo1,-4($tp) ; tp[j-1] 725 add $ti0,$ablo,$ablo 726 fstds ${fab0},-16($xfer) 727 addc %r0,$abhi,$hi0 728 fstds ${fnm0},-8($xfer) 729 add $ablo,$nmlo0,$nmlo0 730 ldw 4($xfer),$ablo 731 addc %r0,$nmhi0,$nmhi0 732 ldw 0($xfer),$abhi 733 add $hi1,$nmlo0,$nmlo0 734 stws,ma $nmlo0,8($tp) ; tp[j-1] 735 addib,<> 8,$idx,L\$inner_pa11 ; j++++ 736 addc %r0,$nmhi0,$hi1 737 738 xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i] 739 ldw 12($xfer),$nmlo1 740 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m 741 ldw 8($xfer),$nmhi1 742 add $hi0,$ablo,$ablo 743 ldw 4($tp),$ti0 ; tp[j] 744 addc %r0,$abhi,$abhi 745 fstds ${fab1},0($xfer) 746 add $ti1,$ablo,$ablo 747 fstds ${fnm1},8($xfer) 748 addc %r0,$abhi,$hi0 749 ldw -16($xfer),$abhi 750 add $ablo,$nmlo1,$nmlo1 751 ldw -12($xfer),$ablo 752 addc %r0,$nmhi1,$nmhi1 753 ldw -8($xfer),$nmhi0 754 add $hi1,$nmlo1,$nmlo1 755 ldw -4($xfer),$nmlo0 756 addc %r0,$nmhi1,$hi1 757 758 add $hi0,$ablo,$ablo 759 stw $nmlo1,-4($tp) ; tp[j-1] 760 addc %r0,$abhi,$abhi 761 add $ti0,$ablo,$ablo 762 ldw 8($tp),$ti1 ; tp[j] 763 addc %r0,$abhi,$hi0 764 ldw 0($xfer),$abhi 765 add $ablo,$nmlo0,$nmlo0 766 ldw 4($xfer),$ablo 767 addc %r0,$nmhi0,$nmhi0 768 ldws,mb 8($xfer),$nmhi1 769 add $hi1,$nmlo0,$nmlo0 770 ldw 4($xfer),$nmlo1 771 addc %r0,$nmhi0,$hi1 772 stws,ma $nmlo0,8($tp) ; tp[j-1] 773 774 addib,= -1,$num,L\$outerdone_pa11; i-- 775 subi 0,$arrsz,$idx ; j=0 776 777 fldws,ma 4($bp),${fbi} ; bp[i] 778 flddx $idx($ap),${fai} ; ap[0] 779 add $hi0,$ablo,$ablo 780 addc %r0,$abhi,$abhi 781 flddx $idx($np),${fni} ; np[0] 782 fldws 8($xfer),${fti}R ; tp[0] 783 add $ti1,$ablo,$ablo 784 addc %r0,$abhi,$hi0 785 786 ldo 8($idx),$idx ; j++++ 787 xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i] 788 xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i] 789 ldw 4($tp),$ti0 ; tp[j] 790 791 add $hi1,$nmlo1,$nmlo1 792 addc %r0,$nmhi1,$nmhi1 793 fstws,mb ${fab0}L,-8($xfer) ; save high part 794 add $ablo,$nmlo1,$nmlo1 795 addc %r0,$nmhi1,$hi1 796 fcpy,sgl %fr0,${fti}L ; zero high part 797 fcpy,sgl %fr0,${fab0}L 798 stw $nmlo1,-4($tp) ; tp[j-1] 799 800 fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double 801 fcnvxf,dbl,dbl ${fab0},${fab0} 802 add $hi1,$hi0,$hi0 803 addc %r0,%r0,$hi1 804 fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] 805 add $ti0,$hi0,$hi0 806 addc %r0,$hi1,$hi1 807 fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int 808 stw $hi0,0($tp) 809 stw $hi1,4($tp) 810 xmpyu ${fn0},${fab0}R,${fm0} 811 812 b L\$outer_pa11 813 ldo `$LOCALS+32+4`($fp),$tp 814 815L\$outerdone_pa11 816 add $hi0,$ablo,$ablo 817 addc %r0,$abhi,$abhi 818 add $ti1,$ablo,$ablo 819 addc %r0,$abhi,$hi0 820 821 ldw 4($tp),$ti0 ; tp[j] 822 823 add $hi1,$nmlo1,$nmlo1 824 addc %r0,$nmhi1,$nmhi1 825 add $ablo,$nmlo1,$nmlo1 826 addc %r0,$nmhi1,$hi1 827 stw $nmlo1,-4($tp) ; tp[j-1] 828 829 add $hi1,$hi0,$hi0 830 addc %r0,%r0,$hi1 831 add $ti0,$hi0,$hi0 832 addc %r0,$hi1,$hi1 833 stw $hi0,0($tp) 834 stw $hi1,4($tp) 835 836 ldo `$LOCALS+32+4`($fp),$tp 837 sub %r0,%r0,%r0 ; clear borrow 838 ldw -4($tp),$ti0 839 addl $tp,$arrsz,$tp 840L\$sub_pa11 841 ldwx $idx($np),$hi0 842 subb $ti0,$hi0,$hi1 843 ldwx $idx($tp),$ti0 844 addib,<> 4,$idx,L\$sub_pa11 845 stws,ma $hi1,4($rp) 846 847 subb $ti0,%r0,$hi1 848 ldo -4($tp),$tp 849 and $tp,$hi1,$ap 850 andcm $rp,$hi1,$bp 851 or $ap,$bp,$np 852 853 sub $rp,$arrsz,$rp ; rewind rp 854 subi 0,$arrsz,$idx 855 ldo `$LOCALS+32`($fp),$tp 856L\$copy_pa11 857 ldwx $idx($np),$hi0 858 stws,ma %r0,4($tp) 859 addib,<> 4,$idx,L\$copy_pa11 860 stws,ma $hi0,4($rp) 861 862 nop ; alignment 863L\$done 864___ 865} 866 867$code.=<<___; 868 ldi 1,%r28 ; signal "handled" 869 ldo $FRAME($fp),%sp ; destroy tp[num+1] 870 871 $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue 872 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 873 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 874 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 875 $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 876 $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 877 $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 878 $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 879L\$abort 880 bv (%r2) 881 .EXIT 882 $POPMB -$FRAME(%sp),%r3 883 .PROCEND 884 885 .data 886 .STRINGZ "Montgomery Multiplication for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>" 887___ 888 889# Explicitly encode PA-RISC 2.0 instructions used in this module, so 890# that it can be compiled with .LEVEL 1.0. It should be noted that I 891# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0 892# directive... 893 894my $ldd = sub { 895 my ($mod,$args) = @_; 896 my $orig = "ldd$mod\t$args"; 897 898 if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 4 899 { my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3; 900 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 901 } 902 elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 5 903 { my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3; 904 $opcode|=(($1&0xF)<<17)|(($1&0x10)<<12); # encode offset 905 $opcode|=(1<<5) if ($mod =~ /^,m/); 906 $opcode|=(1<<13) if ($mod =~ /^,mb/); 907 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 908 } 909 else { "\t".$orig; } 910}; 911 912my $std = sub { 913 my ($mod,$args) = @_; 914 my $orig = "std$mod\t$args"; 915 916 if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 6 917 { my $opcode=(0x03<<26)|($3<<21)|($1<<16)|(1<<12)|(0xB<<6); 918 $opcode|=(($2&0xF)<<1)|(($2&0x10)>>4); # encode offset 919 $opcode|=(1<<5) if ($mod =~ /^,m/); 920 $opcode|=(1<<13) if ($mod =~ /^,mb/); 921 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 922 } 923 else { "\t".$orig; } 924}; 925 926my $extrd = sub { 927 my ($mod,$args) = @_; 928 my $orig = "extrd$mod\t$args"; 929 930 # I only have ",u" completer, it's implicitly encoded... 931 if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15 932 { my $opcode=(0x36<<26)|($1<<21)|($4<<16); 933 my $len=32-$3; 934 $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos 935 $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len 936 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 937 } 938 elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12 939 { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9); 940 my $len=32-$2; 941 $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len 942 $opcode |= (1<<13) if ($mod =~ /,\**=/); 943 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 944 } 945 else { "\t".$orig; } 946}; 947 948my $shrpd = sub { 949 my ($mod,$args) = @_; 950 my $orig = "shrpd$mod\t$args"; 951 952 if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14 953 { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4; 954 my $cpos=63-$3; 955 $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa 956 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 957 } 958 else { "\t".$orig; } 959}; 960 961my $sub = sub { 962 my ($mod,$args) = @_; 963 my $orig = "sub$mod\t$args"; 964 965 if ($mod eq ",db" && $args =~ /%r([0-9]+),%r([0-9]+),%r([0-9]+)/) { 966 my $opcode=(0x02<<26)|($2<<21)|($1<<16)|$3; 967 $opcode|=(1<<10); # e1 968 $opcode|=(1<<8); # e2 969 $opcode|=(1<<5); # d 970 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig 971 } 972 else { "\t".$orig; } 973}; 974 975sub assemble { 976 my ($mnemonic,$mod,$args)=@_; 977 my $opcode = eval("\$$mnemonic"); 978 979 ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args"; 980} 981 982foreach (split("\n",$code)) { 983 s/\`([^\`]*)\`/eval $1/ge; 984 # flip word order in 64-bit mode... 985 s/(xmpyu\s+)($fai|$fni)([LR])/$1.$2.($3 eq "L"?"R":"L")/e if ($BN_SZ==8); 986 # assemble 2.0 instructions in 32-bit mode... 987 s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($BN_SZ==4); 988 989 s/\bbv\b/bve/gm if ($SIZE_T==8); 990 991 print $_,"\n"; 992} 993close STDOUT; 994