1#! /usr/bin/env perl 2# Copyright 2009-2020 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the OpenSSL license (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9 10# ==================================================================== 11# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 12# project. The module is, however, dual licensed under OpenSSL and 13# CRYPTOGAMS licenses depending on where you obtain it. For further 14# details see http://www.openssl.org/~appro/cryptogams/. 15# ==================================================================== 16 17# On PA-7100LC this module performs ~90-50% better, less for longer 18# keys, than code generated by gcc 3.2 for PA-RISC 1.1. Latter means 19# that compiler utilized xmpyu instruction to perform 32x32=64-bit 20# multiplication, which in turn means that "baseline" performance was 21# optimal in respect to instruction set capabilities. Fair comparison 22# with vendor compiler is problematic, because OpenSSL doesn't define 23# BN_LLONG [presumably] for historical reasons, which drives compiler 24# toward 4 times 16x16=32-bit multiplications [plus complementary 25# shifts and additions] instead. This means that you should observe 26# several times improvement over code generated by vendor compiler 27# for PA-RISC 1.1, but the "baseline" is far from optimal. The actual 28# improvement coefficient was never collected on PA-7100LC, or any 29# other 1.1 CPU, because I don't have access to such machine with 30# vendor compiler. But to give you a taste, PA-RISC 1.1 code path 31# reportedly outperformed code generated by cc +DA1.1 +O3 by factor 32# of ~5x on PA-8600. 33# 34# On PA-RISC 2.0 it has to compete with pa-risc2[W].s, which is 35# reportedly ~2x faster than vendor compiler generated code [according 36# to comment in pa-risc2[W].s]. Here comes a catch. Execution core of 37# this implementation is actually 32-bit one, in the sense that it 38# operates on 32-bit values. But pa-risc2[W].s operates on arrays of 39# 64-bit BN_LONGs... How do they interoperate then? No problem. This 40# module picks halves of 64-bit values in reverse order and pretends 41# they were 32-bit BN_LONGs. But can 32-bit core compete with "pure" 42# 64-bit code such as pa-risc2[W].s then? Well, the thing is that 43# 32x32=64-bit multiplication is the best even PA-RISC 2.0 can do, 44# i.e. there is no "wider" multiplication like on most other 64-bit 45# platforms. This means that even being effectively 32-bit, this 46# implementation performs "64-bit" computational task in same amount 47# of arithmetic operations, most notably multiplications. It requires 48# more memory references, most notably to tp[num], but this doesn't 49# seem to exhaust memory port capacity. And indeed, dedicated PA-RISC 50# 2.0 code path provides virtually same performance as pa-risc2[W].s: 51# it's ~10% better for shortest key length and ~10% worse for longest 52# one. 53# 54# In case it wasn't clear. The module has two distinct code paths: 55# PA-RISC 1.1 and PA-RISC 2.0 ones. Latter features carry-free 64-bit 56# additions and 64-bit integer loads, not to mention specific 57# instruction scheduling. In 64-bit build naturally only 2.0 code path 58# is assembled. In 32-bit application context both code paths are 59# assembled, PA-RISC 2.0 CPU is detected at run-time and proper path 60# is taken automatically. Also, in 32-bit build the module imposes 61# couple of limitations: vector lengths has to be even and vector 62# addresses has to be 64-bit aligned. Normally neither is a problem: 63# most common key lengths are even and vectors are commonly malloc-ed, 64# which ensures alignment. 65# 66# Special thanks to polarhome.com for providing HP-UX account on 67# PA-RISC 1.1 machine, and to correspondent who chose to remain 68# anonymous for testing the code on PA-RISC 2.0 machine. 69 70$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 71 72$flavour = shift; 73$output = shift; 74 75open STDOUT,">$output"; 76 77if ($flavour =~ /64/) { 78 $LEVEL ="2.0W"; 79 $SIZE_T =8; 80 $FRAME_MARKER =80; 81 $SAVED_RP =16; 82 $PUSH ="std"; 83 $PUSHMA ="std,ma"; 84 $POP ="ldd"; 85 $POPMB ="ldd,mb"; 86 $BN_SZ =$SIZE_T; 87} else { 88 $LEVEL ="1.1"; #$LEVEL.="\n\t.ALLOW\t2.0"; 89 $SIZE_T =4; 90 $FRAME_MARKER =48; 91 $SAVED_RP =20; 92 $PUSH ="stw"; 93 $PUSHMA ="stwm"; 94 $POP ="ldw"; 95 $POPMB ="ldwm"; 96 $BN_SZ =$SIZE_T; 97 if (open CONF,"<${dir}../../opensslconf.h") { 98 while(<CONF>) { 99 if (m/#\s*define\s+SIXTY_FOUR_BIT/) { 100 $BN_SZ=8; 101 $LEVEL="2.0"; 102 last; 103 } 104 } 105 close CONF; 106 } 107} 108 109$FRAME=8*$SIZE_T+$FRAME_MARKER; # 8 saved regs + frame marker 110 # [+ argument transfer] 111$LOCALS=$FRAME-$FRAME_MARKER; 112$FRAME+=32; # local variables 113 114$tp="%r31"; 115$ti1="%r29"; 116$ti0="%r28"; 117 118$rp="%r26"; 119$ap="%r25"; 120$bp="%r24"; 121$np="%r23"; 122$n0="%r22"; # passed through stack in 32-bit 123$num="%r21"; # passed through stack in 32-bit 124$idx="%r20"; 125$arrsz="%r19"; 126 127$nm1="%r7"; 128$nm0="%r6"; 129$ab1="%r5"; 130$ab0="%r4"; 131 132$fp="%r3"; 133$hi1="%r2"; 134$hi0="%r1"; 135 136$xfer=$n0; # accommodates [-16..15] offset in fld[dw]s 137 138$fm0="%fr4"; $fti=$fm0; 139$fbi="%fr5L"; 140$fn0="%fr5R"; 141$fai="%fr6"; $fab0="%fr7"; $fab1="%fr8"; 142$fni="%fr9"; $fnm0="%fr10"; $fnm1="%fr11"; 143 144$code=<<___; 145 .LEVEL $LEVEL 146 .SPACE \$TEXT\$ 147 .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY 148 149 .EXPORT bn_mul_mont,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR 150 .ALIGN 64 151bn_mul_mont 152 .PROC 153 .CALLINFO FRAME=`$FRAME-8*$SIZE_T`,NO_CALLS,SAVE_RP,SAVE_SP,ENTRY_GR=6 154 .ENTRY 155 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue 156 $PUSHMA %r3,$FRAME(%sp) 157 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) 158 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) 159 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) 160 $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) 161 $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) 162 $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) 163 $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) 164 ldo -$FRAME(%sp),$fp 165___ 166$code.=<<___ if ($SIZE_T==4); 167 ldw `-$FRAME_MARKER-4`($fp),$n0 168 ldw `-$FRAME_MARKER-8`($fp),$num 169 nop 170 nop ; alignment 171___ 172$code.=<<___ if ($BN_SZ==4); 173 comiclr,<= 6,$num,%r0 ; are vectors long enough? 174 b L\$abort 175 ldi 0,%r28 ; signal "unhandled" 176 add,ev %r0,$num,$num ; is $num even? 177 b L\$abort 178 nop 179 or $ap,$np,$ti1 180 extru,= $ti1,31,3,%r0 ; are ap and np 64-bit aligned? 181 b L\$abort 182 nop 183 nop ; alignment 184 nop 185 186 fldws 0($n0),${fn0} 187 fldws,ma 4($bp),${fbi} ; bp[0] 188___ 189$code.=<<___ if ($BN_SZ==8); 190 comib,> 3,$num,L\$abort ; are vectors long enough? 191 ldi 0,%r28 ; signal "unhandled" 192 addl $num,$num,$num ; I operate on 32-bit values 193 194 fldws 4($n0),${fn0} ; only low part of n0 195 fldws 4($bp),${fbi} ; bp[0] in flipped word order 196___ 197$code.=<<___; 198 fldds 0($ap),${fai} ; ap[0,1] 199 fldds 0($np),${fni} ; np[0,1] 200 201 sh2addl $num,%r0,$arrsz 202 ldi 31,$hi0 203 ldo 36($arrsz),$hi1 ; space for tp[num+1] 204 andcm $hi1,$hi0,$hi1 ; align 205 addl $hi1,%sp,%sp 206 $PUSH $fp,-$SIZE_T(%sp) 207 208 ldo `$LOCALS+16`($fp),$xfer 209 ldo `$LOCALS+32+4`($fp),$tp 210 211 xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[0] 212 xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[0] 213 xmpyu ${fn0},${fab0}R,${fm0} 214 215 addl $arrsz,$ap,$ap ; point at the end 216 addl $arrsz,$np,$np 217 subi 0,$arrsz,$idx ; j=0 218 ldo 8($idx),$idx ; j++++ 219 220 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m 221 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m 222 fstds ${fab0},-16($xfer) 223 fstds ${fnm0},-8($xfer) 224 fstds ${fab1},0($xfer) 225 fstds ${fnm1},8($xfer) 226 flddx $idx($ap),${fai} ; ap[2,3] 227 flddx $idx($np),${fni} ; np[2,3] 228___ 229$code.=<<___ if ($BN_SZ==4); 230 mtctl $hi0,%cr11 ; $hi0 still holds 31 231 extrd,u,*= $hi0,%sar,1,$hi0 ; executes on PA-RISC 1.0 232 b L\$parisc11 233 nop 234___ 235$code.=<<___; # PA-RISC 2.0 code-path 236 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] 237 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 238 ldd -16($xfer),$ab0 239 fstds ${fab0},-16($xfer) 240 241 extrd,u $ab0,31,32,$hi0 242 extrd,u $ab0,63,32,$ab0 243 ldd -8($xfer),$nm0 244 fstds ${fnm0},-8($xfer) 245 ldo 8($idx),$idx ; j++++ 246 addl $ab0,$nm0,$nm0 ; low part is discarded 247 extrd,u $nm0,31,32,$hi1 248 249L\$1st 250 xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0] 251 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m 252 ldd 0($xfer),$ab1 253 fstds ${fab1},0($xfer) 254 addl $hi0,$ab1,$ab1 255 extrd,u $ab1,31,32,$hi0 256 ldd 8($xfer),$nm1 257 fstds ${fnm1},8($xfer) 258 extrd,u $ab1,63,32,$ab1 259 addl $hi1,$nm1,$nm1 260 flddx $idx($ap),${fai} ; ap[j,j+1] 261 flddx $idx($np),${fni} ; np[j,j+1] 262 addl $ab1,$nm1,$nm1 263 extrd,u $nm1,31,32,$hi1 264 265 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] 266 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 267 ldd -16($xfer),$ab0 268 fstds ${fab0},-16($xfer) 269 addl $hi0,$ab0,$ab0 270 extrd,u $ab0,31,32,$hi0 271 ldd -8($xfer),$nm0 272 fstds ${fnm0},-8($xfer) 273 extrd,u $ab0,63,32,$ab0 274 addl $hi1,$nm0,$nm0 275 stw $nm1,-4($tp) ; tp[j-1] 276 addl $ab0,$nm0,$nm0 277 stw,ma $nm0,8($tp) ; tp[j-1] 278 addib,<> 8,$idx,L\$1st ; j++++ 279 extrd,u $nm0,31,32,$hi1 280 281 xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0] 282 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m 283 ldd 0($xfer),$ab1 284 fstds ${fab1},0($xfer) 285 addl $hi0,$ab1,$ab1 286 extrd,u $ab1,31,32,$hi0 287 ldd 8($xfer),$nm1 288 fstds ${fnm1},8($xfer) 289 extrd,u $ab1,63,32,$ab1 290 addl $hi1,$nm1,$nm1 291 ldd -16($xfer),$ab0 292 addl $ab1,$nm1,$nm1 293 ldd -8($xfer),$nm0 294 extrd,u $nm1,31,32,$hi1 295 296 addl $hi0,$ab0,$ab0 297 extrd,u $ab0,31,32,$hi0 298 stw $nm1,-4($tp) ; tp[j-1] 299 extrd,u $ab0,63,32,$ab0 300 addl $hi1,$nm0,$nm0 301 ldd 0($xfer),$ab1 302 addl $ab0,$nm0,$nm0 303 ldd,mb 8($xfer),$nm1 304 extrd,u $nm0,31,32,$hi1 305 stw,ma $nm0,8($tp) ; tp[j-1] 306 307 ldo -1($num),$num ; i-- 308 subi 0,$arrsz,$idx ; j=0 309___ 310$code.=<<___ if ($BN_SZ==4); 311 fldws,ma 4($bp),${fbi} ; bp[1] 312___ 313$code.=<<___ if ($BN_SZ==8); 314 fldws 0($bp),${fbi} ; bp[1] in flipped word order 315___ 316$code.=<<___; 317 flddx $idx($ap),${fai} ; ap[0,1] 318 flddx $idx($np),${fni} ; np[0,1] 319 fldws 8($xfer),${fti}R ; tp[0] 320 addl $hi0,$ab1,$ab1 321 extrd,u $ab1,31,32,$hi0 322 extrd,u $ab1,63,32,$ab1 323 ldo 8($idx),$idx ; j++++ 324 xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1] 325 xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1] 326 addl $hi1,$nm1,$nm1 327 addl $ab1,$nm1,$nm1 328 extrd,u $nm1,31,32,$hi1 329 fstws,mb ${fab0}L,-8($xfer) ; save high part 330 stw $nm1,-4($tp) ; tp[j-1] 331 332 fcpy,sgl %fr0,${fti}L ; zero high part 333 fcpy,sgl %fr0,${fab0}L 334 addl $hi1,$hi0,$hi0 335 extrd,u $hi0,31,32,$hi1 336 fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double 337 fcnvxf,dbl,dbl ${fab0},${fab0} 338 stw $hi0,0($tp) 339 stw $hi1,4($tp) 340 341 fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] 342 fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int 343 xmpyu ${fn0},${fab0}R,${fm0} 344 ldo `$LOCALS+32+4`($fp),$tp 345L\$outer 346 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m 347 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m 348 fstds ${fab0},-16($xfer) ; 33-bit value 349 fstds ${fnm0},-8($xfer) 350 flddx $idx($ap),${fai} ; ap[2] 351 flddx $idx($np),${fni} ; np[2] 352 ldo 8($idx),$idx ; j++++ 353 ldd -16($xfer),$ab0 ; 33-bit value 354 ldd -8($xfer),$nm0 355 ldw 0($xfer),$hi0 ; high part 356 357 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] 358 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 359 extrd,u $ab0,31,32,$ti0 ; carry bit 360 extrd,u $ab0,63,32,$ab0 361 fstds ${fab1},0($xfer) 362 addl $ti0,$hi0,$hi0 ; account carry bit 363 fstds ${fnm1},8($xfer) 364 addl $ab0,$nm0,$nm0 ; low part is discarded 365 ldw 0($tp),$ti1 ; tp[1] 366 extrd,u $nm0,31,32,$hi1 367 fstds ${fab0},-16($xfer) 368 fstds ${fnm0},-8($xfer) 369 370L\$inner 371 xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i] 372 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m 373 ldd 0($xfer),$ab1 374 fstds ${fab1},0($xfer) 375 addl $hi0,$ti1,$ti1 376 addl $ti1,$ab1,$ab1 377 ldd 8($xfer),$nm1 378 fstds ${fnm1},8($xfer) 379 extrd,u $ab1,31,32,$hi0 380 extrd,u $ab1,63,32,$ab1 381 flddx $idx($ap),${fai} ; ap[j,j+1] 382 flddx $idx($np),${fni} ; np[j,j+1] 383 addl $hi1,$nm1,$nm1 384 addl $ab1,$nm1,$nm1 385 ldw 4($tp),$ti0 ; tp[j] 386 stw $nm1,-4($tp) ; tp[j-1] 387 388 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] 389 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 390 ldd -16($xfer),$ab0 391 fstds ${fab0},-16($xfer) 392 addl $hi0,$ti0,$ti0 393 addl $ti0,$ab0,$ab0 394 ldd -8($xfer),$nm0 395 fstds ${fnm0},-8($xfer) 396 extrd,u $ab0,31,32,$hi0 397 extrd,u $nm1,31,32,$hi1 398 ldw 8($tp),$ti1 ; tp[j] 399 extrd,u $ab0,63,32,$ab0 400 addl $hi1,$nm0,$nm0 401 addl $ab0,$nm0,$nm0 402 stw,ma $nm0,8($tp) ; tp[j-1] 403 addib,<> 8,$idx,L\$inner ; j++++ 404 extrd,u $nm0,31,32,$hi1 405 406 xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i] 407 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m 408 ldd 0($xfer),$ab1 409 fstds ${fab1},0($xfer) 410 addl $hi0,$ti1,$ti1 411 addl $ti1,$ab1,$ab1 412 ldd 8($xfer),$nm1 413 fstds ${fnm1},8($xfer) 414 extrd,u $ab1,31,32,$hi0 415 extrd,u $ab1,63,32,$ab1 416 ldw 4($tp),$ti0 ; tp[j] 417 addl $hi1,$nm1,$nm1 418 addl $ab1,$nm1,$nm1 419 ldd -16($xfer),$ab0 420 ldd -8($xfer),$nm0 421 extrd,u $nm1,31,32,$hi1 422 423 addl $hi0,$ab0,$ab0 424 addl $ti0,$ab0,$ab0 425 stw $nm1,-4($tp) ; tp[j-1] 426 extrd,u $ab0,31,32,$hi0 427 ldw 8($tp),$ti1 ; tp[j] 428 extrd,u $ab0,63,32,$ab0 429 addl $hi1,$nm0,$nm0 430 ldd 0($xfer),$ab1 431 addl $ab0,$nm0,$nm0 432 ldd,mb 8($xfer),$nm1 433 extrd,u $nm0,31,32,$hi1 434 stw,ma $nm0,8($tp) ; tp[j-1] 435 436 addib,= -1,$num,L\$outerdone ; i-- 437 subi 0,$arrsz,$idx ; j=0 438___ 439$code.=<<___ if ($BN_SZ==4); 440 fldws,ma 4($bp),${fbi} ; bp[i] 441___ 442$code.=<<___ if ($BN_SZ==8); 443 ldi 12,$ti0 ; bp[i] in flipped word order 444 addl,ev %r0,$num,$num 445 ldi -4,$ti0 446 addl $ti0,$bp,$bp 447 fldws 0($bp),${fbi} 448___ 449$code.=<<___; 450 flddx $idx($ap),${fai} ; ap[0] 451 addl $hi0,$ab1,$ab1 452 flddx $idx($np),${fni} ; np[0] 453 fldws 8($xfer),${fti}R ; tp[0] 454 addl $ti1,$ab1,$ab1 455 extrd,u $ab1,31,32,$hi0 456 extrd,u $ab1,63,32,$ab1 457 458 ldo 8($idx),$idx ; j++++ 459 xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i] 460 xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i] 461 ldw 4($tp),$ti0 ; tp[j] 462 463 addl $hi1,$nm1,$nm1 464 fstws,mb ${fab0}L,-8($xfer) ; save high part 465 addl $ab1,$nm1,$nm1 466 extrd,u $nm1,31,32,$hi1 467 fcpy,sgl %fr0,${fti}L ; zero high part 468 fcpy,sgl %fr0,${fab0}L 469 stw $nm1,-4($tp) ; tp[j-1] 470 471 fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double 472 fcnvxf,dbl,dbl ${fab0},${fab0} 473 addl $hi1,$hi0,$hi0 474 fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] 475 addl $ti0,$hi0,$hi0 476 extrd,u $hi0,31,32,$hi1 477 fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int 478 stw $hi0,0($tp) 479 stw $hi1,4($tp) 480 xmpyu ${fn0},${fab0}R,${fm0} 481 482 b L\$outer 483 ldo `$LOCALS+32+4`($fp),$tp 484 485L\$outerdone 486 addl $hi0,$ab1,$ab1 487 addl $ti1,$ab1,$ab1 488 extrd,u $ab1,31,32,$hi0 489 extrd,u $ab1,63,32,$ab1 490 491 ldw 4($tp),$ti0 ; tp[j] 492 493 addl $hi1,$nm1,$nm1 494 addl $ab1,$nm1,$nm1 495 extrd,u $nm1,31,32,$hi1 496 stw $nm1,-4($tp) ; tp[j-1] 497 498 addl $hi1,$hi0,$hi0 499 addl $ti0,$hi0,$hi0 500 extrd,u $hi0,31,32,$hi1 501 stw $hi0,0($tp) 502 stw $hi1,4($tp) 503 504 ldo `$LOCALS+32`($fp),$tp 505 sub %r0,%r0,%r0 ; clear borrow 506___ 507$code.=<<___ if ($BN_SZ==4); 508 ldws,ma 4($tp),$ti0 509 extru,= $rp,31,3,%r0 ; is rp 64-bit aligned? 510 b L\$sub_pa11 511 addl $tp,$arrsz,$tp 512L\$sub 513 ldwx $idx($np),$hi0 514 subb $ti0,$hi0,$hi1 515 ldwx $idx($tp),$ti0 516 addib,<> 4,$idx,L\$sub 517 stws,ma $hi1,4($rp) 518 519 subb $ti0,%r0,$hi1 520___ 521$code.=<<___ if ($BN_SZ==8); 522 ldd,ma 8($tp),$ti0 523L\$sub 524 ldd $idx($np),$hi0 525 shrpd $ti0,$ti0,32,$ti0 ; flip word order 526 std $ti0,-8($tp) ; save flipped value 527 sub,db $ti0,$hi0,$hi1 528 ldd,ma 8($tp),$ti0 529 addib,<> 8,$idx,L\$sub 530 std,ma $hi1,8($rp) 531 532 extrd,u $ti0,31,32,$ti0 ; carry in flipped word order 533 sub,db $ti0,%r0,$hi1 534___ 535$code.=<<___; 536 ldo `$LOCALS+32`($fp),$tp 537 sub $rp,$arrsz,$rp ; rewind rp 538 subi 0,$arrsz,$idx 539L\$copy 540 ldd 0($tp),$ti0 541 ldd 0($rp),$hi0 542 std,ma %r0,8($tp) 543 comiclr,= 0,$hi1,%r0 544 copy $ti0,$hi0 545 addib,<> 8,$idx,L\$copy 546 std,ma $hi0,8($rp) 547___ 548 549if ($BN_SZ==4) { # PA-RISC 1.1 code-path 550$ablo=$ab0; 551$abhi=$ab1; 552$nmlo0=$nm0; 553$nmhi0=$nm1; 554$nmlo1="%r9"; 555$nmhi1="%r8"; 556 557$code.=<<___; 558 b L\$done 559 nop 560 561 .ALIGN 8 562L\$parisc11 563 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] 564 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 565 ldw -12($xfer),$ablo 566 ldw -16($xfer),$hi0 567 ldw -4($xfer),$nmlo0 568 ldw -8($xfer),$nmhi0 569 fstds ${fab0},-16($xfer) 570 fstds ${fnm0},-8($xfer) 571 572 ldo 8($idx),$idx ; j++++ 573 add $ablo,$nmlo0,$nmlo0 ; discarded 574 addc %r0,$nmhi0,$hi1 575 ldw 4($xfer),$ablo 576 ldw 0($xfer),$abhi 577 nop 578 579L\$1st_pa11 580 xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[0] 581 flddx $idx($ap),${fai} ; ap[j,j+1] 582 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m 583 flddx $idx($np),${fni} ; np[j,j+1] 584 add $hi0,$ablo,$ablo 585 ldw 12($xfer),$nmlo1 586 addc %r0,$abhi,$hi0 587 ldw 8($xfer),$nmhi1 588 add $ablo,$nmlo1,$nmlo1 589 fstds ${fab1},0($xfer) 590 addc %r0,$nmhi1,$nmhi1 591 fstds ${fnm1},8($xfer) 592 add $hi1,$nmlo1,$nmlo1 593 ldw -12($xfer),$ablo 594 addc %r0,$nmhi1,$hi1 595 ldw -16($xfer),$abhi 596 597 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[0] 598 ldw -4($xfer),$nmlo0 599 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 600 ldw -8($xfer),$nmhi0 601 add $hi0,$ablo,$ablo 602 stw $nmlo1,-4($tp) ; tp[j-1] 603 addc %r0,$abhi,$hi0 604 fstds ${fab0},-16($xfer) 605 add $ablo,$nmlo0,$nmlo0 606 fstds ${fnm0},-8($xfer) 607 addc %r0,$nmhi0,$nmhi0 608 ldw 0($xfer),$abhi 609 add $hi1,$nmlo0,$nmlo0 610 ldw 4($xfer),$ablo 611 stws,ma $nmlo0,8($tp) ; tp[j-1] 612 addib,<> 8,$idx,L\$1st_pa11 ; j++++ 613 addc %r0,$nmhi0,$hi1 614 615 ldw 8($xfer),$nmhi1 616 ldw 12($xfer),$nmlo1 617 xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[0] 618 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m 619 add $hi0,$ablo,$ablo 620 fstds ${fab1},0($xfer) 621 addc %r0,$abhi,$hi0 622 fstds ${fnm1},8($xfer) 623 add $ablo,$nmlo1,$nmlo1 624 ldw -16($xfer),$abhi 625 addc %r0,$nmhi1,$nmhi1 626 ldw -12($xfer),$ablo 627 add $hi1,$nmlo1,$nmlo1 628 ldw -8($xfer),$nmhi0 629 addc %r0,$nmhi1,$hi1 630 ldw -4($xfer),$nmlo0 631 632 add $hi0,$ablo,$ablo 633 stw $nmlo1,-4($tp) ; tp[j-1] 634 addc %r0,$abhi,$hi0 635 ldw 0($xfer),$abhi 636 add $ablo,$nmlo0,$nmlo0 637 ldw 4($xfer),$ablo 638 addc %r0,$nmhi0,$nmhi0 639 ldws,mb 8($xfer),$nmhi1 640 add $hi1,$nmlo0,$nmlo0 641 ldw 4($xfer),$nmlo1 642 addc %r0,$nmhi0,$hi1 643 stws,ma $nmlo0,8($tp) ; tp[j-1] 644 645 ldo -1($num),$num ; i-- 646 subi 0,$arrsz,$idx ; j=0 647 648 fldws,ma 4($bp),${fbi} ; bp[1] 649 flddx $idx($ap),${fai} ; ap[0,1] 650 flddx $idx($np),${fni} ; np[0,1] 651 fldws 8($xfer),${fti}R ; tp[0] 652 add $hi0,$ablo,$ablo 653 addc %r0,$abhi,$hi0 654 ldo 8($idx),$idx ; j++++ 655 xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[1] 656 xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[1] 657 add $hi1,$nmlo1,$nmlo1 658 addc %r0,$nmhi1,$nmhi1 659 add $ablo,$nmlo1,$nmlo1 660 addc %r0,$nmhi1,$hi1 661 fstws,mb ${fab0}L,-8($xfer) ; save high part 662 stw $nmlo1,-4($tp) ; tp[j-1] 663 664 fcpy,sgl %fr0,${fti}L ; zero high part 665 fcpy,sgl %fr0,${fab0}L 666 add $hi1,$hi0,$hi0 667 addc %r0,%r0,$hi1 668 fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double 669 fcnvxf,dbl,dbl ${fab0},${fab0} 670 stw $hi0,0($tp) 671 stw $hi1,4($tp) 672 673 fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] 674 fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int 675 xmpyu ${fn0},${fab0}R,${fm0} 676 ldo `$LOCALS+32+4`($fp),$tp 677L\$outer_pa11 678 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[0]*m 679 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[1]*m 680 fstds ${fab0},-16($xfer) ; 33-bit value 681 fstds ${fnm0},-8($xfer) 682 flddx $idx($ap),${fai} ; ap[2,3] 683 flddx $idx($np),${fni} ; np[2,3] 684 ldw -16($xfer),$abhi ; carry bit actually 685 ldo 8($idx),$idx ; j++++ 686 ldw -12($xfer),$ablo 687 ldw -8($xfer),$nmhi0 688 ldw -4($xfer),$nmlo0 689 ldw 0($xfer),$hi0 ; high part 690 691 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] 692 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 693 fstds ${fab1},0($xfer) 694 addl $abhi,$hi0,$hi0 ; account carry bit 695 fstds ${fnm1},8($xfer) 696 add $ablo,$nmlo0,$nmlo0 ; discarded 697 ldw 0($tp),$ti1 ; tp[1] 698 addc %r0,$nmhi0,$hi1 699 fstds ${fab0},-16($xfer) 700 fstds ${fnm0},-8($xfer) 701 ldw 4($xfer),$ablo 702 ldw 0($xfer),$abhi 703 704L\$inner_pa11 705 xmpyu ${fai}R,${fbi},${fab1} ; ap[j+1]*bp[i] 706 flddx $idx($ap),${fai} ; ap[j,j+1] 707 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j+1]*m 708 flddx $idx($np),${fni} ; np[j,j+1] 709 add $hi0,$ablo,$ablo 710 ldw 4($tp),$ti0 ; tp[j] 711 addc %r0,$abhi,$abhi 712 ldw 12($xfer),$nmlo1 713 add $ti1,$ablo,$ablo 714 ldw 8($xfer),$nmhi1 715 addc %r0,$abhi,$hi0 716 fstds ${fab1},0($xfer) 717 add $ablo,$nmlo1,$nmlo1 718 fstds ${fnm1},8($xfer) 719 addc %r0,$nmhi1,$nmhi1 720 ldw -12($xfer),$ablo 721 add $hi1,$nmlo1,$nmlo1 722 ldw -16($xfer),$abhi 723 addc %r0,$nmhi1,$hi1 724 725 xmpyu ${fai}L,${fbi},${fab0} ; ap[j]*bp[i] 726 ldw 8($tp),$ti1 ; tp[j] 727 xmpyu ${fni}L,${fm0}R,${fnm0} ; np[j]*m 728 ldw -4($xfer),$nmlo0 729 add $hi0,$ablo,$ablo 730 ldw -8($xfer),$nmhi0 731 addc %r0,$abhi,$abhi 732 stw $nmlo1,-4($tp) ; tp[j-1] 733 add $ti0,$ablo,$ablo 734 fstds ${fab0},-16($xfer) 735 addc %r0,$abhi,$hi0 736 fstds ${fnm0},-8($xfer) 737 add $ablo,$nmlo0,$nmlo0 738 ldw 4($xfer),$ablo 739 addc %r0,$nmhi0,$nmhi0 740 ldw 0($xfer),$abhi 741 add $hi1,$nmlo0,$nmlo0 742 stws,ma $nmlo0,8($tp) ; tp[j-1] 743 addib,<> 8,$idx,L\$inner_pa11 ; j++++ 744 addc %r0,$nmhi0,$hi1 745 746 xmpyu ${fai}R,${fbi},${fab1} ; ap[j]*bp[i] 747 ldw 12($xfer),$nmlo1 748 xmpyu ${fni}R,${fm0}R,${fnm1} ; np[j]*m 749 ldw 8($xfer),$nmhi1 750 add $hi0,$ablo,$ablo 751 ldw 4($tp),$ti0 ; tp[j] 752 addc %r0,$abhi,$abhi 753 fstds ${fab1},0($xfer) 754 add $ti1,$ablo,$ablo 755 fstds ${fnm1},8($xfer) 756 addc %r0,$abhi,$hi0 757 ldw -16($xfer),$abhi 758 add $ablo,$nmlo1,$nmlo1 759 ldw -12($xfer),$ablo 760 addc %r0,$nmhi1,$nmhi1 761 ldw -8($xfer),$nmhi0 762 add $hi1,$nmlo1,$nmlo1 763 ldw -4($xfer),$nmlo0 764 addc %r0,$nmhi1,$hi1 765 766 add $hi0,$ablo,$ablo 767 stw $nmlo1,-4($tp) ; tp[j-1] 768 addc %r0,$abhi,$abhi 769 add $ti0,$ablo,$ablo 770 ldw 8($tp),$ti1 ; tp[j] 771 addc %r0,$abhi,$hi0 772 ldw 0($xfer),$abhi 773 add $ablo,$nmlo0,$nmlo0 774 ldw 4($xfer),$ablo 775 addc %r0,$nmhi0,$nmhi0 776 ldws,mb 8($xfer),$nmhi1 777 add $hi1,$nmlo0,$nmlo0 778 ldw 4($xfer),$nmlo1 779 addc %r0,$nmhi0,$hi1 780 stws,ma $nmlo0,8($tp) ; tp[j-1] 781 782 addib,= -1,$num,L\$outerdone_pa11; i-- 783 subi 0,$arrsz,$idx ; j=0 784 785 fldws,ma 4($bp),${fbi} ; bp[i] 786 flddx $idx($ap),${fai} ; ap[0] 787 add $hi0,$ablo,$ablo 788 addc %r0,$abhi,$abhi 789 flddx $idx($np),${fni} ; np[0] 790 fldws 8($xfer),${fti}R ; tp[0] 791 add $ti1,$ablo,$ablo 792 addc %r0,$abhi,$hi0 793 794 ldo 8($idx),$idx ; j++++ 795 xmpyu ${fai}L,${fbi},${fab0} ; ap[0]*bp[i] 796 xmpyu ${fai}R,${fbi},${fab1} ; ap[1]*bp[i] 797 ldw 4($tp),$ti0 ; tp[j] 798 799 add $hi1,$nmlo1,$nmlo1 800 addc %r0,$nmhi1,$nmhi1 801 fstws,mb ${fab0}L,-8($xfer) ; save high part 802 add $ablo,$nmlo1,$nmlo1 803 addc %r0,$nmhi1,$hi1 804 fcpy,sgl %fr0,${fti}L ; zero high part 805 fcpy,sgl %fr0,${fab0}L 806 stw $nmlo1,-4($tp) ; tp[j-1] 807 808 fcnvxf,dbl,dbl ${fti},${fti} ; 32-bit unsigned int -> double 809 fcnvxf,dbl,dbl ${fab0},${fab0} 810 add $hi1,$hi0,$hi0 811 addc %r0,%r0,$hi1 812 fadd,dbl ${fti},${fab0},${fab0} ; add tp[0] 813 add $ti0,$hi0,$hi0 814 addc %r0,$hi1,$hi1 815 fcnvfx,dbl,dbl ${fab0},${fab0} ; double -> 33-bit unsigned int 816 stw $hi0,0($tp) 817 stw $hi1,4($tp) 818 xmpyu ${fn0},${fab0}R,${fm0} 819 820 b L\$outer_pa11 821 ldo `$LOCALS+32+4`($fp),$tp 822 823L\$outerdone_pa11 824 add $hi0,$ablo,$ablo 825 addc %r0,$abhi,$abhi 826 add $ti1,$ablo,$ablo 827 addc %r0,$abhi,$hi0 828 829 ldw 4($tp),$ti0 ; tp[j] 830 831 add $hi1,$nmlo1,$nmlo1 832 addc %r0,$nmhi1,$nmhi1 833 add $ablo,$nmlo1,$nmlo1 834 addc %r0,$nmhi1,$hi1 835 stw $nmlo1,-4($tp) ; tp[j-1] 836 837 add $hi1,$hi0,$hi0 838 addc %r0,%r0,$hi1 839 add $ti0,$hi0,$hi0 840 addc %r0,$hi1,$hi1 841 stw $hi0,0($tp) 842 stw $hi1,4($tp) 843 844 ldo `$LOCALS+32+4`($fp),$tp 845 sub %r0,%r0,%r0 ; clear borrow 846 ldw -4($tp),$ti0 847 addl $tp,$arrsz,$tp 848L\$sub_pa11 849 ldwx $idx($np),$hi0 850 subb $ti0,$hi0,$hi1 851 ldwx $idx($tp),$ti0 852 addib,<> 4,$idx,L\$sub_pa11 853 stws,ma $hi1,4($rp) 854 855 subb $ti0,%r0,$hi1 856 857 ldo `$LOCALS+32`($fp),$tp 858 sub $rp,$arrsz,$rp ; rewind rp 859 subi 0,$arrsz,$idx 860L\$copy_pa11 861 ldw 0($tp),$ti0 862 ldw 0($rp),$hi0 863 stws,ma %r0,4($tp) 864 comiclr,= 0,$hi1,%r0 865 copy $ti0,$hi0 866 addib,<> 4,$idx,L\$copy_pa11 867 stws,ma $hi0,4($rp) 868 869 nop ; alignment 870L\$done 871___ 872} 873 874$code.=<<___; 875 ldi 1,%r28 ; signal "handled" 876 ldo $FRAME($fp),%sp ; destroy tp[num+1] 877 878 $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue 879 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 880 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 881 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 882 $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 883 $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 884 $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 885 $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 886L\$abort 887 bv (%r2) 888 .EXIT 889 $POPMB -$FRAME(%sp),%r3 890 .PROCEND 891 .STRINGZ "Montgomery Multiplication for PA-RISC, CRYPTOGAMS by <appro\@openssl.org>" 892___ 893 894# Explicitly encode PA-RISC 2.0 instructions used in this module, so 895# that it can be compiled with .LEVEL 1.0. It should be noted that I 896# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0 897# directive... 898 899my $ldd = sub { 900 my ($mod,$args) = @_; 901 my $orig = "ldd$mod\t$args"; 902 903 if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 4 904 { my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3; 905 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 906 } 907 elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 5 908 { my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3; 909 $opcode|=(($1&0xF)<<17)|(($1&0x10)<<12); # encode offset 910 $opcode|=(1<<5) if ($mod =~ /^,m/); 911 $opcode|=(1<<13) if ($mod =~ /^,mb/); 912 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 913 } 914 else { "\t".$orig; } 915}; 916 917my $std = sub { 918 my ($mod,$args) = @_; 919 my $orig = "std$mod\t$args"; 920 921 if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 6 922 { my $opcode=(0x03<<26)|($3<<21)|($1<<16)|(1<<12)|(0xB<<6); 923 $opcode|=(($2&0xF)<<1)|(($2&0x10)>>4); # encode offset 924 $opcode|=(1<<5) if ($mod =~ /^,m/); 925 $opcode|=(1<<13) if ($mod =~ /^,mb/); 926 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 927 } 928 else { "\t".$orig; } 929}; 930 931my $extrd = sub { 932 my ($mod,$args) = @_; 933 my $orig = "extrd$mod\t$args"; 934 935 # I only have ",u" completer, it's implicitly encoded... 936 if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15 937 { my $opcode=(0x36<<26)|($1<<21)|($4<<16); 938 my $len=32-$3; 939 $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos 940 $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len 941 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 942 } 943 elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12 944 { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9); 945 my $len=32-$2; 946 $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len 947 $opcode |= (1<<13) if ($mod =~ /,\**=/); 948 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 949 } 950 else { "\t".$orig; } 951}; 952 953my $shrpd = sub { 954 my ($mod,$args) = @_; 955 my $orig = "shrpd$mod\t$args"; 956 957 if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14 958 { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4; 959 my $cpos=63-$3; 960 $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa 961 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 962 } 963 else { "\t".$orig; } 964}; 965 966my $sub = sub { 967 my ($mod,$args) = @_; 968 my $orig = "sub$mod\t$args"; 969 970 if ($mod eq ",db" && $args =~ /%r([0-9]+),%r([0-9]+),%r([0-9]+)/) { 971 my $opcode=(0x02<<26)|($2<<21)|($1<<16)|$3; 972 $opcode|=(1<<10); # e1 973 $opcode|=(1<<8); # e2 974 $opcode|=(1<<5); # d 975 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig 976 } 977 else { "\t".$orig; } 978}; 979 980sub assemble { 981 my ($mnemonic,$mod,$args)=@_; 982 my $opcode = eval("\$$mnemonic"); 983 984 ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args"; 985} 986 987if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1` 988 =~ /GNU assembler/) { 989 $gnuas = 1; 990} 991 992foreach (split("\n",$code)) { 993 s/\`([^\`]*)\`/eval $1/ge; 994 # flip word order in 64-bit mode... 995 s/(xmpyu\s+)($fai|$fni)([LR])/$1.$2.($3 eq "L"?"R":"L")/e if ($BN_SZ==8); 996 # assemble 2.0 instructions in 32-bit mode... 997 s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e if ($BN_SZ==4); 998 999 s/(\.LEVEL\s+2\.0)W/$1w/ if ($gnuas && $SIZE_T==8); 1000 s/\.SPACE\s+\$TEXT\$/.text/ if ($gnuas && $SIZE_T==8); 1001 s/\.SUBSPA.*// if ($gnuas && $SIZE_T==8); 1002 s/\bbv\b/bve/ if ($SIZE_T==8); 1003 1004 print $_,"\n"; 1005} 1006close STDOUT or die "error closing STDOUT: $!"; 1007