1#!/usr/bin/env perl 2# 3# ==================================================================== 4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9# 10# April 2010 11# 12# The module implements "4-bit" GCM GHASH function and underlying 13# single multiplication operation in GF(2^128). "4-bit" means that it 14# uses 256 bytes per-key table [+128 bytes shared table]. On PA-7100LC 15# it processes one byte in 19.6 cycles, which is more than twice as 16# fast as code generated by gcc 3.2. PA-RISC 2.0 loop is scheduled for 17# 8 cycles, but measured performance on PA-8600 system is ~9 cycles per 18# processed byte. This is ~2.2x faster than 64-bit code generated by 19# vendor compiler (which used to be very hard to beat:-). 20# 21# Special thanks to polarhome.com for providing HP-UX account. 22 23$flavour = shift; 24$output = shift; 25open STDOUT,">$output"; 26 27if ($flavour =~ /64/) { 28 $LEVEL ="2.0W"; 29 $SIZE_T =8; 30 $FRAME_MARKER =80; 31 $SAVED_RP =16; 32 $PUSH ="std"; 33 $PUSHMA ="std,ma"; 34 $POP ="ldd"; 35 $POPMB ="ldd,mb"; 36 $NREGS =6; 37} else { 38 $LEVEL ="1.0"; #"\n\t.ALLOW\t2.0"; 39 $SIZE_T =4; 40 $FRAME_MARKER =48; 41 $SAVED_RP =20; 42 $PUSH ="stw"; 43 $PUSHMA ="stwm"; 44 $POP ="ldw"; 45 $POPMB ="ldwm"; 46 $NREGS =11; 47} 48 49$FRAME=10*$SIZE_T+$FRAME_MARKER;# NREGS saved regs + frame marker 50 # [+ argument transfer] 51 52################# volatile registers 53$Xi="%r26"; # argument block 54$Htbl="%r25"; 55$inp="%r24"; 56$len="%r23"; 57$Hhh=$Htbl; # variables 58$Hll="%r22"; 59$Zhh="%r21"; 60$Zll="%r20"; 61$cnt="%r19"; 62$rem_4bit="%r28"; 63$rem="%r29"; 64$mask0xf0="%r31"; 65 66################# preserved registers 67$Thh="%r1"; 68$Tll="%r2"; 69$nlo="%r3"; 70$nhi="%r4"; 71$byte="%r5"; 72if ($SIZE_T==4) { 73 $Zhl="%r6"; 74 $Zlh="%r7"; 75 $Hhl="%r8"; 76 $Hlh="%r9"; 77 $Thl="%r10"; 78 $Tlh="%r11"; 79} 80$rem2="%r6"; # used in PA-RISC 2.0 code 81 82$code.=<<___; 83 .LEVEL $LEVEL 84#if 0 85 .SPACE \$TEXT\$ 86 .SUBSPA \$CODE\$,QUAD=0,ALIGN=8,ACCESS=0x2C,CODE_ONLY 87#else 88 .text 89#endif 90 91 .EXPORT gcm_gmult_4bit,ENTRY,ARGW0=GR,ARGW1=GR 92 .ALIGN 64 93gcm_gmult_4bit 94 .PROC 95 .CALLINFO FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=$NREGS 96 .ENTRY 97 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue 98 $PUSHMA %r3,$FRAME(%sp) 99 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) 100 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) 101 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) 102___ 103$code.=<<___ if ($SIZE_T==4); 104 $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) 105 $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) 106 $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) 107 $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) 108 $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp) 109___ 110$code.=<<___; 111 blr %r0,$rem_4bit 112 ldi 3,$rem 113L\$pic_gmult 114 andcm $rem_4bit,$rem,$rem_4bit 115 addl $inp,$len,$len 116 ldo L\$rem_4bit-L\$pic_gmult($rem_4bit),$rem_4bit 117 ldi 0xf0,$mask0xf0 118___ 119$code.=<<___ if ($SIZE_T==4); 120 ldi 31,$rem 121 mtctl $rem,%cr11 122 extrd,u,*= $rem,%sar,1,$rem ; executes on PA-RISC 1.0 123 b L\$parisc1_gmult 124 nop 125___ 126 127$code.=<<___; 128 ldb 15($Xi),$nlo 129 ldo 8($Htbl),$Hll 130 131 and $mask0xf0,$nlo,$nhi 132 depd,z $nlo,59,4,$nlo 133 134 ldd $nlo($Hll),$Zll 135 ldd $nlo($Hhh),$Zhh 136 137 depd,z $Zll,60,4,$rem 138 shrpd $Zhh,$Zll,4,$Zll 139 extrd,u $Zhh,59,60,$Zhh 140 ldb 14($Xi),$nlo 141 142 ldd $nhi($Hll),$Tll 143 ldd $nhi($Hhh),$Thh 144 and $mask0xf0,$nlo,$nhi 145 depd,z $nlo,59,4,$nlo 146 147 xor $Tll,$Zll,$Zll 148 xor $Thh,$Zhh,$Zhh 149 ldd $rem($rem_4bit),$rem 150 b L\$oop_gmult_pa2 151 ldi 13,$cnt 152 153 .ALIGN 8 154L\$oop_gmult_pa2 155 xor $rem,$Zhh,$Zhh ; moved here to work around gas bug 156 depd,z $Zll,60,4,$rem 157 158 shrpd $Zhh,$Zll,4,$Zll 159 extrd,u $Zhh,59,60,$Zhh 160 ldd $nlo($Hll),$Tll 161 ldd $nlo($Hhh),$Thh 162 163 xor $Tll,$Zll,$Zll 164 xor $Thh,$Zhh,$Zhh 165 ldd $rem($rem_4bit),$rem 166 167 xor $rem,$Zhh,$Zhh 168 depd,z $Zll,60,4,$rem 169 ldbx $cnt($Xi),$nlo 170 171 shrpd $Zhh,$Zll,4,$Zll 172 extrd,u $Zhh,59,60,$Zhh 173 ldd $nhi($Hll),$Tll 174 ldd $nhi($Hhh),$Thh 175 176 and $mask0xf0,$nlo,$nhi 177 depd,z $nlo,59,4,$nlo 178 ldd $rem($rem_4bit),$rem 179 180 xor $Tll,$Zll,$Zll 181 addib,uv -1,$cnt,L\$oop_gmult_pa2 182 xor $Thh,$Zhh,$Zhh 183 184 xor $rem,$Zhh,$Zhh 185 depd,z $Zll,60,4,$rem 186 187 shrpd $Zhh,$Zll,4,$Zll 188 extrd,u $Zhh,59,60,$Zhh 189 ldd $nlo($Hll),$Tll 190 ldd $nlo($Hhh),$Thh 191 192 xor $Tll,$Zll,$Zll 193 xor $Thh,$Zhh,$Zhh 194 ldd $rem($rem_4bit),$rem 195 196 xor $rem,$Zhh,$Zhh 197 depd,z $Zll,60,4,$rem 198 199 shrpd $Zhh,$Zll,4,$Zll 200 extrd,u $Zhh,59,60,$Zhh 201 ldd $nhi($Hll),$Tll 202 ldd $nhi($Hhh),$Thh 203 204 xor $Tll,$Zll,$Zll 205 xor $Thh,$Zhh,$Zhh 206 ldd $rem($rem_4bit),$rem 207 208 xor $rem,$Zhh,$Zhh 209 std $Zll,8($Xi) 210 std $Zhh,0($Xi) 211___ 212 213$code.=<<___ if ($SIZE_T==4); 214 b L\$done_gmult 215 nop 216 217L\$parisc1_gmult 218 ldb 15($Xi),$nlo 219 ldo 12($Htbl),$Hll 220 ldo 8($Htbl),$Hlh 221 ldo 4($Htbl),$Hhl 222 223 and $mask0xf0,$nlo,$nhi 224 zdep $nlo,27,4,$nlo 225 226 ldwx $nlo($Hll),$Zll 227 ldwx $nlo($Hlh),$Zlh 228 ldwx $nlo($Hhl),$Zhl 229 ldwx $nlo($Hhh),$Zhh 230 zdep $Zll,28,4,$rem 231 ldb 14($Xi),$nlo 232 ldwx $rem($rem_4bit),$rem 233 shrpw $Zlh,$Zll,4,$Zll 234 ldwx $nhi($Hll),$Tll 235 shrpw $Zhl,$Zlh,4,$Zlh 236 ldwx $nhi($Hlh),$Tlh 237 shrpw $Zhh,$Zhl,4,$Zhl 238 ldwx $nhi($Hhl),$Thl 239 extru $Zhh,27,28,$Zhh 240 ldwx $nhi($Hhh),$Thh 241 xor $rem,$Zhh,$Zhh 242 and $mask0xf0,$nlo,$nhi 243 zdep $nlo,27,4,$nlo 244 245 xor $Tll,$Zll,$Zll 246 ldwx $nlo($Hll),$Tll 247 xor $Tlh,$Zlh,$Zlh 248 ldwx $nlo($Hlh),$Tlh 249 xor $Thl,$Zhl,$Zhl 250 b L\$oop_gmult_pa1 251 ldi 13,$cnt 252 253 .ALIGN 8 254L\$oop_gmult_pa1 255 zdep $Zll,28,4,$rem 256 ldwx $nlo($Hhl),$Thl 257 xor $Thh,$Zhh,$Zhh 258 ldwx $rem($rem_4bit),$rem 259 shrpw $Zlh,$Zll,4,$Zll 260 ldwx $nlo($Hhh),$Thh 261 shrpw $Zhl,$Zlh,4,$Zlh 262 ldbx $cnt($Xi),$nlo 263 xor $Tll,$Zll,$Zll 264 ldwx $nhi($Hll),$Tll 265 shrpw $Zhh,$Zhl,4,$Zhl 266 xor $Tlh,$Zlh,$Zlh 267 ldwx $nhi($Hlh),$Tlh 268 extru $Zhh,27,28,$Zhh 269 xor $Thl,$Zhl,$Zhl 270 ldwx $nhi($Hhl),$Thl 271 xor $rem,$Zhh,$Zhh 272 zdep $Zll,28,4,$rem 273 xor $Thh,$Zhh,$Zhh 274 ldwx $nhi($Hhh),$Thh 275 shrpw $Zlh,$Zll,4,$Zll 276 ldwx $rem($rem_4bit),$rem 277 shrpw $Zhl,$Zlh,4,$Zlh 278 shrpw $Zhh,$Zhl,4,$Zhl 279 and $mask0xf0,$nlo,$nhi 280 extru $Zhh,27,28,$Zhh 281 zdep $nlo,27,4,$nlo 282 xor $Tll,$Zll,$Zll 283 ldwx $nlo($Hll),$Tll 284 xor $Tlh,$Zlh,$Zlh 285 ldwx $nlo($Hlh),$Tlh 286 xor $rem,$Zhh,$Zhh 287 addib,uv -1,$cnt,L\$oop_gmult_pa1 288 xor $Thl,$Zhl,$Zhl 289 290 zdep $Zll,28,4,$rem 291 ldwx $nlo($Hhl),$Thl 292 xor $Thh,$Zhh,$Zhh 293 ldwx $rem($rem_4bit),$rem 294 shrpw $Zlh,$Zll,4,$Zll 295 ldwx $nlo($Hhh),$Thh 296 shrpw $Zhl,$Zlh,4,$Zlh 297 xor $Tll,$Zll,$Zll 298 ldwx $nhi($Hll),$Tll 299 shrpw $Zhh,$Zhl,4,$Zhl 300 xor $Tlh,$Zlh,$Zlh 301 ldwx $nhi($Hlh),$Tlh 302 extru $Zhh,27,28,$Zhh 303 xor $rem,$Zhh,$Zhh 304 xor $Thl,$Zhl,$Zhl 305 ldwx $nhi($Hhl),$Thl 306 xor $Thh,$Zhh,$Zhh 307 ldwx $nhi($Hhh),$Thh 308 zdep $Zll,28,4,$rem 309 ldwx $rem($rem_4bit),$rem 310 shrpw $Zlh,$Zll,4,$Zll 311 shrpw $Zhl,$Zlh,4,$Zlh 312 shrpw $Zhh,$Zhl,4,$Zhl 313 extru $Zhh,27,28,$Zhh 314 xor $Tll,$Zll,$Zll 315 xor $Tlh,$Zlh,$Zlh 316 xor $rem,$Zhh,$Zhh 317 stw $Zll,12($Xi) 318 xor $Thl,$Zhl,$Zhl 319 stw $Zlh,8($Xi) 320 xor $Thh,$Zhh,$Zhh 321 stw $Zhl,4($Xi) 322 stw $Zhh,0($Xi) 323___ 324$code.=<<___; 325L\$done_gmult 326 $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue 327 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 328 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 329 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 330___ 331$code.=<<___ if ($SIZE_T==4); 332 $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 333 $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 334 $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 335 $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 336 $POP `-$FRAME+8*$SIZE_T`(%sp),%r11 337___ 338$code.=<<___; 339 bv (%r2) 340 .EXIT 341 $POPMB -$FRAME(%sp),%r3 342 .PROCEND 343 344 .EXPORT gcm_ghash_4bit,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR 345 .ALIGN 64 346gcm_ghash_4bit 347 .PROC 348 .CALLINFO FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=11 349 .ENTRY 350 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue 351 $PUSHMA %r3,$FRAME(%sp) 352 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) 353 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) 354 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) 355___ 356$code.=<<___ if ($SIZE_T==4); 357 $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) 358 $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) 359 $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) 360 $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) 361 $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp) 362___ 363$code.=<<___; 364 blr %r0,$rem_4bit 365 ldi 3,$rem 366L\$pic_ghash 367 andcm $rem_4bit,$rem,$rem_4bit 368 addl $inp,$len,$len 369 ldo L\$rem_4bit-L\$pic_ghash($rem_4bit),$rem_4bit 370 ldi 0xf0,$mask0xf0 371___ 372$code.=<<___ if ($SIZE_T==4); 373 ldi 31,$rem 374 mtctl $rem,%cr11 375 extrd,u,*= $rem,%sar,1,$rem ; executes on PA-RISC 1.0 376 b L\$parisc1_ghash 377 nop 378___ 379 380$code.=<<___; 381 ldb 15($Xi),$nlo 382 ldo 8($Htbl),$Hll 383 384L\$outer_ghash_pa2 385 ldb 15($inp),$nhi 386 xor $nhi,$nlo,$nlo 387 and $mask0xf0,$nlo,$nhi 388 depd,z $nlo,59,4,$nlo 389 390 ldd $nlo($Hll),$Zll 391 ldd $nlo($Hhh),$Zhh 392 393 depd,z $Zll,60,4,$rem 394 shrpd $Zhh,$Zll,4,$Zll 395 extrd,u $Zhh,59,60,$Zhh 396 ldb 14($Xi),$nlo 397 ldb 14($inp),$byte 398 399 ldd $nhi($Hll),$Tll 400 ldd $nhi($Hhh),$Thh 401 xor $byte,$nlo,$nlo 402 and $mask0xf0,$nlo,$nhi 403 depd,z $nlo,59,4,$nlo 404 405 xor $Tll,$Zll,$Zll 406 xor $Thh,$Zhh,$Zhh 407 ldd $rem($rem_4bit),$rem 408 b L\$oop_ghash_pa2 409 ldi 13,$cnt 410 411 .ALIGN 8 412L\$oop_ghash_pa2 413 xor $rem,$Zhh,$Zhh ; moved here to work around gas bug 414 depd,z $Zll,60,4,$rem2 415 416 shrpd $Zhh,$Zll,4,$Zll 417 extrd,u $Zhh,59,60,$Zhh 418 ldd $nlo($Hll),$Tll 419 ldd $nlo($Hhh),$Thh 420 421 xor $Tll,$Zll,$Zll 422 xor $Thh,$Zhh,$Zhh 423 ldbx $cnt($Xi),$nlo 424 ldbx $cnt($inp),$byte 425 426 depd,z $Zll,60,4,$rem 427 shrpd $Zhh,$Zll,4,$Zll 428 ldd $rem2($rem_4bit),$rem2 429 430 xor $rem2,$Zhh,$Zhh 431 xor $byte,$nlo,$nlo 432 ldd $nhi($Hll),$Tll 433 ldd $nhi($Hhh),$Thh 434 435 and $mask0xf0,$nlo,$nhi 436 depd,z $nlo,59,4,$nlo 437 438 extrd,u $Zhh,59,60,$Zhh 439 xor $Tll,$Zll,$Zll 440 441 ldd $rem($rem_4bit),$rem 442 addib,uv -1,$cnt,L\$oop_ghash_pa2 443 xor $Thh,$Zhh,$Zhh 444 445 xor $rem,$Zhh,$Zhh 446 depd,z $Zll,60,4,$rem2 447 448 shrpd $Zhh,$Zll,4,$Zll 449 extrd,u $Zhh,59,60,$Zhh 450 ldd $nlo($Hll),$Tll 451 ldd $nlo($Hhh),$Thh 452 453 xor $Tll,$Zll,$Zll 454 xor $Thh,$Zhh,$Zhh 455 456 depd,z $Zll,60,4,$rem 457 shrpd $Zhh,$Zll,4,$Zll 458 ldd $rem2($rem_4bit),$rem2 459 460 xor $rem2,$Zhh,$Zhh 461 ldd $nhi($Hll),$Tll 462 ldd $nhi($Hhh),$Thh 463 464 extrd,u $Zhh,59,60,$Zhh 465 xor $Tll,$Zll,$Zll 466 xor $Thh,$Zhh,$Zhh 467 ldd $rem($rem_4bit),$rem 468 469 xor $rem,$Zhh,$Zhh 470 std $Zll,8($Xi) 471 ldo 16($inp),$inp 472 std $Zhh,0($Xi) 473 cmpb,*<> $inp,$len,L\$outer_ghash_pa2 474 copy $Zll,$nlo 475___ 476 477$code.=<<___ if ($SIZE_T==4); 478 b L\$done_ghash 479 nop 480 481L\$parisc1_ghash 482 ldb 15($Xi),$nlo 483 ldo 12($Htbl),$Hll 484 ldo 8($Htbl),$Hlh 485 ldo 4($Htbl),$Hhl 486 487L\$outer_ghash_pa1 488 ldb 15($inp),$byte 489 xor $byte,$nlo,$nlo 490 and $mask0xf0,$nlo,$nhi 491 zdep $nlo,27,4,$nlo 492 493 ldwx $nlo($Hll),$Zll 494 ldwx $nlo($Hlh),$Zlh 495 ldwx $nlo($Hhl),$Zhl 496 ldwx $nlo($Hhh),$Zhh 497 zdep $Zll,28,4,$rem 498 ldb 14($Xi),$nlo 499 ldb 14($inp),$byte 500 ldwx $rem($rem_4bit),$rem 501 shrpw $Zlh,$Zll,4,$Zll 502 ldwx $nhi($Hll),$Tll 503 shrpw $Zhl,$Zlh,4,$Zlh 504 ldwx $nhi($Hlh),$Tlh 505 shrpw $Zhh,$Zhl,4,$Zhl 506 ldwx $nhi($Hhl),$Thl 507 extru $Zhh,27,28,$Zhh 508 ldwx $nhi($Hhh),$Thh 509 xor $byte,$nlo,$nlo 510 xor $rem,$Zhh,$Zhh 511 and $mask0xf0,$nlo,$nhi 512 zdep $nlo,27,4,$nlo 513 514 xor $Tll,$Zll,$Zll 515 ldwx $nlo($Hll),$Tll 516 xor $Tlh,$Zlh,$Zlh 517 ldwx $nlo($Hlh),$Tlh 518 xor $Thl,$Zhl,$Zhl 519 b L\$oop_ghash_pa1 520 ldi 13,$cnt 521 522 .ALIGN 8 523L\$oop_ghash_pa1 524 zdep $Zll,28,4,$rem 525 ldwx $nlo($Hhl),$Thl 526 xor $Thh,$Zhh,$Zhh 527 ldwx $rem($rem_4bit),$rem 528 shrpw $Zlh,$Zll,4,$Zll 529 ldwx $nlo($Hhh),$Thh 530 shrpw $Zhl,$Zlh,4,$Zlh 531 ldbx $cnt($Xi),$nlo 532 xor $Tll,$Zll,$Zll 533 ldwx $nhi($Hll),$Tll 534 shrpw $Zhh,$Zhl,4,$Zhl 535 ldbx $cnt($inp),$byte 536 xor $Tlh,$Zlh,$Zlh 537 ldwx $nhi($Hlh),$Tlh 538 extru $Zhh,27,28,$Zhh 539 xor $Thl,$Zhl,$Zhl 540 ldwx $nhi($Hhl),$Thl 541 xor $rem,$Zhh,$Zhh 542 zdep $Zll,28,4,$rem 543 xor $Thh,$Zhh,$Zhh 544 ldwx $nhi($Hhh),$Thh 545 shrpw $Zlh,$Zll,4,$Zll 546 ldwx $rem($rem_4bit),$rem 547 shrpw $Zhl,$Zlh,4,$Zlh 548 xor $byte,$nlo,$nlo 549 shrpw $Zhh,$Zhl,4,$Zhl 550 and $mask0xf0,$nlo,$nhi 551 extru $Zhh,27,28,$Zhh 552 zdep $nlo,27,4,$nlo 553 xor $Tll,$Zll,$Zll 554 ldwx $nlo($Hll),$Tll 555 xor $Tlh,$Zlh,$Zlh 556 ldwx $nlo($Hlh),$Tlh 557 xor $rem,$Zhh,$Zhh 558 addib,uv -1,$cnt,L\$oop_ghash_pa1 559 xor $Thl,$Zhl,$Zhl 560 561 zdep $Zll,28,4,$rem 562 ldwx $nlo($Hhl),$Thl 563 xor $Thh,$Zhh,$Zhh 564 ldwx $rem($rem_4bit),$rem 565 shrpw $Zlh,$Zll,4,$Zll 566 ldwx $nlo($Hhh),$Thh 567 shrpw $Zhl,$Zlh,4,$Zlh 568 xor $Tll,$Zll,$Zll 569 ldwx $nhi($Hll),$Tll 570 shrpw $Zhh,$Zhl,4,$Zhl 571 xor $Tlh,$Zlh,$Zlh 572 ldwx $nhi($Hlh),$Tlh 573 extru $Zhh,27,28,$Zhh 574 xor $rem,$Zhh,$Zhh 575 xor $Thl,$Zhl,$Zhl 576 ldwx $nhi($Hhl),$Thl 577 xor $Thh,$Zhh,$Zhh 578 ldwx $nhi($Hhh),$Thh 579 zdep $Zll,28,4,$rem 580 ldwx $rem($rem_4bit),$rem 581 shrpw $Zlh,$Zll,4,$Zll 582 shrpw $Zhl,$Zlh,4,$Zlh 583 shrpw $Zhh,$Zhl,4,$Zhl 584 extru $Zhh,27,28,$Zhh 585 xor $Tll,$Zll,$Zll 586 xor $Tlh,$Zlh,$Zlh 587 xor $rem,$Zhh,$Zhh 588 stw $Zll,12($Xi) 589 xor $Thl,$Zhl,$Zhl 590 stw $Zlh,8($Xi) 591 xor $Thh,$Zhh,$Zhh 592 stw $Zhl,4($Xi) 593 ldo 16($inp),$inp 594 stw $Zhh,0($Xi) 595 comb,<> $inp,$len,L\$outer_ghash_pa1 596 copy $Zll,$nlo 597___ 598$code.=<<___; 599L\$done_ghash 600 $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue 601 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 602 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 603 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 604___ 605$code.=<<___ if ($SIZE_T==4); 606 $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 607 $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 608 $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 609 $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 610 $POP `-$FRAME+8*$SIZE_T`(%sp),%r11 611___ 612$code.=<<___; 613 bv (%r2) 614 .EXIT 615 $POPMB -$FRAME(%sp),%r3 616 .PROCEND 617 618 .ALIGN 64 619L\$rem_4bit 620 .WORD `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0 621 .WORD `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0 622 .WORD `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0 623 .WORD `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0 624 625 .data 626 .STRINGZ "GHASH for PA-RISC, GRYPTOGAMS by <appro\@openssl.org>" 627 .ALIGN 64 628___ 629 630# Explicitly encode PA-RISC 2.0 instructions used in this module, so 631# that it can be compiled with .LEVEL 1.0. It should be noted that I 632# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0 633# directive... 634 635my $ldd = sub { 636 my ($mod,$args) = @_; 637 my $orig = "ldd$mod\t$args"; 638 639 if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 4 640 { my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3; 641 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 642 } 643 elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 5 644 { my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3; 645 $opcode|=(($1&0xF)<<17)|(($1&0x10)<<12); # encode offset 646 $opcode|=(1<<5) if ($mod =~ /^,m/); 647 $opcode|=(1<<13) if ($mod =~ /^,mb/); 648 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 649 } 650 else { "\t".$orig; } 651}; 652 653my $std = sub { 654 my ($mod,$args) = @_; 655 my $orig = "std$mod\t$args"; 656 657 if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices 658 { my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1); 659 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 660 } 661 else { "\t".$orig; } 662}; 663 664my $extrd = sub { 665 my ($mod,$args) = @_; 666 my $orig = "extrd$mod\t$args"; 667 668 # I only have ",u" completer, it's implicitly encoded... 669 if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15 670 { my $opcode=(0x36<<26)|($1<<21)|($4<<16); 671 my $len=32-$3; 672 $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos 673 $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len 674 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 675 } 676 elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12 677 { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9); 678 my $len=32-$2; 679 $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len 680 $opcode |= (1<<13) if ($mod =~ /,\**=/); 681 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 682 } 683 else { "\t".$orig; } 684}; 685 686my $shrpd = sub { 687 my ($mod,$args) = @_; 688 my $orig = "shrpd$mod\t$args"; 689 690 if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14 691 { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4; 692 my $cpos=63-$3; 693 $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa 694 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 695 } 696 elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/) # format 11 697 { sprintf "\t.WORD\t0x%08x\t; %s", 698 (0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig; 699 } 700 else { "\t".$orig; } 701}; 702 703my $depd = sub { 704 my ($mod,$args) = @_; 705 my $orig = "depd$mod\t$args"; 706 707 # I only have ",z" completer, it's implicitly encoded... 708 if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 16 709 { my $opcode=(0x3c<<26)|($4<<21)|($1<<16); 710 my $cpos=63-$2; 711 my $len=32-$3; 712 $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode pos 713 $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len 714 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 715 } 716 else { "\t".$orig; } 717}; 718 719sub assemble { 720 my ($mnemonic,$mod,$args)=@_; 721 my $opcode = eval("\$$mnemonic"); 722 723 ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args"; 724} 725 726foreach (split("\n",$code)) { 727 s/\`([^\`]*)\`/eval $1/ge; 728 if ($SIZE_T==4) { 729 s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e; 730 s/cmpb,\*/comb,/; 731 s/,\*/,/; 732 } 733 s/\bbv\b/bve/ if ($SIZE_T==8); 734 print $_,"\n"; 735} 736 737close STDOUT; 738