1#!/usr/bin/env perl 2# 3# ==================================================================== 4# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL 5# project. The module is, however, dual licensed under OpenSSL and 6# CRYPTOGAMS licenses depending on where you obtain it. For further 7# details see http://www.openssl.org/~appro/cryptogams/. 8# ==================================================================== 9# 10# April 2010 11# 12# The module implements "4-bit" GCM GHASH function and underlying 13# single multiplication operation in GF(2^128). "4-bit" means that it 14# uses 256 bytes per-key table [+128 bytes shared table]. On PA-7100LC 15# it processes one byte in 19.6 cycles, which is more than twice as 16# fast as code generated by gcc 3.2. PA-RISC 2.0 loop is scheduled for 17# 8 cycles, but measured performance on PA-8600 system is ~9 cycles per 18# processed byte. This is ~2.2x faster than 64-bit code generated by 19# vendor compiler (which used to be very hard to beat:-). 20# 21# Special thanks to polarhome.com for providing HP-UX account. 22 23$flavour = shift; 24$output = shift; 25open STDOUT,">$output"; 26 27if ($flavour =~ /64/) { 28 $LEVEL ="2.0W"; 29 $SIZE_T =8; 30 $FRAME_MARKER =80; 31 $SAVED_RP =16; 32 $PUSH ="std"; 33 $PUSHMA ="std,ma"; 34 $POP ="ldd"; 35 $POPMB ="ldd,mb"; 36 $NREGS =6; 37} else { 38 $LEVEL ="1.0"; #"\n\t.ALLOW\t2.0"; 39 $SIZE_T =4; 40 $FRAME_MARKER =48; 41 $SAVED_RP =20; 42 $PUSH ="stw"; 43 $PUSHMA ="stwm"; 44 $POP ="ldw"; 45 $POPMB ="ldwm"; 46 $NREGS =11; 47} 48 49$FRAME=10*$SIZE_T+$FRAME_MARKER;# NREGS saved regs + frame marker 50 # [+ argument transfer] 51 52################# volatile registers 53$Xi="%r26"; # argument block 54$Htbl="%r25"; 55$inp="%r24"; 56$len="%r23"; 57$Hhh=$Htbl; # variables 58$Hll="%r22"; 59$Zhh="%r21"; 60$Zll="%r20"; 61$cnt="%r19"; 62$rem_4bit="%r28"; 63$rem="%r29"; 64$mask0xf0="%r31"; 65 66################# preserved registers 67$Thh="%r1"; 68$Tll="%r2"; 69$nlo="%r3"; 70$nhi="%r4"; 71$byte="%r5"; 72if ($SIZE_T==4) { 73 $Zhl="%r6"; 74 $Zlh="%r7"; 75 $Hhl="%r8"; 76 $Hlh="%r9"; 77 $Thl="%r10"; 78 $Tlh="%r11"; 79} 80$rem2="%r6"; # used in PA-RISC 2.0 code 81 82$code.=<<___; 83 .LEVEL $LEVEL 84 .text 85 86 .EXPORT gcm_gmult_4bit,ENTRY,ARGW0=GR,ARGW1=GR 87 .ALIGN 64 88gcm_gmult_4bit 89 .PROC 90 .CALLINFO FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=$NREGS 91 .ENTRY 92 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue 93 $PUSHMA %r3,$FRAME(%sp) 94 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) 95 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) 96 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) 97___ 98$code.=<<___ if ($SIZE_T==4); 99 $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) 100 $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) 101 $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) 102 $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) 103 $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp) 104___ 105$code.=<<___; 106 addl $inp,$len,$len 107#ifdef __PIC__ 108 addil LT'L\$rem_4bit, %r19 109 ldw RT'L\$rem_4bit(%r1), $rem_4bit 110#else 111 ldil L'L\$rem_4bit, %t1 112 ldo R'L\$rem_4bit(%t1), $rem_4bit 113#endif 114 ldi 0xf0,$mask0xf0 115___ 116$code.=<<___ if ($SIZE_T==4); 117#ifndef __OpenBSD__ 118 ldi 31,$rem 119 mtctl $rem,%cr11 120 extrd,u,*= $rem,%sar,1,$rem ; executes on PA-RISC 1.0 121 b L\$parisc1_gmult 122 nop 123___ 124 125$code.=<<___; 126 ldb 15($Xi),$nlo 127 ldo 8($Htbl),$Hll 128 129 and $mask0xf0,$nlo,$nhi 130 depd,z $nlo,59,4,$nlo 131 132 ldd $nlo($Hll),$Zll 133 ldd $nlo($Hhh),$Zhh 134 135 depd,z $Zll,60,4,$rem 136 shrpd $Zhh,$Zll,4,$Zll 137 extrd,u $Zhh,59,60,$Zhh 138 ldb 14($Xi),$nlo 139 140 ldd $nhi($Hll),$Tll 141 ldd $nhi($Hhh),$Thh 142 and $mask0xf0,$nlo,$nhi 143 depd,z $nlo,59,4,$nlo 144 145 xor $Tll,$Zll,$Zll 146 xor $Thh,$Zhh,$Zhh 147 ldd $rem($rem_4bit),$rem 148 b L\$oop_gmult_pa2 149 ldi 13,$cnt 150 151 .ALIGN 8 152L\$oop_gmult_pa2 153 xor $rem,$Zhh,$Zhh ; moved here to work around gas bug 154 depd,z $Zll,60,4,$rem 155 156 shrpd $Zhh,$Zll,4,$Zll 157 extrd,u $Zhh,59,60,$Zhh 158 ldd $nlo($Hll),$Tll 159 ldd $nlo($Hhh),$Thh 160 161 xor $Tll,$Zll,$Zll 162 xor $Thh,$Zhh,$Zhh 163 ldd $rem($rem_4bit),$rem 164 165 xor $rem,$Zhh,$Zhh 166 depd,z $Zll,60,4,$rem 167 ldbx $cnt($Xi),$nlo 168 169 shrpd $Zhh,$Zll,4,$Zll 170 extrd,u $Zhh,59,60,$Zhh 171 ldd $nhi($Hll),$Tll 172 ldd $nhi($Hhh),$Thh 173 174 and $mask0xf0,$nlo,$nhi 175 depd,z $nlo,59,4,$nlo 176 ldd $rem($rem_4bit),$rem 177 178 xor $Tll,$Zll,$Zll 179 addib,uv -1,$cnt,L\$oop_gmult_pa2 180 xor $Thh,$Zhh,$Zhh 181 182 xor $rem,$Zhh,$Zhh 183 depd,z $Zll,60,4,$rem 184 185 shrpd $Zhh,$Zll,4,$Zll 186 extrd,u $Zhh,59,60,$Zhh 187 ldd $nlo($Hll),$Tll 188 ldd $nlo($Hhh),$Thh 189 190 xor $Tll,$Zll,$Zll 191 xor $Thh,$Zhh,$Zhh 192 ldd $rem($rem_4bit),$rem 193 194 xor $rem,$Zhh,$Zhh 195 depd,z $Zll,60,4,$rem 196 197 shrpd $Zhh,$Zll,4,$Zll 198 extrd,u $Zhh,59,60,$Zhh 199 ldd $nhi($Hll),$Tll 200 ldd $nhi($Hhh),$Thh 201 202 xor $Tll,$Zll,$Zll 203 xor $Thh,$Zhh,$Zhh 204 ldd $rem($rem_4bit),$rem 205 206 xor $rem,$Zhh,$Zhh 207 std $Zll,8($Xi) 208 std $Zhh,0($Xi) 209___ 210 211$code.=<<___ if ($SIZE_T==4); 212 b L\$done_gmult 213 nop 214 215L\$parisc1_gmult 216#endif 217 ldb 15($Xi),$nlo 218 ldo 12($Htbl),$Hll 219 ldo 8($Htbl),$Hlh 220 ldo 4($Htbl),$Hhl 221 222 and $mask0xf0,$nlo,$nhi 223 zdep $nlo,27,4,$nlo 224 225 ldwx $nlo($Hll),$Zll 226 ldwx $nlo($Hlh),$Zlh 227 ldwx $nlo($Hhl),$Zhl 228 ldwx $nlo($Hhh),$Zhh 229 zdep $Zll,28,4,$rem 230 ldb 14($Xi),$nlo 231 ldwx $rem($rem_4bit),$rem 232 shrpw $Zlh,$Zll,4,$Zll 233 ldwx $nhi($Hll),$Tll 234 shrpw $Zhl,$Zlh,4,$Zlh 235 ldwx $nhi($Hlh),$Tlh 236 shrpw $Zhh,$Zhl,4,$Zhl 237 ldwx $nhi($Hhl),$Thl 238 extru $Zhh,27,28,$Zhh 239 ldwx $nhi($Hhh),$Thh 240 xor $rem,$Zhh,$Zhh 241 and $mask0xf0,$nlo,$nhi 242 zdep $nlo,27,4,$nlo 243 244 xor $Tll,$Zll,$Zll 245 ldwx $nlo($Hll),$Tll 246 xor $Tlh,$Zlh,$Zlh 247 ldwx $nlo($Hlh),$Tlh 248 xor $Thl,$Zhl,$Zhl 249 b L\$oop_gmult_pa1 250 ldi 13,$cnt 251 252 .ALIGN 8 253L\$oop_gmult_pa1 254 zdep $Zll,28,4,$rem 255 ldwx $nlo($Hhl),$Thl 256 xor $Thh,$Zhh,$Zhh 257 ldwx $rem($rem_4bit),$rem 258 shrpw $Zlh,$Zll,4,$Zll 259 ldwx $nlo($Hhh),$Thh 260 shrpw $Zhl,$Zlh,4,$Zlh 261 ldbx $cnt($Xi),$nlo 262 xor $Tll,$Zll,$Zll 263 ldwx $nhi($Hll),$Tll 264 shrpw $Zhh,$Zhl,4,$Zhl 265 xor $Tlh,$Zlh,$Zlh 266 ldwx $nhi($Hlh),$Tlh 267 extru $Zhh,27,28,$Zhh 268 xor $Thl,$Zhl,$Zhl 269 ldwx $nhi($Hhl),$Thl 270 xor $rem,$Zhh,$Zhh 271 zdep $Zll,28,4,$rem 272 xor $Thh,$Zhh,$Zhh 273 ldwx $nhi($Hhh),$Thh 274 shrpw $Zlh,$Zll,4,$Zll 275 ldwx $rem($rem_4bit),$rem 276 shrpw $Zhl,$Zlh,4,$Zlh 277 shrpw $Zhh,$Zhl,4,$Zhl 278 and $mask0xf0,$nlo,$nhi 279 extru $Zhh,27,28,$Zhh 280 zdep $nlo,27,4,$nlo 281 xor $Tll,$Zll,$Zll 282 ldwx $nlo($Hll),$Tll 283 xor $Tlh,$Zlh,$Zlh 284 ldwx $nlo($Hlh),$Tlh 285 xor $rem,$Zhh,$Zhh 286 addib,uv -1,$cnt,L\$oop_gmult_pa1 287 xor $Thl,$Zhl,$Zhl 288 289 zdep $Zll,28,4,$rem 290 ldwx $nlo($Hhl),$Thl 291 xor $Thh,$Zhh,$Zhh 292 ldwx $rem($rem_4bit),$rem 293 shrpw $Zlh,$Zll,4,$Zll 294 ldwx $nlo($Hhh),$Thh 295 shrpw $Zhl,$Zlh,4,$Zlh 296 xor $Tll,$Zll,$Zll 297 ldwx $nhi($Hll),$Tll 298 shrpw $Zhh,$Zhl,4,$Zhl 299 xor $Tlh,$Zlh,$Zlh 300 ldwx $nhi($Hlh),$Tlh 301 extru $Zhh,27,28,$Zhh 302 xor $rem,$Zhh,$Zhh 303 xor $Thl,$Zhl,$Zhl 304 ldwx $nhi($Hhl),$Thl 305 xor $Thh,$Zhh,$Zhh 306 ldwx $nhi($Hhh),$Thh 307 zdep $Zll,28,4,$rem 308 ldwx $rem($rem_4bit),$rem 309 shrpw $Zlh,$Zll,4,$Zll 310 shrpw $Zhl,$Zlh,4,$Zlh 311 shrpw $Zhh,$Zhl,4,$Zhl 312 extru $Zhh,27,28,$Zhh 313 xor $Tll,$Zll,$Zll 314 xor $Tlh,$Zlh,$Zlh 315 xor $rem,$Zhh,$Zhh 316 stw $Zll,12($Xi) 317 xor $Thl,$Zhl,$Zhl 318 stw $Zlh,8($Xi) 319 xor $Thh,$Zhh,$Zhh 320 stw $Zhl,4($Xi) 321 stw $Zhh,0($Xi) 322___ 323$code.=<<___; 324L\$done_gmult 325 $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue 326 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 327 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 328 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 329___ 330$code.=<<___ if ($SIZE_T==4); 331 $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 332 $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 333 $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 334 $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 335 $POP `-$FRAME+8*$SIZE_T`(%sp),%r11 336___ 337$code.=<<___; 338 bv (%r2) 339 .EXIT 340 $POPMB -$FRAME(%sp),%r3 341 .PROCEND 342 343 .EXPORT gcm_ghash_4bit,ENTRY,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR 344 .ALIGN 64 345gcm_ghash_4bit 346 .PROC 347 .CALLINFO FRAME=`$FRAME-10*$SIZE_T`,NO_CALLS,SAVE_RP,ENTRY_GR=11 348 .ENTRY 349 $PUSH %r2,-$SAVED_RP(%sp) ; standard prologue 350 $PUSHMA %r3,$FRAME(%sp) 351 $PUSH %r4,`-$FRAME+1*$SIZE_T`(%sp) 352 $PUSH %r5,`-$FRAME+2*$SIZE_T`(%sp) 353 $PUSH %r6,`-$FRAME+3*$SIZE_T`(%sp) 354___ 355$code.=<<___ if ($SIZE_T==4); 356 $PUSH %r7,`-$FRAME+4*$SIZE_T`(%sp) 357 $PUSH %r8,`-$FRAME+5*$SIZE_T`(%sp) 358 $PUSH %r9,`-$FRAME+6*$SIZE_T`(%sp) 359 $PUSH %r10,`-$FRAME+7*$SIZE_T`(%sp) 360 $PUSH %r11,`-$FRAME+8*$SIZE_T`(%sp) 361___ 362$code.=<<___; 363 addl $inp,$len,$len 364#ifdef __PIC__ 365 addil LT'L\$rem_4bit, %r19 366 ldw RT'L\$rem_4bit(%r1), $rem_4bit 367#else 368 ldil L'L\$rem_4bit, %t1 369 ldo R'L\$rem_4bit(%t1), $rem_4bit 370#endif 371 ldi 0xf0,$mask0xf0 372___ 373$code.=<<___ if ($SIZE_T==4); 374#ifndef __OpenBSD__ 375 ldi 31,$rem 376 mtctl $rem,%cr11 377 extrd,u,*= $rem,%sar,1,$rem ; executes on PA-RISC 1.0 378 b L\$parisc1_ghash 379 nop 380___ 381 382$code.=<<___; 383 ldb 15($Xi),$nlo 384 ldo 8($Htbl),$Hll 385 386L\$outer_ghash_pa2 387 ldb 15($inp),$nhi 388 xor $nhi,$nlo,$nlo 389 and $mask0xf0,$nlo,$nhi 390 depd,z $nlo,59,4,$nlo 391 392 ldd $nlo($Hll),$Zll 393 ldd $nlo($Hhh),$Zhh 394 395 depd,z $Zll,60,4,$rem 396 shrpd $Zhh,$Zll,4,$Zll 397 extrd,u $Zhh,59,60,$Zhh 398 ldb 14($Xi),$nlo 399 ldb 14($inp),$byte 400 401 ldd $nhi($Hll),$Tll 402 ldd $nhi($Hhh),$Thh 403 xor $byte,$nlo,$nlo 404 and $mask0xf0,$nlo,$nhi 405 depd,z $nlo,59,4,$nlo 406 407 xor $Tll,$Zll,$Zll 408 xor $Thh,$Zhh,$Zhh 409 ldd $rem($rem_4bit),$rem 410 b L\$oop_ghash_pa2 411 ldi 13,$cnt 412 413 .ALIGN 8 414L\$oop_ghash_pa2 415 xor $rem,$Zhh,$Zhh ; moved here to work around gas bug 416 depd,z $Zll,60,4,$rem2 417 418 shrpd $Zhh,$Zll,4,$Zll 419 extrd,u $Zhh,59,60,$Zhh 420 ldd $nlo($Hll),$Tll 421 ldd $nlo($Hhh),$Thh 422 423 xor $Tll,$Zll,$Zll 424 xor $Thh,$Zhh,$Zhh 425 ldbx $cnt($Xi),$nlo 426 ldbx $cnt($inp),$byte 427 428 depd,z $Zll,60,4,$rem 429 shrpd $Zhh,$Zll,4,$Zll 430 ldd $rem2($rem_4bit),$rem2 431 432 xor $rem2,$Zhh,$Zhh 433 xor $byte,$nlo,$nlo 434 ldd $nhi($Hll),$Tll 435 ldd $nhi($Hhh),$Thh 436 437 and $mask0xf0,$nlo,$nhi 438 depd,z $nlo,59,4,$nlo 439 440 extrd,u $Zhh,59,60,$Zhh 441 xor $Tll,$Zll,$Zll 442 443 ldd $rem($rem_4bit),$rem 444 addib,uv -1,$cnt,L\$oop_ghash_pa2 445 xor $Thh,$Zhh,$Zhh 446 447 xor $rem,$Zhh,$Zhh 448 depd,z $Zll,60,4,$rem2 449 450 shrpd $Zhh,$Zll,4,$Zll 451 extrd,u $Zhh,59,60,$Zhh 452 ldd $nlo($Hll),$Tll 453 ldd $nlo($Hhh),$Thh 454 455 xor $Tll,$Zll,$Zll 456 xor $Thh,$Zhh,$Zhh 457 458 depd,z $Zll,60,4,$rem 459 shrpd $Zhh,$Zll,4,$Zll 460 ldd $rem2($rem_4bit),$rem2 461 462 xor $rem2,$Zhh,$Zhh 463 ldd $nhi($Hll),$Tll 464 ldd $nhi($Hhh),$Thh 465 466 extrd,u $Zhh,59,60,$Zhh 467 xor $Tll,$Zll,$Zll 468 xor $Thh,$Zhh,$Zhh 469 ldd $rem($rem_4bit),$rem 470 471 xor $rem,$Zhh,$Zhh 472 std $Zll,8($Xi) 473 ldo 16($inp),$inp 474 std $Zhh,0($Xi) 475 cmpb,*<> $inp,$len,L\$outer_ghash_pa2 476 copy $Zll,$nlo 477___ 478 479$code.=<<___ if ($SIZE_T==4); 480 b L\$done_ghash 481 nop 482 483L\$parisc1_ghash 484#endif 485 ldb 15($Xi),$nlo 486 ldo 12($Htbl),$Hll 487 ldo 8($Htbl),$Hlh 488 ldo 4($Htbl),$Hhl 489 490L\$outer_ghash_pa1 491 ldb 15($inp),$byte 492 xor $byte,$nlo,$nlo 493 and $mask0xf0,$nlo,$nhi 494 zdep $nlo,27,4,$nlo 495 496 ldwx $nlo($Hll),$Zll 497 ldwx $nlo($Hlh),$Zlh 498 ldwx $nlo($Hhl),$Zhl 499 ldwx $nlo($Hhh),$Zhh 500 zdep $Zll,28,4,$rem 501 ldb 14($Xi),$nlo 502 ldb 14($inp),$byte 503 ldwx $rem($rem_4bit),$rem 504 shrpw $Zlh,$Zll,4,$Zll 505 ldwx $nhi($Hll),$Tll 506 shrpw $Zhl,$Zlh,4,$Zlh 507 ldwx $nhi($Hlh),$Tlh 508 shrpw $Zhh,$Zhl,4,$Zhl 509 ldwx $nhi($Hhl),$Thl 510 extru $Zhh,27,28,$Zhh 511 ldwx $nhi($Hhh),$Thh 512 xor $byte,$nlo,$nlo 513 xor $rem,$Zhh,$Zhh 514 and $mask0xf0,$nlo,$nhi 515 zdep $nlo,27,4,$nlo 516 517 xor $Tll,$Zll,$Zll 518 ldwx $nlo($Hll),$Tll 519 xor $Tlh,$Zlh,$Zlh 520 ldwx $nlo($Hlh),$Tlh 521 xor $Thl,$Zhl,$Zhl 522 b L\$oop_ghash_pa1 523 ldi 13,$cnt 524 525 .ALIGN 8 526L\$oop_ghash_pa1 527 zdep $Zll,28,4,$rem 528 ldwx $nlo($Hhl),$Thl 529 xor $Thh,$Zhh,$Zhh 530 ldwx $rem($rem_4bit),$rem 531 shrpw $Zlh,$Zll,4,$Zll 532 ldwx $nlo($Hhh),$Thh 533 shrpw $Zhl,$Zlh,4,$Zlh 534 ldbx $cnt($Xi),$nlo 535 xor $Tll,$Zll,$Zll 536 ldwx $nhi($Hll),$Tll 537 shrpw $Zhh,$Zhl,4,$Zhl 538 ldbx $cnt($inp),$byte 539 xor $Tlh,$Zlh,$Zlh 540 ldwx $nhi($Hlh),$Tlh 541 extru $Zhh,27,28,$Zhh 542 xor $Thl,$Zhl,$Zhl 543 ldwx $nhi($Hhl),$Thl 544 xor $rem,$Zhh,$Zhh 545 zdep $Zll,28,4,$rem 546 xor $Thh,$Zhh,$Zhh 547 ldwx $nhi($Hhh),$Thh 548 shrpw $Zlh,$Zll,4,$Zll 549 ldwx $rem($rem_4bit),$rem 550 shrpw $Zhl,$Zlh,4,$Zlh 551 xor $byte,$nlo,$nlo 552 shrpw $Zhh,$Zhl,4,$Zhl 553 and $mask0xf0,$nlo,$nhi 554 extru $Zhh,27,28,$Zhh 555 zdep $nlo,27,4,$nlo 556 xor $Tll,$Zll,$Zll 557 ldwx $nlo($Hll),$Tll 558 xor $Tlh,$Zlh,$Zlh 559 ldwx $nlo($Hlh),$Tlh 560 xor $rem,$Zhh,$Zhh 561 addib,uv -1,$cnt,L\$oop_ghash_pa1 562 xor $Thl,$Zhl,$Zhl 563 564 zdep $Zll,28,4,$rem 565 ldwx $nlo($Hhl),$Thl 566 xor $Thh,$Zhh,$Zhh 567 ldwx $rem($rem_4bit),$rem 568 shrpw $Zlh,$Zll,4,$Zll 569 ldwx $nlo($Hhh),$Thh 570 shrpw $Zhl,$Zlh,4,$Zlh 571 xor $Tll,$Zll,$Zll 572 ldwx $nhi($Hll),$Tll 573 shrpw $Zhh,$Zhl,4,$Zhl 574 xor $Tlh,$Zlh,$Zlh 575 ldwx $nhi($Hlh),$Tlh 576 extru $Zhh,27,28,$Zhh 577 xor $rem,$Zhh,$Zhh 578 xor $Thl,$Zhl,$Zhl 579 ldwx $nhi($Hhl),$Thl 580 xor $Thh,$Zhh,$Zhh 581 ldwx $nhi($Hhh),$Thh 582 zdep $Zll,28,4,$rem 583 ldwx $rem($rem_4bit),$rem 584 shrpw $Zlh,$Zll,4,$Zll 585 shrpw $Zhl,$Zlh,4,$Zlh 586 shrpw $Zhh,$Zhl,4,$Zhl 587 extru $Zhh,27,28,$Zhh 588 xor $Tll,$Zll,$Zll 589 xor $Tlh,$Zlh,$Zlh 590 xor $rem,$Zhh,$Zhh 591 stw $Zll,12($Xi) 592 xor $Thl,$Zhl,$Zhl 593 stw $Zlh,8($Xi) 594 xor $Thh,$Zhh,$Zhh 595 stw $Zhl,4($Xi) 596 ldo 16($inp),$inp 597 stw $Zhh,0($Xi) 598 comb,<> $inp,$len,L\$outer_ghash_pa1 599 copy $Zll,$nlo 600___ 601$code.=<<___; 602L\$done_ghash 603 $POP `-$FRAME-$SAVED_RP`(%sp),%r2 ; standard epilogue 604 $POP `-$FRAME+1*$SIZE_T`(%sp),%r4 605 $POP `-$FRAME+2*$SIZE_T`(%sp),%r5 606 $POP `-$FRAME+3*$SIZE_T`(%sp),%r6 607___ 608$code.=<<___ if ($SIZE_T==4); 609 $POP `-$FRAME+4*$SIZE_T`(%sp),%r7 610 $POP `-$FRAME+5*$SIZE_T`(%sp),%r8 611 $POP `-$FRAME+6*$SIZE_T`(%sp),%r9 612 $POP `-$FRAME+7*$SIZE_T`(%sp),%r10 613 $POP `-$FRAME+8*$SIZE_T`(%sp),%r11 614___ 615$code.=<<___; 616 bv (%r2) 617 .EXIT 618 $POPMB -$FRAME(%sp),%r3 619 .PROCEND 620 621 .section .rodata 622 .ALIGN 64 623L\$rem_4bit 624 .WORD `0x0000<<16`,0,`0x1C20<<16`,0,`0x3840<<16`,0,`0x2460<<16`,0 625 .WORD `0x7080<<16`,0,`0x6CA0<<16`,0,`0x48C0<<16`,0,`0x54E0<<16`,0 626 .WORD `0xE100<<16`,0,`0xFD20<<16`,0,`0xD940<<16`,0,`0xC560<<16`,0 627 .WORD `0x9180<<16`,0,`0x8DA0<<16`,0,`0xA9C0<<16`,0,`0xB5E0<<16`,0 628 .previous 629 630 .ALIGN 64 631___ 632 633# Explicitly encode PA-RISC 2.0 instructions used in this module, so 634# that it can be compiled with .LEVEL 1.0. It should be noted that I 635# wouldn't have to do this, if GNU assembler understood .ALLOW 2.0 636# directive... 637 638my $ldd = sub { 639 my ($mod,$args) = @_; 640 my $orig = "ldd$mod\t$args"; 641 642 if ($args =~ /%r([0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 4 643 { my $opcode=(0x03<<26)|($2<<21)|($1<<16)|(3<<6)|$3; 644 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 645 } 646 elsif ($args =~ /(\-?[0-9]+)\(%r([0-9]+)\),%r([0-9]+)/) # format 5 647 { my $opcode=(0x03<<26)|($2<<21)|(1<<12)|(3<<6)|$3; 648 $opcode|=(($1&0xF)<<17)|(($1&0x10)<<12); # encode offset 649 $opcode|=(1<<5) if ($mod =~ /^,m/); 650 $opcode|=(1<<13) if ($mod =~ /^,mb/); 651 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 652 } 653 else { "\t".$orig; } 654}; 655 656my $std = sub { 657 my ($mod,$args) = @_; 658 my $orig = "std$mod\t$args"; 659 660 if ($args =~ /%r([0-9]+),(\-?[0-9]+)\(%r([0-9]+)\)/) # format 3 suffices 661 { my $opcode=(0x1c<<26)|($3<<21)|($1<<16)|(($2&0x1FF8)<<1)|(($2>>13)&1); 662 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 663 } 664 else { "\t".$orig; } 665}; 666 667my $extrd = sub { 668 my ($mod,$args) = @_; 669 my $orig = "extrd$mod\t$args"; 670 671 # I only have ",u" completer, it's implicitly encoded... 672 if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 15 673 { my $opcode=(0x36<<26)|($1<<21)|($4<<16); 674 my $len=32-$3; 675 $opcode |= (($2&0x20)<<6)|(($2&0x1f)<<5); # encode pos 676 $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len 677 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 678 } 679 elsif ($args =~ /%r([0-9]+),%sar,([0-9]+),%r([0-9]+)/) # format 12 680 { my $opcode=(0x34<<26)|($1<<21)|($3<<16)|(2<<11)|(1<<9); 681 my $len=32-$2; 682 $opcode |= (($len&0x20)<<3)|($len&0x1f); # encode len 683 $opcode |= (1<<13) if ($mod =~ /,\**=/); 684 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 685 } 686 else { "\t".$orig; } 687}; 688 689my $shrpd = sub { 690 my ($mod,$args) = @_; 691 my $orig = "shrpd$mod\t$args"; 692 693 if ($args =~ /%r([0-9]+),%r([0-9]+),([0-9]+),%r([0-9]+)/) # format 14 694 { my $opcode=(0x34<<26)|($2<<21)|($1<<16)|(1<<10)|$4; 695 my $cpos=63-$3; 696 $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode sa 697 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 698 } 699 elsif ($args =~ /%r([0-9]+),%r([0-9]+),%sar,%r([0-9]+)/) # format 11 700 { sprintf "\t.WORD\t0x%08x\t; %s", 701 (0x34<<26)|($2<<21)|($1<<16)|(1<<9)|$3,$orig; 702 } 703 else { "\t".$orig; } 704}; 705 706my $depd = sub { 707 my ($mod,$args) = @_; 708 my $orig = "depd$mod\t$args"; 709 710 # I only have ",z" completer, it's implicitly encoded... 711 if ($args =~ /%r([0-9]+),([0-9]+),([0-9]+),%r([0-9]+)/) # format 16 712 { my $opcode=(0x3c<<26)|($4<<21)|($1<<16); 713 my $cpos=63-$2; 714 my $len=32-$3; 715 $opcode |= (($cpos&0x20)<<6)|(($cpos&0x1f)<<5); # encode pos 716 $opcode |= (($len&0x20)<<7)|($len&0x1f); # encode len 717 sprintf "\t.WORD\t0x%08x\t; %s",$opcode,$orig; 718 } 719 else { "\t".$orig; } 720}; 721 722sub assemble { 723 my ($mnemonic,$mod,$args)=@_; 724 my $opcode = eval("\$$mnemonic"); 725 726 ref($opcode) eq 'CODE' ? &$opcode($mod,$args) : "\t$mnemonic$mod\t$args"; 727} 728 729foreach (split("\n",$code)) { 730 s/\`([^\`]*)\`/eval $1/ge; 731 if ($SIZE_T==4) { 732 s/^\s+([a-z]+)([\S]*)\s+([\S]*)/&assemble($1,$2,$3)/e; 733 s/cmpb,\*/comb,/; 734 s/,\*/,/; 735 } 736 s/\bbv\b/bve/ if ($SIZE_T==8); 737 print $_,"\n"; 738} 739 740close STDOUT; 741