1; This Source Code Form is subject to the terms of the Mozilla Public 2; License, v. 2.0. If a copy of the MPL was not distributed with this 3; file, You can obtain one at http://mozilla.org/MPL/2.0/. 4 5#ifdef __LP64__ 6 .LEVEL 2.0W 7#else 8; .LEVEL 1.1 9; .ALLOW 2.0N 10 .LEVEL 2.0 11#endif 12 .SPACE $TEXT$,SORT=8 13 .SUBSPA $CODE$,QUAD=0,ALIGN=4,ACCESS=0x2c,CODE_ONLY,SORT=24 14 15; *************************************************************** 16; 17; maxpy_[little/big] 18; 19; *************************************************************** 20 21; There is no default -- you must specify one or the other. 22#define LITTLE_WORDIAN 1 23 24#ifdef LITTLE_WORDIAN 25#define EIGHT 8 26#define SIXTEEN 16 27#define THIRTY_TWO 32 28#define UN_EIGHT -8 29#define UN_SIXTEEN -16 30#define UN_TWENTY_FOUR -24 31#endif 32 33#ifdef BIG_WORDIAN 34#define EIGHT -8 35#define SIXTEEN -16 36#define THIRTY_TWO -32 37#define UN_EIGHT 8 38#define UN_SIXTEEN 16 39#define UN_TWENTY_FOUR 24 40#endif 41 42; This performs a multiple-precision integer version of "daxpy", 43; Using the selected addressing direction. "Little-wordian" means that 44; the least significant word of a number is stored at the lowest address. 45; "Big-wordian" means that the most significant word is at the lowest 46; address. Either way, the incoming address of the vector is that 47; of the least significant word. That means that, for little-wordian 48; addressing, we move the address upward as we propagate carries 49; from the least significant word to the most significant. For 50; big-wordian we move the address downward. 51 52; We use the following registers: 53; 54; r2 return PC, of course 55; r26 = arg1 = length 56; r25 = arg2 = address of scalar 57; r24 = arg3 = multiplicand vector 58; r23 = arg4 = result vector 59; 60; fr9 = scalar loaded once only from r25 61 62; The cycle counts shown in the bodies below are simply the result of a 63; scheduling by hand. The actual PCX-U hardware does it differently. 64; The intention is that the overall speed is the same. 65 66; The pipeline startup and shutdown code is constructed in the usual way, 67; by taking the loop bodies and removing unnecessary instructions. 68; We have left the comments describing cycle numbers in the code. 69; These are intended for reference when comparing with the main loop, 70; and have no particular relationship to actual cycle numbers. 71 72#ifdef LITTLE_WORDIAN 73maxpy_little 74#else 75maxpy_big 76#endif 77 .PROC 78 .CALLINFO FRAME=120,ENTRY_GR=4 79 .ENTRY 80 STW,MA %r3,128(%sp) 81 STW %r4,-124(%sp) 82 83 ADDIB,< -1,%r26,$L0 ; If N = 0, exit immediately. 84 FLDD 0(%r25),%fr9 ; fr9 = scalar 85 86; First startup 87 88 FLDD 0(%r24),%fr24 ; Cycle 1 89 XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3 90 XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4 91 XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5 92 CMPIB,> 3,%r26,$N_IS_SMALL ; Pick out cases N = 1, 2, or 3 93 XMPYU %fr9L,%fr24R,%fr24 ; Cycle 6 94 FLDD EIGHT(%r24),%fr28 ; Cycle 8 95 XMPYU %fr9L,%fr28R,%fr31 ; Cycle 10 96 FSTD %fr24,-96(%sp) 97 XMPYU %fr9R,%fr28L,%fr30 ; Cycle 11 98 FSTD %fr25,-80(%sp) 99 LDO SIXTEEN(%r24),%r24 ; Cycle 12 100 FSTD %fr31,-64(%sp) 101 XMPYU %fr9R,%fr28R,%fr29 ; Cycle 13 102 FSTD %fr27,-48(%sp) 103 104; Second startup 105 106 XMPYU %fr9L,%fr28L,%fr28 ; Cycle 1 107 FSTD %fr30,-56(%sp) 108 FLDD 0(%r24),%fr24 109 110 FSTD %fr26,-88(%sp) ; Cycle 2 111 112 XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3 113 FSTD %fr28,-104(%sp) 114 115 XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4 116 LDD -96(%sp),%r3 117 FSTD %fr29,-72(%sp) 118 119 XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5 120 LDD -64(%sp),%r19 121 LDD -80(%sp),%r21 122 123 XMPYU %fr9L,%fr24R,%fr24 ; Cycle 6 124 LDD -56(%sp),%r20 125 ADD %r21,%r3,%r3 126 127 ADD,DC %r20,%r19,%r19 ; Cycle 7 128 LDD -88(%sp),%r4 129 SHRPD %r3,%r0,32,%r21 130 LDD -48(%sp),%r1 131 132 FLDD EIGHT(%r24),%fr28 ; Cycle 8 133 LDD -104(%sp),%r31 134 ADD,DC %r0,%r0,%r20 135 SHRPD %r19,%r3,32,%r3 136 137 LDD -72(%sp),%r29 ; Cycle 9 138 SHRPD %r20,%r19,32,%r20 139 ADD %r21,%r1,%r1 140 141 XMPYU %fr9L,%fr28R,%fr31 ; Cycle 10 142 ADD,DC %r3,%r4,%r4 143 FSTD %fr24,-96(%sp) 144 145 XMPYU %fr9R,%fr28L,%fr30 ; Cycle 11 146 ADD,DC %r0,%r20,%r20 147 LDD 0(%r23),%r3 148 FSTD %fr25,-80(%sp) 149 150 LDO SIXTEEN(%r24),%r24 ; Cycle 12 151 FSTD %fr31,-64(%sp) 152 153 XMPYU %fr9R,%fr28R,%fr29 ; Cycle 13 154 ADD %r0,%r0,%r0 ; clear the carry bit 155 ADDIB,<= -4,%r26,$ENDLOOP ; actually happens in cycle 12 156 FSTD %fr27,-48(%sp) 157; MFCTL %cr16,%r21 ; for timing 158; STD %r21,-112(%sp) 159 160; Here is the loop. 161 162$LOOP XMPYU %fr9L,%fr28L,%fr28 ; Cycle 1 163 ADD,DC %r29,%r4,%r4 164 FSTD %fr30,-56(%sp) 165 FLDD 0(%r24),%fr24 166 167 LDO SIXTEEN(%r23),%r23 ; Cycle 2 168 ADD,DC %r0,%r20,%r20 169 FSTD %fr26,-88(%sp) 170 171 XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3 172 ADD %r3,%r1,%r1 173 FSTD %fr28,-104(%sp) 174 LDD UN_EIGHT(%r23),%r21 175 176 XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4 177 ADD,DC %r21,%r4,%r28 178 FSTD %fr29,-72(%sp) 179 LDD -96(%sp),%r3 180 181 XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5 182 ADD,DC %r20,%r31,%r22 183 LDD -64(%sp),%r19 184 LDD -80(%sp),%r21 185 186 XMPYU %fr9L,%fr24R,%fr24 ; Cycle 6 187 ADD %r21,%r3,%r3 188 LDD -56(%sp),%r20 189 STD %r1,UN_SIXTEEN(%r23) 190 191 ADD,DC %r20,%r19,%r19 ; Cycle 7 192 SHRPD %r3,%r0,32,%r21 193 LDD -88(%sp),%r4 194 LDD -48(%sp),%r1 195 196 ADD,DC %r0,%r0,%r20 ; Cycle 8 197 SHRPD %r19,%r3,32,%r3 198 FLDD EIGHT(%r24),%fr28 199 LDD -104(%sp),%r31 200 201 SHRPD %r20,%r19,32,%r20 ; Cycle 9 202 ADD %r21,%r1,%r1 203 STD %r28,UN_EIGHT(%r23) 204 LDD -72(%sp),%r29 205 206 XMPYU %fr9L,%fr28R,%fr31 ; Cycle 10 207 ADD,DC %r3,%r4,%r4 208 FSTD %fr24,-96(%sp) 209 210 XMPYU %fr9R,%fr28L,%fr30 ; Cycle 11 211 ADD,DC %r0,%r20,%r20 212 FSTD %fr25,-80(%sp) 213 LDD 0(%r23),%r3 214 215 LDO SIXTEEN(%r24),%r24 ; Cycle 12 216 FSTD %fr31,-64(%sp) 217 218 XMPYU %fr9R,%fr28R,%fr29 ; Cycle 13 219 ADD %r22,%r1,%r1 220 ADDIB,> -2,%r26,$LOOP ; actually happens in cycle 12 221 FSTD %fr27,-48(%sp) 222 223$ENDLOOP 224 225; Shutdown code, first stage. 226 227; MFCTL %cr16,%r21 ; for timing 228; STD %r21,UN_SIXTEEN(%r23) 229; LDD -112(%sp),%r21 230; STD %r21,UN_EIGHT(%r23) 231 232 XMPYU %fr9L,%fr28L,%fr28 ; Cycle 1 233 ADD,DC %r29,%r4,%r4 234 CMPIB,= 0,%r26,$ONEMORE 235 FSTD %fr30,-56(%sp) 236 237 LDO SIXTEEN(%r23),%r23 ; Cycle 2 238 ADD,DC %r0,%r20,%r20 239 FSTD %fr26,-88(%sp) 240 241 ADD %r3,%r1,%r1 ; Cycle 3 242 FSTD %fr28,-104(%sp) 243 LDD UN_EIGHT(%r23),%r21 244 245 ADD,DC %r21,%r4,%r28 ; Cycle 4 246 FSTD %fr29,-72(%sp) 247 STD %r28,UN_EIGHT(%r23) ; moved up from cycle 9 248 LDD -96(%sp),%r3 249 250 ADD,DC %r20,%r31,%r22 ; Cycle 5 251 STD %r1,UN_SIXTEEN(%r23) 252$JOIN4 253 LDD -64(%sp),%r19 254 LDD -80(%sp),%r21 255 256 ADD %r21,%r3,%r3 ; Cycle 6 257 LDD -56(%sp),%r20 258 259 ADD,DC %r20,%r19,%r19 ; Cycle 7 260 SHRPD %r3,%r0,32,%r21 261 LDD -88(%sp),%r4 262 LDD -48(%sp),%r1 263 264 ADD,DC %r0,%r0,%r20 ; Cycle 8 265 SHRPD %r19,%r3,32,%r3 266 LDD -104(%sp),%r31 267 268 SHRPD %r20,%r19,32,%r20 ; Cycle 9 269 ADD %r21,%r1,%r1 270 LDD -72(%sp),%r29 271 272 ADD,DC %r3,%r4,%r4 ; Cycle 10 273 274 ADD,DC %r0,%r20,%r20 ; Cycle 11 275 LDD 0(%r23),%r3 276 277 ADD %r22,%r1,%r1 ; Cycle 13 278 279; Shutdown code, second stage. 280 281 ADD,DC %r29,%r4,%r4 ; Cycle 1 282 283 LDO SIXTEEN(%r23),%r23 ; Cycle 2 284 ADD,DC %r0,%r20,%r20 285 286 LDD UN_EIGHT(%r23),%r21 ; Cycle 3 287 ADD %r3,%r1,%r1 288 289 ADD,DC %r21,%r4,%r28 ; Cycle 4 290 291 ADD,DC %r20,%r31,%r22 ; Cycle 5 292 293 STD %r1,UN_SIXTEEN(%r23); Cycle 6 294 295 STD %r28,UN_EIGHT(%r23) ; Cycle 9 296 297 LDD 0(%r23),%r3 ; Cycle 11 298 299; Shutdown code, third stage. 300 301 LDO SIXTEEN(%r23),%r23 302 ADD %r3,%r22,%r1 303$JOIN1 ADD,DC %r0,%r0,%r21 304 CMPIB,*= 0,%r21,$L0 ; if no overflow, exit 305 STD %r1,UN_SIXTEEN(%r23) 306 307; Final carry propagation 308 309$FINAL1 LDO EIGHT(%r23),%r23 310 LDD UN_SIXTEEN(%r23),%r21 311 ADDI 1,%r21,%r21 312 CMPIB,*= 0,%r21,$FINAL1 ; Keep looping if there is a carry. 313 STD %r21,UN_SIXTEEN(%r23) 314 B $L0 315 NOP 316 317; Here is the code that handles the difficult cases N=1, N=2, and N=3. 318; We do the usual trick -- branch out of the startup code at appropriate 319; points, and branch into the shutdown code. 320 321$N_IS_SMALL 322 CMPIB,= 0,%r26,$N_IS_ONE 323 FSTD %fr24,-96(%sp) ; Cycle 10 324 FLDD EIGHT(%r24),%fr28 ; Cycle 8 325 XMPYU %fr9L,%fr28R,%fr31 ; Cycle 10 326 XMPYU %fr9R,%fr28L,%fr30 ; Cycle 11 327 FSTD %fr25,-80(%sp) 328 FSTD %fr31,-64(%sp) ; Cycle 12 329 XMPYU %fr9R,%fr28R,%fr29 ; Cycle 13 330 FSTD %fr27,-48(%sp) 331 XMPYU %fr9L,%fr28L,%fr28 ; Cycle 1 332 CMPIB,= 2,%r26,$N_IS_THREE 333 FSTD %fr30,-56(%sp) 334 335; N = 2 336 FSTD %fr26,-88(%sp) ; Cycle 2 337 FSTD %fr28,-104(%sp) ; Cycle 3 338 LDD -96(%sp),%r3 ; Cycle 4 339 FSTD %fr29,-72(%sp) 340 B $JOIN4 341 ADD %r0,%r0,%r22 342 343$N_IS_THREE 344 FLDD SIXTEEN(%r24),%fr24 345 FSTD %fr26,-88(%sp) ; Cycle 2 346 XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3 347 FSTD %fr28,-104(%sp) 348 XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4 349 LDD -96(%sp),%r3 350 FSTD %fr29,-72(%sp) 351 XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5 352 LDD -64(%sp),%r19 353 LDD -80(%sp),%r21 354 B $JOIN3 355 ADD %r0,%r0,%r22 356 357$N_IS_ONE 358 FSTD %fr25,-80(%sp) 359 FSTD %fr27,-48(%sp) 360 FSTD %fr26,-88(%sp) ; Cycle 2 361 B $JOIN5 362 ADD %r0,%r0,%r22 363 364; We came out of the unrolled loop with wrong parity. Do one more 365; single cycle. This is quite tricky, because of the way the 366; carry chains and SHRPD chains have been chopped up. 367 368$ONEMORE 369 370 FLDD 0(%r24),%fr24 371 372 LDO SIXTEEN(%r23),%r23 ; Cycle 2 373 ADD,DC %r0,%r20,%r20 374 FSTD %fr26,-88(%sp) 375 376 XMPYU %fr9R,%fr24R,%fr27 ; Cycle 3 377 FSTD %fr28,-104(%sp) 378 LDD UN_EIGHT(%r23),%r21 379 ADD %r3,%r1,%r1 380 381 XMPYU %fr9R,%fr24L,%fr25 ; Cycle 4 382 ADD,DC %r21,%r4,%r28 383 STD %r28,UN_EIGHT(%r23) ; moved from cycle 9 384 LDD -96(%sp),%r3 385 FSTD %fr29,-72(%sp) 386 387 XMPYU %fr9L,%fr24L,%fr26 ; Cycle 5 388 ADD,DC %r20,%r31,%r22 389 LDD -64(%sp),%r19 390 LDD -80(%sp),%r21 391 392 STD %r1,UN_SIXTEEN(%r23); Cycle 6 393$JOIN3 394 XMPYU %fr9L,%fr24R,%fr24 395 LDD -56(%sp),%r20 396 ADD %r21,%r3,%r3 397 398 ADD,DC %r20,%r19,%r19 ; Cycle 7 399 LDD -88(%sp),%r4 400 SHRPD %r3,%r0,32,%r21 401 LDD -48(%sp),%r1 402 403 LDD -104(%sp),%r31 ; Cycle 8 404 ADD,DC %r0,%r0,%r20 405 SHRPD %r19,%r3,32,%r3 406 407 LDD -72(%sp),%r29 ; Cycle 9 408 SHRPD %r20,%r19,32,%r20 409 ADD %r21,%r1,%r1 410 411 ADD,DC %r3,%r4,%r4 ; Cycle 10 412 FSTD %fr24,-96(%sp) 413 414 ADD,DC %r0,%r20,%r20 ; Cycle 11 415 LDD 0(%r23),%r3 416 FSTD %fr25,-80(%sp) 417 418 ADD %r22,%r1,%r1 ; Cycle 13 419 FSTD %fr27,-48(%sp) 420 421; Shutdown code, stage 1-1/2. 422 423 ADD,DC %r29,%r4,%r4 ; Cycle 1 424 425 LDO SIXTEEN(%r23),%r23 ; Cycle 2 426 ADD,DC %r0,%r20,%r20 427 FSTD %fr26,-88(%sp) 428 429 LDD UN_EIGHT(%r23),%r21 ; Cycle 3 430 ADD %r3,%r1,%r1 431 432 ADD,DC %r21,%r4,%r28 ; Cycle 4 433 STD %r28,UN_EIGHT(%r23) ; moved from cycle 9 434 435 ADD,DC %r20,%r31,%r22 ; Cycle 5 436 STD %r1,UN_SIXTEEN(%r23) 437$JOIN5 438 LDD -96(%sp),%r3 ; moved from cycle 4 439 LDD -80(%sp),%r21 440 ADD %r21,%r3,%r3 ; Cycle 6 441 ADD,DC %r0,%r0,%r19 ; Cycle 7 442 LDD -88(%sp),%r4 443 SHRPD %r3,%r0,32,%r21 444 LDD -48(%sp),%r1 445 SHRPD %r19,%r3,32,%r3 ; Cycle 8 446 ADD %r21,%r1,%r1 ; Cycle 9 447 ADD,DC %r3,%r4,%r4 ; Cycle 10 448 LDD 0(%r23),%r3 ; Cycle 11 449 ADD %r22,%r1,%r1 ; Cycle 13 450 451; Shutdown code, stage 2-1/2. 452 453 ADD,DC %r0,%r4,%r4 ; Cycle 1 454 LDO SIXTEEN(%r23),%r23 ; Cycle 2 455 LDD UN_EIGHT(%r23),%r21 ; Cycle 3 456 ADD %r3,%r1,%r1 457 STD %r1,UN_SIXTEEN(%r23) 458 ADD,DC %r21,%r4,%r1 459 B $JOIN1 460 LDO EIGHT(%r23),%r23 461 462; exit 463 464$L0 465 LDW -124(%sp),%r4 466 BVE (%r2) 467 .EXIT 468 LDW,MB -128(%sp),%r3 469 470 .PROCEND 471 472; *************************************************************** 473; 474; add_diag_[little/big] 475; 476; *************************************************************** 477 478; The arguments are as follows: 479; r2 return PC, of course 480; r26 = arg1 = length 481; r25 = arg2 = vector to square 482; r24 = arg3 = result vector 483 484#ifdef LITTLE_WORDIAN 485add_diag_little 486#else 487add_diag_big 488#endif 489 .PROC 490 .CALLINFO FRAME=120,ENTRY_GR=4 491 .ENTRY 492 STW,MA %r3,128(%sp) 493 STW %r4,-124(%sp) 494 495 ADDIB,< -1,%r26,$Z0 ; If N=0, exit immediately. 496 NOP 497 498; Startup code 499 500 FLDD 0(%r25),%fr7 ; Cycle 2 (alternate body) 501 XMPYU %fr7R,%fr7R,%fr29 ; Cycle 4 502 XMPYU %fr7L,%fr7R,%fr27 ; Cycle 5 503 XMPYU %fr7L,%fr7L,%fr30 504 LDO SIXTEEN(%r25),%r25 ; Cycle 6 505 FSTD %fr29,-88(%sp) 506 FSTD %fr27,-72(%sp) ; Cycle 7 507 CMPIB,= 0,%r26,$DIAG_N_IS_ONE ; Cycle 1 (main body) 508 FSTD %fr30,-96(%sp) 509 FLDD UN_EIGHT(%r25),%fr7 ; Cycle 2 510 LDD -88(%sp),%r22 ; Cycle 3 511 LDD -72(%sp),%r31 ; Cycle 4 512 XMPYU %fr7R,%fr7R,%fr28 513 XMPYU %fr7L,%fr7R,%fr24 ; Cycle 5 514 XMPYU %fr7L,%fr7L,%fr31 515 LDD -96(%sp),%r20 ; Cycle 6 516 FSTD %fr28,-80(%sp) 517 ADD %r0,%r0,%r0 ; clear the carry bit 518 ADDIB,<= -2,%r26,$ENDDIAGLOOP ; Cycle 7 519 FSTD %fr24,-64(%sp) 520 521; Here is the loop. It is unrolled twice, modelled after the "alternate body" and then the "main body". 522 523$DIAGLOOP 524 SHRPD %r31,%r0,31,%r3 ; Cycle 1 (alternate body) 525 LDO SIXTEEN(%r25),%r25 526 LDD 0(%r24),%r1 527 FSTD %fr31,-104(%sp) 528 SHRPD %r0,%r31,31,%r4 ; Cycle 2 529 ADD,DC %r22,%r3,%r3 530 FLDD UN_SIXTEEN(%r25),%fr7 531 ADD,DC %r0,%r20,%r20 ; Cycle 3 532 ADD %r1,%r3,%r3 533 XMPYU %fr7R,%fr7R,%fr29 ; Cycle 4 534 LDD -80(%sp),%r21 535 STD %r3,0(%r24) 536 XMPYU %fr7L,%fr7R,%fr27 ; Cycle 5 537 XMPYU %fr7L,%fr7L,%fr30 538 LDD -64(%sp),%r29 539 LDD EIGHT(%r24),%r1 540 ADD,DC %r4,%r20,%r20 ; Cycle 6 541 LDD -104(%sp),%r19 542 FSTD %fr29,-88(%sp) 543 ADD %r20,%r1,%r1 ; Cycle 7 544 FSTD %fr27,-72(%sp) 545 SHRPD %r29,%r0,31,%r4 ; Cycle 1 (main body) 546 LDO THIRTY_TWO(%r24),%r24 547 LDD UN_SIXTEEN(%r24),%r28 548 FSTD %fr30,-96(%sp) 549 SHRPD %r0,%r29,31,%r3 ; Cycle 2 550 ADD,DC %r21,%r4,%r4 551 FLDD UN_EIGHT(%r25),%fr7 552 STD %r1,UN_TWENTY_FOUR(%r24) 553 ADD,DC %r0,%r19,%r19 ; Cycle 3 554 ADD %r28,%r4,%r4 555 XMPYU %fr7R,%fr7R,%fr28 ; Cycle 4 556 LDD -88(%sp),%r22 557 STD %r4,UN_SIXTEEN(%r24) 558 XMPYU %fr7L,%fr7R,%fr24 ; Cycle 5 559 XMPYU %fr7L,%fr7L,%fr31 560 LDD -72(%sp),%r31 561 LDD UN_EIGHT(%r24),%r28 562 ADD,DC %r3,%r19,%r19 ; Cycle 6 563 LDD -96(%sp),%r20 564 FSTD %fr28,-80(%sp) 565 ADD %r19,%r28,%r28 ; Cycle 7 566 FSTD %fr24,-64(%sp) 567 ADDIB,> -2,%r26,$DIAGLOOP ; Cycle 8 568 STD %r28,UN_EIGHT(%r24) 569 570$ENDDIAGLOOP 571 572 ADD,DC %r0,%r22,%r22 573 CMPIB,= 0,%r26,$ONEMOREDIAG 574 SHRPD %r31,%r0,31,%r3 575 576; Shutdown code, first stage. 577 578 FSTD %fr31,-104(%sp) ; Cycle 1 (alternate body) 579 LDD 0(%r24),%r28 580 SHRPD %r0,%r31,31,%r4 ; Cycle 2 581 ADD %r3,%r22,%r3 582 ADD,DC %r0,%r20,%r20 ; Cycle 3 583 LDD -80(%sp),%r21 584 ADD %r3,%r28,%r3 585 LDD -64(%sp),%r29 ; Cycle 4 586 STD %r3,0(%r24) 587 LDD EIGHT(%r24),%r1 ; Cycle 5 588 LDO SIXTEEN(%r25),%r25 ; Cycle 6 589 LDD -104(%sp),%r19 590 ADD,DC %r4,%r20,%r20 591 ADD %r20,%r1,%r1 ; Cycle 7 592 ADD,DC %r0,%r21,%r21 ; Cycle 8 593 STD %r1,EIGHT(%r24) 594 595; Shutdown code, second stage. 596 597 SHRPD %r29,%r0,31,%r4 ; Cycle 1 (main body) 598 LDO THIRTY_TWO(%r24),%r24 599 LDD UN_SIXTEEN(%r24),%r1 600 SHRPD %r0,%r29,31,%r3 ; Cycle 2 601 ADD %r4,%r21,%r4 602 ADD,DC %r0,%r19,%r19 ; Cycle 3 603 ADD %r4,%r1,%r4 604 STD %r4,UN_SIXTEEN(%r24); Cycle 4 605 LDD UN_EIGHT(%r24),%r28 ; Cycle 5 606 ADD,DC %r3,%r19,%r19 ; Cycle 6 607 ADD %r19,%r28,%r28 ; Cycle 7 608 ADD,DC %r0,%r0,%r22 ; Cycle 8 609 CMPIB,*= 0,%r22,$Z0 ; if no overflow, exit 610 STD %r28,UN_EIGHT(%r24) 611 612; Final carry propagation 613 614$FDIAG2 615 LDO EIGHT(%r24),%r24 616 LDD UN_EIGHT(%r24),%r26 617 ADDI 1,%r26,%r26 618 CMPIB,*= 0,%r26,$FDIAG2 ; Keep looping if there is a carry. 619 STD %r26,UN_EIGHT(%r24) 620 621 B $Z0 622 NOP 623 624; Here is the code that handles the difficult case N=1. 625; We do the usual trick -- branch out of the startup code at appropriate 626; points, and branch into the shutdown code. 627 628$DIAG_N_IS_ONE 629 630 LDD -88(%sp),%r22 631 LDD -72(%sp),%r31 632 B $JOINDIAG 633 LDD -96(%sp),%r20 634 635; We came out of the unrolled loop with wrong parity. Do one more 636; single cycle. This is the "alternate body". It will, of course, 637; give us opposite registers from the other case, so we need 638; completely different shutdown code. 639 640$ONEMOREDIAG 641 FSTD %fr31,-104(%sp) ; Cycle 1 (alternate body) 642 LDD 0(%r24),%r28 643 FLDD 0(%r25),%fr7 ; Cycle 2 644 SHRPD %r0,%r31,31,%r4 645 ADD %r3,%r22,%r3 646 ADD,DC %r0,%r20,%r20 ; Cycle 3 647 LDD -80(%sp),%r21 648 ADD %r3,%r28,%r3 649 LDD -64(%sp),%r29 ; Cycle 4 650 STD %r3,0(%r24) 651 XMPYU %fr7R,%fr7R,%fr29 652 LDD EIGHT(%r24),%r1 ; Cycle 5 653 XMPYU %fr7L,%fr7R,%fr27 654 XMPYU %fr7L,%fr7L,%fr30 655 LDD -104(%sp),%r19 ; Cycle 6 656 FSTD %fr29,-88(%sp) 657 ADD,DC %r4,%r20,%r20 658 FSTD %fr27,-72(%sp) ; Cycle 7 659 ADD %r20,%r1,%r1 660 ADD,DC %r0,%r21,%r21 ; Cycle 8 661 STD %r1,EIGHT(%r24) 662 663; Shutdown code, first stage. 664 665 SHRPD %r29,%r0,31,%r4 ; Cycle 1 (main body) 666 LDO THIRTY_TWO(%r24),%r24 667 FSTD %fr30,-96(%sp) 668 LDD UN_SIXTEEN(%r24),%r1 669 SHRPD %r0,%r29,31,%r3 ; Cycle 2 670 ADD %r4,%r21,%r4 671 ADD,DC %r0,%r19,%r19 ; Cycle 3 672 LDD -88(%sp),%r22 673 ADD %r4,%r1,%r4 674 LDD -72(%sp),%r31 ; Cycle 4 675 STD %r4,UN_SIXTEEN(%r24) 676 LDD UN_EIGHT(%r24),%r28 ; Cycle 5 677 LDD -96(%sp),%r20 ; Cycle 6 678 ADD,DC %r3,%r19,%r19 679 ADD %r19,%r28,%r28 ; Cycle 7 680 ADD,DC %r0,%r22,%r22 ; Cycle 8 681 STD %r28,UN_EIGHT(%r24) 682 683; Shutdown code, second stage. 684 685$JOINDIAG 686 SHRPD %r31,%r0,31,%r3 ; Cycle 1 (alternate body) 687 LDD 0(%r24),%r28 688 SHRPD %r0,%r31,31,%r4 ; Cycle 2 689 ADD %r3,%r22,%r3 690 ADD,DC %r0,%r20,%r20 ; Cycle 3 691 ADD %r3,%r28,%r3 692 STD %r3,0(%r24) ; Cycle 4 693 LDD EIGHT(%r24),%r1 ; Cycle 5 694 ADD,DC %r4,%r20,%r20 695 ADD %r20,%r1,%r1 ; Cycle 7 696 ADD,DC %r0,%r0,%r21 ; Cycle 8 697 CMPIB,*= 0,%r21,$Z0 ; if no overflow, exit 698 STD %r1,EIGHT(%r24) 699 700; Final carry propagation 701 702$FDIAG1 703 LDO EIGHT(%r24),%r24 704 LDD EIGHT(%r24),%r26 705 ADDI 1,%r26,%r26 706 CMPIB,*= 0,%r26,$FDIAG1 ; Keep looping if there is a carry. 707 STD %r26,EIGHT(%r24) 708 709$Z0 710 LDW -124(%sp),%r4 711 BVE (%r2) 712 .EXIT 713 LDW,MB -128(%sp),%r3 714 .PROCEND 715; .ALLOW 716 717 .SPACE $TEXT$ 718 .SUBSPA $CODE$ 719#ifdef LITTLE_WORDIAN 720#ifdef __GNUC__ 721; GNU-as (as of 2.19) does not support LONG_RETURN 722 .EXPORT maxpy_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR 723 .EXPORT add_diag_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR 724#else 725 .EXPORT maxpy_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,LONG_RETURN 726 .EXPORT add_diag_little,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,LONG_RETURN 727#endif 728#else 729 .EXPORT maxpy_big,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,ARGW3=GR,LONG_RETURN 730 .EXPORT add_diag_big,ENTRY,PRIV_LEV=3,ARGW0=GR,ARGW1=GR,ARGW2=GR,LONG_RETURN 731#endif 732 .END 733 734 735; How to use "maxpy_PA20_little" and "maxpy_PA20_big" 736; 737; The routine "maxpy_PA20_little" or "maxpy_PA20_big" 738; performs a 64-bit x any-size multiply, and adds the 739; result to an area of memory. That is, it performs 740; something like 741; 742; A B C D 743; * Z 744; __________ 745; P Q R S T 746; 747; and then adds the "PQRST" vector into an area of memory, 748; handling all carries. 749; 750; Digression on nomenclature and endian-ness: 751; 752; Each of the capital letters in the above represents a 64-bit 753; quantity. That is, you could think of the discussion as 754; being in terms of radix-16-quintillion arithmetic. The data 755; type being manipulated is "unsigned long long int". This 756; requires the 64-bit extension of the HP-UX C compiler, 757; available at release 10. You need these compiler flags to 758; enable these extensions: 759; 760; -Aa +e +DA2.0 +DS2.0 761; 762; (The first specifies ANSI C, the second enables the 763; extensions, which are beyond ANSI C, and the third and 764; fourth tell the compiler to use whatever features of the 765; PA2.0 architecture it wishes, in order to made the code more 766; efficient. Since the presence of the assembly code will 767; make the program unable to run on anything less than PA2.0, 768; you might as well gain the performance enhancements in the C 769; code as well.) 770; 771; Questions of "endian-ness" often come up, usually in the 772; context of byte ordering in a word. These routines have a 773; similar issue, that could be called "wordian-ness". 774; Independent of byte ordering (PA is always big-endian), one 775; can make two choices when representing extremely large 776; numbers as arrays of 64-bit doublewords in memory. 777; 778; "Little-wordian" layout means that the least significant 779; word of a number is stored at the lowest address. 780; 781; MSW LSW 782; | | 783; V V 784; 785; A B C D E 786; 787; ^ ^ ^ 788; | | |____ address 0 789; | | 790; | |_______address 8 791; | 792; address 32 793; 794; "Big-wordian" means that the most significant word is at the 795; lowest address. 796; 797; MSW LSW 798; | | 799; V V 800; 801; A B C D E 802; 803; ^ ^ ^ 804; | | |____ address 32 805; | | 806; | |_______address 24 807; | 808; address 0 809; 810; When you compile the file, you must specify one or the other, with 811; a switch "-DLITTLE_WORDIAN" or "-DBIG_WORDIAN". 812; 813; Incidentally, you assemble this file as part of your 814; project with the same C compiler as the rest of the program. 815; My "makefile" for a superprecision arithmetic package has 816; the following stuff: 817; 818; # definitions: 819; CC = cc -Aa +e -z +DA2.0 +DS2.0 +w1 820; CFLAGS = +O3 821; LDFLAGS = -L /usr/lib -Wl,-aarchive 822; 823; # general build rule for ".s" files: 824; .s.o: 825; $(CC) $(CFLAGS) -c $< -DBIG_WORDIAN 826; 827; # Now any bind step that calls for pa20.o will assemble pa20.s 828; 829; End of digression, back to arithmetic: 830; 831; The way we multiply two huge numbers is, of course, to multiply 832; the "ABCD" vector by each of the "WXYZ" doublewords, adding 833; the result vectors with increasing offsets, the way we learned 834; in school, back before we all used calculators: 835; 836; A B C D 837; * W X Y Z 838; __________ 839; P Q R S T 840; E F G H I 841; M N O P Q 842; + R S T U V 843; _______________ 844; F I N A L S U M 845; 846; So we call maxpy_PA20_big (in my case; my package is 847; big-wordian) repeatedly, giving the W, X, Y, and Z arguments 848; in turn as the "scalar", and giving the "ABCD" vector each 849; time. We direct it to add its result into an area of memory 850; that we have cleared at the start. We skew the exact 851; location into that area with each call. 852; 853; The prototype for the function is 854; 855; extern void maxpy_PA20_big( 856; int length, /* Number of doublewords in the multiplicand vector. */ 857; const long long int *scalaraddr, /* Address to fetch the scalar. */ 858; const long long int *multiplicand, /* The multiplicand vector. */ 859; long long int *result); /* Where to accumulate the result. */ 860; 861; (You should place a copy of this prototype in an include file 862; or in your C file.) 863; 864; Now, IN ALL CASES, the given address for the multiplicand or 865; the result is that of the LEAST SIGNIFICANT DOUBLEWORD. 866; That word is, of course, the word at which the routine 867; starts processing. "maxpy_PA20_little" then increases the 868; addresses as it computes. "maxpy_PA20_big" decreases them. 869; 870; In our example above, "length" would be 4 in each case. 871; "multiplicand" would be the "ABCD" vector. Specifically, 872; the address of the element "D". "scalaraddr" would be the 873; address of "W", "X", "Y", or "Z" on the four calls that we 874; would make. (The order doesn't matter, of course.) 875; "result" would be the appropriate address in the result 876; area. When multiplying by "Z", that would be the least 877; significant word. When multiplying by "Y", it would be the 878; next higher word (8 bytes higher if little-wordian; 8 bytes 879; lower if big-wordian), and so on. The size of the result 880; area must be the the sum of the sizes of the multiplicand 881; and multiplier vectors, and must be initialized to zero 882; before we start. 883; 884; Whenever the routine adds its partial product into the result 885; vector, it follows carry chains as far as they need to go. 886; 887; Here is the super-precision multiply routine that I use for 888; my package. The package is big-wordian. I have taken out 889; handling of exponents (it's a floating point package): 890; 891; static void mul_PA20( 892; int size, 893; const long long int *arg1, 894; const long long int *arg2, 895; long long int *result) 896; { 897; int i; 898; 899; for (i=0 ; i<2*size ; i++) result[i] = 0ULL; 900; 901; for (i=0 ; i<size ; i++) { 902; maxpy_PA20_big(size, &arg2[i], &arg1[size-1], &result[size+i]); 903; } 904; } 905