1/*********************************************************************/ 2/* */ 3/* Optimized BLAS libraries */ 4/* By Kazushige Goto <kgoto@tacc.utexas.edu> */ 5/* */ 6/* Copyright (c) The University of Texas, 2009. All rights reserved. */ 7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */ 8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */ 9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */ 10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */ 11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */ 12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */ 13/* THE USE OF THE SOFTWARE OR DOCUMENTATION. */ 14/* Under no circumstances shall University be liable for incidental, */ 15/* special, indirect, direct or consequential damages or loss of */ 16/* profits, interruption of business, or related expenses which may */ 17/* arise from use of Software or Documentation, including but not */ 18/* limited to those resulting from defects in Software and/or */ 19/* Documentation, or loss or inaccuracy of data of any kind. */ 20/*********************************************************************/ 21 22#define ASSEMBLER 23#include "common.h" 24 25#define M r3 26#define N r4 27#define A r5 28#define LDA r6 29#define B r7 30 31#define AO1 r8 32#define AO2 r9 33#define AO3 r10 34#define AO4 r11 35 36#define J r12 37 38#define INC r30 39#define INC2 r31 40 41#define c01 f0 42#define c02 f1 43#define c03 f2 44#define c04 f3 45#define c05 f4 46#define c06 f5 47#define c07 f6 48#define c08 f7 49#define c09 f8 50#define c10 f9 51#define c11 f10 52#define c12 f11 53#define c13 f12 54#define c14 f13 55#define c15 f14 56#define c16 f15 57 58#define sel_p f16 59#define sel_s f17 60 61#define c17 f18 62#define c18 f19 63 64 65 PROLOGUE 66 PROFCODE 67 68 li r0, -16 69 70 stfpdux f14, SP, r0 71 stfpdux f15, SP, r0 72 73 stfpdux f16, SP, r0 74 stfpdux f17, SP, r0 75 stfpdux f18, SP, r0 76 stfpdux f19, SP, r0 77 78 stwu r31, -4(SP) 79 stwu r30, -4(SP) 80 81 lis r9, 0x3f80 82 lis r10, 0xbf80 83 84 stwu r9, -4(SP) 85 stwu r10, -4(SP) 86 stwu r10, -4(SP) 87 stwu r9, -4(SP) 88 89 slwi LDA, LDA, BASE_SHIFT 90 91 cmpwi cr0, M, 0 92 ble- .L99 93 cmpwi cr0, N, 0 94 ble- .L99 95 96 andi. r0, A, 2 * SIZE - 1 97 bne .L100 98 andi. r0, LDA, 2 * SIZE - 1 99 bne .L100 100 101 li r0, 8 102 addi SP, SP, -8 103 104 lfpsux sel_p, SP, r0 105 lfpsux sel_s, SP, r0 106 107 li INC, 1 * SIZE 108 li INC2, 2 * SIZE 109 110 subi A, A, 2 * SIZE 111 subi B, B, 2 * SIZE 112 113 srawi. J, N, 2 114 ble .L20 115 .align 4 116.L11: 117 mr AO1, A 118 add AO2, A, LDA 119 add AO3, AO2, LDA 120 add AO4, AO3, LDA 121 add A, AO4, LDA 122 123 srawi. r0, M, 3 124 mtspr CTR, r0 125 ble .L15 126 .align 4 127 128.L12: 129 LFPDUX c01, AO1, INC2 130 LFXDUX c05, AO2, INC2 131 LFPDUX c09, AO3, INC2 132 LFXDUX c13, AO4, INC2 133 134 LFPDUX c02, AO1, INC2 135 LFXDUX c06, AO2, INC2 136 LFPDUX c10, AO3, INC2 137 LFXDUX c14, AO4, INC2 138 139 LFPDUX c03, AO1, INC2 140 LFXDUX c07, AO2, INC2 141 LFPDUX c11, AO3, INC2 142 LFXDUX c15, AO4, INC2 143 144 LFPDUX c04, AO1, INC2 145 LFXDUX c08, AO2, INC2 146 LFPDUX c12, AO3, INC2 147 LFXDUX c16, AO4, INC2 148 149 fpsel c17, sel_p, c01, c05 150 fpsel c18, sel_p, c09, c13 151 fpsel c01, sel_s, c01, c05 152 fpsel c05, sel_s, c09, c13 153 154 fpsel c09, sel_p, c02, c06 155 fpsel c13, sel_p, c10, c14 156 STFPDUX c17, B, INC2 157 fpsel c02, sel_s, c02, c06 158 STFPDUX c18, B, INC2 159 fpsel c06, sel_s, c10, c14 160 STFXDUX c01, B, INC2 161 162 fpsel c10, sel_p, c03, c07 163 STFXDUX c05, B, INC2 164 fpsel c14, sel_p, c11, c15 165 STFPDUX c09, B, INC2 166 fpsel c03, sel_s, c03, c07 167 STFPDUX c13, B, INC2 168 fpsel c07, sel_s, c11, c15 169 STFXDUX c02, B, INC2 170 171 fpsel c11, sel_p, c04, c08 172 STFXDUX c06, B, INC2 173 fpsel c15, sel_p, c12, c16 174 STFPDUX c10, B, INC2 175 fpsel c04, sel_s, c04, c08 176 STFPDUX c14, B, INC2 177 fpsel c08, sel_s, c12, c16 178 STFXDUX c03, B, INC2 179 180 STFXDUX c07, B, INC2 181 STFPDUX c11, B, INC2 182 STFPDUX c15, B, INC2 183 STFXDUX c04, B, INC2 184 STFXDUX c08, B, INC2 185 bdnz .L12 186 .align 4 187 188.L15: 189 andi. r0, M, 7 190 ble .L19 191 192 andi. r0, M, 4 193 beq .L16 194 195 LFPDUX c01, AO1, INC2 196 LFXDUX c05, AO2, INC2 197 LFPDUX c09, AO3, INC2 198 LFXDUX c13, AO4, INC2 199 200 LFPDUX c02, AO1, INC2 201 LFXDUX c06, AO2, INC2 202 LFPDUX c10, AO3, INC2 203 LFXDUX c14, AO4, INC2 204 205 fpsel c17, sel_p, c01, c05 206 fpsel c18, sel_p, c09, c13 207 fpsel c01, sel_s, c01, c05 208 fpsel c05, sel_s, c09, c13 209 210 fpsel c09, sel_p, c02, c06 211 fpsel c13, sel_p, c10, c14 212 STFPDUX c17, B, INC2 213 fpsel c02, sel_s, c02, c06 214 STFPDUX c18, B, INC2 215 fpsel c06, sel_s, c10, c14 216 STFXDUX c01, B, INC2 217 STFXDUX c05, B, INC2 218 STFPDUX c09, B, INC2 219 STFPDUX c13, B, INC2 220 STFXDUX c02, B, INC2 221 STFXDUX c06, B, INC2 222 .align 4 223 224.L16: 225 andi. r0, M, 2 226 beq .L17 227 228 LFPDUX c01, AO1, INC2 229 LFXDUX c05, AO2, INC2 230 LFPDUX c09, AO3, INC2 231 LFXDUX c13, AO4, INC2 232 233 fpsel c17, sel_p, c01, c05 234 fpsel c18, sel_p, c09, c13 235 fpsel c01, sel_s, c01, c05 236 fpsel c05, sel_s, c09, c13 237 238 STFPDUX c17, B, INC2 239 STFPDUX c18, B, INC2 240 STFXDUX c01, B, INC2 241 STFXDUX c05, B, INC2 242 .align 4 243 244.L17: 245 andi. r0, M, 1 246 beq .L19 247 248 LFDUX c01, AO1, INC2 249 LFDUX c02, AO2, INC2 250 LFDUX c03, AO3, INC2 251 LFDUX c04, AO4, INC2 252 253 fsmfp c01, c02 254 fsmfp c03, c04 255 256 STFPDUX c01, B, INC2 257 STFPDUX c03, B, INC2 258 .align 4 259 260.L19: 261 addic. J, J, -1 262 bgt .L11 263 .align 4 264 265.L20: 266 andi. J, N, 2 267 ble .L30 268 269 mr AO1, A 270 add AO2, A, LDA 271 add A, AO2, LDA 272 273 srawi. r0, M, 3 274 mtspr CTR, r0 275 ble .L25 276 .align 4 277 278.L22: 279 LFPDUX c01, AO1, INC2 280 LFXDUX c05, AO2, INC2 281 LFPDUX c02, AO1, INC2 282 LFXDUX c06, AO2, INC2 283 284 LFPDUX c03, AO1, INC2 285 LFXDUX c07, AO2, INC2 286 LFPDUX c04, AO1, INC2 287 LFXDUX c08, AO2, INC2 288 289 fpsel c17, sel_p, c01, c05 290 fpsel c01, sel_s, c01, c05 291 fpsel c09, sel_p, c02, c06 292 fpsel c02, sel_s, c02, c06 293 294 fpsel c10, sel_p, c03, c07 295 fpsel c03, sel_s, c03, c07 296 STFPDUX c17, B, INC2 297 fpsel c11, sel_p, c04, c08 298 STFXDUX c01, B, INC2 299 fpsel c04, sel_s, c04, c08 300 STFPDUX c09, B, INC2 301 302 STFXDUX c02, B, INC2 303 STFPDUX c10, B, INC2 304 STFXDUX c03, B, INC2 305 STFPDUX c11, B, INC2 306 STFXDUX c04, B, INC2 307 bdnz .L22 308 .align 4 309 310.L25: 311 andi. r0, M, 7 312 ble .L30 313 314 andi. r0, M, 4 315 beq .L26 316 317 LFPDUX c01, AO1, INC2 318 LFXDUX c05, AO2, INC2 319 LFPDUX c02, AO1, INC2 320 LFXDUX c06, AO2, INC2 321 322 fpsel c17, sel_p, c01, c05 323 fpsel c01, sel_s, c01, c05 324 fpsel c09, sel_p, c02, c06 325 fpsel c02, sel_s, c02, c06 326 327 STFPDUX c17, B, INC2 328 STFXDUX c01, B, INC2 329 STFPDUX c09, B, INC2 330 STFXDUX c02, B, INC2 331 .align 4 332 333.L26: 334 andi. r0, M, 2 335 beq .L27 336 337 LFPDUX c01, AO1, INC2 338 LFXDUX c05, AO2, INC2 339 340 fpsel c17, sel_p, c01, c05 341 fpsel c01, sel_s, c01, c05 342 343 STFPDUX c17, B, INC2 344 STFXDUX c01, B, INC2 345 .align 4 346 347.L27: 348 andi. r0, M, 1 349 beq .L30 350 351 LFDUX c01, AO1, INC2 352 LFDUX c02, AO2, INC2 353 354 fsmfp c01, c02 355 STFPDUX c01, B, INC2 356 .align 4 357 358.L30: 359 andi. J, N, 1 360 ble .L99 361 362 mr AO1, A 363 364 srawi. r0, M, 3 365 mtspr CTR, r0 366 ble .L35 367 .align 4 368 369.L32: 370 LFPDUX c01, AO1, INC2 371 LFPDUX c02, AO1, INC2 372 LFPDUX c03, AO1, INC2 373 LFPDUX c04, AO1, INC2 374 375 STFPDUX c01, B, INC2 376 STFPDUX c02, B, INC2 377 STFPDUX c03, B, INC2 378 STFPDUX c04, B, INC2 379 bdnz .L32 380 .align 4 381 382.L35: 383 andi. r0, M, 7 384 ble .L99 385 386 andi. r0, M, 4 387 beq .L36 388 389 LFPDUX c01, AO1, INC2 390 LFPDUX c02, AO1, INC2 391 392 STFPDUX c01, B, INC2 393 STFPDUX c02, B, INC2 394 .align 4 395 396.L36: 397 andi. r0, M, 2 398 beq .L37 399 400 LFPDUX c01, AO1, INC2 401 402 STFPDUX c01, B, INC2 403 .align 4 404 405.L37: 406 andi. r0, M, 1 407 beq .L99 408 409 LFDX c01, AO1, INC2 410 STFDX c01, B, INC2 411 .align 4 412 413.L99: 414 addi SP, SP, 4 415 416 lwzu r30, 4(SP) 417 lwzu r31, 4(SP) 418 419 subi SP, SP, 12 420 li r0, 16 421 422 lfpdux f19, SP, r0 423 lfpdux f18, SP, r0 424 lfpdux f17, SP, r0 425 lfpdux f16, SP, r0 426 427 lfpdux f15, SP, r0 428 lfpdux f14, SP, r0 429 addi SP, SP, 16 430 blr 431 .align 4 432 433.L100: 434 li INC, 1 * SIZE 435 li INC2, 2 * SIZE 436 437 subi A, A, 1 * SIZE 438 subi B, B, 2 * SIZE 439 440 srawi. J, N, 2 441 ble .L120 442 .align 4 443.L111: 444 mr AO1, A 445 add AO2, A, LDA 446 add AO3, AO2, LDA 447 add AO4, AO3, LDA 448 add A, AO4, LDA 449 450 srawi. r0, M, 3 451 mtspr CTR, r0 452 ble .L115 453 .align 4 454 455.L112: 456 LFDUX c01, AO1, INC 457 LFDUX c02, AO1, INC 458 LFDUX c03, AO1, INC 459 LFDUX c04, AO1, INC 460 461 LFDUX c09, AO1, INC 462 LFDUX c10, AO1, INC 463 LFDUX c11, AO1, INC 464 LFDUX c12, AO1, INC 465 466 LFSDUX c01, AO2, INC 467 LFSDUX c02, AO2, INC 468 LFSDUX c03, AO2, INC 469 LFSDUX c04, AO2, INC 470 471 LFSDUX c09, AO2, INC 472 LFSDUX c10, AO2, INC 473 LFSDUX c11, AO2, INC 474 LFSDUX c12, AO2, INC 475 476 LFDUX c05, AO3, INC 477 LFDUX c06, AO3, INC 478 LFDUX c07, AO3, INC 479 LFDUX c08, AO3, INC 480 481 LFDUX c13, AO3, INC 482 LFDUX c14, AO3, INC 483 LFDUX c15, AO3, INC 484 LFDUX c16, AO3, INC 485 486 LFSDUX c05, AO4, INC 487 LFSDUX c06, AO4, INC 488 LFSDUX c07, AO4, INC 489 LFSDUX c08, AO4, INC 490 491 LFSDUX c13, AO4, INC 492 LFSDUX c14, AO4, INC 493 LFSDUX c15, AO4, INC 494 LFSDUX c16, AO4, INC 495 496 STFPDUX c01, B, INC2 497 STFPDUX c05, B, INC2 498 STFPDUX c02, B, INC2 499 STFPDUX c06, B, INC2 500 STFPDUX c03, B, INC2 501 STFPDUX c07, B, INC2 502 STFPDUX c04, B, INC2 503 STFPDUX c08, B, INC2 504 505 STFPDUX c09, B, INC2 506 STFPDUX c13, B, INC2 507 STFPDUX c10, B, INC2 508 STFPDUX c14, B, INC2 509 STFPDUX c11, B, INC2 510 STFPDUX c15, B, INC2 511 STFPDUX c12, B, INC2 512 STFPDUX c16, B, INC2 513 bdnz .L112 514 .align 4 515 516.L115: 517 andi. r0, M, 7 518 ble .L119 519 520 andi. r0, M, 4 521 beq .L116 522 523 LFDUX c01, AO1, INC 524 LFDUX c02, AO1, INC 525 LFDUX c03, AO1, INC 526 LFDUX c04, AO1, INC 527 528 LFSDUX c01, AO2, INC 529 LFSDUX c02, AO2, INC 530 LFSDUX c03, AO2, INC 531 LFSDUX c04, AO2, INC 532 533 LFDUX c05, AO3, INC 534 LFDUX c06, AO3, INC 535 LFDUX c07, AO3, INC 536 LFDUX c08, AO3, INC 537 538 LFSDUX c05, AO4, INC 539 LFSDUX c06, AO4, INC 540 LFSDUX c07, AO4, INC 541 LFSDUX c08, AO4, INC 542 543 STFPDUX c01, B, INC2 544 STFPDUX c05, B, INC2 545 STFPDUX c02, B, INC2 546 STFPDUX c06, B, INC2 547 STFPDUX c03, B, INC2 548 STFPDUX c07, B, INC2 549 STFPDUX c04, B, INC2 550 STFPDUX c08, B, INC2 551 .align 4 552 553.L116: 554 andi. r0, M, 2 555 beq .L117 556 557 LFDUX c01, AO1, INC 558 LFDUX c02, AO1, INC 559 560 LFSDUX c01, AO2, INC 561 LFSDUX c02, AO2, INC 562 563 LFDUX c05, AO3, INC 564 LFDUX c06, AO3, INC 565 566 LFSDUX c05, AO4, INC 567 LFSDUX c06, AO4, INC 568 569 STFPDUX c01, B, INC2 570 STFPDUX c05, B, INC2 571 STFPDUX c02, B, INC2 572 STFPDUX c06, B, INC2 573 .align 4 574 575.L117: 576 andi. r0, M, 1 577 beq .L119 578 579 LFDUX c01, AO1, INC 580 LFDUX c05, AO3, INC 581 582 nop 583 nop 584 585 LFSDUX c01, AO2, INC 586 LFSDUX c05, AO4, INC 587 588 STFPDUX c01, B, INC2 589 STFPDUX c05, B, INC2 590 .align 4 591 592.L119: 593 addic. J, J, -1 594 bgt .L111 595 .align 4 596 597.L120: 598 andi. J, N, 2 599 ble .L130 600 601 mr AO1, A 602 add AO2, A, LDA 603 add A, AO2, LDA 604 605 srawi. r0, M, 3 606 mtspr CTR, r0 607 ble .L125 608 .align 4 609 610.L122: 611 LFDUX c01, AO1, INC 612 LFDUX c02, AO1, INC 613 LFDUX c03, AO1, INC 614 LFDUX c04, AO1, INC 615 616 LFDUX c09, AO1, INC 617 LFDUX c10, AO1, INC 618 LFDUX c11, AO1, INC 619 LFDUX c12, AO1, INC 620 621 LFSDUX c01, AO2, INC 622 LFSDUX c02, AO2, INC 623 LFSDUX c03, AO2, INC 624 LFSDUX c04, AO2, INC 625 626 LFSDUX c09, AO2, INC 627 LFSDUX c10, AO2, INC 628 LFSDUX c11, AO2, INC 629 LFSDUX c12, AO2, INC 630 631 STFPDUX c01, B, INC2 632 STFPDUX c02, B, INC2 633 STFPDUX c03, B, INC2 634 STFPDUX c04, B, INC2 635 636 STFPDUX c09, B, INC2 637 STFPDUX c10, B, INC2 638 STFPDUX c11, B, INC2 639 STFPDUX c12, B, INC2 640 bdnz .L122 641 .align 4 642 643.L125: 644 andi. r0, M, 7 645 ble .L130 646 647 andi. r0, M, 4 648 beq .L126 649 650 LFDUX c01, AO1, INC 651 LFDUX c02, AO1, INC 652 LFDUX c03, AO1, INC 653 LFDUX c04, AO1, INC 654 655 LFSDUX c01, AO2, INC 656 LFSDUX c02, AO2, INC 657 LFSDUX c03, AO2, INC 658 LFSDUX c04, AO2, INC 659 660 STFPDUX c01, B, INC2 661 STFPDUX c02, B, INC2 662 STFPDUX c03, B, INC2 663 STFPDUX c04, B, INC2 664 .align 4 665 666.L126: 667 andi. r0, M, 2 668 beq .L127 669 670 LFDUX c01, AO1, INC 671 LFDUX c02, AO1, INC 672 673 LFSDUX c01, AO2, INC 674 LFSDUX c02, AO2, INC 675 676 STFPDUX c01, B, INC2 677 STFPDUX c02, B, INC2 678 .align 4 679 680.L127: 681 andi. r0, M, 1 682 beq .L130 683 684 LFDUX c01, AO1, INC 685 LFDUX c02, AO2, INC 686 687 fsmfp c01, c02 688 STFPDUX c01, B, INC2 689 .align 4 690 691.L130: 692 andi. J, N, 1 693 ble .L999 694 695 mr AO1, A 696 697 srawi. r0, M, 3 698 mtspr CTR, r0 699 ble .L135 700 .align 4 701 702.L132: 703 LFDUX c01, AO1, INC 704 LFDUX c02, AO1, INC 705 LFDUX c03, AO1, INC 706 LFDUX c04, AO1, INC 707 708 LFDUX c05, AO1, INC 709 LFDUX c06, AO1, INC 710 LFDUX c07, AO1, INC 711 LFDUX c08, AO1, INC 712 713 fsmfp c01, c02 714 fsmfp c03, c04 715 fsmfp c05, c06 716 fsmfp c07, c08 717 718 STFPDUX c01, B, INC2 719 STFPDUX c03, B, INC2 720 STFPDUX c05, B, INC2 721 STFPDUX c07, B, INC2 722 bdnz .L132 723 .align 4 724 725.L135: 726 andi. r0, M, 7 727 ble .L999 728 729 andi. r0, M, 4 730 beq .L136 731 732 LFDUX c01, AO1, INC 733 LFDUX c02, AO1, INC 734 LFDUX c03, AO1, INC 735 LFDUX c04, AO1, INC 736 737 fsmfp c01, c02 738 fsmfp c03, c04 739 740 STFPDUX c01, B, INC2 741 STFPDUX c03, B, INC2 742 .align 4 743 744.L136: 745 andi. r0, M, 2 746 beq .L137 747 748 LFDUX c01, AO1, INC 749 LFDUX c02, AO1, INC 750 751 fsmfp c01, c02 752 STFPDUX c01, B, INC2 753 .align 4 754 755.L137: 756 andi. r0, M, 1 757 beq .L999 758 759 LFDX c01, AO1, INC 760 STFDX c01, B, INC2 761 .align 4 762 763.L999: 764 addi SP, SP, 12 765 766 lwzu r30, 4(SP) 767 lwzu r31, 4(SP) 768 769 subi SP, SP, 12 770 li r0, 16 771 772 lfpdux f19, SP, r0 773 lfpdux f18, SP, r0 774 lfpdux f17, SP, r0 775 lfpdux f16, SP, r0 776 777 lfpdux f15, SP, r0 778 lfpdux f14, SP, r0 779 addi SP, SP, 16 780 blr 781 EPILOGUE 782