1/*************************************************************************** 2Copyright (c) 2013-2016, The OpenBLAS Project 3All rights reserved. 4Redistribution and use in source and binary forms, with or without 5modification, are permitted provided that the following conditions are 6met: 71. Redistributions of source code must retain the above copyright 8notice, this list of conditions and the following disclaimer. 92. Redistributions in binary form must reproduce the above copyright 10notice, this list of conditions and the following disclaimer in 11the documentation and/or other materials provided with the 12distribution. 133. Neither the name of the OpenBLAS project nor the names of 14its contributors may be used to endorse or promote products 15derived from this software without specific prior written permission. 16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE 20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE 25USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26*****************************************************************************/ 27 28/************************************************************************************** 29* 2016/04/22 Werner Saar (wernsaar@googlemail.com) 30* BLASTEST : OK 31* CTEST : OK 32* TEST : OK 33* LAPACK-TEST : OK 34**************************************************************************************/ 35 36 37 srawi. J, N, 1 38 ble ZGEMM_L2_END 39 40ZGEMM_L2_BEGIN: 41 42 mr BO, B 43 mr BBO, BBUFFER 44 srawi. T1, K, 2 45 ble ZGEMM_L2_COPYB1 46 47ZGEMM_L2_COPYB8: 48 49 addi T2, PRE, 128 50 dcbt BO, PRE 51 dcbtst BBO, PRE 52 dcbtst BBO, T2 53 ZCOPYB_8x1 54 addic. T1, T1, -1 55 56 bgt ZGEMM_L2_COPYB8 57 58ZGEMM_L2_COPYB1: 59 60 andi. T1, K, 3 61 ble ZGEMM_L2_COPYB_END 62 63ZGEMM_L2_COPYB_LOOP: 64 65 ZCOPYB_1x1 66 ZCOPYB_1x1 67 addic. T1, T1, -1 68 69 bgt ZGEMM_L2_COPYB_LOOP 70 71ZGEMM_L2_COPYB_END: 72 73 mr CO, C 74 mr AO, A 75 slwi T1, LDC , 1 76 add C, C, T1 77 srawi. I, M, 3 78 ble ZGEMM_L2x8_END 79 80ZGEMM_L2x8_BEGIN: 81 82 83 mr BO, BBUFFER 84 srawi. L, K, 3 85 ble ZGEMM_L2x8_SUB0 86 cmpwi cr0, L, 1 87 ble ZGEMM_L2x8_SUB4 88 89ZGEMM_L2x8_LOOP_START: 90 91 dcbt AO, PRE 92 dcbt BO, PRE 93 LOAD2x8_1 94 dcbt AO, PRE 95 KERNEL2x8_I1 96 dcbt AO, PRE 97 dcbt BO, PRE 98 KERNEL2x8_2 99 dcbt AO, PRE 100 KERNEL2x8_1 101 dcbt AO, PRE 102 dcbt BO, PRE 103 KERNEL2x8_2 104 105 dcbt AO, PRE 106 KERNEL2x8_1 107 dcbt AO, PRE 108 dcbt BO, PRE 109 KERNEL2x8_2 110 dcbt AO, PRE 111 KERNEL2x8_1 112 dcbt AO, PRE 113 dcbt BO, PRE 114 KERNEL2x8_2 115 116 addic. L, L, -2 117 ble ZGEMM_L2x8_LOOP_END 118 119 .align 5 120 121ZGEMM_L2x8_LOOP: 122 123 dcbt AO, PRE 124 KERNEL2x8_1 125 dcbt AO, PRE 126 dcbt BO, PRE 127 KERNEL2x8_2 128 dcbt AO, PRE 129 KERNEL2x8_1 130 dcbt AO, PRE 131 dcbt BO, PRE 132 KERNEL2x8_2 133 134 dcbt AO, PRE 135 KERNEL2x8_1 136 dcbt AO, PRE 137 dcbt BO, PRE 138 KERNEL2x8_2 139 dcbt AO, PRE 140 KERNEL2x8_1 141 dcbt AO, PRE 142 dcbt BO, PRE 143 KERNEL2x8_2 144 145 addic. L, L, -1 146 bgt ZGEMM_L2x8_LOOP 147 148ZGEMM_L2x8_LOOP_END: 149 150 dcbt AO, PRE 151 KERNEL2x8_1 152 dcbt AO, PRE 153 dcbt BO, PRE 154 KERNEL2x8_2 155 dcbt AO, PRE 156 KERNEL2x8_1 157 dcbt AO, PRE 158 dcbt BO, PRE 159 KERNEL2x8_2 160 161 dcbt AO, PRE 162 KERNEL2x8_1 163 dcbt AO, PRE 164 KERNEL2x8_2 165 dcbt AO, PRE 166 KERNEL2x8_1 167 KERNEL2x8_E2 168 169 b ZGEMM_L2x8_SUB1 170 171ZGEMM_L2x8_SUB4: 172 173 dcbt AO, PRE 174 KERNEL2x8_SUBI1 175 dcbt AO, PRE 176 KERNEL2x8_SUB1 177 dcbt AO, PRE 178 KERNEL2x8_SUB1 179 dcbt AO, PRE 180 KERNEL2x8_SUB1 181 182 KERNEL2x8_SUB1 183 KERNEL2x8_SUB1 184 KERNEL2x8_SUB1 185 KERNEL2x8_SUB1 186 187 b ZGEMM_L2x8_SUB1 188 189ZGEMM_L2x8_SUB0: 190 191 andi. L, K, 7 192 193 KERNEL2x8_SUBI1 194 195 addic. L, L, -1 196 ble ZGEMM_L2x8_SAVE 197 b ZGEMM_L2x8_SUB2 198 199ZGEMM_L2x8_SUB1: 200 201 andi. L, K, 7 202 ble ZGEMM_L2x8_SAVE 203 204ZGEMM_L2x8_SUB2: 205 206 KERNEL2x8_SUB1 207 208 addic. L, L, -1 209 bgt ZGEMM_L2x8_SUB2 210 211ZGEMM_L2x8_SAVE: 212 213 SAVE2x8 214 215 addic. I, I, -1 216 bgt ZGEMM_L2x8_BEGIN 217 218ZGEMM_L2x8_END: 219 220ZGEMM_L2x4_BEGIN: 221 222 andi. T2, M, 7 223 ble ZGEMM_L2x1_END 224 225 andi. T1, M, 4 226 ble ZGEMM_L2x4_END 227 mr BO, BBUFFER 228 srawi. L, K, 3 229 ble ZGEMM_L2x4_SUB0 230 cmpwi cr0, L, 1 231 ble ZGEMM_L2x4_SUB4 232 233ZGEMM_L2x4_LOOP_START: 234 235 LOAD2x4_1 236 KERNEL2x4_I1 237 KERNEL2x4_2 238 KERNEL2x4_1 239 KERNEL2x4_2 240 241 KERNEL2x4_1 242 KERNEL2x4_2 243 KERNEL2x4_1 244 KERNEL2x4_2 245 246 addic. L, L, -2 247 ble ZGEMM_L2x4_LOOP_END 248 249 .align 5 250 251ZGEMM_L2x4_LOOP: 252 253 KERNEL2x4_1 254 KERNEL2x4_2 255 KERNEL2x4_1 256 KERNEL2x4_2 257 258 KERNEL2x4_1 259 KERNEL2x4_2 260 KERNEL2x4_1 261 KERNEL2x4_2 262 263 addic. L, L, -1 264 bgt ZGEMM_L2x4_LOOP 265 266ZGEMM_L2x4_LOOP_END: 267 268 KERNEL2x4_1 269 KERNEL2x4_2 270 KERNEL2x4_1 271 KERNEL2x4_2 272 273 KERNEL2x4_1 274 KERNEL2x4_2 275 KERNEL2x4_1 276 KERNEL2x4_E2 277 278 b ZGEMM_L2x4_SUB1 279 280ZGEMM_L2x4_SUB4: 281 282 KERNEL2x4_SUBI1 283 KERNEL2x4_SUB1 284 KERNEL2x4_SUB1 285 KERNEL2x4_SUB1 286 287 KERNEL2x4_SUB1 288 KERNEL2x4_SUB1 289 KERNEL2x4_SUB1 290 KERNEL2x4_SUB1 291 292 b ZGEMM_L2x4_SUB1 293 294ZGEMM_L2x4_SUB0: 295 296 andi. L, K, 7 297 298 KERNEL2x4_SUBI1 299 300 addic. L, L, -1 301 ble ZGEMM_L2x4_SAVE 302 b ZGEMM_L2x4_SUB2 303 304ZGEMM_L2x4_SUB1: 305 306 andi. L, K, 7 307 ble ZGEMM_L2x4_SAVE 308 309ZGEMM_L2x4_SUB2: 310 311 KERNEL2x4_SUB1 312 313 addic. L, L, -1 314 bgt ZGEMM_L2x4_SUB2 315 316ZGEMM_L2x4_SAVE: 317 318 SAVE2x4 319 320ZGEMM_L2x4_END: 321 322ZGEMM_L2x2_BEGIN: 323 324 325 andi. T1, M, 2 326 ble ZGEMM_L2x2_END 327 mr BO, BBUFFER 328 srawi. L, K, 3 329 ble ZGEMM_L2x2_SUB0 330 cmpwi cr0, L, 1 331 ble ZGEMM_L2x2_SUB4 332 333ZGEMM_L2x2_LOOP_START: 334 335 LOAD2x2_1 336 KERNEL2x2_I1 337 KERNEL2x2_2 338 KERNEL2x2_1 339 KERNEL2x2_2 340 341 KERNEL2x2_1 342 KERNEL2x2_2 343 KERNEL2x2_1 344 KERNEL2x2_2 345 346 addic. L, L, -2 347 ble ZGEMM_L2x2_LOOP_END 348 349 .align 5 350 351ZGEMM_L2x2_LOOP: 352 353 KERNEL2x2_1 354 KERNEL2x2_2 355 KERNEL2x2_1 356 KERNEL2x2_2 357 358 KERNEL2x2_1 359 KERNEL2x2_2 360 KERNEL2x2_1 361 KERNEL2x2_2 362 363 addic. L, L, -1 364 bgt ZGEMM_L2x2_LOOP 365 366ZGEMM_L2x2_LOOP_END: 367 368 KERNEL2x2_1 369 KERNEL2x2_2 370 KERNEL2x2_1 371 KERNEL2x2_2 372 373 KERNEL2x2_1 374 KERNEL2x2_2 375 KERNEL2x2_1 376 KERNEL2x2_E2 377 378 b ZGEMM_L2x2_SUB1 379 380ZGEMM_L2x2_SUB4: 381 382 KERNEL2x2_SUBI1 383 KERNEL2x2_SUB1 384 KERNEL2x2_SUB1 385 KERNEL2x2_SUB1 386 387 KERNEL2x2_SUB1 388 KERNEL2x2_SUB1 389 KERNEL2x2_SUB1 390 KERNEL2x2_SUB1 391 392 b ZGEMM_L2x2_SUB1 393 394ZGEMM_L2x2_SUB0: 395 396 andi. L, K, 7 397 398 KERNEL2x2_SUBI1 399 400 addic. L, L, -1 401 ble ZGEMM_L2x2_SAVE 402 b ZGEMM_L2x2_SUB2 403 404ZGEMM_L2x2_SUB1: 405 406 andi. L, K, 7 407 ble ZGEMM_L2x2_SAVE 408 409ZGEMM_L2x2_SUB2: 410 411 KERNEL2x2_SUB1 412 413 addic. L, L, -1 414 bgt ZGEMM_L2x2_SUB2 415 416ZGEMM_L2x2_SAVE: 417 418 SAVE2x2 419 420ZGEMM_L2x2_END: 421 422ZGEMM_L2x1_BEGIN: 423 424 425 andi. T1, M, 1 426 ble ZGEMM_L2x1_END 427 mr BO, BBUFFER 428 srawi. L, K, 3 429 ble ZGEMM_L2x1_SUB0 430 cmpwi cr0, L, 1 431 ble ZGEMM_L2x1_SUB4 432 433ZGEMM_L2x1_LOOP_START: 434 435 LOAD2x1_1 436 KERNEL2x1_I1 437 KERNEL2x1_2 438 KERNEL2x1_1 439 KERNEL2x1_2 440 441 KERNEL2x1_1 442 KERNEL2x1_2 443 KERNEL2x1_1 444 KERNEL2x1_2 445 446 addic. L, L, -2 447 ble ZGEMM_L2x1_LOOP_END 448 449 .align 5 450 451ZGEMM_L2x1_LOOP: 452 453 KERNEL2x1_1 454 KERNEL2x1_2 455 KERNEL2x1_1 456 KERNEL2x1_2 457 458 KERNEL2x1_1 459 KERNEL2x1_2 460 KERNEL2x1_1 461 KERNEL2x1_2 462 463 addic. L, L, -1 464 bgt ZGEMM_L2x1_LOOP 465 466ZGEMM_L2x1_LOOP_END: 467 468 KERNEL2x1_1 469 KERNEL2x1_2 470 KERNEL2x1_1 471 KERNEL2x1_2 472 473 KERNEL2x1_1 474 KERNEL2x1_2 475 KERNEL2x1_1 476 KERNEL2x1_E2 477 478 b ZGEMM_L2x1_SUB1 479 480ZGEMM_L2x1_SUB4: 481 482 KERNEL2x1_SUBI1 483 KERNEL2x1_SUB1 484 KERNEL2x1_SUB1 485 KERNEL2x1_SUB1 486 487 KERNEL2x1_SUB1 488 KERNEL2x1_SUB1 489 KERNEL2x1_SUB1 490 KERNEL2x1_SUB1 491 492 b ZGEMM_L2x1_SUB1 493 494ZGEMM_L2x1_SUB0: 495 496 andi. L, K, 7 497 498 KERNEL2x1_SUBI1 499 500 addic. L, L, -1 501 ble ZGEMM_L2x1_SAVE 502 b ZGEMM_L2x1_SUB2 503 504ZGEMM_L2x1_SUB1: 505 506 andi. L, K, 7 507 ble ZGEMM_L2x1_SAVE 508 509ZGEMM_L2x1_SUB2: 510 511 KERNEL2x1_SUB1 512 513 addic. L, L, -1 514 bgt ZGEMM_L2x1_SUB2 515 516ZGEMM_L2x1_SAVE: 517 518 SAVE2x1 519 520ZGEMM_L2x1_END: 521 522 slwi T1, K, 5 523 add B, B, T1 524 525 addic. J, J, -1 526 bgt ZGEMM_L2_BEGIN 527 528 andi. T2, N, 1 529 ble L999 530 531ZGEMM_L2_END: 532 533 b ZGEMM_L1_BEGIN 534 535L999_H1: 536 537 b L999 538 539ZGEMM_L1_BEGIN: 540 541 mr BO, B 542 mr BBO, BBUFFER 543 slwi T1, K, 0 544 545ZGEMM_L1_COPYB: 546 dcbtst BBO, PRE 547 548 lxvdsx vs4, o0, BO // b0_r 549 lxvdsx vs5, o8, BO // b0_i 550 addi BO, BO, 16 551 stxvd2x vs4, o0, BBO 552 stxvd2x vs5, o16, BBO 553 addic. T1, T1, -1 554 addi BBO, BBO, 32 555 556 bge ZGEMM_L1_COPYB 557 558 559 andi. T1, N, 1 560 ble ZGEMM_L1_END 561 mr CO, C 562 mr AO, A 563 srawi. I, M, 3 564 ble ZGEMM_L1x8_END 565 566ZGEMM_L1x8_BEGIN: 567 568 569 mr BO, BBUFFER 570 srawi. L, K, 3 571 ble ZGEMM_L1x8_SUB0 572 cmpwi cr0, L, 1 573 ble ZGEMM_L1x8_SUB4 574 575ZGEMM_L1x8_LOOP_START: 576 577 dcbt AO, PRE 578 LOAD1x8_1 579 dcbt AO, PRE 580 KERNEL1x8_I1 581 dcbt AO, PRE 582 KERNEL1x8_2 583 dcbt AO, PRE 584 KERNEL1x8_1 585 dcbt AO, PRE 586 KERNEL1x8_2 587 588 dcbt AO, PRE 589 KERNEL1x8_1 590 dcbt AO, PRE 591 KERNEL1x8_2 592 dcbt AO, PRE 593 KERNEL1x8_1 594 dcbt AO, PRE 595 KERNEL1x8_2 596 597 addic. L, L, -2 598 ble ZGEMM_L1x8_LOOP_END 599 600 .align 5 601 602ZGEMM_L1x8_LOOP: 603 604 dcbt AO, PRE 605 KERNEL1x8_1 606 dcbt AO, PRE 607 KERNEL1x8_2 608 dcbt AO, PRE 609 KERNEL1x8_1 610 dcbt AO, PRE 611 KERNEL1x8_2 612 613 dcbt AO, PRE 614 KERNEL1x8_1 615 dcbt AO, PRE 616 KERNEL1x8_2 617 dcbt AO, PRE 618 KERNEL1x8_1 619 dcbt AO, PRE 620 KERNEL1x8_2 621 622 addic. L, L, -1 623 bgt ZGEMM_L1x8_LOOP 624 625ZGEMM_L1x8_LOOP_END: 626 627 dcbt AO, PRE 628 KERNEL1x8_1 629 dcbt AO, PRE 630 KERNEL1x8_2 631 dcbt AO, PRE 632 KERNEL1x8_1 633 dcbt AO, PRE 634 KERNEL1x8_2 635 636 dcbt AO, PRE 637 KERNEL1x8_1 638 dcbt AO, PRE 639 KERNEL1x8_2 640 dcbt AO, PRE 641 KERNEL1x8_1 642 KERNEL1x8_E2 643 644 b ZGEMM_L1x8_SUB1 645 646ZGEMM_L1x8_SUB4: 647 648 dcbt AO, PRE 649 KERNEL1x8_SUBI1 650 dcbt AO, PRE 651 KERNEL1x8_SUB1 652 dcbt AO, PRE 653 KERNEL1x8_SUB1 654 dcbt AO, PRE 655 KERNEL1x8_SUB1 656 657 KERNEL1x8_SUB1 658 KERNEL1x8_SUB1 659 KERNEL1x8_SUB1 660 KERNEL1x8_SUB1 661 662 b ZGEMM_L1x8_SUB1 663 664ZGEMM_L1x8_SUB0: 665 666 andi. L, K, 7 667 668 KERNEL1x8_SUBI1 669 670 addic. L, L, -1 671 ble ZGEMM_L1x8_SAVE 672 b ZGEMM_L1x8_SUB2 673 674ZGEMM_L1x8_SUB1: 675 676 andi. L, K, 7 677 ble ZGEMM_L1x8_SAVE 678 679ZGEMM_L1x8_SUB2: 680 681 KERNEL1x8_SUB1 682 683 addic. L, L, -1 684 bgt ZGEMM_L1x8_SUB2 685 686ZGEMM_L1x8_SAVE: 687 688 SAVE1x8 689 690 addic. I, I, -1 691 bgt ZGEMM_L1x8_BEGIN 692 693ZGEMM_L1x8_END: 694 695ZGEMM_L1x4_BEGIN: 696 697 andi. T2, M, 7 698 ble ZGEMM_L1x1_END 699 700 andi. T1, M, 4 701 ble ZGEMM_L1x4_END 702 mr BO, BBUFFER 703 srawi. L, K, 3 704 ble ZGEMM_L1x4_SUB0 705 cmpwi cr0, L, 1 706 ble ZGEMM_L1x4_SUB4 707 708ZGEMM_L1x4_LOOP_START: 709 710 LOAD1x4_1 711 KERNEL1x4_I1 712 KERNEL1x4_2 713 KERNEL1x4_1 714 KERNEL1x4_2 715 716 KERNEL1x4_1 717 KERNEL1x4_2 718 KERNEL1x4_1 719 KERNEL1x4_2 720 721 addic. L, L, -2 722 ble ZGEMM_L1x4_LOOP_END 723 724 .align 5 725 726ZGEMM_L1x4_LOOP: 727 728 KERNEL1x4_1 729 KERNEL1x4_2 730 KERNEL1x4_1 731 KERNEL1x4_2 732 733 KERNEL1x4_1 734 KERNEL1x4_2 735 KERNEL1x4_1 736 KERNEL1x4_2 737 738 addic. L, L, -1 739 bgt ZGEMM_L1x4_LOOP 740 741ZGEMM_L1x4_LOOP_END: 742 743 KERNEL1x4_1 744 KERNEL1x4_2 745 KERNEL1x4_1 746 KERNEL1x4_2 747 748 KERNEL1x4_1 749 KERNEL1x4_2 750 KERNEL1x4_1 751 KERNEL1x4_E2 752 753 b ZGEMM_L1x4_SUB1 754 755ZGEMM_L1x4_SUB4: 756 757 KERNEL1x4_SUBI1 758 KERNEL1x4_SUB1 759 KERNEL1x4_SUB1 760 KERNEL1x4_SUB1 761 762 KERNEL1x4_SUB1 763 KERNEL1x4_SUB1 764 KERNEL1x4_SUB1 765 KERNEL1x4_SUB1 766 767 b ZGEMM_L1x4_SUB1 768 769ZGEMM_L1x4_SUB0: 770 771 andi. L, K, 7 772 773 KERNEL1x4_SUBI1 774 775 addic. L, L, -1 776 ble ZGEMM_L1x4_SAVE 777 b ZGEMM_L1x4_SUB2 778 779ZGEMM_L1x4_SUB1: 780 781 andi. L, K, 7 782 ble ZGEMM_L1x4_SAVE 783 784ZGEMM_L1x4_SUB2: 785 786 KERNEL1x4_SUB1 787 788 addic. L, L, -1 789 bgt ZGEMM_L1x4_SUB2 790 791ZGEMM_L1x4_SAVE: 792 793 SAVE1x4 794 795ZGEMM_L1x4_END: 796 797ZGEMM_L1x2_BEGIN: 798 799 800 andi. T1, M, 2 801 ble ZGEMM_L1x2_END 802 mr BO, BBUFFER 803 srawi. L, K, 3 804 ble ZGEMM_L1x2_SUB0 805 cmpwi cr0, L, 1 806 ble ZGEMM_L1x2_SUB4 807 808ZGEMM_L1x2_LOOP_START: 809 810 LOAD1x2_1 811 KERNEL1x2_I1 812 KERNEL1x2_2 813 KERNEL1x2_1 814 KERNEL1x2_2 815 816 KERNEL1x2_1 817 KERNEL1x2_2 818 KERNEL1x2_1 819 KERNEL1x2_2 820 821 addic. L, L, -2 822 ble ZGEMM_L1x2_LOOP_END 823 824 .align 5 825 826ZGEMM_L1x2_LOOP: 827 828 KERNEL1x2_1 829 KERNEL1x2_2 830 KERNEL1x2_1 831 KERNEL1x2_2 832 833 KERNEL1x2_1 834 KERNEL1x2_2 835 KERNEL1x2_1 836 KERNEL1x2_2 837 838 addic. L, L, -1 839 bgt ZGEMM_L1x2_LOOP 840 841ZGEMM_L1x2_LOOP_END: 842 843 KERNEL1x2_1 844 KERNEL1x2_2 845 KERNEL1x2_1 846 KERNEL1x2_2 847 848 KERNEL1x2_1 849 KERNEL1x2_2 850 KERNEL1x2_1 851 KERNEL1x2_E2 852 853 b ZGEMM_L1x2_SUB1 854 855ZGEMM_L1x2_SUB4: 856 857 KERNEL1x2_SUBI1 858 KERNEL1x2_SUB1 859 KERNEL1x2_SUB1 860 KERNEL1x2_SUB1 861 862 KERNEL1x2_SUB1 863 KERNEL1x2_SUB1 864 KERNEL1x2_SUB1 865 KERNEL1x2_SUB1 866 867 b ZGEMM_L1x2_SUB1 868 869ZGEMM_L1x2_SUB0: 870 871 andi. L, K, 7 872 873 KERNEL1x2_SUBI1 874 875 addic. L, L, -1 876 ble ZGEMM_L1x2_SAVE 877 b ZGEMM_L1x2_SUB2 878 879ZGEMM_L1x2_SUB1: 880 881 andi. L, K, 7 882 ble ZGEMM_L1x2_SAVE 883 884ZGEMM_L1x2_SUB2: 885 886 KERNEL1x2_SUB1 887 888 addic. L, L, -1 889 bgt ZGEMM_L1x2_SUB2 890 891ZGEMM_L1x2_SAVE: 892 893 SAVE1x2 894 895ZGEMM_L1x2_END: 896 897ZGEMM_L1x1_BEGIN: 898 899 900 andi. T1, M, 1 901 ble ZGEMM_L1x1_END 902 mr BO, BBUFFER 903 srawi. L, K, 3 904 ble ZGEMM_L1x1_SUB0 905 cmpwi cr0, L, 1 906 ble ZGEMM_L1x1_SUB4 907 908ZGEMM_L1x1_LOOP_START: 909 910 LOAD1x1_1 911 KERNEL1x1_I1 912 KERNEL1x1_2 913 KERNEL1x1_1 914 KERNEL1x1_2 915 916 KERNEL1x1_1 917 KERNEL1x1_2 918 KERNEL1x1_1 919 KERNEL1x1_2 920 921 addic. L, L, -2 922 ble ZGEMM_L1x1_LOOP_END 923 924 .align 5 925 926ZGEMM_L1x1_LOOP: 927 928 KERNEL1x1_1 929 KERNEL1x1_2 930 KERNEL1x1_1 931 KERNEL1x1_2 932 933 KERNEL1x1_1 934 KERNEL1x1_2 935 KERNEL1x1_1 936 KERNEL1x1_2 937 938 addic. L, L, -1 939 bgt ZGEMM_L1x1_LOOP 940 941ZGEMM_L1x1_LOOP_END: 942 943 KERNEL1x1_1 944 KERNEL1x1_2 945 KERNEL1x1_1 946 KERNEL1x1_2 947 948 KERNEL1x1_1 949 KERNEL1x1_2 950 KERNEL1x1_1 951 KERNEL1x1_E2 952 953 b ZGEMM_L1x1_SUB1 954 955ZGEMM_L1x1_SUB4: 956 957 KERNEL1x1_SUBI1 958 KERNEL1x1_SUB1 959 KERNEL1x1_SUB1 960 KERNEL1x1_SUB1 961 962 KERNEL1x1_SUB1 963 KERNEL1x1_SUB1 964 KERNEL1x1_SUB1 965 KERNEL1x1_SUB1 966 967 b ZGEMM_L1x1_SUB1 968 969ZGEMM_L1x1_SUB0: 970 971 andi. L, K, 7 972 973 KERNEL1x1_SUBI1 974 975 addic. L, L, -1 976 ble ZGEMM_L1x1_SAVE 977 b ZGEMM_L1x1_SUB2 978 979ZGEMM_L1x1_SUB1: 980 981 andi. L, K, 7 982 ble ZGEMM_L1x1_SAVE 983 984ZGEMM_L1x1_SUB2: 985 986 KERNEL1x1_SUB1 987 988 addic. L, L, -1 989 bgt ZGEMM_L1x1_SUB2 990 991ZGEMM_L1x1_SAVE: 992 993 SAVE1x1 994 995ZGEMM_L1x1_END: 996 997ZGEMM_L1_END: 998