1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define M %rdi 43#define N %rsi 44#define K %rdx 45 46#define A %rcx 47#define B %r8 48#define C %r9 49#define LDC %r10 50 51#define I %r11 52#define AO %r13 53#define BO %r14 54#define CO1 %r15 55#define CO2 %rbx 56#define KK %rbp 57#define BB %r12 58 59#ifndef WINDOWS_ABI 60 61#define STACKSIZE 128 62 63#define OLD_LDC 8 + STACKSIZE(%rsp) 64#define OLD_OFFSET 16 + STACKSIZE(%rsp) 65 66#define OFFSET 48(%rsp) 67#define J 56(%rsp) 68#define KKK 64(%rsp) 69#define AORIG 72(%rsp) 70 71#else 72 73#define STACKSIZE 256 74 75#define OLD_A 40 + STACKSIZE(%rsp) 76#define OLD_B 48 + STACKSIZE(%rsp) 77#define OLD_C 56 + STACKSIZE(%rsp) 78#define OLD_LDC 64 + STACKSIZE(%rsp) 79#define OLD_OFFSET 72 + STACKSIZE(%rsp) 80 81#define OFFSET 224(%rsp) 82#define J 232(%rsp) 83#define KKK 240(%rsp) 84#define AORIG 248(%rsp) 85 86#endif 87 88#define PREFETCH prefetcht0 89#define PREFETCHSIZE (8 * 8 + 3) 90 91 PROLOGUE 92 PROFCODE 93 94 subq $STACKSIZE, %rsp 95 movq %rbx, 0(%rsp) 96 movq %rbp, 8(%rsp) 97 movq %r12, 16(%rsp) 98 movq %r13, 24(%rsp) 99 movq %r14, 32(%rsp) 100 movq %r15, 40(%rsp) 101 102#ifdef WINDOWS_ABI 103 movq %rdi, 48(%rsp) 104 movq %rsi, 56(%rsp) 105 movups %xmm6, 64(%rsp) 106 movups %xmm7, 80(%rsp) 107 movups %xmm8, 96(%rsp) 108 movups %xmm9, 112(%rsp) 109 movups %xmm10, 128(%rsp) 110 movups %xmm11, 144(%rsp) 111 movups %xmm12, 160(%rsp) 112 movups %xmm13, 176(%rsp) 113 movups %xmm14, 192(%rsp) 114 movups %xmm15, 208(%rsp) 115 116 movq ARG1, M 117 movq ARG2, N 118 movq ARG3, K 119 movq OLD_A, A 120 movq OLD_B, B 121 movq OLD_C, C 122#endif 123 124 movq OLD_LDC, LDC 125 movq OLD_OFFSET, KK 126 127 movq KK, OFFSET 128 129 leaq (, LDC, SIZE), LDC 130 131#ifdef LN 132 leaq (, M, SIZE), %rax 133 addq %rax, C 134 imulq K, %rax 135 addq %rax, A 136#endif 137 138#ifdef RT 139 leaq (, N, SIZE), %rax 140 imulq K, %rax 141 addq %rax, B 142 movq N, %rax 143 imulq LDC, %rax 144 addq %rax, C 145#endif 146 147#ifdef RN 148 negq KK 149#endif 150 151#ifdef RT 152 movq N, %rax 153 subq OFFSET, %rax 154 movq %rax, KK 155#endif 156 157 testq $1, N 158 je .L40 159 ALIGN_4 160 161#if defined(LT) || defined(RN) 162 movq A, AO 163#else 164 movq A, AORIG 165#endif 166 167#ifdef RT 168 movq K, %rax 169 salq $0 + BASE_SHIFT, %rax 170 subq %rax, B 171 172 subq LDC, C 173#endif 174 175 movq C, CO1 176#ifndef RT 177 addq LDC, C 178#endif 179 180#ifdef LN 181 movq OFFSET, %rax 182 addq M, %rax 183 movq %rax, KK 184#endif 185 186#ifdef LT 187 movq OFFSET, %rax 188 movq %rax, KK 189#endif 190 191 movq M, I 192 sarq $2, I 193 jle .L50 194 ALIGN_4 195 196.L41: 197#ifdef LN 198 movq K, %rax 199 salq $2 + BASE_SHIFT, %rax 200 subq %rax, AORIG 201#endif 202 203#if defined(LN) || defined(RT) 204 movq KK, %rax 205 leaq (, %rax, SIZE), %rax 206 movq AORIG, AO 207 leaq (AO, %rax, 4), AO 208 leaq (B, %rax, 1), BO 209#else 210 movq B, BO 211#endif 212 213 movsd 0 * SIZE(AO), %xmm0 214 xorps %xmm9, %xmm9 215 movsd 1 * SIZE(AO), %xmm1 216 xorps %xmm11, %xmm11 217 movsd 2 * SIZE(AO), %xmm2 218 xorps %xmm13, %xmm13 219 movsd 3 * SIZE(AO), %xmm3 220 xorps %xmm15, %xmm15 221 222 movsd 0 * SIZE(BO), %xmm4 223 xorps %xmm8, %xmm8 224 movsd 1 * SIZE(BO), %xmm5 225 xorps %xmm10, %xmm10 226 prefetcht0 3 * SIZE(CO1) 227 xorps %xmm12, %xmm12 228 xorps %xmm14, %xmm14 229 230#if defined(LT) || defined(RN) 231 movq KK, %rax 232#else 233 movq K, %rax 234 subq KK, %rax 235#endif 236 sarq $2, %rax 237 je .L45 238 ALIGN_4 239 240.L42: 241 addsd %xmm9, %xmm8 242 movsd 4 * SIZE(AO), %xmm9 243 mulsd %xmm4, %xmm0 244 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 245 246 addsd %xmm11, %xmm10 247 movsd 5 * SIZE(AO), %xmm11 248 mulsd %xmm4, %xmm1 249 250 addsd %xmm13, %xmm12 251 movsd 6 * SIZE(AO), %xmm13 252 mulsd %xmm4, %xmm2 253 254 addsd %xmm15, %xmm14 255 movsd 7 * SIZE(AO), %xmm15 256 mulsd %xmm4, %xmm3 257 movsd 2 * SIZE(BO), %xmm4 258 259 addsd %xmm0, %xmm8 260 movsd 8 * SIZE(AO), %xmm0 261 mulsd %xmm5, %xmm9 262 263 addsd %xmm1, %xmm10 264 movsd 9 * SIZE(AO), %xmm1 265 mulsd %xmm5, %xmm11 266 267 addsd %xmm2, %xmm12 268 movsd 10 * SIZE(AO), %xmm2 269 mulsd %xmm5, %xmm13 270 271 addsd %xmm3, %xmm14 272 movsd 11 * SIZE(AO), %xmm3 273 mulsd %xmm5, %xmm15 274 movsd 3 * SIZE(BO), %xmm5 275 276 addsd %xmm9, %xmm8 277 movsd 12 * SIZE(AO), %xmm9 278 mulsd %xmm4, %xmm0 279 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) 280 281 addsd %xmm11, %xmm10 282 movsd 13 * SIZE(AO), %xmm11 283 mulsd %xmm4, %xmm1 284 285 addsd %xmm13, %xmm12 286 movsd 14 * SIZE(AO), %xmm13 287 mulsd %xmm4, %xmm2 288 289 addsd %xmm15, %xmm14 290 movsd 15 * SIZE(AO), %xmm15 291 mulsd %xmm4, %xmm3 292 movsd 4 * SIZE(BO), %xmm4 293 subq $-16 * SIZE, AO 294 295 addsd %xmm0, %xmm8 296 movsd 0 * SIZE(AO), %xmm0 297 mulsd %xmm5, %xmm9 298 299 addsd %xmm1, %xmm10 300 movsd 1 * SIZE(AO), %xmm1 301 mulsd %xmm5, %xmm11 302 addq $ 4 * SIZE, BO 303 304 addsd %xmm2, %xmm12 305 movsd 2 * SIZE(AO), %xmm2 306 mulsd %xmm5, %xmm13 307 decq %rax 308 309 addsd %xmm3, %xmm14 310 movsd 3 * SIZE(AO), %xmm3 311 mulsd %xmm5, %xmm15 312 movsd 1 * SIZE(BO), %xmm5 313 314 jne .L42 315 ALIGN_4 316 317.L45: 318#if defined(LT) || defined(RN) 319 movq KK, %rax 320#else 321 movq K, %rax 322 subq KK, %rax 323#endif 324 325 addsd %xmm9, %xmm8 326 addsd %xmm11, %xmm10 327 addsd %xmm13, %xmm12 328 addsd %xmm15, %xmm14 329 330 andq $3, %rax 331 BRANCH 332 BRANCH 333 je .L49 334 ALIGN_4 335 336.L46: 337 mulsd %xmm4, %xmm0 338 mulsd %xmm4, %xmm1 339 mulsd %xmm4, %xmm2 340 mulsd %xmm4, %xmm3 341 movsd 1 * SIZE(BO), %xmm4 342 343 addsd %xmm0, %xmm8 344 movsd 4 * SIZE(AO), %xmm0 345 addsd %xmm1, %xmm10 346 movsd 5 * SIZE(AO), %xmm1 347 addsd %xmm2, %xmm12 348 movsd 6 * SIZE(AO), %xmm2 349 addsd %xmm3, %xmm14 350 movsd 7 * SIZE(AO), %xmm3 351 352 addq $4 * SIZE, AO 353 addq $1 * SIZE, BO 354 decq %rax 355 BRANCH 356 jg .L46 357 ALIGN_4 358 359.L49: 360#if defined(LN) || defined(RT) 361 movq KK, %rax 362#ifdef LN 363 subq $4, %rax 364#else 365 subq $1, %rax 366#endif 367 leaq (, %rax, SIZE), %rax 368 369 movq AORIG, AO 370 leaq (AO, %rax, 4), AO 371 leaq (B, %rax, 1), BO 372#endif 373 374#if defined(LN) || defined(LT) 375 movsd 0 * SIZE(BO), %xmm0 376 movsd 1 * SIZE(BO), %xmm2 377 movsd 2 * SIZE(BO), %xmm4 378 movsd 3 * SIZE(BO), %xmm6 379 380 subsd %xmm8, %xmm0 381 subsd %xmm10, %xmm2 382 subsd %xmm12, %xmm4 383 subsd %xmm14, %xmm6 384#else 385 movsd 0 * SIZE(AO), %xmm0 386 movsd 1 * SIZE(AO), %xmm2 387 movsd 2 * SIZE(AO), %xmm4 388 movsd 3 * SIZE(AO), %xmm6 389 390 subsd %xmm8, %xmm0 391 subsd %xmm10, %xmm2 392 subsd %xmm12, %xmm4 393 subsd %xmm14, %xmm6 394#endif 395 396#ifdef LN 397 movsd 15 * SIZE(AO), %xmm8 398 mulsd %xmm8, %xmm6 399 movsd 14 * SIZE(AO), %xmm9 400 mulsd %xmm6, %xmm9 401 movsd 13 * SIZE(AO), %xmm11 402 subsd %xmm9, %xmm4 403 movsd 12 * SIZE(AO), %xmm13 404 mulsd %xmm6, %xmm11 405 movsd 10 * SIZE(AO), %xmm8 406 subsd %xmm11, %xmm2 407 movsd 9 * SIZE(AO), %xmm9 408 mulsd %xmm6, %xmm13 409 movsd 8 * SIZE(AO), %xmm11 410 subsd %xmm13, %xmm0 411 412 mulsd %xmm8, %xmm4 413 movsd 5 * SIZE(AO), %xmm8 414 mulsd %xmm4, %xmm9 415 subsd %xmm9, %xmm2 416 movsd 4 * SIZE(AO), %xmm9 417 mulsd %xmm4, %xmm11 418 subsd %xmm11, %xmm0 419 movsd 0 * SIZE(AO), %xmm11 420 mulsd %xmm8, %xmm2 421 mulsd %xmm2, %xmm9 422 subsd %xmm9, %xmm0 423 mulsd %xmm11, %xmm0 424#endif 425 426#ifdef LT 427 movsd 0 * SIZE(AO), %xmm8 428 mulsd %xmm8, %xmm0 429 movsd 1 * SIZE(AO), %xmm9 430 mulsd %xmm0, %xmm9 431 movsd 2 * SIZE(AO), %xmm11 432 subsd %xmm9, %xmm2 433 movsd 3 * SIZE(AO), %xmm13 434 mulsd %xmm0, %xmm11 435 movsd 5 * SIZE(AO), %xmm8 436 subsd %xmm11, %xmm4 437 movsd 6 * SIZE(AO), %xmm9 438 mulsd %xmm0, %xmm13 439 movsd 7 * SIZE(AO), %xmm11 440 subsd %xmm13, %xmm6 441 442 mulsd %xmm8, %xmm2 443 movsd 10 * SIZE(AO), %xmm8 444 mulsd %xmm2, %xmm9 445 subsd %xmm9, %xmm4 446 movsd 11 * SIZE(AO), %xmm9 447 mulsd %xmm2, %xmm11 448 subsd %xmm11, %xmm6 449 mulsd %xmm8, %xmm4 450 movsd 15 * SIZE(AO), %xmm8 451 mulsd %xmm4, %xmm9 452 subsd %xmm9, %xmm6 453 mulsd %xmm8, %xmm6 454#endif 455 456#if defined(RN) || defined(RT) 457 movsd 0 * SIZE(BO), %xmm8 458 mulsd %xmm8, %xmm0 459 mulsd %xmm8, %xmm2 460 mulsd %xmm8, %xmm4 461 mulsd %xmm8, %xmm6 462#endif 463 464#ifdef LN 465 subq $4 * SIZE, CO1 466#endif 467 468 movsd %xmm0, 0 * SIZE(CO1) 469 movsd %xmm2, 1 * SIZE(CO1) 470 movsd %xmm4, 2 * SIZE(CO1) 471 movsd %xmm6, 3 * SIZE(CO1) 472 473#if defined(LN) || defined(LT) 474 movsd %xmm0, 0 * SIZE(BO) 475 movsd %xmm2, 1 * SIZE(BO) 476 movsd %xmm4, 2 * SIZE(BO) 477 movsd %xmm6, 3 * SIZE(BO) 478#else 479 movsd %xmm0, 0 * SIZE(AO) 480 movsd %xmm2, 1 * SIZE(AO) 481 movsd %xmm4, 2 * SIZE(AO) 482 movsd %xmm6, 3 * SIZE(AO) 483#endif 484 485#ifndef LN 486 addq $4 * SIZE, CO1 487#endif 488 489#if defined(LT) || defined(RN) 490 movq K, %rax 491 subq KK, %rax 492 leaq (,%rax, SIZE), %rax 493 leaq (AO, %rax, 4), AO 494 leaq (BO, %rax, 1), BO 495#endif 496 497#ifdef LN 498 subq $4, KK 499#endif 500 501#ifdef LT 502 addq $4, KK 503#endif 504 505#ifdef RT 506 movq K, %rax 507 salq $2 + BASE_SHIFT, %rax 508 addq %rax, AORIG 509#endif 510 511 decq I # i -- 512 jg .L41 513 ALIGN_4 514 515.L50: 516 testq $2, M 517 je .L60 518 519#ifdef LN 520 movq K, %rax 521 salq $1 + BASE_SHIFT, %rax 522 subq %rax, AORIG 523#endif 524 525#if defined(LN) || defined(RT) 526 movq KK, %rax 527 leaq (, %rax, SIZE), %rax 528 movq AORIG, AO 529 leaq (AO, %rax, 2), AO 530 leaq (B, %rax, 1), BO 531#else 532 movq B, BO 533#endif 534 535 movsd 0 * SIZE(AO), %xmm0 536 xorps %xmm2, %xmm2 537 movsd 1 * SIZE(AO), %xmm1 538 xorps %xmm3, %xmm3 539 540 movsd 0 * SIZE(BO), %xmm4 541 xorps %xmm8, %xmm8 542 movsd 1 * SIZE(BO), %xmm5 543 xorps %xmm10, %xmm10 544 545#if defined(LT) || defined(RN) 546 movq KK, %rax 547#else 548 movq K, %rax 549 subq KK, %rax 550#endif 551 sarq $2, %rax 552 je .L55 553 ALIGN_4 554 555.L52: 556 addsd %xmm2, %xmm8 557 movsd 2 * SIZE(AO), %xmm2 558 mulsd %xmm4, %xmm0 559 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 560 561 addsd %xmm3, %xmm10 562 movsd 3 * SIZE(AO), %xmm3 563 mulsd %xmm4, %xmm1 564 movsd 2 * SIZE(BO), %xmm4 565 566 addsd %xmm0, %xmm8 567 movsd 4 * SIZE(AO), %xmm0 568 mulsd %xmm5, %xmm2 569 addq $8 * SIZE, AO 570 571 addsd %xmm1, %xmm10 572 movsd -3 * SIZE(AO), %xmm1 573 mulsd %xmm5, %xmm3 574 movsd 3 * SIZE(BO), %xmm5 575 576 addsd %xmm2, %xmm8 577 movsd -2 * SIZE(AO), %xmm2 578 mulsd %xmm4, %xmm0 579 addq $4 * SIZE, BO 580 581 addsd %xmm3, %xmm10 582 movsd -1 * SIZE(AO), %xmm3 583 mulsd %xmm4, %xmm1 584 movsd 0 * SIZE(BO), %xmm4 585 586 addsd %xmm0, %xmm8 587 movsd 0 * SIZE(AO), %xmm0 588 mulsd %xmm5, %xmm2 589 decq %rax 590 591 addsd %xmm1, %xmm10 592 movsd 1 * SIZE(AO), %xmm1 593 mulsd %xmm5, %xmm3 594 movsd 1 * SIZE(BO), %xmm5 595 596 jne .L52 597 ALIGN_4 598 599.L55: 600#if defined(LT) || defined(RN) 601 movq KK, %rax 602#else 603 movq K, %rax 604 subq KK, %rax 605#endif 606 addsd %xmm2, %xmm8 607 addsd %xmm3, %xmm10 608 609 andq $3, %rax 610 BRANCH 611 je .L59 612 ALIGN_4 613 614.L56: 615 mulsd %xmm4, %xmm0 616 mulsd %xmm4, %xmm1 617 movsd 1 * SIZE(BO), %xmm4 618 619 addsd %xmm0, %xmm8 620 movsd 2 * SIZE(AO), %xmm0 621 addsd %xmm1, %xmm10 622 movsd 3 * SIZE(AO), %xmm1 623 624 addq $2 * SIZE, AO 625 addq $1 * SIZE, BO 626 decq %rax 627 BRANCH 628 jg .L56 629 ALIGN_4 630 631.L59: 632#if defined(LN) || defined(RT) 633 movq KK, %rax 634#ifdef LN 635 subq $2, %rax 636#else 637 subq $1, %rax 638#endif 639 leaq (, %rax, SIZE), %rax 640 641 movq AORIG, AO 642 leaq (AO, %rax, 2), AO 643 leaq (B, %rax, 1), BO 644#endif 645 646#if defined(LN) || defined(LT) 647 movsd 0 * SIZE(BO), %xmm0 648 movsd 1 * SIZE(BO), %xmm2 649 650 subsd %xmm8, %xmm0 651 subsd %xmm10, %xmm2 652#else 653 movsd 0 * SIZE(AO), %xmm0 654 movsd 1 * SIZE(AO), %xmm2 655 656 subsd %xmm8, %xmm0 657 subsd %xmm10, %xmm2 658#endif 659 660#ifdef LN 661 movsd 3 * SIZE(AO), %xmm8 662 movsd 2 * SIZE(AO), %xmm9 663 movsd 0 * SIZE(AO), %xmm11 664 mulsd %xmm8, %xmm2 665 mulsd %xmm2, %xmm9 666 subsd %xmm9, %xmm0 667 mulsd %xmm11,%xmm0 668#endif 669 670#ifdef LT 671 movsd 0 * SIZE(AO), %xmm8 672 movsd 1 * SIZE(AO), %xmm9 673 movsd 3 * SIZE(AO), %xmm11 674 mulsd %xmm8, %xmm0 675 mulsd %xmm0, %xmm9 676 subsd %xmm9, %xmm2 677 mulsd %xmm11,%xmm2 678#endif 679 680#if defined(RN) || defined(RT) 681 movsd 0 * SIZE(BO), %xmm8 682 mulsd %xmm8, %xmm0 683 mulsd %xmm8, %xmm2 684#endif 685 686#ifdef LN 687 subq $2 * SIZE, CO1 688#endif 689 690 movsd %xmm0, 0 * SIZE(CO1) 691 movsd %xmm2, 1 * SIZE(CO1) 692 693#if defined(LN) || defined(LT) 694 movsd %xmm0, 0 * SIZE(BO) 695 movsd %xmm2, 1 * SIZE(BO) 696#else 697 movsd %xmm0, 0 * SIZE(AO) 698 movsd %xmm2, 1 * SIZE(AO) 699#endif 700 701#ifndef LN 702 addq $2 * SIZE, CO1 703#endif 704 705#if defined(LT) || defined(RN) 706 movq K, %rax 707 subq KK, %rax 708 leaq (,%rax, SIZE), %rax 709 leaq (AO, %rax, 2), AO 710 leaq (BO, %rax, 1), BO 711#endif 712 713#ifdef LN 714 subq $2, KK 715#endif 716 717#ifdef LT 718 addq $2, KK 719#endif 720 721#ifdef RT 722 movq K, %rax 723 salq $1 + BASE_SHIFT, %rax 724 addq %rax, AORIG 725#endif 726 ALIGN_4 727 728.L60: 729 testq $1, M 730 je .L69 731 732#ifdef LN 733 movq K, %rax 734 salq $0 + BASE_SHIFT, %rax 735 subq %rax, AORIG 736#endif 737 738#if defined(LN) || defined(RT) 739 movq KK, %rax 740 leaq (, %rax, SIZE), %rax 741 movq AORIG, AO 742 leaq (AO, %rax, 1), AO 743 leaq (B, %rax, 1), BO 744#else 745 movq B, BO 746#endif 747 748 movsd 0 * SIZE(AO), %xmm0 749 xorps %xmm5, %xmm5 750 movsd 1 * SIZE(AO), %xmm2 751 xorps %xmm7, %xmm7 752 753 movsd 0 * SIZE(BO), %xmm1 754 xorps %xmm8, %xmm8 755 movsd 1 * SIZE(BO), %xmm3 756 xorps %xmm9, %xmm9 757 movsd 2 * SIZE(AO), %xmm4 758 movsd 3 * SIZE(AO), %xmm6 759 760#if defined(LT) || defined(RN) 761 movq KK, %rax 762#else 763 movq K, %rax 764 subq KK, %rax 765#endif 766 sarq $2, %rax 767 je .L65 768 ALIGN_4 769 770.L62: 771 addsd %xmm5, %xmm8 772 movsd 2 * SIZE(BO), %xmm5 773 mulsd %xmm0, %xmm1 774 movsd 4 * SIZE(AO), %xmm0 775 776 addsd %xmm7, %xmm9 777 movsd 3 * SIZE(BO), %xmm7 778 mulsd %xmm2, %xmm3 779 movsd 5 * SIZE(AO), %xmm2 780 781 addsd %xmm1, %xmm8 782 movsd 4 * SIZE(BO), %xmm1 783 mulsd %xmm4, %xmm5 784 movsd 6 * SIZE(AO), %xmm4 785 786 addsd %xmm3, %xmm9 787 movsd 5 * SIZE(BO), %xmm3 788 mulsd %xmm6, %xmm7 789 movsd 7 * SIZE(AO), %xmm6 790 791 addq $4 * SIZE, AO 792 addq $4 * SIZE, BO 793 794 decq %rax 795 jne .L62 796 797 addsd %xmm5, %xmm8 798 addsd %xmm7, %xmm9 799 ALIGN_4 800 801.L65: 802#if defined(LT) || defined(RN) 803 movq KK, %rax 804#else 805 movq K, %rax 806 subq KK, %rax 807#endif 808 andq $3, %rax 809 BRANCH 810 je .L68 811 ALIGN_4 812 813.L66: 814 movsd 0 * SIZE(AO), %xmm0 815 movsd 0 * SIZE(BO), %xmm1 816 817 mulsd %xmm0, %xmm1 818 addsd %xmm1, %xmm8 819 820 addq $1 * SIZE, AO 821 addq $1 * SIZE, BO 822 823 decq %rax 824 BRANCH 825 jg .L66 826 ALIGN_4 827 828.L68: 829 addsd %xmm9, %xmm8 830 831#if defined(LN) || defined(RT) 832 movq KK, %rax 833#ifdef LN 834 subq $1, %rax 835#else 836 subq $1, %rax 837#endif 838 leaq (, %rax, SIZE), %rax 839 840 movq AORIG, AO 841 leaq (AO, %rax, 1), AO 842 leaq (B, %rax, 1), BO 843#endif 844 845#if defined(LN) || defined(LT) 846 movsd 0 * SIZE(BO), %xmm0 847 subsd %xmm8, %xmm0 848#else 849 movsd 0 * SIZE(AO), %xmm0 850 subsd %xmm8, %xmm0 851#endif 852 853#if defined(LN) || defined(LT) 854 movsd 0 * SIZE(AO), %xmm8 855 mulsd %xmm8, %xmm0 856#endif 857 858#if defined(RN) || defined(RT) 859 movsd 0 * SIZE(BO), %xmm8 860 mulsd %xmm8, %xmm0 861#endif 862 863#ifdef LN 864 subq $1 * SIZE, CO1 865#endif 866 867 movsd %xmm0, 0 * SIZE(CO1) 868 869#if defined(LN) || defined(LT) 870 movsd %xmm0, 0 * SIZE(BO) 871#else 872 movsd %xmm0, 0 * SIZE(AO) 873#endif 874 875#ifndef LN 876 addq $1 * SIZE, CO1 877#endif 878 879#if defined(LT) || defined(RN) 880 movq K, %rax 881 subq KK, %rax 882 leaq (,%rax, SIZE), %rax 883 leaq (AO, %rax, 1), AO 884 leaq (BO, %rax, 1), BO 885#endif 886 887#ifdef LN 888 subq $1, KK 889#endif 890 891#ifdef LT 892 addq $1, KK 893#endif 894 895#ifdef RT 896 movq K, %rax 897 salq $0 + BASE_SHIFT, %rax 898 addq %rax, AORIG 899#endif 900 ALIGN_4 901 902.L69: 903#ifdef LN 904 leaq (, K, SIZE), %rax 905 leaq (B, %rax, 1), B 906#endif 907 908#if defined(LT) || defined(RN) 909 movq BO, B 910#endif 911 912#ifdef RN 913 addq $1, KK 914#endif 915 916#ifdef RT 917 subq $1, KK 918#endif 919 ALIGN_2 920 921.L40: 922 movq N, J 923 sarq $1, J 924 jle .L999 925 ALIGN_4 926 927.L10: 928#if defined(LT) || defined(RN) 929 movq A, AO 930#else 931 movq A, AORIG 932#endif 933 934#ifdef RT 935 movq K, %rax 936 salq $1 + BASE_SHIFT, %rax 937 subq %rax, B 938 939 leaq (, LDC, 2), %rax 940 subq %rax, C 941#endif 942 943 movq C, CO1 944 leaq (C, LDC, 1), CO2 945#ifndef RT 946 leaq (C, LDC, 2), C 947#endif 948 949#ifdef LN 950 movq OFFSET, %rax 951 addq M, %rax 952 movq %rax, KK 953#endif 954 955 movq K, %rax 956 salq $BASE_SHIFT + 1, %rax 957 leaq (B, %rax), BB 958 959#ifdef LT 960 movq OFFSET, %rax 961 movq %rax, KK 962#endif 963 964 movq M, I 965 sarq $2, I 966 jle .L20 967 ALIGN_4 968 969.L11: 970#ifdef LN 971 movq K, %rax 972 salq $2 + BASE_SHIFT, %rax 973 subq %rax, AORIG 974#endif 975 976#if defined(LN) || defined(RT) 977 movq KK, %rax 978 leaq (, %rax, SIZE), %rax 979 movq AORIG, AO 980 leaq (AO, %rax, 4), AO 981 leaq (B, %rax, 2), BO 982#else 983 movq B, BO 984#endif 985 986 prefetcht0 0 * SIZE(BB) 987 subq $-8 * SIZE, BB 988 989 movsd 0 * SIZE(AO), %xmm0 990 xorps %xmm2, %xmm2 991 movsd 1 * SIZE(AO), %xmm4 992 xorps %xmm5, %xmm5 993 movsd 2 * SIZE(AO), %xmm5 994 xorps %xmm6, %xmm6 995 xorps %xmm7, %xmm7 996 997 movsd 0 * SIZE(BO), %xmm1 998 xorps %xmm8, %xmm8 999 xorps %xmm9, %xmm9 1000 movsd 1 * SIZE(BO), %xmm3 1001 xorps %xmm10, %xmm10 1002 xorps %xmm11, %xmm11 1003 1004 prefetcht0 3 * SIZE(CO1) 1005 xorps %xmm12, %xmm12 1006 xorps %xmm13, %xmm13 1007 prefetcht0 3 * SIZE(CO2) 1008 xorps %xmm14, %xmm14 1009 xorps %xmm15, %xmm15 1010 1011#if defined(LT) || defined(RN) 1012 movq KK, %rax 1013#else 1014 movq K, %rax 1015 subq KK, %rax 1016#endif 1017 sarq $2, %rax 1018 je .L15 1019 ALIGN_4 1020 1021.L12: 1022 addsd %xmm2, %xmm13 1023 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 1024 movaps %xmm0, %xmm2 1025 mulsd %xmm1, %xmm0 1026 1027 addsd %xmm7, %xmm14 1028 movsd 3 * SIZE(AO), %xmm7 1029 mulsd %xmm3, %xmm2 1030 1031 addsd %xmm6, %xmm15 1032 PREFETCH (PREFETCHSIZE + 0) * SIZE(BO) 1033 movaps %xmm4, %xmm6 1034 mulsd %xmm1, %xmm4 1035 1036 addsd %xmm0, %xmm8 1037 movsd 4 * SIZE(AO), %xmm0 1038 mulsd %xmm3, %xmm6 1039 1040 addsd %xmm2, %xmm9 1041 movaps %xmm5, %xmm2 1042 mulsd %xmm1, %xmm5 1043 1044 addsd %xmm4, %xmm10 1045 movsd 5 * SIZE(AO), %xmm4 1046 mulsd %xmm3, %xmm2 1047 1048 addsd %xmm6, %xmm11 1049 movaps %xmm7, %xmm6 1050 mulsd %xmm1, %xmm7 1051 movsd 2 * SIZE(BO), %xmm1 1052 1053 addsd %xmm5, %xmm12 1054 movsd 6 * SIZE(AO), %xmm5 1055 mulsd %xmm3, %xmm6 1056 movsd 3 * SIZE(BO), %xmm3 1057 1058 addsd %xmm2, %xmm13 1059 movaps %xmm0, %xmm2 1060 mulsd %xmm1, %xmm0 1061 1062 addsd %xmm7, %xmm14 1063 movsd 7 * SIZE(AO), %xmm7 1064 mulsd %xmm3, %xmm2 1065 1066 addsd %xmm6, %xmm15 1067 movaps %xmm4, %xmm6 1068 mulsd %xmm1, %xmm4 1069 1070 addsd %xmm0, %xmm8 1071 movsd 8 * SIZE(AO), %xmm0 1072 mulsd %xmm3, %xmm6 1073 1074 addsd %xmm2, %xmm9 1075 movaps %xmm5, %xmm2 1076 mulsd %xmm1, %xmm5 1077 1078 addsd %xmm4, %xmm10 1079 movsd 9 * SIZE(AO), %xmm4 1080 mulsd %xmm3, %xmm2 1081 1082 addsd %xmm6, %xmm11 1083 movaps %xmm7, %xmm6 1084 mulsd %xmm1, %xmm7 1085 movsd 4 * SIZE(BO), %xmm1 1086 1087 addsd %xmm5, %xmm12 1088 movsd 10 * SIZE(AO), %xmm5 1089 mulsd %xmm3, %xmm6 1090 movsd 5 * SIZE(BO), %xmm3 1091 1092 addsd %xmm2, %xmm13 1093 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) 1094 movaps %xmm0, %xmm2 1095 mulsd %xmm1, %xmm0 1096 1097 addsd %xmm7, %xmm14 1098 movsd 11 * SIZE(AO), %xmm7 1099 mulsd %xmm3, %xmm2 1100 1101 addsd %xmm6, %xmm15 1102 movaps %xmm4, %xmm6 1103 mulsd %xmm1, %xmm4 1104 1105 addsd %xmm0, %xmm8 1106 movsd 12 * SIZE(AO), %xmm0 1107 mulsd %xmm3, %xmm6 1108 1109 addsd %xmm2, %xmm9 1110 movaps %xmm5, %xmm2 1111 mulsd %xmm1, %xmm5 1112 1113 addsd %xmm4, %xmm10 1114 movsd 13 * SIZE(AO), %xmm4 1115 mulsd %xmm3, %xmm2 1116 1117 addsd %xmm6, %xmm11 1118 movaps %xmm7, %xmm6 1119 mulsd %xmm1, %xmm7 1120 movsd 6 * SIZE(BO), %xmm1 1121 1122 addsd %xmm5, %xmm12 1123 movsd 14 * SIZE(AO), %xmm5 1124 mulsd %xmm3, %xmm6 1125 movsd 7 * SIZE(BO), %xmm3 1126 1127 addsd %xmm2, %xmm13 1128 movaps %xmm0, %xmm2 1129 mulsd %xmm1, %xmm0 1130 1131 addsd %xmm7, %xmm14 1132 movsd 15 * SIZE(AO), %xmm7 1133 mulsd %xmm3, %xmm2 1134 subq $-16 * SIZE, AO 1135 1136 addsd %xmm6, %xmm15 1137 movaps %xmm4, %xmm6 1138 mulsd %xmm1, %xmm4 1139 1140 addsd %xmm0, %xmm8 1141 movsd 0 * SIZE(AO), %xmm0 1142 mulsd %xmm3, %xmm6 1143 1144 addsd %xmm2, %xmm9 1145 movaps %xmm5, %xmm2 1146 mulsd %xmm1, %xmm5 1147 addq $ 8 * SIZE, BO 1148 1149 addsd %xmm4, %xmm10 1150 movsd 1 * SIZE(AO), %xmm4 1151 mulsd %xmm3, %xmm2 1152 decq %rax 1153 1154 addsd %xmm6, %xmm11 1155 movaps %xmm7, %xmm6 1156 mulsd %xmm1, %xmm7 1157 movsd 0 * SIZE(BO), %xmm1 1158 1159 addsd %xmm5, %xmm12 1160 movsd 2 * SIZE(AO), %xmm5 1161 mulsd %xmm3, %xmm6 1162 movsd 1 * SIZE(BO), %xmm3 1163 1164 jne .L12 1165 ALIGN_4 1166 1167.L15: 1168#if defined(LT) || defined(RN) 1169 movq KK, %rax 1170#else 1171 movq K, %rax 1172 subq KK, %rax 1173#endif 1174 andq $3, %rax 1175 BRANCH 1176 je .L19 1177 ALIGN_4 1178 1179.L16: 1180 addsd %xmm2, %xmm13 1181 movaps %xmm0, %xmm2 1182 mulsd %xmm1, %xmm0 1183 1184 addsd %xmm7, %xmm14 1185 movsd 3 * SIZE(AO), %xmm7 1186 mulsd %xmm3, %xmm2 1187 1188 addsd %xmm6, %xmm15 1189 movaps %xmm4, %xmm6 1190 mulsd %xmm1, %xmm4 1191 1192 addsd %xmm0, %xmm8 1193 movsd 4 * SIZE(AO), %xmm0 1194 mulsd %xmm3, %xmm6 1195 1196 addsd %xmm2, %xmm9 1197 movaps %xmm5, %xmm2 1198 mulsd %xmm1, %xmm5 1199 1200 addsd %xmm4, %xmm10 1201 movsd 5 * SIZE(AO), %xmm4 1202 mulsd %xmm3, %xmm2 1203 1204 addsd %xmm6, %xmm11 1205 movaps %xmm7, %xmm6 1206 mulsd %xmm1, %xmm7 1207 movsd 2 * SIZE(BO), %xmm1 1208 1209 addsd %xmm5, %xmm12 1210 movsd 6 * SIZE(AO), %xmm5 1211 mulsd %xmm3, %xmm6 1212 movsd 3 * SIZE(BO), %xmm3 1213 1214 addq $4 * SIZE, AO 1215 addq $2 * SIZE, BO 1216 decq %rax 1217 BRANCH 1218 jg .L16 1219 ALIGN_4 1220 1221.L19: 1222 addsd %xmm2, %xmm13 1223 addsd %xmm7, %xmm14 1224 addsd %xmm6, %xmm15 1225 1226#if defined(LN) || defined(RT) 1227 movq KK, %rax 1228#ifdef LN 1229 subq $4, %rax 1230#else 1231 subq $2, %rax 1232#endif 1233 1234 leaq (, %rax, SIZE), %rax 1235 1236 movq AORIG, AO 1237 leaq (AO, %rax, 4), AO 1238 leaq (B, %rax, 2), BO 1239#endif 1240 1241#if defined(LN) || defined(LT) 1242 movsd 0 * SIZE(BO), %xmm0 1243 movsd 1 * SIZE(BO), %xmm1 1244 movsd 2 * SIZE(BO), %xmm2 1245 movsd 3 * SIZE(BO), %xmm3 1246 movsd 4 * SIZE(BO), %xmm4 1247 movsd 5 * SIZE(BO), %xmm5 1248 movsd 6 * SIZE(BO), %xmm6 1249 movsd 7 * SIZE(BO), %xmm7 1250 1251 subsd %xmm8, %xmm0 1252 subsd %xmm9, %xmm1 1253 subsd %xmm10, %xmm2 1254 subsd %xmm11, %xmm3 1255 subsd %xmm12, %xmm4 1256 subsd %xmm13, %xmm5 1257 subsd %xmm14, %xmm6 1258 subsd %xmm15, %xmm7 1259#else 1260 movsd 0 * SIZE(AO), %xmm0 1261 movsd 1 * SIZE(AO), %xmm2 1262 movsd 2 * SIZE(AO), %xmm4 1263 movsd 3 * SIZE(AO), %xmm6 1264 1265 movsd 4 * SIZE(AO), %xmm1 1266 movsd 5 * SIZE(AO), %xmm3 1267 movsd 6 * SIZE(AO), %xmm5 1268 movsd 7 * SIZE(AO), %xmm7 1269 1270 subsd %xmm8, %xmm0 1271 subsd %xmm10, %xmm2 1272 subsd %xmm12, %xmm4 1273 subsd %xmm14, %xmm6 1274 subsd %xmm9, %xmm1 1275 subsd %xmm11, %xmm3 1276 subsd %xmm13, %xmm5 1277 subsd %xmm15, %xmm7 1278#endif 1279 1280#ifdef LN 1281 movsd 15 * SIZE(AO), %xmm8 1282 mulsd %xmm8, %xmm6 1283 movsd 14 * SIZE(AO), %xmm9 1284 mulsd %xmm8, %xmm7 1285 movsd 13 * SIZE(AO), %xmm11 1286 1287 movaps %xmm9, %xmm10 1288 movsd 12 * SIZE(AO), %xmm13 1289 mulsd %xmm6, %xmm9 1290 movsd 10 * SIZE(AO), %xmm8 1291 mulsd %xmm7, %xmm10 1292 subsd %xmm9, %xmm4 1293 movsd 9 * SIZE(AO), %xmm9 1294 subsd %xmm10, %xmm5 1295 1296 movaps %xmm11, %xmm12 1297 mulsd %xmm6, %xmm11 1298 mulsd %xmm7, %xmm12 1299 subsd %xmm11, %xmm2 1300 movsd 8 * SIZE(AO), %xmm11 1301 subsd %xmm12, %xmm3 1302 1303 movaps %xmm13, %xmm14 1304 mulsd %xmm6, %xmm13 1305 mulsd %xmm7, %xmm14 1306 subsd %xmm13, %xmm0 1307 subsd %xmm14, %xmm1 1308 1309 mulsd %xmm8, %xmm4 1310 mulsd %xmm8, %xmm5 1311 movsd 5 * SIZE(AO), %xmm8 1312 1313 movaps %xmm9, %xmm10 1314 mulsd %xmm4, %xmm9 1315 mulsd %xmm5, %xmm10 1316 subsd %xmm9, %xmm2 1317 movsd 4 * SIZE(AO), %xmm9 1318 subsd %xmm10, %xmm3 1319 1320 movaps %xmm11, %xmm12 1321 mulsd %xmm4, %xmm11 1322 mulsd %xmm5, %xmm12 1323 subsd %xmm11, %xmm0 1324 movsd 0 * SIZE(AO), %xmm11 1325 subsd %xmm12, %xmm1 1326 1327 mulsd %xmm8, %xmm2 1328 mulsd %xmm8, %xmm3 1329 1330 movaps %xmm9, %xmm10 1331 mulsd %xmm2, %xmm9 1332 mulsd %xmm3, %xmm10 1333 subsd %xmm9, %xmm0 1334 subsd %xmm10, %xmm1 1335 1336 mulsd %xmm11, %xmm0 1337 mulsd %xmm11, %xmm1 1338#endif 1339 1340#ifdef LT 1341 movsd 0 * SIZE(AO), %xmm8 1342 mulsd %xmm8, %xmm0 1343 movsd 1 * SIZE(AO), %xmm9 1344 mulsd %xmm8, %xmm1 1345 1346 movsd 2 * SIZE(AO), %xmm11 1347 movaps %xmm9, %xmm10 1348 movsd 3 * SIZE(AO), %xmm13 1349 mulsd %xmm0, %xmm9 1350 movsd 5 * SIZE(AO), %xmm8 1351 mulsd %xmm1, %xmm10 1352 subsd %xmm9, %xmm2 1353 movsd 6 * SIZE(AO), %xmm9 1354 subsd %xmm10, %xmm3 1355 1356 movaps %xmm11, %xmm12 1357 mulsd %xmm0, %xmm11 1358 mulsd %xmm1, %xmm12 1359 subsd %xmm11, %xmm4 1360 movsd 7 * SIZE(AO), %xmm11 1361 subsd %xmm12, %xmm5 1362 1363 movaps %xmm13, %xmm14 1364 mulsd %xmm0, %xmm13 1365 mulsd %xmm1, %xmm14 1366 subsd %xmm13, %xmm6 1367 subsd %xmm14, %xmm7 1368 1369 mulsd %xmm8, %xmm2 1370 mulsd %xmm8, %xmm3 1371 movsd 10 * SIZE(AO), %xmm8 1372 1373 movaps %xmm9, %xmm10 1374 mulsd %xmm2, %xmm9 1375 mulsd %xmm3, %xmm10 1376 subsd %xmm9, %xmm4 1377 movsd 11 * SIZE(AO), %xmm9 1378 subsd %xmm10, %xmm5 1379 1380 movaps %xmm11, %xmm12 1381 mulsd %xmm2, %xmm11 1382 mulsd %xmm3, %xmm12 1383 subsd %xmm11, %xmm6 1384 subsd %xmm12, %xmm7 1385 1386 mulsd %xmm8, %xmm4 1387 mulsd %xmm8, %xmm5 1388 movsd 15 * SIZE(AO), %xmm8 1389 1390 movaps %xmm9, %xmm10 1391 mulsd %xmm4, %xmm9 1392 mulsd %xmm5, %xmm10 1393 subsd %xmm9, %xmm6 1394 subsd %xmm10, %xmm7 1395 1396 mulsd %xmm8, %xmm6 1397 mulsd %xmm8, %xmm7 1398#endif 1399 1400#ifdef RN 1401 movsd 0 * SIZE(BO), %xmm8 1402 mulsd %xmm8, %xmm0 1403 movsd 1 * SIZE(BO), %xmm9 1404 mulsd %xmm8, %xmm2 1405 movsd 3 * SIZE(BO), %xmm13 1406 mulsd %xmm8, %xmm4 1407 mulsd %xmm8, %xmm6 1408 1409 movaps %xmm9, %xmm10 1410 movaps %xmm9, %xmm11 1411 movaps %xmm9, %xmm12 1412 1413 mulsd %xmm0, %xmm9 1414 mulsd %xmm2, %xmm10 1415 mulsd %xmm4, %xmm11 1416 mulsd %xmm6, %xmm12 1417 1418 subsd %xmm9, %xmm1 1419 subsd %xmm10, %xmm3 1420 subsd %xmm11, %xmm5 1421 subsd %xmm12, %xmm7 1422 1423 mulsd %xmm13, %xmm1 1424 mulsd %xmm13, %xmm3 1425 mulsd %xmm13, %xmm5 1426 mulsd %xmm13, %xmm7 1427#endif 1428 1429#ifdef RT 1430 movsd 3 * SIZE(BO), %xmm8 1431 mulsd %xmm8, %xmm1 1432 movsd 2 * SIZE(BO), %xmm9 1433 mulsd %xmm8, %xmm3 1434 movsd 0 * SIZE(BO), %xmm13 1435 mulsd %xmm8, %xmm5 1436 mulsd %xmm8, %xmm7 1437 1438 movaps %xmm9, %xmm10 1439 movaps %xmm9, %xmm11 1440 movaps %xmm9, %xmm12 1441 1442 mulsd %xmm1, %xmm9 1443 mulsd %xmm3, %xmm10 1444 mulsd %xmm5, %xmm11 1445 mulsd %xmm7, %xmm12 1446 1447 subsd %xmm9, %xmm0 1448 subsd %xmm10, %xmm2 1449 subsd %xmm11, %xmm4 1450 subsd %xmm12, %xmm6 1451 1452 mulsd %xmm13, %xmm0 1453 mulsd %xmm13, %xmm2 1454 mulsd %xmm13, %xmm4 1455 mulsd %xmm13, %xmm6 1456#endif 1457 1458#ifdef LN 1459 subq $4 * SIZE, CO1 1460 subq $4 * SIZE, CO2 1461#endif 1462 1463 movsd %xmm0, 0 * SIZE(CO1) 1464 movsd %xmm2, 1 * SIZE(CO1) 1465 movsd %xmm4, 2 * SIZE(CO1) 1466 movsd %xmm6, 3 * SIZE(CO1) 1467 1468 movsd %xmm1, 0 * SIZE(CO2) 1469 movsd %xmm3, 1 * SIZE(CO2) 1470 movsd %xmm5, 2 * SIZE(CO2) 1471 movsd %xmm7, 3 * SIZE(CO2) 1472 1473#if defined(LN) || defined(LT) 1474 movsd %xmm0, 0 * SIZE(BO) 1475 movsd %xmm1, 1 * SIZE(BO) 1476 movsd %xmm2, 2 * SIZE(BO) 1477 movsd %xmm3, 3 * SIZE(BO) 1478 movsd %xmm4, 4 * SIZE(BO) 1479 movsd %xmm5, 5 * SIZE(BO) 1480 movsd %xmm6, 6 * SIZE(BO) 1481 movsd %xmm7, 7 * SIZE(BO) 1482#else 1483 movsd %xmm0, 0 * SIZE(AO) 1484 movsd %xmm2, 1 * SIZE(AO) 1485 movsd %xmm4, 2 * SIZE(AO) 1486 movsd %xmm6, 3 * SIZE(AO) 1487 movsd %xmm1, 4 * SIZE(AO) 1488 movsd %xmm3, 5 * SIZE(AO) 1489 movsd %xmm5, 6 * SIZE(AO) 1490 movsd %xmm7, 7 * SIZE(AO) 1491#endif 1492 1493#ifndef LN 1494 addq $4 * SIZE, CO1 1495 addq $4 * SIZE, CO2 1496#endif 1497 1498#if defined(LT) || defined(RN) 1499 movq K, %rax 1500 subq KK, %rax 1501 leaq (,%rax, SIZE), %rax 1502 leaq (AO, %rax, 4), AO 1503 leaq (BO, %rax, 2), BO 1504#endif 1505 1506#ifdef LN 1507 subq $4, KK 1508#endif 1509 1510#ifdef LT 1511 addq $4, KK 1512#endif 1513 1514#ifdef RT 1515 movq K, %rax 1516 salq $2 + BASE_SHIFT, %rax 1517 addq %rax, AORIG 1518#endif 1519 1520 decq I # i -- 1521 jg .L11 1522 ALIGN_4 1523 1524.L20: 1525 testq $2, M 1526 BRANCH 1527 je .L30 1528 1529#ifdef LN 1530 movq K, %rax 1531 salq $1 + BASE_SHIFT, %rax 1532 subq %rax, AORIG 1533#endif 1534 1535#if defined(LN) || defined(RT) 1536 movq KK, %rax 1537 leaq (, %rax, SIZE), %rax 1538 movq AORIG, AO 1539 leaq (AO, %rax, 2), AO 1540 leaq (B, %rax, 2), BO 1541#else 1542 movq B, BO 1543#endif 1544 1545 movsd 0 * SIZE(AO), %xmm0 1546 xorps %xmm2, %xmm2 1547 movsd 1 * SIZE(AO), %xmm4 1548 xorps %xmm5, %xmm5 1549 movsd 2 * SIZE(AO), %xmm5 1550 xorps %xmm6, %xmm6 1551 movsd 3 * SIZE(AO), %xmm7 1552 1553 movsd 0 * SIZE(BO), %xmm1 1554 xorps %xmm8, %xmm8 1555 xorps %xmm9, %xmm9 1556 movsd 1 * SIZE(BO), %xmm3 1557 xorps %xmm10, %xmm10 1558 xorps %xmm11, %xmm11 1559 1560#if defined(LT) || defined(RN) 1561 movq KK, %rax 1562#else 1563 movq K, %rax 1564 subq KK, %rax 1565#endif 1566 sarq $2, %rax 1567 je .L25 1568 ALIGN_4 1569 1570.L22: 1571 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 1572 addsd %xmm2, %xmm9 1573 movaps %xmm0, %xmm2 1574 mulsd %xmm1, %xmm0 1575 1576 addsd %xmm6, %xmm11 1577 movaps %xmm4, %xmm6 1578 mulsd %xmm1, %xmm4 1579 movsd 2 * SIZE(BO), %xmm1 1580 1581 addsd %xmm0, %xmm8 1582 movsd 4 * SIZE(AO), %xmm0 1583 mulsd %xmm3, %xmm2 1584 1585 addsd %xmm4, %xmm10 1586 movsd 5 * SIZE(AO), %xmm4 1587 mulsd %xmm3, %xmm6 1588 movsd 3 * SIZE(BO), %xmm3 1589 1590 addsd %xmm2, %xmm9 1591 movaps %xmm5, %xmm2 1592 mulsd %xmm1, %xmm5 1593 1594 addsd %xmm6, %xmm11 1595 movaps %xmm7, %xmm6 1596 mulsd %xmm1, %xmm7 1597 movsd 4 * SIZE(BO), %xmm1 1598 1599 addsd %xmm5, %xmm8 1600 movsd 6 * SIZE(AO), %xmm5 1601 mulsd %xmm3, %xmm2 1602 1603 addsd %xmm7, %xmm10 1604 movsd 7 * SIZE(AO), %xmm7 1605 mulsd %xmm3, %xmm6 1606 movsd 5 * SIZE(BO), %xmm3 1607 1608 addsd %xmm2, %xmm9 1609 movaps %xmm0, %xmm2 1610 mulsd %xmm1, %xmm0 1611 1612 addsd %xmm6, %xmm11 1613 movaps %xmm4, %xmm6 1614 mulsd %xmm1, %xmm4 1615 movsd 6 * SIZE(BO), %xmm1 1616 1617 addsd %xmm0, %xmm8 1618 movsd 8 * SIZE(AO), %xmm0 1619 mulsd %xmm3, %xmm2 1620 1621 addsd %xmm4, %xmm10 1622 movsd 9 * SIZE(AO), %xmm4 1623 mulsd %xmm3, %xmm6 1624 movsd 7 * SIZE(BO), %xmm3 1625 1626 addsd %xmm2, %xmm9 1627 movaps %xmm5, %xmm2 1628 mulsd %xmm1, %xmm5 1629 1630 addsd %xmm6, %xmm11 1631 movaps %xmm7, %xmm6 1632 mulsd %xmm1, %xmm7 1633 movsd 8 * SIZE(BO), %xmm1 1634 1635 addsd %xmm5, %xmm8 1636 movsd 10 * SIZE(AO), %xmm5 1637 mulsd %xmm3, %xmm2 1638 1639 addsd %xmm7, %xmm10 1640 movsd 11 * SIZE(AO), %xmm7 1641 mulsd %xmm3, %xmm6 1642 movsd 9 * SIZE(BO), %xmm3 1643 1644 addq $8 * SIZE, AO 1645 addq $8 * SIZE, BO 1646 1647 decq %rax 1648 jne .L22 1649 ALIGN_4 1650 1651.L25: 1652#if defined(LT) || defined(RN) 1653 movq KK, %rax 1654#else 1655 movq K, %rax 1656 subq KK, %rax 1657#endif 1658 andq $3, %rax 1659 BRANCH 1660 je .L29 1661 ALIGN_4 1662 1663.L26: 1664 addsd %xmm2, %xmm9 1665 movaps %xmm0, %xmm2 1666 mulsd %xmm1, %xmm0 1667 1668 addsd %xmm6, %xmm11 1669 movaps %xmm4, %xmm6 1670 mulsd %xmm1, %xmm4 1671 movsd 2 * SIZE(BO), %xmm1 1672 1673 mulsd %xmm3, %xmm2 1674 addsd %xmm0, %xmm8 1675 movsd 2 * SIZE(AO), %xmm0 1676 1677 mulsd %xmm3, %xmm6 1678 movsd 3 * SIZE(BO), %xmm3 1679 addsd %xmm4, %xmm10 1680 movsd 3 * SIZE(AO), %xmm4 1681 1682 addq $2 * SIZE, AO 1683 addq $2 * SIZE, BO 1684 decq %rax 1685 BRANCH 1686 jg .L26 1687 ALIGN_4 1688 1689.L29: 1690 addsd %xmm2, %xmm9 1691 addsd %xmm6, %xmm11 1692 1693#if defined(LN) || defined(RT) 1694 movq KK, %rax 1695#ifdef LN 1696 subq $2, %rax 1697#else 1698 subq $2, %rax 1699#endif 1700 1701 leaq (, %rax, SIZE), %rax 1702 movq AORIG, AO 1703 leaq (AO, %rax, 2), AO 1704 leaq (B, %rax, 2), BO 1705#endif 1706 1707#if defined(LN) || defined(LT) 1708 movsd 0 * SIZE(BO), %xmm0 1709 movsd 1 * SIZE(BO), %xmm1 1710 movsd 2 * SIZE(BO), %xmm2 1711 movsd 3 * SIZE(BO), %xmm3 1712 1713 subsd %xmm8, %xmm0 1714 subsd %xmm9, %xmm1 1715 subsd %xmm10, %xmm2 1716 subsd %xmm11, %xmm3 1717#else 1718 movsd 0 * SIZE(AO), %xmm0 1719 movsd 1 * SIZE(AO), %xmm2 1720 movsd 2 * SIZE(AO), %xmm1 1721 movsd 3 * SIZE(AO), %xmm3 1722 1723 subsd %xmm8, %xmm0 1724 subsd %xmm10, %xmm2 1725 subsd %xmm9, %xmm1 1726 subsd %xmm11, %xmm3 1727#endif 1728 1729#ifdef LN 1730 movsd 3 * SIZE(AO), %xmm8 1731 mulsd %xmm8, %xmm2 1732 movsd 2 * SIZE(AO), %xmm9 1733 mulsd %xmm8, %xmm3 1734 movsd 0 * SIZE(AO), %xmm13 1735 1736 movaps %xmm9, %xmm10 1737 mulsd %xmm2, %xmm9 1738 mulsd %xmm3, %xmm10 1739 1740 subsd %xmm9, %xmm0 1741 subsd %xmm10, %xmm1 1742 1743 mulsd %xmm13, %xmm0 1744 mulsd %xmm13, %xmm1 1745#endif 1746 1747#ifdef LT 1748 movsd 0 * SIZE(AO), %xmm8 1749 mulsd %xmm8, %xmm0 1750 movsd 1 * SIZE(AO), %xmm9 1751 mulsd %xmm8, %xmm1 1752 movsd 3 * SIZE(AO), %xmm13 1753 1754 movaps %xmm9, %xmm10 1755 mulsd %xmm0, %xmm9 1756 mulsd %xmm1, %xmm10 1757 1758 subsd %xmm9, %xmm2 1759 subsd %xmm10, %xmm3 1760 1761 mulsd %xmm13, %xmm2 1762 mulsd %xmm13, %xmm3 1763#endif 1764 1765#ifdef RN 1766 movsd 0 * SIZE(BO), %xmm8 1767 mulsd %xmm8, %xmm0 1768 movsd 1 * SIZE(BO), %xmm9 1769 mulsd %xmm8, %xmm2 1770 movsd 3 * SIZE(BO), %xmm13 1771 1772 movaps %xmm9, %xmm10 1773 mulsd %xmm0, %xmm9 1774 mulsd %xmm2, %xmm10 1775 1776 subsd %xmm9, %xmm1 1777 subsd %xmm10, %xmm3 1778 1779 mulsd %xmm13, %xmm1 1780 mulsd %xmm13, %xmm3 1781#endif 1782 1783#ifdef RT 1784 movsd 3 * SIZE(BO), %xmm8 1785 mulsd %xmm8, %xmm1 1786 movsd 2 * SIZE(BO), %xmm9 1787 mulsd %xmm8, %xmm3 1788 movsd 0 * SIZE(BO), %xmm13 1789 1790 movaps %xmm9, %xmm10 1791 mulsd %xmm1, %xmm9 1792 mulsd %xmm3, %xmm10 1793 1794 subsd %xmm9, %xmm0 1795 subsd %xmm10, %xmm2 1796 1797 mulsd %xmm13, %xmm0 1798 mulsd %xmm13, %xmm2 1799#endif 1800 1801#ifdef LN 1802 subq $2 * SIZE, CO1 1803 subq $2 * SIZE, CO2 1804#endif 1805 1806 movsd %xmm0, 0 * SIZE(CO1) 1807 movsd %xmm2, 1 * SIZE(CO1) 1808 movsd %xmm1, 0 * SIZE(CO2) 1809 movsd %xmm3, 1 * SIZE(CO2) 1810 1811#if defined(LN) || defined(LT) 1812 movsd %xmm0, 0 * SIZE(BO) 1813 movsd %xmm1, 1 * SIZE(BO) 1814 movsd %xmm2, 2 * SIZE(BO) 1815 movsd %xmm3, 3 * SIZE(BO) 1816#else 1817 movsd %xmm0, 0 * SIZE(AO) 1818 movsd %xmm2, 1 * SIZE(AO) 1819 movsd %xmm1, 2 * SIZE(AO) 1820 movsd %xmm3, 3 * SIZE(AO) 1821#endif 1822 1823#ifndef LN 1824 addq $2 * SIZE, CO1 1825 addq $2 * SIZE, CO2 1826#endif 1827 1828#if defined(LT) || defined(RN) 1829 movq K, %rax 1830 subq KK, %rax 1831 leaq (,%rax, SIZE), %rax 1832 leaq (AO, %rax, 2), AO 1833 leaq (BO, %rax, 2), BO 1834#endif 1835 1836#ifdef LN 1837 subq $2, KK 1838#endif 1839 1840#ifdef LT 1841 addq $2, KK 1842#endif 1843 1844#ifdef RT 1845 movq K, %rax 1846 salq $1 + BASE_SHIFT, %rax 1847 addq %rax, AORIG 1848#endif 1849 ALIGN_4 1850 1851.L30: 1852 testq $1, M 1853 je .L39 1854 1855#ifdef LN 1856 movq K, %rax 1857 salq $0 + BASE_SHIFT, %rax 1858 subq %rax, AORIG 1859#endif 1860 1861 1862#if defined(LN) || defined(RT) 1863 movq KK, %rax 1864 leaq (, %rax, SIZE), %rax 1865 movq AORIG, AO 1866 leaq (AO, %rax, 1), AO 1867 leaq (B, %rax, 2), BO 1868#else 1869 movq B, BO 1870#endif 1871 1872 movsd 0 * SIZE(AO), %xmm0 1873 xorps %xmm7, %xmm7 1874 movsd 1 * SIZE(AO), %xmm2 1875 xorps %xmm5, %xmm5 1876 1877 movsd 0 * SIZE(BO), %xmm1 1878 xorps %xmm8, %xmm8 1879 xorps %xmm9, %xmm9 1880 movsd 1 * SIZE(BO), %xmm3 1881 1882#if defined(LT) || defined(RN) 1883 movq KK, %rax 1884#else 1885 movq K, %rax 1886 subq KK, %rax 1887#endif 1888 sarq $2, %rax 1889 je .L35 1890 ALIGN_4 1891 1892.L32: 1893 addsd %xmm5, %xmm8 1894 movsd 2 * SIZE(BO), %xmm5 1895 mulsd %xmm0, %xmm1 1896 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 1897 1898 addsd %xmm7, %xmm9 1899 movsd 3 * SIZE(BO), %xmm7 1900 mulsd %xmm0, %xmm3 1901 movsd 2 * SIZE(AO), %xmm0 1902 1903 addsd %xmm1, %xmm8 1904 movsd 4 * SIZE(BO), %xmm1 1905 mulsd %xmm2, %xmm5 1906 1907 addsd %xmm3, %xmm9 1908 movsd 5 * SIZE(BO), %xmm3 1909 mulsd %xmm2, %xmm7 1910 movsd 3 * SIZE(AO), %xmm2 1911 1912 addsd %xmm5, %xmm8 1913 movsd 6 * SIZE(BO), %xmm5 1914 mulsd %xmm0, %xmm1 1915 1916 addsd %xmm7, %xmm9 1917 movsd 7 * SIZE(BO), %xmm7 1918 mulsd %xmm0, %xmm3 1919 movsd 4 * SIZE(AO), %xmm0 1920 1921 addsd %xmm1, %xmm8 1922 movsd 8 * SIZE(BO), %xmm1 1923 mulsd %xmm2, %xmm5 1924 1925 addsd %xmm3, %xmm9 1926 movsd 9 * SIZE(BO), %xmm3 1927 mulsd %xmm2, %xmm7 1928 movsd 5 * SIZE(AO), %xmm2 1929 1930 addq $4 * SIZE, AO 1931 addq $8 * SIZE, BO 1932 1933 decq %rax 1934 jne .L32 1935 ALIGN_4 1936 1937.L35: 1938#if defined(LT) || defined(RN) 1939 movq KK, %rax 1940#else 1941 movq K, %rax 1942 subq KK, %rax 1943#endif 1944 addsd %xmm5, %xmm8 1945 addsd %xmm7, %xmm9 1946 1947 andq $3, %rax 1948 BRANCH 1949 BRANCH 1950 je .L38 1951 ALIGN_4 1952 1953.L36: 1954 mulsd %xmm0, %xmm1 1955 addq $2 * SIZE, BO 1956 mulsd %xmm0, %xmm3 1957 movsd 1 * SIZE(AO), %xmm0 1958 1959 addsd %xmm1, %xmm8 1960 movsd 0 * SIZE(BO), %xmm1 1961 addsd %xmm3, %xmm9 1962 movsd 1 * SIZE(BO), %xmm3 1963 1964 addq $1 * SIZE, AO 1965 decq %rax 1966 BRANCH 1967 jg .L36 1968 ALIGN_4 1969 1970.L38: 1971#if defined(LN) || defined(RT) 1972 movq KK, %rax 1973#ifdef LN 1974 subq $1, %rax 1975#else 1976 subq $2, %rax 1977#endif 1978 1979 leaq (, %rax, SIZE), %rax 1980 movq AORIG, AO 1981 leaq (AO, %rax, 1), AO 1982 leaq (B, %rax, 2), BO 1983#endif 1984 1985#if defined(LN) || defined(LT) 1986 movsd 0 * SIZE(BO), %xmm0 1987 movsd 1 * SIZE(BO), %xmm1 1988 1989 subsd %xmm8, %xmm0 1990 subsd %xmm9, %xmm1 1991#else 1992 movsd 0 * SIZE(AO), %xmm0 1993 movsd 1 * SIZE(AO), %xmm1 1994 1995 subsd %xmm8, %xmm0 1996 subsd %xmm9, %xmm1 1997#endif 1998 1999#if defined(LN) || defined(LT) 2000 movsd 0 * SIZE(AO), %xmm8 2001 mulsd %xmm8, %xmm0 2002 mulsd %xmm8, %xmm1 2003#endif 2004 2005#ifdef RN 2006 movsd 0 * SIZE(BO), %xmm8 2007 mulsd %xmm8, %xmm0 2008 movsd 1 * SIZE(BO), %xmm9 2009 mulsd %xmm0, %xmm9 2010 movsd 3 * SIZE(BO), %xmm13 2011 subsd %xmm9, %xmm1 2012 mulsd %xmm13, %xmm1 2013#endif 2014 2015#ifdef RT 2016 movsd 3 * SIZE(BO), %xmm8 2017 mulsd %xmm8, %xmm1 2018 movsd 2 * SIZE(BO), %xmm9 2019 mulsd %xmm1, %xmm9 2020 movsd 0 * SIZE(BO), %xmm13 2021 subsd %xmm9, %xmm0 2022 mulsd %xmm13, %xmm0 2023#endif 2024 2025#ifdef LN 2026 subq $1 * SIZE, CO1 2027 subq $1 * SIZE, CO2 2028#endif 2029 2030 movsd %xmm0, 0 * SIZE(CO1) 2031 movsd %xmm1, 0 * SIZE(CO2) 2032 2033#if defined(LN) || defined(LT) 2034 movsd %xmm0, 0 * SIZE(BO) 2035 movsd %xmm1, 1 * SIZE(BO) 2036#else 2037 movsd %xmm0, 0 * SIZE(AO) 2038 movsd %xmm1, 1 * SIZE(AO) 2039#endif 2040 2041#ifndef LN 2042 addq $1 * SIZE, CO1 2043 addq $1 * SIZE, CO2 2044#endif 2045 2046#if defined(LT) || defined(RN) 2047 movq K, %rax 2048 subq KK, %rax 2049 leaq (,%rax, SIZE), %rax 2050 leaq (AO, %rax, 1), AO 2051 leaq (BO, %rax, 2), BO 2052#endif 2053 2054#ifdef LN 2055 subq $1, KK 2056#endif 2057 2058#ifdef LT 2059 addq $1, KK 2060#endif 2061 2062#ifdef RT 2063 movq K, %rax 2064 salq $0 + BASE_SHIFT, %rax 2065 addq %rax, AORIG 2066#endif 2067 ALIGN_4 2068 2069.L39: 2070#ifdef LN 2071 leaq (, K, SIZE), %rax 2072 leaq (B, %rax, 2), B 2073#endif 2074#if defined(LT) || defined(RN) 2075 movq BO, B 2076#endif 2077 2078#ifdef RN 2079 addq $2, KK 2080#endif 2081 2082#ifdef RT 2083 subq $2, KK 2084#endif 2085 2086 decq J # j -- 2087 jg .L10 2088 ALIGN_4 2089 2090.L999: 2091 movq 0(%rsp), %rbx 2092 movq 8(%rsp), %rbp 2093 movq 16(%rsp), %r12 2094 movq 24(%rsp), %r13 2095 movq 32(%rsp), %r14 2096 movq 40(%rsp), %r15 2097 2098#ifdef WINDOWS_ABI 2099 movq 48(%rsp), %rdi 2100 movq 56(%rsp), %rsi 2101 movups 64(%rsp), %xmm6 2102 movups 80(%rsp), %xmm7 2103 movups 96(%rsp), %xmm8 2104 movups 112(%rsp), %xmm9 2105 movups 128(%rsp), %xmm10 2106 movups 144(%rsp), %xmm11 2107 movups 160(%rsp), %xmm12 2108 movups 176(%rsp), %xmm13 2109 movups 192(%rsp), %xmm14 2110 movups 208(%rsp), %xmm15 2111#endif 2112 2113 addq $STACKSIZE, %rsp 2114 ret 2115 2116 EPILOGUE 2117