1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define OLD_M %rdi 43#define OLD_N %rsi 44#define M %r13 45#define N %r14 46#define K %rdx 47 48#define A %rcx 49#define B %r8 50#define C %r9 51#define LDC %r10 52 53#define I %r11 54#define AO %rdi 55#define BO %rsi 56#define CO1 %rbx 57#define CO2 %rbp 58#define BB %r12 59 60#ifndef WINDOWS_ABI 61 62#define STACKSIZE 64 63 64#define OLD_LDC 8 + STACKSIZE(%rsp) 65#define OLD_OFFSET 16 + STACKSIZE(%rsp) 66 67#else 68 69#define STACKSIZE 256 70 71#define OLD_A 48 + STACKSIZE(%rsp) 72#define OLD_B 56 + STACKSIZE(%rsp) 73#define OLD_C 64 + STACKSIZE(%rsp) 74#define OLD_LDC 72 + STACKSIZE(%rsp) 75#define OLD_OFFSET 80 + STACKSIZE(%rsp) 76 77#endif 78 79#define POSINV 0(%rsp) 80#define J 16(%rsp) 81#define OFFSET 24(%rsp) 82#define KK 32(%rsp) 83#define KKK 40(%rsp) 84#define AORIG 48(%rsp) 85#define BORIG 56(%rsp) 86#define BUFFER 128(%rsp) 87 88#define PREFETCH_R (8 * 4 + 0) 89#define PREFETCH_W (PREFETCH_R) 90 91#define PREFETCHSIZE (8 * 17 + 2) 92#define PREFETCH prefetcht0 93 94#ifndef CONJ 95#define NN 96#else 97#if defined(LN) || defined(LT) 98#define CN 99#else 100#define NC 101#endif 102#endif 103 104#define ADD1 addpd 105#define ADD2 addpd 106 107 PROLOGUE 108 PROFCODE 109 110 subq $STACKSIZE, %rsp 111 112 movq %rbx, 0(%rsp) 113 movq %rbp, 8(%rsp) 114 movq %r12, 16(%rsp) 115 movq %r13, 24(%rsp) 116 movq %r14, 32(%rsp) 117 movq %r15, 40(%rsp) 118 119#ifdef WINDOWS_ABI 120 movq %rdi, 48(%rsp) 121 movq %rsi, 56(%rsp) 122 movups %xmm6, 64(%rsp) 123 movups %xmm7, 80(%rsp) 124 movups %xmm8, 96(%rsp) 125 movups %xmm9, 112(%rsp) 126 movups %xmm10, 128(%rsp) 127 movups %xmm11, 144(%rsp) 128 movups %xmm12, 160(%rsp) 129 movups %xmm13, 176(%rsp) 130 movups %xmm14, 192(%rsp) 131 movups %xmm15, 208(%rsp) 132 133 movq ARG1, OLD_M 134 movq ARG2, OLD_N 135 movq ARG3, K 136 movq OLD_A, A 137 movq OLD_B, B 138 movq OLD_C, C 139#endif 140 141 movq OLD_LDC, LDC 142 movq OLD_OFFSET, %rax 143 144 movq %rsp, %r15 # save old stack 145 subq $128 + LOCAL_BUFFER_SIZE, %rsp 146 andq $-4096, %rsp # align stack 147 148 STACK_TOUCHING 149 150 movq %rax, KK 151 movq %rax, OFFSET 152 153 movq OLD_M, M 154 movq OLD_N, N 155 156 subq $-16 * SIZE, A 157 subq $-16 * SIZE, B 158 159 pcmpeqb %xmm15, %xmm15 160 psllq $63, %xmm15 # Generate mask 161 pxor %xmm2, %xmm2 162 163 movlpd %xmm2, 0 + POSINV 164 movlpd %xmm15, 8 + POSINV 165 166 salq $ZBASE_SHIFT, LDC 167 168#ifdef LN 169 movq M, %rax 170 salq $ZBASE_SHIFT, %rax 171 addq %rax, C 172 imulq K, %rax 173 addq %rax, A 174#endif 175 176#ifdef RT 177 movq N, %rax 178 salq $ZBASE_SHIFT, %rax 179 imulq K, %rax 180 addq %rax, B 181 182 movq N, %rax 183 imulq LDC, %rax 184 addq %rax, C 185#endif 186 187#ifdef RN 188 negq KK 189#endif 190 191#ifdef RT 192 movq N, %rax 193 subq OFFSET, %rax 194 movq %rax, KK 195#endif 196 197 movq N, J 198 sarq $1, J # j = (n >> 2) 199 jle .L100 200 ALIGN_4 201 202.L01: 203#ifdef LN 204 movq OFFSET, %rax 205 addq M, %rax 206 movq %rax, KK 207#endif 208 209 leaq 16 * SIZE + BUFFER, BO 210 211#ifdef RT 212 movq K, %rax 213 salq $1 + ZBASE_SHIFT, %rax 214 subq %rax, B 215#endif 216 217#if defined(LN) || defined(RT) 218 movq KK, %rax 219 movq B, BORIG 220 salq $ZBASE_SHIFT, %rax 221 leaq (B, %rax, 2), B 222 leaq (BO, %rax, 4), BO 223#endif 224 225#if defined(LT) 226 movq OFFSET, %rax 227 movq %rax, KK 228#endif 229 230#if defined(LT) || defined(RN) 231 movq KK, %rax 232#else 233 movq K, %rax 234 subq KK, %rax 235#endif 236 sarq $2, %rax 237 jle .L03 238 239 addq %rax, %rax 240 ALIGN_4 241 242.L02: 243 prefetcht0 (PREFETCH_R + 0) * SIZE(B) 244 245 movddup -16 * SIZE(B), %xmm8 246 movddup -15 * SIZE(B), %xmm9 247 movddup -14 * SIZE(B), %xmm10 248 movddup -13 * SIZE(B), %xmm11 249 movddup -12 * SIZE(B), %xmm12 250 movddup -11 * SIZE(B), %xmm13 251 movddup -10 * SIZE(B), %xmm14 252 movddup -9 * SIZE(B), %xmm15 253 254 prefetcht0 (PREFETCH_W + 0) * SIZE(BO) 255 256 movapd %xmm8, -16 * SIZE(BO) 257 movapd %xmm9, -14 * SIZE(BO) 258 movapd %xmm10, -12 * SIZE(BO) 259 movapd %xmm11, -10 * SIZE(BO) 260 261 prefetcht0 (PREFETCH_W + 8) * SIZE(BO) 262 263 movapd %xmm12, -8 * SIZE(BO) 264 movapd %xmm13, -6 * SIZE(BO) 265 movapd %xmm14, -4 * SIZE(BO) 266 movapd %xmm15, -2 * SIZE(BO) 267 268 addq $ 8 * SIZE, B 269 subq $-16 * SIZE, BO 270 decq %rax 271 jne .L02 272 ALIGN_4 273 274.L03: 275#if defined(LT) || defined(RN) 276 movq KK, %rax 277#else 278 movq K, %rax 279 subq KK, %rax 280#endif 281 andq $3, %rax 282 BRANCH 283 jle .L05 284 ALIGN_4 285 286.L04: 287 movddup -16 * SIZE(B), %xmm8 288 movddup -15 * SIZE(B), %xmm9 289 movddup -14 * SIZE(B), %xmm10 290 movddup -13 * SIZE(B), %xmm11 291 292 movapd %xmm8, -16 * SIZE(BO) 293 movapd %xmm9, -14 * SIZE(BO) 294 movapd %xmm10, -12 * SIZE(BO) 295 movapd %xmm11, -10 * SIZE(BO) 296 297 addq $ 4 * SIZE, B 298 addq $ 8 * SIZE, BO 299 300 decq %rax 301 jne .L04 302 ALIGN_4 303 304.L05: 305#if defined(LT) || defined(RN) 306 movq A, AO 307#else 308 movq A, AORIG 309#endif 310 311#ifdef RT 312 leaq (, LDC, 2), %rax 313 subq %rax, C 314#endif 315 316 movq C, CO1 317 leaq (C, LDC, 1), CO2 318 319#ifndef RT 320 leaq (C, LDC, 2), C 321#endif 322 323 testq $1, M 324 jle .L30 325 326#ifdef LN 327 movq K, %rax 328 salq $0 + ZBASE_SHIFT, %rax 329 subq %rax, AORIG 330#endif 331 332#if defined(LN) || defined(RT) 333 movq KK, %rax 334 movq AORIG, AO 335 salq $ZBASE_SHIFT, %rax 336 addq %rax, AO 337#endif 338 339 leaq 16 * SIZE + BUFFER, BO 340 341#if defined(LN) || defined(RT) 342 movq KK, %rax 343 salq $1 + ZBASE_SHIFT, %rax 344 leaq (BO, %rax, 2), BO 345#endif 346 347 pxor %xmm8, %xmm8 348 pxor %xmm9, %xmm9 349 pxor %xmm10, %xmm10 350 pxor %xmm11, %xmm11 351 352#if defined(LT) || defined(RN) 353 movq KK, %rax 354#else 355 movq K, %rax 356 subq KK, %rax 357#endif 358 sarq $2, %rax 359 je .L42 360 361.L41: 362 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 363 364 movapd -16 * SIZE(AO), %xmm0 365 movapd -16 * SIZE(BO), %xmm2 366 movapd -14 * SIZE(BO), %xmm3 367 movapd -12 * SIZE(BO), %xmm4 368 movapd -10 * SIZE(BO), %xmm5 369 370 mulpd %xmm0, %xmm2 371 mulpd %xmm0, %xmm3 372 mulpd %xmm0, %xmm4 373 mulpd %xmm0, %xmm5 374 375 ADD1 %xmm2, %xmm8 376 ADD2 %xmm3, %xmm9 377 ADD1 %xmm4, %xmm10 378 ADD2 %xmm5, %xmm11 379 380 movapd -14 * SIZE(AO), %xmm0 381 movapd -8 * SIZE(BO), %xmm2 382 movapd -6 * SIZE(BO), %xmm3 383 movapd -4 * SIZE(BO), %xmm4 384 movapd -2 * SIZE(BO), %xmm5 385 386 mulpd %xmm0, %xmm2 387 mulpd %xmm0, %xmm3 388 mulpd %xmm0, %xmm4 389 mulpd %xmm0, %xmm5 390 391 ADD1 %xmm2, %xmm8 392 ADD2 %xmm3, %xmm9 393 ADD1 %xmm4, %xmm10 394 ADD2 %xmm5, %xmm11 395 396 movapd -12 * SIZE(AO), %xmm0 397 movapd 0 * SIZE(BO), %xmm2 398 movapd 2 * SIZE(BO), %xmm3 399 movapd 4 * SIZE(BO), %xmm4 400 movapd 6 * SIZE(BO), %xmm5 401 402 mulpd %xmm0, %xmm2 403 mulpd %xmm0, %xmm3 404 mulpd %xmm0, %xmm4 405 mulpd %xmm0, %xmm5 406 407 ADD1 %xmm2, %xmm8 408 ADD2 %xmm3, %xmm9 409 ADD1 %xmm4, %xmm10 410 ADD2 %xmm5, %xmm11 411 412 movapd -10 * SIZE(AO), %xmm0 413 movapd 8 * SIZE(BO), %xmm2 414 movapd 10 * SIZE(BO), %xmm3 415 movapd 12 * SIZE(BO), %xmm4 416 movapd 14 * SIZE(BO), %xmm5 417 418 mulpd %xmm0, %xmm2 419 mulpd %xmm0, %xmm3 420 mulpd %xmm0, %xmm4 421 mulpd %xmm0, %xmm5 422 423 ADD1 %xmm2, %xmm8 424 ADD2 %xmm3, %xmm9 425 ADD1 %xmm4, %xmm10 426 ADD2 %xmm5, %xmm11 427 428 subq $ -8 * SIZE, AO 429 subq $-32 * SIZE, BO 430 subq $1, %rax 431 jne .L41 432 433.L42: 434#if defined(LT) || defined(RN) 435 movq KK, %rax 436#else 437 movq K, %rax 438 subq KK, %rax 439#endif 440 movapd POSINV, %xmm7 441 442 andq $3, %rax # if (k & 1) 443 BRANCH 444 jle .L44 445 446.L43: 447 movapd -16 * SIZE(AO), %xmm0 448 movapd -16 * SIZE(BO), %xmm2 449 movapd -14 * SIZE(BO), %xmm3 450 movapd -12 * SIZE(BO), %xmm4 451 movapd -10 * SIZE(BO), %xmm5 452 453 mulpd %xmm0, %xmm2 454 mulpd %xmm0, %xmm3 455 mulpd %xmm0, %xmm4 456 mulpd %xmm0, %xmm5 457 458 ADD1 %xmm2, %xmm8 459 ADD2 %xmm3, %xmm9 460 ADD1 %xmm4, %xmm10 461 ADD2 %xmm5, %xmm11 462 463 addq $2 * SIZE, AO 464 addq $8 * SIZE, BO 465 subq $1, %rax 466 jg .L43 467 ALIGN_4 468 469.L44: 470#if defined(LN) || defined(RT) 471 movq KK, %rax 472#ifdef LN 473 subq $1, %rax 474#else 475 subq $2, %rax 476#endif 477 478 movq AORIG, AO 479 movq BORIG, B 480 leaq 16 * SIZE + BUFFER, BO 481 482 salq $ZBASE_SHIFT, %rax 483 leaq (AO, %rax, 1), AO 484 leaq (B, %rax, 2), B 485 leaq (BO, %rax, 4), BO 486#endif 487 488 SHUFPD_1 %xmm9, %xmm9 489 SHUFPD_1 %xmm11, %xmm11 490 491#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 492 defined(NR) || defined(NC) || defined(TR) || defined(TC) 493 xorpd %xmm7, %xmm9 494 xorpd %xmm7, %xmm11 495#else 496 xorpd %xmm7, %xmm8 497 xorpd %xmm7, %xmm10 498#endif 499 500#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 501 defined(RR) || defined(RC) || defined(CR) || defined(CC) 502 subpd %xmm9, %xmm8 503 subpd %xmm11, %xmm10 504#else 505 addpd %xmm9, %xmm8 506 addpd %xmm11, %xmm10 507#endif 508 509#if defined(LN) || defined(LT) 510 movapd -16 * SIZE(B), %xmm9 511 movapd -14 * SIZE(B), %xmm11 512 513 subpd %xmm8, %xmm9 514 subpd %xmm10, %xmm11 515#else 516 movapd -16 * SIZE(AO), %xmm9 517 movapd -14 * SIZE(AO), %xmm11 518 519 subpd %xmm8, %xmm9 520 subpd %xmm10, %xmm11 521#endif 522 523#ifndef CONJ 524 SHUFPD_1 %xmm7, %xmm7 525#endif 526 527#if defined(LN) || defined(LT) 528 movddup -16 * SIZE(AO), %xmm0 529 movddup -15 * SIZE(AO), %xmm1 530 531 pshufd $0x4e, %xmm9, %xmm8 532 pshufd $0x4e, %xmm11, %xmm10 533 534 xorpd %xmm7, %xmm8 535 xorpd %xmm7, %xmm10 536 537 mulpd %xmm0, %xmm9 538 mulpd %xmm1, %xmm8 539 mulpd %xmm0, %xmm11 540 mulpd %xmm1, %xmm10 541 542 addpd %xmm8, %xmm9 543 addpd %xmm10, %xmm11 544#endif 545 546#ifdef RN 547 movddup -16 * SIZE(B), %xmm0 548 movddup -15 * SIZE(B), %xmm1 549 movddup -14 * SIZE(B), %xmm2 550 movddup -13 * SIZE(B), %xmm3 551 movddup -10 * SIZE(B), %xmm4 552 movddup -9 * SIZE(B), %xmm5 553 554 pshufd $0x4e, %xmm9, %xmm8 555 556 xorpd %xmm7, %xmm8 557 558 mulpd %xmm0, %xmm9 559 mulpd %xmm1, %xmm8 560 561 addpd %xmm8, %xmm9 562 563 movapd %xmm9, %xmm8 564 pshufd $0x4e, %xmm9, %xmm12 565 566 xorpd %xmm7, %xmm12 567 568 mulpd %xmm2, %xmm8 569 mulpd %xmm3, %xmm12 570 571 subpd %xmm8, %xmm11 572 subpd %xmm12, %xmm11 573 574 pshufd $0x4e, %xmm11, %xmm10 575 576 xorpd %xmm7, %xmm10 577 578 mulpd %xmm4, %xmm11 579 mulpd %xmm5, %xmm10 580 581 addpd %xmm10, %xmm11 582#endif 583 584#ifdef RT 585 movddup -10 * SIZE(B), %xmm0 586 movddup -9 * SIZE(B), %xmm1 587 movddup -12 * SIZE(B), %xmm2 588 movddup -11 * SIZE(B), %xmm3 589 movddup -16 * SIZE(B), %xmm4 590 movddup -15 * SIZE(B), %xmm5 591 592 pshufd $0x4e, %xmm11, %xmm10 593 594 xorpd %xmm7, %xmm10 595 596 mulpd %xmm0, %xmm11 597 mulpd %xmm1, %xmm10 598 599 addpd %xmm10, %xmm11 600 601 movapd %xmm11, %xmm8 602 pshufd $0x4e, %xmm11, %xmm12 603 604 xorpd %xmm7, %xmm12 605 606 mulpd %xmm2, %xmm8 607 mulpd %xmm3, %xmm12 608 609 subpd %xmm8, %xmm9 610 subpd %xmm12, %xmm9 611 612 pshufd $0x4e, %xmm9, %xmm8 613 614 xorpd %xmm7, %xmm8 615 616 mulpd %xmm4, %xmm9 617 mulpd %xmm5, %xmm8 618 619 addpd %xmm8, %xmm9 620#endif 621 622#ifdef LN 623 subq $2 * SIZE, CO1 624 subq $2 * SIZE, CO2 625#endif 626 627 movsd %xmm9, 0 * SIZE(CO1) 628 movhpd %xmm9, 1 * SIZE(CO1) 629 630 movsd %xmm11, 0 * SIZE(CO2) 631 movhpd %xmm11, 1 * SIZE(CO2) 632 633#if defined(LN) || defined(LT) 634 movapd %xmm9, -16 * SIZE(B) 635 movapd %xmm11, -14 * SIZE(B) 636 637 movddup %xmm9, %xmm8 638 unpckhpd %xmm9, %xmm9 639 movddup %xmm11, %xmm10 640 unpckhpd %xmm11, %xmm11 641 642 movapd %xmm8, -16 * SIZE(BO) 643 movapd %xmm9, -14 * SIZE(BO) 644 movapd %xmm10, -12 * SIZE(BO) 645 movapd %xmm11, -10 * SIZE(BO) 646#else 647 movapd %xmm9, -16 * SIZE(AO) 648 movapd %xmm11, -14 * SIZE(AO) 649 650#endif 651 652#ifndef LN 653 addq $2 * SIZE, CO1 654 addq $2 * SIZE, CO2 655#endif 656 657#if defined(LT) || defined(RN) 658 movq K, %rax 659 subq KK, %rax 660 salq $ZBASE_SHIFT, %rax 661 leaq (AO, %rax, 1), AO 662#ifdef LT 663 addq $4 * SIZE, B 664#endif 665#endif 666 667#ifdef LN 668 subq $1, KK 669 movq BORIG, B 670#endif 671 672#ifdef LT 673 addq $1, KK 674#endif 675 676#ifdef RT 677 movq K, %rax 678 movq BORIG, B 679 salq $0 + ZBASE_SHIFT, %rax 680 addq %rax, AORIG 681#endif 682 ALIGN_4 683 684.L30: 685 movq M, I 686 sarq $1, I # i = (m >> 2) 687 jle .L99 688 ALIGN_4 689 690.L10: 691 leaq (PREFETCH_R + 0) * SIZE(B), BB 692 693#ifdef LN 694 movq K, %rax 695 salq $1 + ZBASE_SHIFT, %rax 696 subq %rax, AORIG 697#endif 698 699#if defined(LN) || defined(RT) 700 movq KK, %rax 701 movq AORIG, AO 702 salq $ZBASE_SHIFT, %rax 703 leaq (AO, %rax, 2), AO 704#endif 705 706 leaq 16 * SIZE + BUFFER, BO 707 708#if defined(LN) || defined(RT) 709 movq KK, %rax 710 salq $1 + ZBASE_SHIFT, %rax 711 leaq (BO, %rax, 2), BO 712#endif 713 714 prefetcht2 0 * SIZE(BB) 715 716#ifdef LN 717 pxor %xmm8, %xmm8 718 prefetcht1 -3 * SIZE(CO1) 719 pxor %xmm9, %xmm9 720 pxor %xmm10, %xmm10 721 prefetcht1 -3 * SIZE(CO2) 722 pxor %xmm11, %xmm11 723#else 724 pxor %xmm8, %xmm8 725 prefetcht1 3 * SIZE(CO1) 726 pxor %xmm9, %xmm9 727 pxor %xmm10, %xmm10 728 prefetcht1 3 * SIZE(CO2) 729 pxor %xmm11, %xmm11 730#endif 731 732 pxor %xmm12, %xmm12 733 pxor %xmm13, %xmm13 734 pxor %xmm14, %xmm14 735 pxor %xmm15, %xmm15 736 737 pxor %xmm2, %xmm2 738 pxor %xmm3, %xmm3 739 pxor %xmm4, %xmm4 740 pxor %xmm5, %xmm5 741 742 subq $-8 * SIZE, BB 743 744#if defined(LT) || defined(RN) 745 movq KK, %rax 746#else 747 movq K, %rax 748 subq KK, %rax 749#endif 750 sarq $2, %rax 751 NOBRANCH 752 jle .L15 753 ALIGN_4 754 755.L12: 756 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 757 758 movapd -16 * SIZE(AO), %xmm0 759 ADD1 %xmm2, %xmm10 760 movapd -16 * SIZE(BO), %xmm2 761 ADD1 %xmm3, %xmm14 762 movapd %xmm2, %xmm3 763 movapd -14 * SIZE(AO), %xmm1 764 mulpd %xmm0, %xmm2 765 mulpd %xmm1, %xmm3 766 ADD2 %xmm4, %xmm11 767 movapd -14 * SIZE(BO), %xmm4 768 ADD2 %xmm5, %xmm15 769 movapd %xmm4, %xmm5 770 mulpd %xmm0, %xmm4 771 mulpd %xmm1, %xmm5 772 773 ADD1 %xmm2, %xmm8 774 movapd -12 * SIZE(BO), %xmm2 775 ADD1 %xmm3, %xmm12 776 movapd %xmm2, %xmm3 777 mulpd %xmm0, %xmm2 778 mulpd %xmm1, %xmm3 779 ADD2 %xmm4, %xmm9 780 movapd -10 * SIZE(BO), %xmm4 781 ADD2 %xmm5, %xmm13 782 movapd %xmm4, %xmm5 783 mulpd %xmm0, %xmm4 784 mulpd %xmm1, %xmm5 785 786 movapd -12 * SIZE(AO), %xmm0 787 ADD1 %xmm2, %xmm10 788 movapd -8 * SIZE(BO), %xmm2 789 ADD1 %xmm3, %xmm14 790 movapd %xmm2, %xmm3 791 movapd -10 * SIZE(AO), %xmm1 792 mulpd %xmm0, %xmm2 793 mulpd %xmm1, %xmm3 794 ADD2 %xmm4, %xmm11 795 ADD2 %xmm5, %xmm15 796 movapd -6 * SIZE(BO), %xmm4 797 movapd %xmm4, %xmm5 798 mulpd %xmm0, %xmm4 799 mulpd %xmm1, %xmm5 800 801 ADD1 %xmm2, %xmm8 802 ADD1 %xmm3, %xmm12 803 movapd -4 * SIZE(BO), %xmm2 804 movapd %xmm2, %xmm3 805 mulpd %xmm0, %xmm2 806 mulpd %xmm1, %xmm3 807 ADD2 %xmm4, %xmm9 808 ADD2 %xmm5, %xmm13 809 movapd -2 * SIZE(BO), %xmm4 810 movapd %xmm4, %xmm5 811 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) 812 mulpd %xmm0, %xmm4 813 mulpd %xmm1, %xmm5 814 815 movapd -8 * SIZE(AO), %xmm0 816 ADD1 %xmm2, %xmm10 817 movapd 0 * SIZE(BO), %xmm2 818 ADD1 %xmm3, %xmm14 819 movapd %xmm2, %xmm3 820 movapd -6 * SIZE(AO), %xmm1 821 mulpd %xmm0, %xmm2 822 mulpd %xmm1, %xmm3 823 ADD2 %xmm4, %xmm11 824 movapd 2 * SIZE(BO), %xmm4 825 ADD2 %xmm5, %xmm15 826 movapd %xmm4, %xmm5 827 mulpd %xmm0, %xmm4 828 mulpd %xmm1, %xmm5 829 830 ADD1 %xmm2, %xmm8 831 movapd 4 * SIZE(BO), %xmm2 832 ADD1 %xmm3, %xmm12 833 movapd %xmm2, %xmm3 834 mulpd %xmm0, %xmm2 835 mulpd %xmm1, %xmm3 836 ADD2 %xmm4, %xmm9 837 movapd 6 * SIZE(BO), %xmm4 838 ADD2 %xmm5, %xmm13 839 movapd %xmm4, %xmm5 840 mulpd %xmm0, %xmm4 841 mulpd %xmm1, %xmm5 842 843 movapd -4 * SIZE(AO), %xmm0 844 ADD1 %xmm2, %xmm10 845 ADD1 %xmm3, %xmm14 846 movapd 8 * SIZE(BO), %xmm2 847 movapd %xmm2, %xmm3 848 mulpd %xmm0, %xmm2 849 movapd -2 * SIZE(AO), %xmm1 850 mulpd %xmm1, %xmm3 851 ADD2 %xmm4, %xmm11 852 movapd 10 * SIZE(BO), %xmm4 853 ADD2 %xmm5, %xmm15 854 subq $-32 * SIZE, BO 855 movapd %xmm4, %xmm5 856 mulpd %xmm0, %xmm4 857 mulpd %xmm1, %xmm5 858 859 ADD1 %xmm2, %xmm8 860 ADD1 %xmm3, %xmm12 861 movapd -20 * SIZE(BO), %xmm2 862 movapd %xmm2, %xmm3 863 mulpd %xmm0, %xmm2 864 subq $-16 * SIZE, AO 865 mulpd %xmm1, %xmm3 866 ADD2 %xmm4, %xmm9 867 ADD2 %xmm5, %xmm13 868 movapd -18 * SIZE(BO), %xmm4 869 movapd %xmm4, %xmm5 870 mulpd %xmm0, %xmm4 871 mulpd %xmm1, %xmm5 872 873 subq $1, %rax 874 BRANCH 875 BRANCH 876 jg .L12 877 ALIGN_4 878 879.L15: 880#if defined(LT) || defined(RN) 881 movq KK, %rax 882#else 883 movq K, %rax 884 subq KK, %rax 885#endif 886 movapd POSINV, %xmm7 887 888 andq $3, %rax 889 BRANCH 890 BRANCH 891 je .L19 892 ALIGN_4 893 894.L16: 895 ADD1 %xmm2, %xmm10 896 ADD1 %xmm3, %xmm14 897 ADD2 %xmm4, %xmm11 898 ADD2 %xmm5, %xmm15 899 900 movapd -16 * SIZE(BO), %xmm2 901 movapd %xmm2, %xmm3 902 movapd -14 * SIZE(BO), %xmm4 903 movapd %xmm4, %xmm5 904 905 movapd -16 * SIZE(AO), %xmm0 906 mulpd %xmm0, %xmm2 907 movapd -14 * SIZE(AO), %xmm1 908 mulpd %xmm1, %xmm3 909 mulpd %xmm0, %xmm4 910 mulpd %xmm1, %xmm5 911 912 ADD1 %xmm2, %xmm8 913 ADD1 %xmm3, %xmm12 914 ADD2 %xmm4, %xmm9 915 ADD2 %xmm5, %xmm13 916 917 movapd -12 * SIZE(BO), %xmm2 918 movapd %xmm2, %xmm3 919 movapd -10 * SIZE(BO), %xmm4 920 movapd %xmm4, %xmm5 921 922 mulpd %xmm0, %xmm2 923 mulpd %xmm1, %xmm3 924 mulpd %xmm0, %xmm4 925 mulpd %xmm1, %xmm5 926 927 addq $4 * SIZE, AO 928 addq $8 * SIZE, BO 929 subq $1, %rax 930 BRANCH 931 jg .L16 932 ALIGN_4 933 934.L19: 935 ADD1 %xmm2, %xmm10 936 ADD1 %xmm3, %xmm14 937 ADD2 %xmm4, %xmm11 938 ADD2 %xmm5, %xmm15 939 940#if defined(LN) || defined(RT) 941 movq KK, %rax 942#ifdef LN 943 subq $2, %rax 944#else 945 subq $2, %rax 946#endif 947 948 movq AORIG, AO 949 movq BORIG, B 950 leaq 16 * SIZE + BUFFER, BO 951 952 salq $ZBASE_SHIFT, %rax 953 leaq (AO, %rax, 2), AO 954 leaq (B, %rax, 2), B 955 leaq (BO, %rax, 4), BO 956#endif 957 958 SHUFPD_1 %xmm9, %xmm9 959 SHUFPD_1 %xmm11, %xmm11 960 SHUFPD_1 %xmm13, %xmm13 961 SHUFPD_1 %xmm15, %xmm15 962 963#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 964 defined(NR) || defined(NC) || defined(TR) || defined(TC) 965 xorpd %xmm7, %xmm9 966 xorpd %xmm7, %xmm11 967 xorpd %xmm7, %xmm13 968 xorpd %xmm7, %xmm15 969#else 970 xorpd %xmm7, %xmm8 971 xorpd %xmm7, %xmm10 972 xorpd %xmm7, %xmm12 973 xorpd %xmm7, %xmm14 974#endif 975 976#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 977 defined(RR) || defined(RC) || defined(CR) || defined(CC) 978 subpd %xmm9, %xmm8 979 subpd %xmm11, %xmm10 980 subpd %xmm13, %xmm12 981 subpd %xmm15, %xmm14 982#else 983 addpd %xmm9, %xmm8 984 addpd %xmm11, %xmm10 985 addpd %xmm13, %xmm12 986 addpd %xmm15, %xmm14 987#endif 988 989#if defined(LN) || defined(LT) 990 movapd -16 * SIZE(B), %xmm9 991 movapd -14 * SIZE(B), %xmm11 992 movapd -12 * SIZE(B), %xmm13 993 movapd -10 * SIZE(B), %xmm15 994 995 subpd %xmm8, %xmm9 996 subpd %xmm10, %xmm11 997 subpd %xmm12, %xmm13 998 subpd %xmm14, %xmm15 999#else 1000 movapd -16 * SIZE(AO), %xmm9 1001 movapd -14 * SIZE(AO), %xmm13 1002 movapd -12 * SIZE(AO), %xmm11 1003 movapd -10 * SIZE(AO), %xmm15 1004 1005 subpd %xmm8, %xmm9 1006 subpd %xmm10, %xmm11 1007 subpd %xmm12, %xmm13 1008 subpd %xmm14, %xmm15 1009#endif 1010 1011#ifndef CONJ 1012 SHUFPD_1 %xmm7, %xmm7 1013#endif 1014 1015#ifdef LN 1016 movddup -10 * SIZE(AO), %xmm0 1017 movddup -9 * SIZE(AO), %xmm1 1018 movddup -12 * SIZE(AO), %xmm2 1019 movddup -11 * SIZE(AO), %xmm3 1020 movddup -16 * SIZE(AO), %xmm4 1021 movddup -15 * SIZE(AO), %xmm5 1022 1023 pshufd $0x4e, %xmm13, %xmm12 1024 pshufd $0x4e, %xmm15, %xmm14 1025 1026 xorpd %xmm7, %xmm12 1027 xorpd %xmm7, %xmm14 1028 1029 mulpd %xmm0, %xmm13 1030 mulpd %xmm1, %xmm12 1031 mulpd %xmm0, %xmm15 1032 mulpd %xmm1, %xmm14 1033 1034 addpd %xmm12, %xmm13 1035 addpd %xmm14, %xmm15 1036 1037 movapd %xmm13, %xmm8 1038 movapd %xmm15, %xmm10 1039 pshufd $0x4e, %xmm13, %xmm12 1040 pshufd $0x4e, %xmm15, %xmm14 1041 1042 xorpd %xmm7, %xmm12 1043 xorpd %xmm7, %xmm14 1044 1045 mulpd %xmm2, %xmm8 1046 mulpd %xmm2, %xmm10 1047 mulpd %xmm3, %xmm12 1048 mulpd %xmm3, %xmm14 1049 1050 subpd %xmm8, %xmm9 1051 subpd %xmm10, %xmm11 1052 subpd %xmm12, %xmm9 1053 subpd %xmm14, %xmm11 1054 1055 pshufd $0x4e, %xmm9, %xmm8 1056 pshufd $0x4e, %xmm11, %xmm10 1057 1058 xorpd %xmm7, %xmm8 1059 xorpd %xmm7, %xmm10 1060 1061 mulpd %xmm4, %xmm9 1062 mulpd %xmm5, %xmm8 1063 mulpd %xmm4, %xmm11 1064 mulpd %xmm5, %xmm10 1065 1066 addpd %xmm8, %xmm9 1067 addpd %xmm10, %xmm11 1068#endif 1069 1070#ifdef LT 1071 movddup -16 * SIZE(AO), %xmm0 1072 movddup -15 * SIZE(AO), %xmm1 1073 movddup -14 * SIZE(AO), %xmm2 1074 movddup -13 * SIZE(AO), %xmm3 1075 movddup -10 * SIZE(AO), %xmm4 1076 movddup -9 * SIZE(AO), %xmm5 1077 1078 pshufd $0x4e, %xmm9, %xmm8 1079 pshufd $0x4e, %xmm11, %xmm10 1080 1081 xorpd %xmm7, %xmm8 1082 xorpd %xmm7, %xmm10 1083 1084 mulpd %xmm0, %xmm9 1085 mulpd %xmm1, %xmm8 1086 mulpd %xmm0, %xmm11 1087 mulpd %xmm1, %xmm10 1088 1089 addpd %xmm8, %xmm9 1090 addpd %xmm10, %xmm11 1091 1092 movapd %xmm9, %xmm8 1093 movapd %xmm11, %xmm10 1094 pshufd $0x4e, %xmm9, %xmm12 1095 pshufd $0x4e, %xmm11, %xmm14 1096 1097 xorpd %xmm7, %xmm12 1098 xorpd %xmm7, %xmm14 1099 1100 mulpd %xmm2, %xmm8 1101 mulpd %xmm2, %xmm10 1102 mulpd %xmm3, %xmm12 1103 mulpd %xmm3, %xmm14 1104 1105 subpd %xmm8, %xmm13 1106 subpd %xmm10, %xmm15 1107 subpd %xmm12, %xmm13 1108 subpd %xmm14, %xmm15 1109 1110 pshufd $0x4e, %xmm13, %xmm12 1111 pshufd $0x4e, %xmm15, %xmm14 1112 1113 xorpd %xmm7, %xmm12 1114 xorpd %xmm7, %xmm14 1115 1116 mulpd %xmm4, %xmm13 1117 mulpd %xmm5, %xmm12 1118 mulpd %xmm4, %xmm15 1119 mulpd %xmm5, %xmm14 1120 1121 addpd %xmm12, %xmm13 1122 addpd %xmm14, %xmm15 1123#endif 1124 1125#ifdef RN 1126 movddup -16 * SIZE(B), %xmm0 1127 movddup -15 * SIZE(B), %xmm1 1128 movddup -14 * SIZE(B), %xmm2 1129 movddup -13 * SIZE(B), %xmm3 1130 movddup -10 * SIZE(B), %xmm4 1131 movddup -9 * SIZE(B), %xmm5 1132 1133 pshufd $0x4e, %xmm9, %xmm8 1134 pshufd $0x4e, %xmm13, %xmm12 1135 1136 xorpd %xmm7, %xmm8 1137 xorpd %xmm7, %xmm12 1138 1139 mulpd %xmm0, %xmm9 1140 mulpd %xmm1, %xmm8 1141 mulpd %xmm0, %xmm13 1142 mulpd %xmm1, %xmm12 1143 1144 addpd %xmm8, %xmm9 1145 addpd %xmm12, %xmm13 1146 1147 movapd %xmm9, %xmm8 1148 movapd %xmm13, %xmm10 1149 pshufd $0x4e, %xmm9, %xmm12 1150 pshufd $0x4e, %xmm13, %xmm14 1151 1152 xorpd %xmm7, %xmm12 1153 xorpd %xmm7, %xmm14 1154 1155 mulpd %xmm2, %xmm8 1156 mulpd %xmm2, %xmm10 1157 mulpd %xmm3, %xmm12 1158 mulpd %xmm3, %xmm14 1159 1160 subpd %xmm8, %xmm11 1161 subpd %xmm10, %xmm15 1162 subpd %xmm12, %xmm11 1163 subpd %xmm14, %xmm15 1164 1165 pshufd $0x4e, %xmm11, %xmm10 1166 pshufd $0x4e, %xmm15, %xmm14 1167 1168 xorpd %xmm7, %xmm10 1169 xorpd %xmm7, %xmm14 1170 1171 mulpd %xmm4, %xmm11 1172 mulpd %xmm5, %xmm10 1173 mulpd %xmm4, %xmm15 1174 mulpd %xmm5, %xmm14 1175 1176 addpd %xmm10, %xmm11 1177 addpd %xmm14, %xmm15 1178#endif 1179 1180#ifdef RT 1181 movddup -10 * SIZE(B), %xmm0 1182 movddup -9 * SIZE(B), %xmm1 1183 movddup -12 * SIZE(B), %xmm2 1184 movddup -11 * SIZE(B), %xmm3 1185 movddup -16 * SIZE(B), %xmm4 1186 movddup -15 * SIZE(B), %xmm5 1187 1188 pshufd $0x4e, %xmm11, %xmm10 1189 pshufd $0x4e, %xmm15, %xmm14 1190 1191 xorpd %xmm7, %xmm10 1192 xorpd %xmm7, %xmm14 1193 1194 mulpd %xmm0, %xmm11 1195 mulpd %xmm1, %xmm10 1196 mulpd %xmm0, %xmm15 1197 mulpd %xmm1, %xmm14 1198 1199 addpd %xmm10, %xmm11 1200 addpd %xmm14, %xmm15 1201 1202 movapd %xmm11, %xmm8 1203 movapd %xmm15, %xmm10 1204 pshufd $0x4e, %xmm11, %xmm12 1205 pshufd $0x4e, %xmm15, %xmm14 1206 1207 xorpd %xmm7, %xmm12 1208 xorpd %xmm7, %xmm14 1209 1210 mulpd %xmm2, %xmm8 1211 mulpd %xmm2, %xmm10 1212 mulpd %xmm3, %xmm12 1213 mulpd %xmm3, %xmm14 1214 1215 subpd %xmm8, %xmm9 1216 subpd %xmm10, %xmm13 1217 subpd %xmm12, %xmm9 1218 subpd %xmm14, %xmm13 1219 1220 pshufd $0x4e, %xmm9, %xmm8 1221 pshufd $0x4e, %xmm13, %xmm12 1222 1223 xorpd %xmm7, %xmm8 1224 xorpd %xmm7, %xmm12 1225 1226 mulpd %xmm4, %xmm9 1227 mulpd %xmm5, %xmm8 1228 mulpd %xmm4, %xmm13 1229 mulpd %xmm5, %xmm12 1230 1231 addpd %xmm8, %xmm9 1232 addpd %xmm12, %xmm13 1233#endif 1234 1235#ifdef LN 1236 subq $4 * SIZE, CO1 1237 subq $4 * SIZE, CO2 1238#endif 1239 1240 movsd %xmm9, 0 * SIZE(CO1) 1241 movhpd %xmm9, 1 * SIZE(CO1) 1242 movsd %xmm13, 2 * SIZE(CO1) 1243 movhpd %xmm13, 3 * SIZE(CO1) 1244 1245 movsd %xmm11, 0 * SIZE(CO2) 1246 movhpd %xmm11, 1 * SIZE(CO2) 1247 movsd %xmm15, 2 * SIZE(CO2) 1248 movhpd %xmm15, 3 * SIZE(CO2) 1249 1250#if defined(LN) || defined(LT) 1251 movapd %xmm9, -16 * SIZE(B) 1252 movapd %xmm11, -14 * SIZE(B) 1253 movapd %xmm13, -12 * SIZE(B) 1254 movapd %xmm15, -10 * SIZE(B) 1255 1256 movddup %xmm9, %xmm8 1257 unpckhpd %xmm9, %xmm9 1258 movddup %xmm11, %xmm10 1259 unpckhpd %xmm11, %xmm11 1260 movddup %xmm13, %xmm12 1261 unpckhpd %xmm13, %xmm13 1262 movddup %xmm15, %xmm14 1263 unpckhpd %xmm15, %xmm15 1264 1265 movapd %xmm8, -16 * SIZE(BO) 1266 movapd %xmm9, -14 * SIZE(BO) 1267 movapd %xmm10, -12 * SIZE(BO) 1268 movapd %xmm11, -10 * SIZE(BO) 1269 movapd %xmm12, -8 * SIZE(BO) 1270 movapd %xmm13, -6 * SIZE(BO) 1271 movapd %xmm14, -4 * SIZE(BO) 1272 movapd %xmm15, -2 * SIZE(BO) 1273#else 1274 movapd %xmm9, -16 * SIZE(AO) 1275 movapd %xmm13, -14 * SIZE(AO) 1276 movapd %xmm11, -12 * SIZE(AO) 1277 movapd %xmm15, -10 * SIZE(AO) 1278#endif 1279 1280#ifndef LN 1281 addq $4 * SIZE, CO1 1282 addq $4 * SIZE, CO2 1283#endif 1284 1285#if defined(LT) || defined(RN) 1286 movq K, %rax 1287 subq KK, %rax 1288 salq $ZBASE_SHIFT, %rax 1289 leaq (AO, %rax, 2), AO 1290#ifdef LT 1291 addq $8 * SIZE, B 1292#endif 1293#endif 1294 1295#ifdef LN 1296 subq $2, KK 1297 movq BORIG, B 1298#endif 1299 1300#ifdef LT 1301 addq $2, KK 1302#endif 1303 1304#ifdef RT 1305 movq K, %rax 1306 movq BORIG, B 1307 salq $1 + ZBASE_SHIFT, %rax 1308 addq %rax, AORIG 1309#endif 1310 1311 decq I # i -- 1312 jg .L10 1313 ALIGN_4 1314 1315.L99: 1316#ifdef LN 1317 leaq (, K, SIZE), %rax 1318 leaq (B, %rax, 4), B 1319#endif 1320 1321#if defined(LT) || defined(RN) 1322 movq K, %rax 1323 subq KK, %rax 1324 leaq (,%rax, SIZE), %rax 1325 leaq (B, %rax, 2 * COMPSIZE), B 1326#endif 1327 1328#ifdef RN 1329 addq $2, KK 1330#endif 1331 1332#ifdef RT 1333 subq $2, KK 1334#endif 1335 1336 decq J # j -- 1337 jg .L01 1338 1339.L100: 1340 testq $1, N 1341 jle .L999 1342 1343.L101: 1344#ifdef LN 1345 movq OFFSET, %rax 1346 addq M, %rax 1347 movq %rax, KK 1348#endif 1349 1350 leaq BUFFER, BO 1351 1352#ifdef RT 1353 movq K, %rax 1354 salq $0 + ZBASE_SHIFT, %rax 1355 subq %rax, B 1356#endif 1357 1358#if defined(LN) || defined(RT) 1359 movq KK, %rax 1360 movq B, BORIG 1361 salq $ZBASE_SHIFT, %rax 1362 leaq (B, %rax, 1), B 1363 leaq (BO, %rax, 2), BO 1364#endif 1365 1366#if defined(LT) 1367 movq OFFSET, %rax 1368 movq %rax, KK 1369#endif 1370 1371#if defined(LT) || defined(RN) 1372 movq KK, %rax 1373#else 1374 movq K, %rax 1375 subq KK, %rax 1376#endif 1377 sarq $2, %rax 1378 jle .L103 1379 ALIGN_4 1380 1381.L102: 1382 movddup -16 * SIZE(B), %xmm8 1383 movddup -15 * SIZE(B), %xmm9 1384 movddup -14 * SIZE(B), %xmm10 1385 movddup -13 * SIZE(B), %xmm11 1386 movddup -12 * SIZE(B), %xmm12 1387 movddup -11 * SIZE(B), %xmm13 1388 movddup -10 * SIZE(B), %xmm14 1389 movddup -9 * SIZE(B), %xmm15 1390 1391 movapd %xmm8, 0 * SIZE(BO) 1392 movapd %xmm9, 2 * SIZE(BO) 1393 movapd %xmm10, 4 * SIZE(BO) 1394 movapd %xmm11, 6 * SIZE(BO) 1395 movapd %xmm12, 8 * SIZE(BO) 1396 movapd %xmm13, 10 * SIZE(BO) 1397 movapd %xmm14, 12 * SIZE(BO) 1398 movapd %xmm15, 14 * SIZE(BO) 1399 1400 addq $ 8 * SIZE, B 1401 subq $-16 * SIZE, BO 1402 decq %rax 1403 jne .L102 1404 ALIGN_4 1405 1406.L103: 1407#if defined(LT) || defined(RN) 1408 movq KK, %rax 1409#else 1410 movq K, %rax 1411 subq KK, %rax 1412#endif 1413 andq $3, %rax 1414 BRANCH 1415 jle .L105 1416 ALIGN_4 1417 1418.L104: 1419 movddup -16 * SIZE(B), %xmm8 1420 movddup -15 * SIZE(B), %xmm9 1421 1422 movapd %xmm8, 0 * SIZE(BO) 1423 movapd %xmm9, 2 * SIZE(BO) 1424 1425 addq $4 * SIZE, BO 1426 addq $2 * SIZE, B 1427 decq %rax 1428 jne .L104 1429 ALIGN_4 1430 1431.L105: 1432#if defined(LT) || defined(RN) 1433 movq A, AO 1434#else 1435 movq A, AORIG 1436#endif 1437 1438#ifdef RT 1439 subq LDC, C 1440#endif 1441 1442 movq C, CO1 1443#ifndef RT 1444 addq LDC, C 1445#endif 1446 1447 testq $1, M 1448 jle .L130 1449 ALIGN_4 1450 1451.L140: 1452#ifdef LN 1453 movq K, %rax 1454 salq $0 + ZBASE_SHIFT, %rax 1455 subq %rax, AORIG 1456#endif 1457 1458#if defined(LN) || defined(RT) 1459 movq KK, %rax 1460 movq AORIG, AO 1461 salq $ZBASE_SHIFT, %rax 1462 leaq (AO, %rax, 1), AO 1463#endif 1464 1465 leaq 16 * SIZE + BUFFER, BO 1466 1467#if defined(LN) || defined(RT) 1468 movq KK, %rax 1469 salq $0 + ZBASE_SHIFT, %rax 1470 leaq (BO, %rax, 2), BO 1471#endif 1472 1473 pxor %xmm8, %xmm8 1474 pxor %xmm9, %xmm9 1475 pxor %xmm10, %xmm10 1476 pxor %xmm11, %xmm11 1477 1478#if defined(LT) || defined(RN) 1479 movq KK, %rax 1480#else 1481 movq K, %rax 1482 subq KK, %rax 1483#endif 1484 sarq $2, %rax 1485 je .L142 1486 1487.L141: 1488 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 1489 1490 movapd -16 * SIZE(AO), %xmm0 1491 movapd -14 * SIZE(AO), %xmm1 1492 movapd -16 * SIZE(BO), %xmm2 1493 movapd -14 * SIZE(BO), %xmm3 1494 movapd -12 * SIZE(BO), %xmm4 1495 movapd -10 * SIZE(BO), %xmm5 1496 1497 mulpd %xmm0, %xmm2 1498 mulpd %xmm0, %xmm3 1499 mulpd %xmm1, %xmm4 1500 mulpd %xmm1, %xmm5 1501 1502 ADD1 %xmm2, %xmm8 1503 ADD2 %xmm3, %xmm9 1504 ADD1 %xmm4, %xmm10 1505 ADD2 %xmm5, %xmm11 1506 1507 movapd -12 * SIZE(AO), %xmm0 1508 movapd -10 * SIZE(AO), %xmm1 1509 movapd -8 * SIZE(BO), %xmm2 1510 movapd -6 * SIZE(BO), %xmm3 1511 movapd -4 * SIZE(BO), %xmm4 1512 movapd -2 * SIZE(BO), %xmm5 1513 1514 mulpd %xmm0, %xmm2 1515 mulpd %xmm0, %xmm3 1516 mulpd %xmm1, %xmm4 1517 mulpd %xmm1, %xmm5 1518 1519 ADD1 %xmm2, %xmm8 1520 ADD2 %xmm3, %xmm9 1521 ADD1 %xmm4, %xmm10 1522 ADD2 %xmm5, %xmm11 1523 1524 subq $ -8 * SIZE, AO 1525 subq $-16 * SIZE, BO 1526 subq $1, %rax 1527 jne .L141 1528 1529.L142: 1530#if defined(LT) || defined(RN) 1531 movq KK, %rax 1532#else 1533 movq K, %rax 1534 subq KK, %rax 1535#endif 1536 movapd POSINV, %xmm7 1537 1538 andq $3, %rax # if (k & 1) 1539 BRANCH 1540 jle .L144 1541 1542.L143: 1543 movapd -16 * SIZE(AO), %xmm0 1544 movapd -16 * SIZE(BO), %xmm2 1545 movapd -14 * SIZE(BO), %xmm3 1546 1547 mulpd %xmm0, %xmm2 1548 mulpd %xmm0, %xmm3 1549 1550 ADD1 %xmm2, %xmm8 1551 ADD2 %xmm3, %xmm9 1552 1553 addq $2 * SIZE, AO 1554 addq $4 * SIZE, BO 1555 subq $1, %rax 1556 jg .L143 1557 ALIGN_4 1558 1559.L144: 1560 addpd %xmm10, %xmm8 1561 addpd %xmm11, %xmm9 1562 1563#if defined(LN) || defined(RT) 1564 movq KK, %rax 1565#ifdef LN 1566 subq $1, %rax 1567#else 1568 subq $1, %rax 1569#endif 1570 1571 movq AORIG, AO 1572 movq BORIG, B 1573 leaq 16 * SIZE + BUFFER, BO 1574 1575 salq $ZBASE_SHIFT, %rax 1576 leaq (AO, %rax, 1), AO 1577 leaq (B, %rax, 1), B 1578 leaq (BO, %rax, 2), BO 1579#endif 1580 1581 SHUFPD_1 %xmm9, %xmm9 1582 1583#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 1584 defined(NR) || defined(NC) || defined(TR) || defined(TC) 1585 xorpd %xmm7, %xmm9 1586#else 1587 xorpd %xmm7, %xmm8 1588#endif 1589 1590#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 1591 defined(RR) || defined(RC) || defined(CR) || defined(CC) 1592 subpd %xmm9, %xmm8 1593#else 1594 addpd %xmm9, %xmm8 1595#endif 1596 1597 1598#if defined(LN) || defined(LT) 1599 movapd -16 * SIZE(B), %xmm9 1600 1601 subpd %xmm8, %xmm9 1602#else 1603 movapd -16 * SIZE(AO), %xmm9 1604 1605 subpd %xmm8, %xmm9 1606#endif 1607 1608#ifndef CONJ 1609 SHUFPD_1 %xmm7, %xmm7 1610#endif 1611 1612#ifdef LN 1613 movddup -16 * SIZE(AO), %xmm0 1614 movddup -15 * SIZE(AO), %xmm1 1615 1616 pshufd $0x4e, %xmm9, %xmm8 1617 xorpd %xmm7, %xmm8 1618 1619 mulpd %xmm0, %xmm9 1620 mulpd %xmm1, %xmm8 1621 1622 addpd %xmm8, %xmm9 1623#endif 1624 1625#ifdef LT 1626 movddup -16 * SIZE(AO), %xmm0 1627 movddup -15 * SIZE(AO), %xmm1 1628 1629 pshufd $0x4e, %xmm9, %xmm8 1630 1631 xorpd %xmm7, %xmm8 1632 1633 mulpd %xmm0, %xmm9 1634 mulpd %xmm1, %xmm8 1635 1636 addpd %xmm8, %xmm9 1637#endif 1638 1639#ifdef RN 1640 movddup -16 * SIZE(B), %xmm0 1641 movddup -15 * SIZE(B), %xmm1 1642 1643 pshufd $0x4e, %xmm9, %xmm8 1644 1645 xorpd %xmm7, %xmm8 1646 1647 mulpd %xmm0, %xmm9 1648 mulpd %xmm1, %xmm8 1649 1650 addpd %xmm8, %xmm9 1651#endif 1652 1653#ifdef RT 1654 movddup -16 * SIZE(B), %xmm0 1655 movddup -15 * SIZE(B), %xmm1 1656 1657 pshufd $0x4e, %xmm9, %xmm8 1658 1659 xorpd %xmm7, %xmm8 1660 1661 mulpd %xmm0, %xmm9 1662 mulpd %xmm1, %xmm8 1663 1664 addpd %xmm8, %xmm9 1665#endif 1666 1667#ifdef LN 1668 subq $2 * SIZE, CO1 1669#endif 1670 1671 movsd %xmm9, 0 * SIZE(CO1) 1672 movhpd %xmm9, 1 * SIZE(CO1) 1673 1674#if defined(LN) || defined(LT) 1675 movapd %xmm9, -16 * SIZE(B) 1676 1677 movddup %xmm9, %xmm8 1678 unpckhpd %xmm9, %xmm9 1679 1680 movapd %xmm8, -16 * SIZE(BO) 1681 movapd %xmm9, -14 * SIZE(BO) 1682#else 1683 movapd %xmm9, -16 * SIZE(AO) 1684#endif 1685 1686#ifndef LN 1687 addq $2 * SIZE, CO1 1688#endif 1689 1690#if defined(LT) || defined(RN) 1691 movq K, %rax 1692 subq KK, %rax 1693 salq $ZBASE_SHIFT, %rax 1694 leaq (AO, %rax, 1), AO 1695#ifdef LT 1696 addq $2 * SIZE, B 1697#endif 1698#endif 1699 1700#ifdef LN 1701 subq $1, KK 1702 movq BORIG, B 1703#endif 1704 1705#ifdef LT 1706 addq $1, KK 1707#endif 1708 1709#ifdef RT 1710 movq K, %rax 1711 movq BORIG, B 1712 salq $0 + ZBASE_SHIFT, %rax 1713 addq %rax, AORIG 1714#endif 1715 ALIGN_4 1716 1717.L130: 1718 movq M, I 1719 sarq $1, I # i = (m >> 2) 1720 jle .L199 1721 ALIGN_4 1722 1723.L110: 1724#ifdef LN 1725 movq K, %rax 1726 salq $1 + ZBASE_SHIFT, %rax 1727 subq %rax, AORIG 1728#endif 1729 1730#if defined(LN) || defined(RT) 1731 movq KK, %rax 1732 movq AORIG, AO 1733 salq $ZBASE_SHIFT, %rax 1734 leaq (AO, %rax, 2), AO 1735#endif 1736 1737 leaq 16 * SIZE + BUFFER, BO 1738 1739#if defined(LN) || defined(RT) 1740 movq KK, %rax 1741 salq $0 + ZBASE_SHIFT, %rax 1742 leaq (BO, %rax, 2), BO 1743#endif 1744 1745 pxor %xmm8, %xmm8 1746 pxor %xmm9, %xmm9 1747 pxor %xmm12, %xmm12 1748 pxor %xmm13, %xmm13 1749 prefetcht0 -3 * SIZE(CO1) 1750 1751#if defined(LT) || defined(RN) 1752 movq KK, %rax 1753#else 1754 movq K, %rax 1755 subq KK, %rax 1756#endif 1757 sarq $2, %rax 1758 je .L112 1759 1760.L111: 1761 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 1762 1763 movapd -16 * SIZE(AO), %xmm0 1764 movapd -14 * SIZE(AO), %xmm1 1765 1766 movapd -16 * SIZE(BO), %xmm2 1767 movapd %xmm2, %xmm3 1768 movapd -14 * SIZE(BO), %xmm4 1769 movapd %xmm4, %xmm5 1770 1771 mulpd %xmm0, %xmm2 1772 mulpd %xmm1, %xmm3 1773 mulpd %xmm0, %xmm4 1774 mulpd %xmm1, %xmm5 1775 1776 ADD1 %xmm2, %xmm8 1777 ADD1 %xmm3, %xmm12 1778 ADD2 %xmm4, %xmm9 1779 ADD2 %xmm5, %xmm13 1780 1781 movapd -12 * SIZE(AO), %xmm0 1782 movapd -10 * SIZE(AO), %xmm1 1783 1784 movapd -12 * SIZE(BO), %xmm2 1785 movapd %xmm2, %xmm3 1786 movapd -10 * SIZE(BO), %xmm4 1787 movapd %xmm4, %xmm5 1788 1789 mulpd %xmm0, %xmm2 1790 mulpd %xmm1, %xmm3 1791 mulpd %xmm0, %xmm4 1792 mulpd %xmm1, %xmm5 1793 1794 ADD1 %xmm2, %xmm8 1795 ADD1 %xmm3, %xmm12 1796 ADD2 %xmm4, %xmm9 1797 ADD2 %xmm5, %xmm13 1798 1799 movapd -8 * SIZE(AO), %xmm0 1800 movapd -6 * SIZE(AO), %xmm1 1801 1802 movapd -8 * SIZE(BO), %xmm2 1803 movapd %xmm2, %xmm3 1804 movapd -6 * SIZE(BO), %xmm4 1805 movapd %xmm4, %xmm5 1806 1807 mulpd %xmm0, %xmm2 1808 mulpd %xmm1, %xmm3 1809 mulpd %xmm0, %xmm4 1810 mulpd %xmm1, %xmm5 1811 1812 ADD1 %xmm2, %xmm8 1813 ADD1 %xmm3, %xmm12 1814 ADD2 %xmm4, %xmm9 1815 ADD2 %xmm5, %xmm13 1816 1817 movapd -4 * SIZE(AO), %xmm0 1818 movapd -2 * SIZE(AO), %xmm1 1819 1820 movapd -4 * SIZE(BO), %xmm2 1821 movapd %xmm2, %xmm3 1822 movapd -2 * SIZE(BO), %xmm4 1823 movapd %xmm4, %xmm5 1824 1825 mulpd %xmm0, %xmm2 1826 mulpd %xmm1, %xmm3 1827 mulpd %xmm0, %xmm4 1828 mulpd %xmm1, %xmm5 1829 1830 ADD1 %xmm2, %xmm8 1831 ADD1 %xmm3, %xmm12 1832 ADD2 %xmm4, %xmm9 1833 ADD2 %xmm5, %xmm13 1834 1835 subq $-16 * SIZE, AO 1836 subq $-16 * SIZE, BO 1837 subq $1, %rax 1838 jne .L111 1839 ALIGN_4 1840 1841.L112: 1842#if defined(LT) || defined(RN) 1843 movq KK, %rax 1844#else 1845 movq K, %rax 1846 subq KK, %rax 1847#endif 1848 movapd POSINV, %xmm7 1849 andq $3, %rax # if (k & 1) 1850 BRANCH 1851 jle .L114 1852 1853.L113: 1854 movapd -16 * SIZE(AO), %xmm0 1855 movapd -14 * SIZE(AO), %xmm1 1856 1857 movapd -16 * SIZE(BO), %xmm2 1858 movapd %xmm2, %xmm3 1859 movapd -14 * SIZE(BO), %xmm4 1860 movapd %xmm4, %xmm5 1861 1862 mulpd %xmm0, %xmm2 1863 mulpd %xmm1, %xmm3 1864 mulpd %xmm0, %xmm4 1865 mulpd %xmm1, %xmm5 1866 1867 ADD1 %xmm2, %xmm8 1868 ADD1 %xmm3, %xmm12 1869 ADD2 %xmm4, %xmm9 1870 ADD2 %xmm5, %xmm13 1871 1872 addq $4 * SIZE, AO 1873 addq $4 * SIZE, BO 1874 subq $1, %rax 1875 jg .L113 1876 ALIGN_4 1877 1878.L114: 1879#if defined(LN) || defined(RT) 1880 movq KK, %rax 1881#ifdef LN 1882 subq $2, %rax 1883#else 1884 subq $1, %rax 1885#endif 1886 1887 movq AORIG, AO 1888 movq BORIG, B 1889 leaq 16 * SIZE + BUFFER, BO 1890 1891 salq $ZBASE_SHIFT, %rax 1892 leaq (AO, %rax, 2), AO 1893 leaq (B, %rax, 1), B 1894 leaq (BO, %rax, 2), BO 1895#endif 1896 1897 SHUFPD_1 %xmm9, %xmm9 1898 SHUFPD_1 %xmm13, %xmm13 1899 1900#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 1901 defined(NR) || defined(NC) || defined(TR) || defined(TC) 1902 xorpd %xmm7, %xmm9 1903 xorpd %xmm7, %xmm13 1904#else 1905 xorpd %xmm7, %xmm8 1906 xorpd %xmm7, %xmm12 1907#endif 1908 1909#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 1910 defined(RR) || defined(RC) || defined(CR) || defined(CC) 1911 subpd %xmm9, %xmm8 1912 subpd %xmm13, %xmm12 1913#else 1914 addpd %xmm9, %xmm8 1915 addpd %xmm13, %xmm12 1916#endif 1917 1918#if defined(LN) || defined(LT) 1919 movapd -16 * SIZE(B), %xmm9 1920 movapd -14 * SIZE(B), %xmm13 1921 1922 subpd %xmm8, %xmm9 1923 subpd %xmm12, %xmm13 1924#else 1925 movapd -16 * SIZE(AO), %xmm9 1926 movapd -14 * SIZE(AO), %xmm13 1927 1928 subpd %xmm8, %xmm9 1929 subpd %xmm12, %xmm13 1930#endif 1931 1932#ifndef CONJ 1933 SHUFPD_1 %xmm7, %xmm7 1934#endif 1935 1936#ifdef LN 1937 movddup -10 * SIZE(AO), %xmm0 1938 movddup -9 * SIZE(AO), %xmm1 1939 movddup -12 * SIZE(AO), %xmm2 1940 movddup -11 * SIZE(AO), %xmm3 1941 movddup -16 * SIZE(AO), %xmm4 1942 movddup -15 * SIZE(AO), %xmm5 1943 1944 pshufd $0x4e, %xmm13, %xmm12 1945 1946 xorpd %xmm7, %xmm12 1947 1948 mulpd %xmm0, %xmm13 1949 mulpd %xmm1, %xmm12 1950 1951 addpd %xmm12, %xmm13 1952 1953 movapd %xmm13, %xmm8 1954 pshufd $0x4e, %xmm13, %xmm12 1955 1956 xorpd %xmm7, %xmm12 1957 1958 mulpd %xmm2, %xmm8 1959 mulpd %xmm3, %xmm12 1960 1961 subpd %xmm8, %xmm9 1962 subpd %xmm12, %xmm9 1963 1964 pshufd $0x4e, %xmm9, %xmm8 1965 1966 xorpd %xmm7, %xmm8 1967 1968 mulpd %xmm4, %xmm9 1969 mulpd %xmm5, %xmm8 1970 1971 addpd %xmm8, %xmm9 1972#endif 1973 1974#ifdef LT 1975 movddup -16 * SIZE(AO), %xmm0 1976 movddup -15 * SIZE(AO), %xmm1 1977 movddup -14 * SIZE(AO), %xmm2 1978 movddup -13 * SIZE(AO), %xmm3 1979 movddup -10 * SIZE(AO), %xmm4 1980 movddup -9 * SIZE(AO), %xmm5 1981 1982 pshufd $0x4e, %xmm9, %xmm8 1983 1984 xorpd %xmm7, %xmm8 1985 1986 mulpd %xmm0, %xmm9 1987 mulpd %xmm1, %xmm8 1988 1989 addpd %xmm8, %xmm9 1990 1991 movapd %xmm9, %xmm8 1992 pshufd $0x4e, %xmm9, %xmm12 1993 1994 xorpd %xmm7, %xmm12 1995 1996 mulpd %xmm2, %xmm8 1997 mulpd %xmm3, %xmm12 1998 1999 subpd %xmm8, %xmm13 2000 subpd %xmm12, %xmm13 2001 2002 pshufd $0x4e, %xmm13, %xmm12 2003 2004 xorpd %xmm7, %xmm12 2005 2006 mulpd %xmm4, %xmm13 2007 mulpd %xmm5, %xmm12 2008 2009 addpd %xmm12, %xmm13 2010#endif 2011 2012#ifdef RN 2013 movddup -16 * SIZE(B), %xmm0 2014 movddup -15 * SIZE(B), %xmm1 2015 2016 pshufd $0x4e, %xmm9, %xmm8 2017 pshufd $0x4e, %xmm13, %xmm12 2018 2019 xorpd %xmm7, %xmm8 2020 xorpd %xmm7, %xmm12 2021 2022 mulpd %xmm0, %xmm9 2023 mulpd %xmm1, %xmm8 2024 mulpd %xmm0, %xmm13 2025 mulpd %xmm1, %xmm12 2026 2027 addpd %xmm8, %xmm9 2028 addpd %xmm12, %xmm13 2029#endif 2030 2031#ifdef RT 2032 movddup -16 * SIZE(B), %xmm0 2033 movddup -15 * SIZE(B), %xmm1 2034 2035 pshufd $0x4e, %xmm9, %xmm8 2036 pshufd $0x4e, %xmm13, %xmm12 2037 2038 xorpd %xmm7, %xmm8 2039 xorpd %xmm7, %xmm12 2040 2041 mulpd %xmm0, %xmm9 2042 mulpd %xmm1, %xmm8 2043 mulpd %xmm0, %xmm13 2044 mulpd %xmm1, %xmm12 2045 2046 addpd %xmm8, %xmm9 2047 addpd %xmm12, %xmm13 2048#endif 2049 2050#ifdef LN 2051 subq $4 * SIZE, CO1 2052#endif 2053 2054 movsd %xmm9, 0 * SIZE(CO1) 2055 movhpd %xmm9, 1 * SIZE(CO1) 2056 movsd %xmm13, 2 * SIZE(CO1) 2057 movhpd %xmm13, 3 * SIZE(CO1) 2058 2059#if defined(LN) || defined(LT) 2060 movapd %xmm9, -16 * SIZE(B) 2061 movapd %xmm13, -14 * SIZE(B) 2062 2063 movddup %xmm9, %xmm8 2064 unpckhpd %xmm9, %xmm9 2065 movddup %xmm13, %xmm12 2066 unpckhpd %xmm13, %xmm13 2067 2068 movapd %xmm8, -16 * SIZE(BO) 2069 movapd %xmm9, -14 * SIZE(BO) 2070 movapd %xmm12, -12 * SIZE(BO) 2071 movapd %xmm13, -10 * SIZE(BO) 2072#else 2073 movapd %xmm9, -16 * SIZE(AO) 2074 movapd %xmm13, -14 * SIZE(AO) 2075#endif 2076 2077#ifndef LN 2078 addq $4 * SIZE, CO1 2079#endif 2080 2081#if defined(LT) || defined(RN) 2082 movq K, %rax 2083 subq KK, %rax 2084 salq $ZBASE_SHIFT, %rax 2085 leaq (AO, %rax, 2), AO 2086#ifdef LT 2087 addq $4 * SIZE, B 2088#endif 2089#endif 2090 2091#ifdef LN 2092 subq $2, KK 2093 movq BORIG, B 2094#endif 2095 2096#ifdef LT 2097 addq $2, KK 2098#endif 2099 2100#ifdef RT 2101 movq K, %rax 2102 movq BORIG, B 2103 salq $1 + ZBASE_SHIFT, %rax 2104 addq %rax, AORIG 2105#endif 2106 2107 decq I # i -- 2108 jg .L110 2109 ALIGN_4 2110 2111.L199: 2112#ifdef LN 2113 leaq (, K, SIZE), %rax 2114 leaq (B, %rax, 2), B 2115#endif 2116 2117#if defined(LT) || defined(RN) 2118 movq K, %rax 2119 subq KK, %rax 2120 leaq (,%rax, SIZE), %rax 2121 leaq (B, %rax, 1 * COMPSIZE), B 2122#endif 2123 2124#ifdef RN 2125 addq $1, KK 2126#endif 2127 2128#ifdef RT 2129 subq $1, KK 2130#endif 2131 ALIGN_4 2132 2133 2134.L999: 2135 movq %r15, %rsp 2136 2137 movq 0(%rsp), %rbx 2138 movq 8(%rsp), %rbp 2139 movq 16(%rsp), %r12 2140 movq 24(%rsp), %r13 2141 movq 32(%rsp), %r14 2142 movq 40(%rsp), %r15 2143 2144#ifdef WINDOWS_ABI 2145 movq 48(%rsp), %rdi 2146 movq 56(%rsp), %rsi 2147 movups 64(%rsp), %xmm6 2148 movups 80(%rsp), %xmm7 2149 movups 96(%rsp), %xmm8 2150 movups 112(%rsp), %xmm9 2151 movups 128(%rsp), %xmm10 2152 movups 144(%rsp), %xmm11 2153 movups 160(%rsp), %xmm12 2154 movups 176(%rsp), %xmm13 2155 movups 192(%rsp), %xmm14 2156 movups 208(%rsp), %xmm15 2157#endif 2158 2159 addq $STACKSIZE, %rsp 2160 ret 2161 2162 EPILOGUE 2163