1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define OLD_M %rdi 43#define OLD_N %rsi 44#define OLD_K %rdx 45 46#define M %r13 47#define N %r14 48#define K %r15 49 50#define A %rcx 51#define B %r8 52#define C %r9 53#define LDC %r10 54 55#define I %r11 56#define AO %rdi 57#define BO %rsi 58#define CO1 %rbx 59#define CO2 %rbp 60#define KK %rdx 61#define BB %r12 62 63#ifndef WINDOWS_ABI 64 65#define STACKSIZE 128 66 67#define OLD_LDC 8 + STACKSIZE(%rsp) 68#define OLD_OFFSET 16 + STACKSIZE(%rsp) 69 70#define OFFSET 48(%rsp) 71#define J 56(%rsp) 72#define KKK 64(%rsp) 73#define AORIG 72(%rsp) 74 75#else 76 77#define STACKSIZE 512 78 79#define OLD_A 40 + STACKSIZE(%rsp) 80#define OLD_B 48 + STACKSIZE(%rsp) 81#define OLD_C 56 + STACKSIZE(%rsp) 82#define OLD_LDC 64 + STACKSIZE(%rsp) 83#define OLD_OFFSET 72 + STACKSIZE(%rsp) 84 85#define OFFSET 224(%rsp) 86#define J 232(%rsp) 87#define KKK 240(%rsp) 88#define AORIG 248(%rsp) 89 90#endif 91 92#define PREFETCHSIZE (8 * 1 - 4) 93#define PREFETCH prefetcht0 94 95 PROLOGUE 96 PROFCODE 97 98 subq $STACKSIZE, %rsp 99 100 movq %rbx, 0(%rsp) 101 movq %rbp, 8(%rsp) 102 movq %r12, 16(%rsp) 103 movq %r13, 24(%rsp) 104 movq %r14, 32(%rsp) 105 movq %r15, 40(%rsp) 106 107#ifdef WINDOWS_ABI 108 movq %rdi, 48(%rsp) 109 movq %rsi, 56(%rsp) 110 movups %xmm6, 64(%rsp) 111 movups %xmm7, 80(%rsp) 112 movups %xmm8, 96(%rsp) 113 movups %xmm9, 112(%rsp) 114 movups %xmm10, 128(%rsp) 115 movups %xmm11, 144(%rsp) 116 movups %xmm12, 160(%rsp) 117 movups %xmm13, 176(%rsp) 118 movups %xmm14, 192(%rsp) 119 movups %xmm15, 208(%rsp) 120 121 movq ARG1, OLD_M 122 movq ARG2, OLD_N 123 movq ARG3, OLD_K 124 movq OLD_A, A 125 movq OLD_B, B 126 movq OLD_C, C 127#endif 128 129 subq $-16 * SIZE, A 130 subq $-16 * SIZE, B 131 132 movq OLD_M, M 133 movq OLD_N, N 134 movq OLD_K, K 135 136 movq OLD_LDC, LDC 137 movq OLD_OFFSET, KK 138 139 140 leaq (, LDC, SIZE), LDC 141 142 movq KK, OFFSET 143 negq KK 144 145#ifdef LN 146 leaq (, M, SIZE), %rax 147 addq %rax, C 148 imulq K, %rax 149 addq %rax, A 150#endif 151 152#ifdef RT 153 leaq (, N, SIZE), %rax 154 imulq K, %rax 155 addq %rax, B 156 movq N, %rax 157 imulq LDC, %rax 158 addq %rax, C 159#endif 160 161#ifdef RT 162 movq N, %rax 163 subq OFFSET, %rax 164 movq %rax, KK 165#endif 166 167 movq N, J 168 sarq $3, J 169 NOBRANCH 170 jle .L30 171 ALIGN_4 172 173.L01: 174#if defined(LT) || defined(RN) 175 movq A, AO 176#else 177 movq A, AORIG 178#endif 179 180#ifdef RT 181 movq K, %rax 182 salq $3 + BASE_SHIFT, %rax 183 subq %rax, B 184 185 leaq (, LDC, 8), %rax 186 subq %rax, C 187#endif 188 189 movq C, CO1 190 leaq (C, LDC, 4), CO2 191#ifndef RT 192 leaq (C, LDC, 8), C 193#endif 194 195#ifdef LN 196 movq OFFSET, %rax 197 addq M, %rax 198 movq %rax, KK 199#endif 200 201#ifdef LT 202 movq OFFSET, %rax 203 movq %rax, KK 204#endif 205 206 movq K, %rax 207 salq $BASE_SHIFT + 3, %rax 208 leaq (B, %rax), BB 209 210 movq M, I 211 sarq $1, I 212 NOBRANCH 213 jle .L20 214 ALIGN_4 215 216.L11: 217#ifdef LN 218 movq K, %rax 219 salq $1 + BASE_SHIFT, %rax 220 subq %rax, AORIG 221#endif 222 223#if defined(LN) || defined(RT) 224 movq KK, %rax 225 leaq (, %rax, SIZE), %rax 226 movq AORIG, AO 227 leaq (AO, %rax, 2), AO 228 leaq (B, %rax, 8), BO 229#else 230 movq B, BO 231#endif 232 233 prefetcht0 -16 * SIZE(BB) 234 subq $-8 * SIZE, BB 235 236 xorps %xmm1, %xmm1 237 movapd -16 * SIZE(AO), %xmm0 238 xorps %xmm2, %xmm2 239 xorps %xmm3, %xmm3 240 xorps %xmm4, %xmm4 241 242 leaq (LDC, LDC, 2), %rax 243 244 xorps %xmm8, %xmm8 245 prefetcht0 1 * SIZE(CO1) 246 xorps %xmm9, %xmm9 247 prefetcht0 2 * SIZE(CO1, LDC, 1) 248 xorps %xmm10, %xmm10 249 prefetcht0 1 * SIZE(CO1, LDC, 2) 250 xorps %xmm11, %xmm11 251 prefetcht0 2 * SIZE(CO1, %rax, 1) 252 253 xorps %xmm12, %xmm12 254 prefetcht0 1 * SIZE(CO2) 255 xorps %xmm13, %xmm13 256 prefetcht0 2 * SIZE(CO2, LDC, 1) 257 xorps %xmm14, %xmm14 258 prefetcht0 1 * SIZE(CO2, LDC, 2) 259 xorps %xmm15, %xmm15 260 prefetcht0 2 * SIZE(CO2, %rax, 1) 261 262#if defined(LT) || defined(RN) 263 movq KK, %rax 264#else 265 movq K, %rax 266 subq KK, %rax 267#endif 268 sarq $2, %rax 269 NOBRANCH 270 jle .L15 271 ALIGN_3 272 273.L12: 274 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 275 276 addpd %xmm1, %xmm12 277 movaps -16 * SIZE(BO), %xmm6 278 addpd %xmm2, %xmm13 279 pshufd $0x4e, %xmm6, %xmm2 280 mulpd %xmm0, %xmm6 281 mulpd %xmm0, %xmm2 282 283 addpd %xmm3, %xmm14 284 movaps -14 * SIZE(BO), %xmm3 285 addpd %xmm4, %xmm15 286 pshufd $0x4e, %xmm3, %xmm4 287 mulpd %xmm0, %xmm3 288 mulpd %xmm0, %xmm4 289 290 addpd %xmm6, %xmm8 291 movaps -12 * SIZE(BO), %xmm6 292 addpd %xmm2, %xmm9 293 pshufd $0x4e, %xmm6, %xmm2 294 mulpd %xmm0, %xmm6 295 mulpd %xmm0, %xmm2 296 297 addpd %xmm3, %xmm10 298 movaps -10 * SIZE(BO), %xmm3 299 addpd %xmm4, %xmm11 300 pshufd $0x4e, %xmm3, %xmm4 301 mulpd %xmm0, %xmm3 302 mulpd %xmm0, %xmm4 303 304 addpd %xmm6, %xmm12 305 movaps -8 * SIZE(BO), %xmm1 306 addpd %xmm2, %xmm13 307 movaps -14 * SIZE(AO), %xmm5 308 pshufd $0x4e, %xmm1, %xmm2 309 mulpd %xmm5, %xmm1 310 mulpd %xmm5, %xmm2 311 312 addpd %xmm3, %xmm14 313 movaps -6 * SIZE(BO), %xmm3 314 addpd %xmm4, %xmm15 315 pshufd $0x4e, %xmm3, %xmm4 316 mulpd %xmm5, %xmm3 317 mulpd %xmm5, %xmm4 318 319 addpd %xmm1, %xmm8 320 movaps -4 * SIZE(BO), %xmm1 321 addpd %xmm2, %xmm9 322 pshufd $0x4e, %xmm1, %xmm2 323 mulpd %xmm5, %xmm1 324 mulpd %xmm5, %xmm2 325 326 addpd %xmm3, %xmm10 327 movaps -2 * SIZE(BO), %xmm3 328 addpd %xmm4, %xmm11 329 pshufd $0x4e, %xmm3, %xmm4 330 movaps -12 * SIZE(AO), %xmm0 331 mulpd %xmm5, %xmm3 332 mulpd %xmm5, %xmm4 333 334 addpd %xmm1, %xmm12 335 movaps 0 * SIZE(BO), %xmm6 336 addpd %xmm2, %xmm13 337 pshufd $0x4e, %xmm6, %xmm2 338 mulpd %xmm0, %xmm6 339 mulpd %xmm0, %xmm2 340 341 addpd %xmm3, %xmm14 342 movaps 2 * SIZE(BO), %xmm3 343 addpd %xmm4, %xmm15 344 pshufd $0x4e, %xmm3, %xmm4 345 mulpd %xmm0, %xmm3 346 mulpd %xmm0, %xmm4 347 348 addpd %xmm6, %xmm8 349 movaps 4 * SIZE(BO), %xmm6 350 addpd %xmm2, %xmm9 351 pshufd $0x4e, %xmm6, %xmm2 352 mulpd %xmm0, %xmm6 353 mulpd %xmm0, %xmm2 354 355 addpd %xmm3, %xmm10 356 movaps 6 * SIZE(BO), %xmm3 357 addpd %xmm4, %xmm11 358 pshufd $0x4e, %xmm3, %xmm4 359 mulpd %xmm0, %xmm3 360 mulpd %xmm0, %xmm4 361 362 addpd %xmm6, %xmm12 363 movaps 8 * SIZE(BO), %xmm1 364 addpd %xmm2, %xmm13 365 movaps -10 * SIZE(AO), %xmm5 366 pshufd $0x4e, %xmm1, %xmm2 367 mulpd %xmm5, %xmm1 368 mulpd %xmm5, %xmm2 369 370 addpd %xmm3, %xmm14 371 movaps 10 * SIZE(BO), %xmm3 372 addpd %xmm4, %xmm15 373 pshufd $0x4e, %xmm3, %xmm4 374 mulpd %xmm5, %xmm3 375 mulpd %xmm5, %xmm4 376 377 addpd %xmm1, %xmm8 378 movaps 12 * SIZE(BO), %xmm1 379 addpd %xmm2, %xmm9 380 pshufd $0x4e, %xmm1, %xmm2 381 mulpd %xmm5, %xmm1 382 mulpd %xmm5, %xmm2 383 384 addpd %xmm3, %xmm10 385 movaps 14 * SIZE(BO), %xmm3 386 addpd %xmm4, %xmm11 387 pshufd $0x4e, %xmm3, %xmm4 388 movaps -8 * SIZE(AO), %xmm0 389 mulpd %xmm5, %xmm3 390 mulpd %xmm5, %xmm4 391 392 addq $32 * SIZE, BO 393 subq $-8 * SIZE, AO 394 decq %rax 395 BRANCH 396 jg .L12 397 ALIGN_3 398 399.L15: 400#if defined(LT) || defined(RN) 401 movq KK, %rax 402#else 403 movq K, %rax 404 subq KK, %rax 405#endif 406 andq $3, %rax # if (k & 1) 407 BRANCH 408 je .L18 409 ALIGN_3 410 411.L16: 412 addpd %xmm1, %xmm12 413 movaps -16 * SIZE(BO), %xmm1 414 addpd %xmm2, %xmm13 415 pshufd $0x4e, %xmm1, %xmm2 416 mulpd %xmm0, %xmm1 417 mulpd %xmm0, %xmm2 418 419 addpd %xmm3, %xmm14 420 movaps -14 * SIZE(BO), %xmm3 421 addpd %xmm4, %xmm15 422 pshufd $0x4e, %xmm3, %xmm4 423 mulpd %xmm0, %xmm3 424 mulpd %xmm0, %xmm4 425 426 addpd %xmm1, %xmm8 427 movaps -12 * SIZE(BO), %xmm1 428 addpd %xmm2, %xmm9 429 pshufd $0x4e, %xmm1, %xmm2 430 mulpd %xmm0, %xmm1 431 mulpd %xmm0, %xmm2 432 433 addpd %xmm3, %xmm10 434 movaps -10 * SIZE(BO), %xmm3 435 addpd %xmm4, %xmm11 436 pshufd $0x4e, %xmm3, %xmm4 437 mulpd %xmm0, %xmm3 438 mulpd %xmm0, %xmm4 439 440 movaps -14 * SIZE(AO), %xmm0 441 442 addq $2 * SIZE, AO 443 addq $8 * SIZE, BO 444 445 subq $1, %rax 446 BRANCH 447 jg .L16 448 ALIGN_4 449 450.L18: 451#if defined(LN) || defined(RT) 452 movq KK, %rax 453#ifdef LN 454 subq $2, %rax 455#else 456 subq $8, %rax 457#endif 458 459 leaq (, %rax, SIZE), %rax 460 461 movq AORIG, AO 462 leaq (AO, %rax, 2), AO 463 leaq (B, %rax, 8), BO 464#endif 465 466 addpd %xmm1, %xmm12 467 addpd %xmm2, %xmm13 468 addpd %xmm3, %xmm14 469 addpd %xmm4, %xmm15 470 471#if defined(LN) || defined(LT) 472 movaps %xmm8, %xmm0 473 shufpd $0, %xmm9, %xmm8 474 shufpd $3, %xmm0, %xmm9 475 476 movaps %xmm10, %xmm0 477 shufpd $0, %xmm11, %xmm10 478 shufpd $3, %xmm0, %xmm11 479 480 movaps %xmm12, %xmm0 481 shufpd $0, %xmm13, %xmm12 482 shufpd $3, %xmm0, %xmm13 483 484 movaps %xmm14, %xmm0 485 shufpd $0, %xmm15, %xmm14 486 shufpd $3, %xmm0, %xmm15 487 488 movapd -16 * SIZE(BO), %xmm0 489 movapd -14 * SIZE(BO), %xmm2 490 movapd -12 * SIZE(BO), %xmm4 491 movapd -10 * SIZE(BO), %xmm6 492 movapd -8 * SIZE(BO), %xmm1 493 movapd -6 * SIZE(BO), %xmm3 494 movapd -4 * SIZE(BO), %xmm5 495 movapd -2 * SIZE(BO), %xmm7 496#else 497 movaps %xmm8, %xmm0 498 shufpd $2, %xmm9, %xmm8 499 shufpd $2, %xmm0, %xmm9 500 501 movaps %xmm10, %xmm0 502 shufpd $2, %xmm11, %xmm10 503 shufpd $2, %xmm0, %xmm11 504 505 movaps %xmm12, %xmm0 506 shufpd $2, %xmm13, %xmm12 507 shufpd $2, %xmm0, %xmm13 508 509 movaps %xmm14, %xmm0 510 shufpd $2, %xmm15, %xmm14 511 shufpd $2, %xmm0, %xmm15 512 513 movapd -16 * SIZE(AO), %xmm0 514 movapd -14 * SIZE(AO), %xmm1 515 movapd -12 * SIZE(AO), %xmm2 516 movapd -10 * SIZE(AO), %xmm3 517 518 movapd -8 * SIZE(AO), %xmm4 519 movapd -6 * SIZE(AO), %xmm5 520 movapd -4 * SIZE(AO), %xmm6 521 movapd -2 * SIZE(AO), %xmm7 522#endif 523 524 subpd %xmm8, %xmm0 525 subpd %xmm9, %xmm1 526 subpd %xmm10, %xmm2 527 subpd %xmm11, %xmm3 528 subpd %xmm12, %xmm4 529 subpd %xmm13, %xmm5 530 subpd %xmm14, %xmm6 531 subpd %xmm15, %xmm7 532 533#ifdef LN 534 movddup -13 * SIZE(AO), %xmm8 535 mulpd %xmm8, %xmm1 536 mulpd %xmm8, %xmm3 537 mulpd %xmm8, %xmm5 538 mulpd %xmm8, %xmm7 539 540 movddup -14 * SIZE(AO), %xmm12 541 movapd %xmm12, %xmm13 542 movapd %xmm12, %xmm14 543 movapd %xmm12, %xmm15 544 545 mulpd %xmm1, %xmm12 546 mulpd %xmm3, %xmm13 547 mulpd %xmm5, %xmm14 548 mulpd %xmm7, %xmm15 549 550 subpd %xmm12, %xmm0 551 subpd %xmm13, %xmm2 552 subpd %xmm14, %xmm4 553 subpd %xmm15, %xmm6 554 555 movddup -16 * SIZE(AO), %xmm8 556 mulpd %xmm8, %xmm0 557 mulpd %xmm8, %xmm2 558 mulpd %xmm8, %xmm4 559 mulpd %xmm8, %xmm6 560#endif 561 562#ifdef LT 563 movddup -16 * SIZE(AO), %xmm8 564 mulpd %xmm8, %xmm0 565 mulpd %xmm8, %xmm2 566 mulpd %xmm8, %xmm4 567 mulpd %xmm8, %xmm6 568 569 movddup -15 * SIZE(AO), %xmm12 570 movapd %xmm12, %xmm13 571 movapd %xmm12, %xmm14 572 movapd %xmm12, %xmm15 573 574 mulpd %xmm0, %xmm12 575 mulpd %xmm2, %xmm13 576 mulpd %xmm4, %xmm14 577 mulpd %xmm6, %xmm15 578 579 subpd %xmm12, %xmm1 580 subpd %xmm13, %xmm3 581 subpd %xmm14, %xmm5 582 subpd %xmm15, %xmm7 583 584 movddup -13 * SIZE(AO), %xmm8 585 mulpd %xmm8, %xmm1 586 mulpd %xmm8, %xmm3 587 mulpd %xmm8, %xmm5 588 mulpd %xmm8, %xmm7 589#endif 590 591#ifdef RN 592 movddup -16 * SIZE(BO), %xmm8 593 mulpd %xmm8, %xmm0 594 movddup -15 * SIZE(BO), %xmm9 595 mulpd %xmm0, %xmm9 596 subpd %xmm9, %xmm1 597 movddup -14 * SIZE(BO), %xmm10 598 mulpd %xmm0, %xmm10 599 subpd %xmm10, %xmm2 600 movddup -13 * SIZE(BO), %xmm11 601 mulpd %xmm0, %xmm11 602 subpd %xmm11, %xmm3 603 movddup -12 * SIZE(BO), %xmm12 604 mulpd %xmm0, %xmm12 605 subpd %xmm12, %xmm4 606 movddup -11 * SIZE(BO), %xmm13 607 mulpd %xmm0, %xmm13 608 subpd %xmm13, %xmm5 609 movddup -10 * SIZE(BO), %xmm14 610 mulpd %xmm0, %xmm14 611 subpd %xmm14, %xmm6 612 movddup -9 * SIZE(BO), %xmm15 613 mulpd %xmm0, %xmm15 614 subpd %xmm15, %xmm7 615 616 movddup -7 * SIZE(BO), %xmm9 617 mulpd %xmm9, %xmm1 618 movddup -6 * SIZE(BO), %xmm10 619 mulpd %xmm1, %xmm10 620 subpd %xmm10, %xmm2 621 movddup -5 * SIZE(BO), %xmm11 622 mulpd %xmm1, %xmm11 623 subpd %xmm11, %xmm3 624 movddup -4 * SIZE(BO), %xmm12 625 mulpd %xmm1, %xmm12 626 subpd %xmm12, %xmm4 627 movddup -3 * SIZE(BO), %xmm13 628 mulpd %xmm1, %xmm13 629 subpd %xmm13, %xmm5 630 movddup -2 * SIZE(BO), %xmm14 631 mulpd %xmm1, %xmm14 632 subpd %xmm14, %xmm6 633 movddup -1 * SIZE(BO), %xmm15 634 mulpd %xmm1, %xmm15 635 subpd %xmm15, %xmm7 636 637 movddup 2 * SIZE(BO), %xmm10 638 mulpd %xmm10, %xmm2 639 movddup 3 * SIZE(BO), %xmm11 640 mulpd %xmm2, %xmm11 641 subpd %xmm11, %xmm3 642 movddup 4 * SIZE(BO), %xmm12 643 mulpd %xmm2, %xmm12 644 subpd %xmm12, %xmm4 645 movddup 5 * SIZE(BO), %xmm13 646 mulpd %xmm2, %xmm13 647 subpd %xmm13, %xmm5 648 movddup 6 * SIZE(BO), %xmm14 649 mulpd %xmm2, %xmm14 650 subpd %xmm14, %xmm6 651 movddup 7 * SIZE(BO), %xmm15 652 mulpd %xmm2, %xmm15 653 subpd %xmm15, %xmm7 654 655 movddup 11 * SIZE(BO), %xmm11 656 mulpd %xmm11, %xmm3 657 movddup 12 * SIZE(BO), %xmm12 658 mulpd %xmm3, %xmm12 659 subpd %xmm12, %xmm4 660 movddup 13 * SIZE(BO), %xmm13 661 mulpd %xmm3, %xmm13 662 subpd %xmm13, %xmm5 663 movddup 14 * SIZE(BO), %xmm14 664 mulpd %xmm3, %xmm14 665 subpd %xmm14, %xmm6 666 movddup 15 * SIZE(BO), %xmm15 667 mulpd %xmm3, %xmm15 668 subpd %xmm15, %xmm7 669 670 movddup 20 * SIZE(BO), %xmm12 671 mulpd %xmm12, %xmm4 672 movddup 21 * SIZE(BO), %xmm13 673 mulpd %xmm4, %xmm13 674 subpd %xmm13, %xmm5 675 movddup 22 * SIZE(BO), %xmm14 676 mulpd %xmm4, %xmm14 677 subpd %xmm14, %xmm6 678 movddup 23 * SIZE(BO), %xmm15 679 mulpd %xmm4, %xmm15 680 subpd %xmm15, %xmm7 681 682 movddup 29 * SIZE(BO), %xmm13 683 mulpd %xmm13, %xmm5 684 movddup 30 * SIZE(BO), %xmm14 685 mulpd %xmm5, %xmm14 686 subpd %xmm14, %xmm6 687 movddup 31 * SIZE(BO), %xmm15 688 mulpd %xmm5, %xmm15 689 subpd %xmm15, %xmm7 690 691 movddup 38 * SIZE(BO), %xmm14 692 mulpd %xmm14, %xmm6 693 movddup 39 * SIZE(BO), %xmm15 694 mulpd %xmm6, %xmm15 695 subpd %xmm15, %xmm7 696 697 movddup 47 * SIZE(BO), %xmm15 698 mulpd %xmm15, %xmm7 699#endif 700 701#ifdef RT 702 movddup 47 * SIZE(BO), %xmm8 703 mulpd %xmm8, %xmm7 704 movddup 46 * SIZE(BO), %xmm9 705 mulpd %xmm7, %xmm9 706 subpd %xmm9, %xmm6 707 movddup 45 * SIZE(BO), %xmm10 708 mulpd %xmm7, %xmm10 709 subpd %xmm10, %xmm5 710 movddup 44 * SIZE(BO), %xmm11 711 mulpd %xmm7, %xmm11 712 subpd %xmm11, %xmm4 713 movddup 43 * SIZE(BO), %xmm12 714 mulpd %xmm7, %xmm12 715 subpd %xmm12, %xmm3 716 movddup 42 * SIZE(BO), %xmm13 717 mulpd %xmm7, %xmm13 718 subpd %xmm13, %xmm2 719 movddup 41 * SIZE(BO), %xmm14 720 mulpd %xmm7, %xmm14 721 subpd %xmm14, %xmm1 722 movddup 40 * SIZE(BO), %xmm15 723 mulpd %xmm7, %xmm15 724 subpd %xmm15, %xmm0 725 726 movddup 38 * SIZE(BO), %xmm9 727 mulpd %xmm9, %xmm6 728 movddup 37 * SIZE(BO), %xmm10 729 mulpd %xmm6, %xmm10 730 subpd %xmm10, %xmm5 731 movddup 36 * SIZE(BO), %xmm11 732 mulpd %xmm6, %xmm11 733 subpd %xmm11, %xmm4 734 movddup 35 * SIZE(BO), %xmm12 735 mulpd %xmm6, %xmm12 736 subpd %xmm12, %xmm3 737 movddup 34 * SIZE(BO), %xmm13 738 mulpd %xmm6, %xmm13 739 subpd %xmm13, %xmm2 740 movddup 33 * SIZE(BO), %xmm14 741 mulpd %xmm6, %xmm14 742 subpd %xmm14, %xmm1 743 movddup 32 * SIZE(BO), %xmm15 744 mulpd %xmm6, %xmm15 745 subpd %xmm15, %xmm0 746 747 movddup 29 * SIZE(BO), %xmm10 748 mulpd %xmm10, %xmm5 749 movddup 28 * SIZE(BO), %xmm11 750 mulpd %xmm5, %xmm11 751 subpd %xmm11, %xmm4 752 movddup 27 * SIZE(BO), %xmm12 753 mulpd %xmm5, %xmm12 754 subpd %xmm12, %xmm3 755 movddup 26 * SIZE(BO), %xmm13 756 mulpd %xmm5, %xmm13 757 subpd %xmm13, %xmm2 758 movddup 25 * SIZE(BO), %xmm14 759 mulpd %xmm5, %xmm14 760 subpd %xmm14, %xmm1 761 movddup 24 * SIZE(BO), %xmm15 762 mulpd %xmm5, %xmm15 763 subpd %xmm15, %xmm0 764 765 movddup 20 * SIZE(BO), %xmm11 766 mulpd %xmm11, %xmm4 767 movddup 19 * SIZE(BO), %xmm12 768 mulpd %xmm4, %xmm12 769 subpd %xmm12, %xmm3 770 movddup 18 * SIZE(BO), %xmm13 771 mulpd %xmm4, %xmm13 772 subpd %xmm13, %xmm2 773 movddup 17 * SIZE(BO), %xmm14 774 mulpd %xmm4, %xmm14 775 subpd %xmm14, %xmm1 776 movddup 16 * SIZE(BO), %xmm15 777 mulpd %xmm4, %xmm15 778 subpd %xmm15, %xmm0 779 780 movddup 11 * SIZE(BO), %xmm12 781 mulpd %xmm12, %xmm3 782 movddup 10 * SIZE(BO), %xmm13 783 mulpd %xmm3, %xmm13 784 subpd %xmm13, %xmm2 785 movddup 9 * SIZE(BO), %xmm14 786 mulpd %xmm3, %xmm14 787 subpd %xmm14, %xmm1 788 movddup 8 * SIZE(BO), %xmm15 789 mulpd %xmm3, %xmm15 790 subpd %xmm15, %xmm0 791 792 movddup 2 * SIZE(BO), %xmm13 793 mulpd %xmm13, %xmm2 794 movddup 1 * SIZE(BO), %xmm14 795 mulpd %xmm2, %xmm14 796 subpd %xmm14, %xmm1 797 movddup 0 * SIZE(BO), %xmm15 798 mulpd %xmm2, %xmm15 799 subpd %xmm15, %xmm0 800 801 movddup -7 * SIZE(BO), %xmm14 802 mulpd %xmm14, %xmm1 803 movddup -8 * SIZE(BO), %xmm15 804 mulpd %xmm1, %xmm15 805 subpd %xmm15, %xmm0 806 807 movddup -16 * SIZE(BO), %xmm15 808 mulpd %xmm15, %xmm0 809#endif 810 811 812#ifdef LN 813 subq $2 * SIZE, CO1 814 subq $2 * SIZE, CO2 815#endif 816 817#if defined(LN) || defined(LT) 818 movapd %xmm0, -16 * SIZE(BO) 819 movapd %xmm2, -14 * SIZE(BO) 820 movapd %xmm4, -12 * SIZE(BO) 821 movapd %xmm6, -10 * SIZE(BO) 822 movapd %xmm1, -8 * SIZE(BO) 823 movapd %xmm3, -6 * SIZE(BO) 824 movapd %xmm5, -4 * SIZE(BO) 825 movapd %xmm7, -2 * SIZE(BO) 826#else 827 movapd %xmm0, -16 * SIZE(AO) 828 movapd %xmm1, -14 * SIZE(AO) 829 movapd %xmm2, -12 * SIZE(AO) 830 movapd %xmm3, -10 * SIZE(AO) 831 movapd %xmm4, -8 * SIZE(AO) 832 movapd %xmm5 , -6 * SIZE(AO) 833 movapd %xmm6, -4 * SIZE(AO) 834 movapd %xmm7, -2 * SIZE(AO) 835#endif 836 837 leaq (LDC, LDC, 2), %rax 838 839#if defined(LN) || defined(LT) 840 movsd %xmm0, 0 * SIZE(CO1) 841 movsd %xmm1, 1 * SIZE(CO1) 842 movhps %xmm0, 0 * SIZE(CO1, LDC, 1) 843 movhps %xmm1, 1 * SIZE(CO1, LDC, 1) 844 845 movsd %xmm2, 0 * SIZE(CO1, LDC, 2) 846 movsd %xmm3, 1 * SIZE(CO1, LDC, 2) 847 movhps %xmm2, 0 * SIZE(CO1, %rax, 1) 848 movhps %xmm3, 1 * SIZE(CO1, %rax, 1) 849 850 movsd %xmm4, 0 * SIZE(CO2) 851 movsd %xmm5, 1 * SIZE(CO2) 852 movhps %xmm4, 0 * SIZE(CO2, LDC, 1) 853 movhps %xmm5, 1 * SIZE(CO2, LDC, 1) 854 855 movsd %xmm6, 0 * SIZE(CO2, LDC, 2) 856 movsd %xmm7, 1 * SIZE(CO2, LDC, 2) 857 movhps %xmm6, 0 * SIZE(CO2, %rax, 1) 858 movhps %xmm7, 1 * SIZE(CO2, %rax, 1) 859#else 860 movups %xmm0, 0 * SIZE(CO1) 861 movups %xmm1, 0 * SIZE(CO1, LDC, 1) 862 movups %xmm2, 0 * SIZE(CO1, LDC, 2) 863 movups %xmm3, 0 * SIZE(CO1, %rax, 1) 864 movups %xmm4, 0 * SIZE(CO2) 865 movups %xmm5, 0 * SIZE(CO2, LDC, 1) 866 movups %xmm6, 0 * SIZE(CO2, LDC, 2) 867 movups %xmm7, 0 * SIZE(CO2, %rax, 1) 868#endif 869 870#ifndef LN 871 addq $2 * SIZE, CO1 872 addq $2 * SIZE, CO2 873#endif 874 875 876#if defined(LT) || defined(RN) 877 movq K, %rax 878 subq KK, %rax 879 leaq (,%rax, SIZE), %rax 880 leaq (AO, %rax, 2), AO 881 leaq (BO, %rax, 8), BO 882#endif 883 884#ifdef LN 885 subq $2, KK 886#endif 887 888#ifdef LT 889 addq $2, KK 890#endif 891 892#ifdef RT 893 movq K, %rax 894 salq $1 + BASE_SHIFT, %rax 895 addq %rax, AORIG 896#endif 897 898 decq I 899 BRANCH 900 jg .L11 901 ALIGN_4 902 903.L20: 904 testq $1, M 905 BRANCH 906 jle .L29 907 ALIGN_4 908 909#ifdef LN 910 movq K, %rax 911 salq $BASE_SHIFT, %rax 912 subq %rax, AORIG 913#endif 914 915#if defined(LN) || defined(RT) 916 movq KK, %rax 917 leaq (, %rax, SIZE), %rax 918 movq AORIG, AO 919 leaq (AO, %rax, 1), AO 920 leaq (B, %rax, 8), BO 921#else 922 movq B, BO 923#endif 924 925 movddup -16 * SIZE(AO), %xmm0 926 xorps %xmm8, %xmm8 927 movaps -16 * SIZE(BO), %xmm1 928 xorps %xmm9, %xmm9 929 xorps %xmm10, %xmm10 930 xorps %xmm11, %xmm11 931 932#if defined(LT) || defined(RN) 933 movq KK, %rax 934#else 935 movq K, %rax 936 subq KK, %rax 937#endif 938 sarq $2, %rax 939 NOBRANCH 940 jle .L25 941 ALIGN_3 942 943.L22: 944 mulpd %xmm0, %xmm1 945 addpd %xmm1, %xmm8 946 movaps -14 * SIZE(BO), %xmm1 947 mulpd %xmm0, %xmm1 948 addpd %xmm1, %xmm9 949 movaps -12 * SIZE(BO), %xmm1 950 mulpd %xmm0, %xmm1 951 addpd %xmm1, %xmm10 952 movaps -10 * SIZE(BO), %xmm1 953 mulpd %xmm0, %xmm1 954 movddup -15 * SIZE(AO), %xmm0 955 addpd %xmm1, %xmm11 956 movaps -8 * SIZE(BO), %xmm1 957 958 mulpd %xmm0, %xmm1 959 addpd %xmm1, %xmm8 960 movaps -6 * SIZE(BO), %xmm1 961 mulpd %xmm0, %xmm1 962 addpd %xmm1, %xmm9 963 movaps -4 * SIZE(BO), %xmm1 964 mulpd %xmm0, %xmm1 965 addpd %xmm1, %xmm10 966 movaps -2 * SIZE(BO), %xmm1 967 mulpd %xmm0, %xmm1 968 movddup -14 * SIZE(AO), %xmm0 969 addpd %xmm1, %xmm11 970 movaps 0 * SIZE(BO), %xmm1 971 972 mulpd %xmm0, %xmm1 973 addpd %xmm1, %xmm8 974 movaps 2 * SIZE(BO), %xmm1 975 mulpd %xmm0, %xmm1 976 addpd %xmm1, %xmm9 977 movaps 4 * SIZE(BO), %xmm1 978 mulpd %xmm0, %xmm1 979 addpd %xmm1, %xmm10 980 movaps 6 * SIZE(BO), %xmm1 981 mulpd %xmm0, %xmm1 982 movddup -13 * SIZE(AO), %xmm0 983 addpd %xmm1, %xmm11 984 movaps 8 * SIZE(BO), %xmm1 985 986 mulpd %xmm0, %xmm1 987 addpd %xmm1, %xmm8 988 movaps 10 * SIZE(BO), %xmm1 989 mulpd %xmm0, %xmm1 990 addpd %xmm1, %xmm9 991 movaps 12 * SIZE(BO), %xmm1 992 mulpd %xmm0, %xmm1 993 addpd %xmm1, %xmm10 994 movaps 14 * SIZE(BO), %xmm1 995 mulpd %xmm0, %xmm1 996 movddup -12 * SIZE(AO), %xmm0 997 addpd %xmm1, %xmm11 998 movaps 16 * SIZE(BO), %xmm1 999 1000 subq $ -4 * SIZE, AO 1001 subq $-32 * SIZE, BO 1002 1003 subq $1, %rax 1004 BRANCH 1005 jg .L22 1006 ALIGN_3 1007 1008.L25: 1009#if defined(LT) || defined(RN) 1010 movq KK, %rax 1011#else 1012 movq K, %rax 1013 subq KK, %rax 1014#endif 1015 andq $3, %rax # if (k & 1) 1016 BRANCH 1017 je .L28 1018 ALIGN_3 1019 1020.L26: 1021 mulpd %xmm0, %xmm1 1022 addpd %xmm1, %xmm8 1023 movaps -14 * SIZE(BO), %xmm1 1024 mulpd %xmm0, %xmm1 1025 addpd %xmm1, %xmm9 1026 movaps -12 * SIZE(BO), %xmm1 1027 mulpd %xmm0, %xmm1 1028 addpd %xmm1, %xmm10 1029 movaps -10 * SIZE(BO), %xmm1 1030 mulpd %xmm0, %xmm1 1031 movddup -15 * SIZE(AO), %xmm0 1032 addpd %xmm1, %xmm11 1033 movaps -8 * SIZE(BO), %xmm1 1034 1035 addq $1 * SIZE, AO 1036 addq $8 * SIZE, BO 1037 1038 subq $1, %rax 1039 BRANCH 1040 jg .L26 1041 ALIGN_4 1042 1043.L28: 1044#if defined(LN) || defined(RT) 1045 movq KK, %rax 1046#ifdef LN 1047 subq $1, %rax 1048#else 1049 subq $8, %rax 1050#endif 1051 1052 leaq (, %rax, SIZE), %rax 1053 1054 movq AORIG, AO 1055 leaq (AO, %rax, 1), AO 1056 leaq (B, %rax, 8), BO 1057#endif 1058 1059#if defined(LN) || defined(LT) 1060 movapd -16 * SIZE(BO), %xmm0 1061 movapd -14 * SIZE(BO), %xmm1 1062 movapd -12 * SIZE(BO), %xmm2 1063 movapd -10 * SIZE(BO), %xmm3 1064#else 1065 movapd -16 * SIZE(AO), %xmm0 1066 movapd -14 * SIZE(AO), %xmm1 1067 movapd -12 * SIZE(AO), %xmm2 1068 movapd -10 * SIZE(AO), %xmm3 1069#endif 1070 1071 subpd %xmm8, %xmm0 1072 subpd %xmm9, %xmm1 1073 subpd %xmm10, %xmm2 1074 subpd %xmm11, %xmm3 1075 1076#if defined(LN) || defined(LT) 1077 movddup -16 * SIZE(AO), %xmm8 1078 mulpd %xmm8, %xmm0 1079 mulpd %xmm8, %xmm1 1080 mulpd %xmm8, %xmm2 1081 mulpd %xmm8, %xmm3 1082#endif 1083 1084#if defined(RN) || defined(RT) 1085 pshufd $0xe, %xmm3, %xmm7 1086 movaps %xmm3, %xmm6 1087 pshufd $0xe, %xmm2, %xmm5 1088 movaps %xmm2, %xmm4 1089 pshufd $0xe, %xmm1, %xmm3 1090 movaps %xmm1, %xmm2 1091 pshufd $0xe, %xmm0, %xmm1 1092#endif 1093 1094#ifdef RN 1095 movsd -16 * SIZE(BO), %xmm8 1096 mulsd %xmm8, %xmm0 1097 movsd -15 * SIZE(BO), %xmm9 1098 mulsd %xmm0, %xmm9 1099 subsd %xmm9, %xmm1 1100 movsd -14 * SIZE(BO), %xmm10 1101 mulsd %xmm0, %xmm10 1102 subsd %xmm10, %xmm2 1103 movsd -13 * SIZE(BO), %xmm11 1104 mulsd %xmm0, %xmm11 1105 subsd %xmm11, %xmm3 1106 movsd -12 * SIZE(BO), %xmm12 1107 mulsd %xmm0, %xmm12 1108 subsd %xmm12, %xmm4 1109 movsd -11 * SIZE(BO), %xmm13 1110 mulsd %xmm0, %xmm13 1111 subsd %xmm13, %xmm5 1112 movsd -10 * SIZE(BO), %xmm14 1113 mulsd %xmm0, %xmm14 1114 subsd %xmm14, %xmm6 1115 movsd -9 * SIZE(BO), %xmm15 1116 mulsd %xmm0, %xmm15 1117 subsd %xmm15, %xmm7 1118 1119 movsd -7 * SIZE(BO), %xmm9 1120 mulsd %xmm9, %xmm1 1121 movsd -6 * SIZE(BO), %xmm10 1122 mulsd %xmm1, %xmm10 1123 subsd %xmm10, %xmm2 1124 movsd -5 * SIZE(BO), %xmm11 1125 mulsd %xmm1, %xmm11 1126 subsd %xmm11, %xmm3 1127 movsd -4 * SIZE(BO), %xmm12 1128 mulsd %xmm1, %xmm12 1129 subsd %xmm12, %xmm4 1130 movsd -3 * SIZE(BO), %xmm13 1131 mulsd %xmm1, %xmm13 1132 subsd %xmm13, %xmm5 1133 movsd -2 * SIZE(BO), %xmm14 1134 mulsd %xmm1, %xmm14 1135 subsd %xmm14, %xmm6 1136 movsd -1 * SIZE(BO), %xmm15 1137 mulsd %xmm1, %xmm15 1138 subsd %xmm15, %xmm7 1139 1140 movsd 2 * SIZE(BO), %xmm10 1141 mulsd %xmm10, %xmm2 1142 movsd 3 * SIZE(BO), %xmm11 1143 mulsd %xmm2, %xmm11 1144 subsd %xmm11, %xmm3 1145 movsd 4 * SIZE(BO), %xmm12 1146 mulsd %xmm2, %xmm12 1147 subsd %xmm12, %xmm4 1148 movsd 5 * SIZE(BO), %xmm13 1149 mulsd %xmm2, %xmm13 1150 subsd %xmm13, %xmm5 1151 movsd 6 * SIZE(BO), %xmm14 1152 mulsd %xmm2, %xmm14 1153 subsd %xmm14, %xmm6 1154 movsd 7 * SIZE(BO), %xmm15 1155 mulsd %xmm2, %xmm15 1156 subsd %xmm15, %xmm7 1157 1158 movsd 11 * SIZE(BO), %xmm11 1159 mulsd %xmm11, %xmm3 1160 movsd 12 * SIZE(BO), %xmm12 1161 mulsd %xmm3, %xmm12 1162 subsd %xmm12, %xmm4 1163 movsd 13 * SIZE(BO), %xmm13 1164 mulsd %xmm3, %xmm13 1165 subsd %xmm13, %xmm5 1166 movsd 14 * SIZE(BO), %xmm14 1167 mulsd %xmm3, %xmm14 1168 subsd %xmm14, %xmm6 1169 movsd 15 * SIZE(BO), %xmm15 1170 mulsd %xmm3, %xmm15 1171 subsd %xmm15, %xmm7 1172 1173 movsd 20 * SIZE(BO), %xmm12 1174 mulsd %xmm12, %xmm4 1175 movsd 21 * SIZE(BO), %xmm13 1176 mulsd %xmm4, %xmm13 1177 subsd %xmm13, %xmm5 1178 movsd 22 * SIZE(BO), %xmm14 1179 mulsd %xmm4, %xmm14 1180 subsd %xmm14, %xmm6 1181 movsd 23 * SIZE(BO), %xmm15 1182 mulsd %xmm4, %xmm15 1183 subsd %xmm15, %xmm7 1184 1185 movsd 29 * SIZE(BO), %xmm13 1186 mulsd %xmm13, %xmm5 1187 movsd 30 * SIZE(BO), %xmm14 1188 mulsd %xmm5, %xmm14 1189 subsd %xmm14, %xmm6 1190 movsd 31 * SIZE(BO), %xmm15 1191 mulsd %xmm5, %xmm15 1192 subsd %xmm15, %xmm7 1193 1194 movsd 38 * SIZE(BO), %xmm14 1195 mulsd %xmm14, %xmm6 1196 movsd 39 * SIZE(BO), %xmm15 1197 mulsd %xmm6, %xmm15 1198 subsd %xmm15, %xmm7 1199 1200 movsd 47 * SIZE(BO), %xmm15 1201 mulsd %xmm15, %xmm7 1202#endif 1203 1204#ifdef RT 1205 movsd 47 * SIZE(BO), %xmm8 1206 mulsd %xmm8, %xmm7 1207 movsd 46 * SIZE(BO), %xmm9 1208 mulsd %xmm7, %xmm9 1209 subsd %xmm9, %xmm6 1210 movsd 45 * SIZE(BO), %xmm10 1211 mulsd %xmm7, %xmm10 1212 subsd %xmm10, %xmm5 1213 movsd 44 * SIZE(BO), %xmm11 1214 mulsd %xmm7, %xmm11 1215 subsd %xmm11, %xmm4 1216 movsd 43 * SIZE(BO), %xmm12 1217 mulsd %xmm7, %xmm12 1218 subsd %xmm12, %xmm3 1219 movsd 42 * SIZE(BO), %xmm13 1220 mulsd %xmm7, %xmm13 1221 subsd %xmm13, %xmm2 1222 movsd 41 * SIZE(BO), %xmm14 1223 mulsd %xmm7, %xmm14 1224 subsd %xmm14, %xmm1 1225 movsd 40 * SIZE(BO), %xmm15 1226 mulsd %xmm7, %xmm15 1227 subsd %xmm15, %xmm0 1228 1229 movsd 38 * SIZE(BO), %xmm9 1230 mulsd %xmm9, %xmm6 1231 movsd 37 * SIZE(BO), %xmm10 1232 mulsd %xmm6, %xmm10 1233 subsd %xmm10, %xmm5 1234 movsd 36 * SIZE(BO), %xmm11 1235 mulsd %xmm6, %xmm11 1236 subsd %xmm11, %xmm4 1237 movsd 35 * SIZE(BO), %xmm12 1238 mulsd %xmm6, %xmm12 1239 subsd %xmm12, %xmm3 1240 movsd 34 * SIZE(BO), %xmm13 1241 mulsd %xmm6, %xmm13 1242 subsd %xmm13, %xmm2 1243 movsd 33 * SIZE(BO), %xmm14 1244 mulsd %xmm6, %xmm14 1245 subsd %xmm14, %xmm1 1246 movsd 32 * SIZE(BO), %xmm15 1247 mulsd %xmm6, %xmm15 1248 subsd %xmm15, %xmm0 1249 1250 movsd 29 * SIZE(BO), %xmm10 1251 mulsd %xmm10, %xmm5 1252 movsd 28 * SIZE(BO), %xmm11 1253 mulsd %xmm5, %xmm11 1254 subsd %xmm11, %xmm4 1255 movsd 27 * SIZE(BO), %xmm12 1256 mulsd %xmm5, %xmm12 1257 subsd %xmm12, %xmm3 1258 movsd 26 * SIZE(BO), %xmm13 1259 mulsd %xmm5, %xmm13 1260 subsd %xmm13, %xmm2 1261 movsd 25 * SIZE(BO), %xmm14 1262 mulsd %xmm5, %xmm14 1263 subsd %xmm14, %xmm1 1264 movsd 24 * SIZE(BO), %xmm15 1265 mulsd %xmm5, %xmm15 1266 subsd %xmm15, %xmm0 1267 1268 movsd 20 * SIZE(BO), %xmm11 1269 mulsd %xmm11, %xmm4 1270 movsd 19 * SIZE(BO), %xmm12 1271 mulsd %xmm4, %xmm12 1272 subsd %xmm12, %xmm3 1273 movsd 18 * SIZE(BO), %xmm13 1274 mulsd %xmm4, %xmm13 1275 subsd %xmm13, %xmm2 1276 movsd 17 * SIZE(BO), %xmm14 1277 mulsd %xmm4, %xmm14 1278 subsd %xmm14, %xmm1 1279 movsd 16 * SIZE(BO), %xmm15 1280 mulsd %xmm4, %xmm15 1281 subsd %xmm15, %xmm0 1282 1283 movsd 11 * SIZE(BO), %xmm12 1284 mulsd %xmm12, %xmm3 1285 movsd 10 * SIZE(BO), %xmm13 1286 mulsd %xmm3, %xmm13 1287 subsd %xmm13, %xmm2 1288 movsd 9 * SIZE(BO), %xmm14 1289 mulsd %xmm3, %xmm14 1290 subsd %xmm14, %xmm1 1291 movsd 8 * SIZE(BO), %xmm15 1292 mulsd %xmm3, %xmm15 1293 subsd %xmm15, %xmm0 1294 1295 movsd 2 * SIZE(BO), %xmm13 1296 mulsd %xmm13, %xmm2 1297 movsd 1 * SIZE(BO), %xmm14 1298 mulsd %xmm2, %xmm14 1299 subsd %xmm14, %xmm1 1300 movsd 0 * SIZE(BO), %xmm15 1301 mulsd %xmm2, %xmm15 1302 subsd %xmm15, %xmm0 1303 1304 movsd -7 * SIZE(BO), %xmm14 1305 mulsd %xmm14, %xmm1 1306 movsd -8 * SIZE(BO), %xmm15 1307 mulsd %xmm1, %xmm15 1308 subsd %xmm15, %xmm0 1309 1310 movsd -16 * SIZE(BO), %xmm15 1311 mulsd %xmm15, %xmm0 1312#endif 1313 1314#if defined(RN) || defined(RT) 1315 unpcklpd %xmm1, %xmm0 1316 movaps %xmm2, %xmm1 1317 unpcklpd %xmm3, %xmm1 1318 movaps %xmm4, %xmm2 1319 unpcklpd %xmm5, %xmm2 1320 movaps %xmm6, %xmm3 1321 unpcklpd %xmm7, %xmm3 1322#endif 1323 1324#ifdef LN 1325 subq $1 * SIZE, CO1 1326 subq $1 * SIZE, CO2 1327#endif 1328 1329 leaq (LDC, LDC, 2), %rax 1330 1331 movsd %xmm0, 0 * SIZE(CO1) 1332 movhps %xmm0, 0 * SIZE(CO1, LDC, 1) 1333 movsd %xmm1, 0 * SIZE(CO1, LDC, 2) 1334 movhps %xmm1, 0 * SIZE(CO1, %rax, 1) 1335 movsd %xmm2, 0 * SIZE(CO2) 1336 movhps %xmm2, 0 * SIZE(CO2, LDC, 1) 1337 movsd %xmm3, 0 * SIZE(CO2, LDC, 2) 1338 movhps %xmm3, 0 * SIZE(CO2, %rax, 1) 1339 1340#if defined(LN) || defined(LT) 1341 movapd %xmm0, -16 * SIZE(BO) 1342 movapd %xmm1, -14 * SIZE(BO) 1343 movapd %xmm2, -12 * SIZE(BO) 1344 movapd %xmm3, -10 * SIZE(BO) 1345#else 1346 movapd %xmm0, -16 * SIZE(AO) 1347 movapd %xmm1, -14 * SIZE(AO) 1348 movapd %xmm2, -12 * SIZE(AO) 1349 movapd %xmm3, -10 * SIZE(AO) 1350#endif 1351 1352#ifndef LN 1353 addq $1 * SIZE, CO1 1354 addq $1 * SIZE, CO2 1355#endif 1356 1357 1358#if defined(LT) || defined(RN) 1359 movq K, %rax 1360 subq KK, %rax 1361 leaq (,%rax, SIZE), %rax 1362 leaq (AO, %rax, 1), AO 1363 leaq (BO, %rax, 8), BO 1364#endif 1365 1366#ifdef LN 1367 subq $1, KK 1368#endif 1369 1370#ifdef LT 1371 addq $1, KK 1372#endif 1373 1374#ifdef RT 1375 movq K, %rax 1376 salq $BASE_SHIFT, %rax 1377 addq %rax, AORIG 1378#endif 1379 ALIGN_4 1380 1381.L29: 1382#ifdef LN 1383 leaq (, K, SIZE), %rax 1384 leaq (B, %rax, 8), B 1385#endif 1386#if defined(LT) || defined(RN) 1387 movq BO, B 1388#endif 1389 1390#ifdef RN 1391 addq $8, KK 1392#endif 1393 1394#ifdef RT 1395 subq $8, KK 1396#endif 1397 1398 subq $1, J 1399 BRANCH 1400 jg .L01 1401 ALIGN_4 1402 1403.L30: 1404 testq $4, N 1405 jle .L50 1406 ALIGN_4 1407 1408#if defined(LT) || defined(RN) 1409 movq A, AO 1410#else 1411 movq A, AORIG 1412#endif 1413 1414#ifdef RT 1415 movq K, %rax 1416 salq $2 + BASE_SHIFT, %rax 1417 subq %rax, B 1418 1419 leaq (, LDC, 4), %rax 1420 subq %rax, C 1421#endif 1422 1423 movq C, CO1 1424 leaq (C, LDC, 2), CO2 1425#ifndef RT 1426 leaq (C, LDC, 4), C 1427#endif 1428 1429#ifdef LN 1430 movq OFFSET, %rax 1431 addq M, %rax 1432 movq %rax, KK 1433#endif 1434 1435#ifdef LT 1436 movq OFFSET, %rax 1437 movq %rax, KK 1438#endif 1439 1440 movq M, I 1441 sarq $1, I 1442 NOBRANCH 1443 jle .L40 1444 ALIGN_4 1445 1446.L31: 1447#ifdef LN 1448 movq K, %rax 1449 salq $1 + BASE_SHIFT, %rax 1450 subq %rax, AORIG 1451#endif 1452 1453#if defined(LN) || defined(RT) 1454 movq KK, %rax 1455 leaq (, %rax, SIZE), %rax 1456 movq AORIG, AO 1457 leaq (AO, %rax, 2), AO 1458 leaq (B, %rax, 4), BO 1459#else 1460 movq B, BO 1461#endif 1462 1463 xorps %xmm1, %xmm1 1464 movaps -16 * SIZE(AO), %xmm0 1465 xorps %xmm2, %xmm2 1466 xorps %xmm3, %xmm3 1467 xorps %xmm4, %xmm4 1468 1469 xorps %xmm8, %xmm8 1470 prefetcht0 2 * SIZE(CO1) 1471 xorps %xmm9, %xmm9 1472 prefetcht0 2 * SIZE(CO1, LDC, 1) 1473 xorps %xmm10, %xmm10 1474 prefetcht0 2 * SIZE(CO2) 1475 xorps %xmm11, %xmm11 1476 prefetcht0 2 * SIZE(CO2, LDC, 1) 1477 1478#if defined(LT) || defined(RN) 1479 movq KK, %rax 1480#else 1481 movq K, %rax 1482 subq KK, %rax 1483#endif 1484 sarq $2, %rax 1485 NOBRANCH 1486 jle .L35 1487 ALIGN_3 1488 1489.L32: 1490 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 1491 1492 addpd %xmm1, %xmm8 1493 movaps -16 * SIZE(BO), %xmm1 1494 addpd %xmm2, %xmm9 1495 pshufd $0x4e, %xmm1, %xmm2 1496 mulpd %xmm0, %xmm1 1497 mulpd %xmm0, %xmm2 1498 1499 addpd %xmm3, %xmm10 1500 movaps -14 * SIZE(BO), %xmm3 1501 addpd %xmm4, %xmm11 1502 pshufd $0x4e, %xmm3, %xmm4 1503 mulpd %xmm0, %xmm3 1504 mulpd %xmm0, %xmm4 1505 1506 movaps -14 * SIZE(AO), %xmm0 1507 1508 addpd %xmm1, %xmm8 1509 movaps -12 * SIZE(BO), %xmm1 1510 addpd %xmm2, %xmm9 1511 pshufd $0x4e, %xmm1, %xmm2 1512 mulpd %xmm0, %xmm1 1513 mulpd %xmm0, %xmm2 1514 1515 addpd %xmm3, %xmm10 1516 movaps -10 * SIZE(BO), %xmm3 1517 addpd %xmm4, %xmm11 1518 pshufd $0x4e, %xmm3, %xmm4 1519 mulpd %xmm0, %xmm3 1520 mulpd %xmm0, %xmm4 1521 1522 movaps -12 * SIZE(AO), %xmm0 1523 1524 addpd %xmm1, %xmm8 1525 movaps -8 * SIZE(BO), %xmm1 1526 addpd %xmm2, %xmm9 1527 pshufd $0x4e, %xmm1, %xmm2 1528 mulpd %xmm0, %xmm1 1529 mulpd %xmm0, %xmm2 1530 1531 addpd %xmm3, %xmm10 1532 movaps -6 * SIZE(BO), %xmm3 1533 addpd %xmm4, %xmm11 1534 pshufd $0x4e, %xmm3, %xmm4 1535 mulpd %xmm0, %xmm3 1536 mulpd %xmm0, %xmm4 1537 1538 movaps -10 * SIZE(AO), %xmm0 1539 1540 addpd %xmm1, %xmm8 1541 movaps -4 * SIZE(BO), %xmm1 1542 addpd %xmm2, %xmm9 1543 pshufd $0x4e, %xmm1, %xmm2 1544 mulpd %xmm0, %xmm1 1545 mulpd %xmm0, %xmm2 1546 1547 addpd %xmm3, %xmm10 1548 movaps -2 * SIZE(BO), %xmm3 1549 addpd %xmm4, %xmm11 1550 pshufd $0x4e, %xmm3, %xmm4 1551 mulpd %xmm0, %xmm3 1552 mulpd %xmm0, %xmm4 1553 1554 movaps -8 * SIZE(AO), %xmm0 1555 1556 subq $-8 * SIZE, AO 1557 subq $-16 * SIZE, BO 1558 1559 subq $1, %rax 1560 BRANCH 1561 jg .L32 1562 ALIGN_3 1563 1564.L35: 1565#if defined(LT) || defined(RN) 1566 movq KK, %rax 1567#else 1568 movq K, %rax 1569 subq KK, %rax 1570#endif 1571 andq $3, %rax # if (k & 1) 1572 BRANCH 1573 je .L38 1574 ALIGN_3 1575 1576.L36: 1577 addpd %xmm1, %xmm8 1578 movaps -16 * SIZE(BO), %xmm1 1579 addpd %xmm2, %xmm9 1580 pshufd $0x4e, %xmm1, %xmm2 1581 mulpd %xmm0, %xmm1 1582 mulpd %xmm0, %xmm2 1583 1584 addpd %xmm3, %xmm10 1585 movaps -14 * SIZE(BO), %xmm3 1586 addpd %xmm4, %xmm11 1587 pshufd $0x4e, %xmm3, %xmm4 1588 mulpd %xmm0, %xmm3 1589 mulpd %xmm0, %xmm4 1590 1591 movaps -14 * SIZE(AO), %xmm0 1592 1593 addq $2 * SIZE, AO 1594 addq $4 * SIZE, BO 1595 1596 subq $1, %rax 1597 BRANCH 1598 jg .L36 1599 ALIGN_4 1600 1601.L38: 1602#if defined(LN) || defined(RT) 1603 movq KK, %rax 1604#ifdef LN 1605 subq $2, %rax 1606#else 1607 subq $4, %rax 1608#endif 1609 1610 leaq (, %rax, SIZE), %rax 1611 1612 movq AORIG, AO 1613 leaq (AO, %rax, 2), AO 1614 leaq (B, %rax, 4), BO 1615#endif 1616 1617 addpd %xmm1, %xmm8 1618 addpd %xmm2, %xmm9 1619 addpd %xmm3, %xmm10 1620 addpd %xmm4, %xmm11 1621 1622#if defined(LN) || defined(LT) 1623 movaps %xmm8, %xmm0 1624 shufpd $0, %xmm9, %xmm8 1625 shufpd $3, %xmm0, %xmm9 1626 1627 movaps %xmm10, %xmm0 1628 shufpd $0, %xmm11, %xmm10 1629 shufpd $3, %xmm0, %xmm11 1630 1631 movapd -16 * SIZE(BO), %xmm0 1632 movapd -14 * SIZE(BO), %xmm2 1633 movapd -12 * SIZE(BO), %xmm1 1634 movapd -10 * SIZE(BO), %xmm3 1635#else 1636 movaps %xmm8, %xmm0 1637 shufpd $2, %xmm9, %xmm8 1638 shufpd $2, %xmm0, %xmm9 1639 1640 movaps %xmm10, %xmm0 1641 shufpd $2, %xmm11, %xmm10 1642 shufpd $2, %xmm0, %xmm11 1643 1644 movapd -16 * SIZE(AO), %xmm0 1645 movapd -14 * SIZE(AO), %xmm1 1646 movapd -12 * SIZE(AO), %xmm2 1647 movapd -10 * SIZE(AO), %xmm3 1648#endif 1649 1650 subpd %xmm8, %xmm0 1651 subpd %xmm9, %xmm1 1652 subpd %xmm10, %xmm2 1653 subpd %xmm11, %xmm3 1654 1655#ifdef LN 1656 movddup -13 * SIZE(AO), %xmm8 1657 mulpd %xmm8, %xmm1 1658 mulpd %xmm8, %xmm3 1659 1660 movddup -14 * SIZE(AO), %xmm12 1661 movapd %xmm12, %xmm13 1662 1663 mulpd %xmm1, %xmm12 1664 mulpd %xmm3, %xmm13 1665 1666 subpd %xmm12, %xmm0 1667 subpd %xmm13, %xmm2 1668 1669 movddup -16 * SIZE(AO), %xmm8 1670 mulpd %xmm8, %xmm0 1671 mulpd %xmm8, %xmm2 1672#endif 1673 1674#ifdef LT 1675 movddup -16 * SIZE(AO), %xmm8 1676 mulpd %xmm8, %xmm0 1677 mulpd %xmm8, %xmm2 1678 1679 movddup -15 * SIZE(AO), %xmm12 1680 movapd %xmm12, %xmm13 1681 1682 mulpd %xmm0, %xmm12 1683 mulpd %xmm2, %xmm13 1684 1685 subpd %xmm12, %xmm1 1686 subpd %xmm13, %xmm3 1687 1688 movddup -13 * SIZE(AO), %xmm8 1689 mulpd %xmm8, %xmm1 1690 mulpd %xmm8, %xmm3 1691#endif 1692 1693#ifdef RN 1694 movddup -16 * SIZE(BO), %xmm8 1695 mulpd %xmm8, %xmm0 1696 movddup -15 * SIZE(BO), %xmm9 1697 mulpd %xmm0, %xmm9 1698 subpd %xmm9, %xmm1 1699 movddup -14 * SIZE(BO), %xmm10 1700 mulpd %xmm0, %xmm10 1701 subpd %xmm10, %xmm2 1702 movddup -13 * SIZE(BO), %xmm11 1703 mulpd %xmm0, %xmm11 1704 subpd %xmm11, %xmm3 1705 1706 movddup -11 * SIZE(BO), %xmm9 1707 mulpd %xmm9, %xmm1 1708 movddup -10 * SIZE(BO), %xmm10 1709 mulpd %xmm1, %xmm10 1710 subpd %xmm10, %xmm2 1711 movddup -9 * SIZE(BO), %xmm11 1712 mulpd %xmm1, %xmm11 1713 subpd %xmm11, %xmm3 1714 1715 movddup -6 * SIZE(BO), %xmm10 1716 mulpd %xmm10, %xmm2 1717 movddup -5 * SIZE(BO), %xmm11 1718 mulpd %xmm2, %xmm11 1719 subpd %xmm11, %xmm3 1720 1721 movddup -1 * SIZE(BO), %xmm11 1722 mulpd %xmm11, %xmm3 1723#endif 1724 1725#ifdef RT 1726 movddup -1 * SIZE(BO), %xmm12 1727 mulpd %xmm12, %xmm3 1728 movddup -2 * SIZE(BO), %xmm13 1729 mulpd %xmm3, %xmm13 1730 subpd %xmm13, %xmm2 1731 movddup -3 * SIZE(BO), %xmm14 1732 mulpd %xmm3, %xmm14 1733 subpd %xmm14, %xmm1 1734 movddup -4 * SIZE(BO), %xmm15 1735 mulpd %xmm3, %xmm15 1736 subpd %xmm15, %xmm0 1737 1738 movddup -6 * SIZE(BO), %xmm13 1739 mulpd %xmm13, %xmm2 1740 movddup -7 * SIZE(BO), %xmm14 1741 mulpd %xmm2, %xmm14 1742 subpd %xmm14, %xmm1 1743 movddup -8 * SIZE(BO), %xmm15 1744 mulpd %xmm2, %xmm15 1745 subpd %xmm15, %xmm0 1746 1747 movddup -11 * SIZE(BO), %xmm14 1748 mulpd %xmm14, %xmm1 1749 movddup -12 * SIZE(BO), %xmm15 1750 mulpd %xmm1, %xmm15 1751 subpd %xmm15, %xmm0 1752 1753 movddup -16 * SIZE(BO), %xmm15 1754 mulpd %xmm15, %xmm0 1755#endif 1756 1757 1758#ifdef LN 1759 subq $2 * SIZE, CO1 1760 subq $2 * SIZE, CO2 1761#endif 1762 1763 leaq (LDC, LDC, 2), %rax 1764 1765#if defined(LN) || defined(LT) 1766 movsd %xmm0, 0 * SIZE(CO1) 1767 movsd %xmm1, 1 * SIZE(CO1) 1768 movhps %xmm0, 0 * SIZE(CO1, LDC, 1) 1769 movhps %xmm1, 1 * SIZE(CO1, LDC, 1) 1770 1771 movsd %xmm2, 0 * SIZE(CO2) 1772 movsd %xmm3, 1 * SIZE(CO2) 1773 movhps %xmm2, 0 * SIZE(CO2, LDC, 1) 1774 movhps %xmm3, 1 * SIZE(CO2, LDC, 1) 1775#else 1776 movsd %xmm0, 0 * SIZE(CO1) 1777 movhps %xmm0, 1 * SIZE(CO1) 1778 movsd %xmm1, 0 * SIZE(CO1, LDC, 1) 1779 movhps %xmm1, 1 * SIZE(CO1, LDC, 1) 1780 1781 movsd %xmm2, 0 * SIZE(CO2) 1782 movhps %xmm2, 1 * SIZE(CO2) 1783 movsd %xmm3, 0 * SIZE(CO2, LDC, 1) 1784 movhps %xmm3, 1 * SIZE(CO2, LDC, 1) 1785#endif 1786 1787#if defined(LN) || defined(LT) 1788 movapd %xmm0, -16 * SIZE(BO) 1789 movapd %xmm2, -14 * SIZE(BO) 1790 movapd %xmm1, -12 * SIZE(BO) 1791 movapd %xmm3, -10 * SIZE(BO) 1792#else 1793 movapd %xmm0, -16 * SIZE(AO) 1794 movapd %xmm1, -14 * SIZE(AO) 1795 movapd %xmm2, -12 * SIZE(AO) 1796 movapd %xmm3, -10 * SIZE(AO) 1797#endif 1798 1799#ifndef LN 1800 addq $2 * SIZE, CO1 1801 addq $2 * SIZE, CO2 1802#endif 1803 1804 1805#if defined(LT) || defined(RN) 1806 movq K, %rax 1807 subq KK, %rax 1808 leaq (,%rax, SIZE), %rax 1809 leaq (AO, %rax, 2), AO 1810 leaq (BO, %rax, 4), BO 1811#endif 1812 1813#ifdef LN 1814 subq $2, KK 1815#endif 1816 1817#ifdef LT 1818 addq $2, KK 1819#endif 1820 1821#ifdef RT 1822 movq K, %rax 1823 salq $1 + BASE_SHIFT, %rax 1824 addq %rax, AORIG 1825#endif 1826 1827 decq I 1828 BRANCH 1829 jg .L31 1830 ALIGN_4 1831 1832.L40: 1833 testq $1, M 1834 BRANCH 1835 jle .L49 1836 ALIGN_4 1837 1838#ifdef LN 1839 movq K, %rax 1840 salq $BASE_SHIFT, %rax 1841 subq %rax, AORIG 1842#endif 1843 1844#if defined(LN) || defined(RT) 1845 movq KK, %rax 1846 leaq (, %rax, SIZE), %rax 1847 movq AORIG, AO 1848 leaq (AO, %rax, 1), AO 1849 leaq (B, %rax, 4), BO 1850#else 1851 movq B, BO 1852#endif 1853 1854 movddup -16 * SIZE(AO), %xmm0 1855 xorps %xmm8, %xmm8 1856 movaps -16 * SIZE(BO), %xmm1 1857 xorps %xmm9, %xmm9 1858 xorps %xmm10, %xmm10 1859 xorps %xmm11, %xmm11 1860 1861#if defined(LT) || defined(RN) 1862 movq KK, %rax 1863#else 1864 movq K, %rax 1865 subq KK, %rax 1866#endif 1867 sarq $2, %rax 1868 NOBRANCH 1869 jle .L45 1870 ALIGN_3 1871 1872.L42: 1873 mulpd %xmm0, %xmm1 1874 addpd %xmm1, %xmm8 1875 movaps -14 * SIZE(BO), %xmm1 1876 mulpd %xmm0, %xmm1 1877 movddup -15 * SIZE(AO), %xmm0 1878 addpd %xmm1, %xmm9 1879 movaps -12 * SIZE(BO), %xmm1 1880 1881 mulpd %xmm0, %xmm1 1882 addpd %xmm1, %xmm10 1883 movaps -10 * SIZE(BO), %xmm1 1884 mulpd %xmm0, %xmm1 1885 movddup -14 * SIZE(AO), %xmm0 1886 addpd %xmm1, %xmm11 1887 movaps -8 * SIZE(BO), %xmm1 1888 1889 mulpd %xmm0, %xmm1 1890 addpd %xmm1, %xmm8 1891 movaps -6 * SIZE(BO), %xmm1 1892 mulpd %xmm0, %xmm1 1893 movddup -13 * SIZE(AO), %xmm0 1894 addpd %xmm1, %xmm9 1895 movaps -4 * SIZE(BO), %xmm1 1896 1897 mulpd %xmm0, %xmm1 1898 addpd %xmm1, %xmm10 1899 movaps -2 * SIZE(BO), %xmm1 1900 mulpd %xmm0, %xmm1 1901 movddup -12 * SIZE(AO), %xmm0 1902 addpd %xmm1, %xmm11 1903 movaps 0 * SIZE(BO), %xmm1 1904 1905 subq $ -4 * SIZE, AO 1906 subq $-16 * SIZE, BO 1907 1908 subq $1, %rax 1909 BRANCH 1910 jg .L42 1911 ALIGN_3 1912 1913.L45: 1914#if defined(LT) || defined(RN) 1915 movq KK, %rax 1916#else 1917 movq K, %rax 1918 subq KK, %rax 1919#endif 1920 andq $3, %rax # if (k & 1) 1921 BRANCH 1922 je .L48 1923 ALIGN_3 1924 1925.L46: 1926 mulpd %xmm0, %xmm1 1927 addpd %xmm1, %xmm8 1928 movaps -14 * SIZE(BO), %xmm1 1929 mulpd %xmm0, %xmm1 1930 movddup -15 * SIZE(AO), %xmm0 1931 addpd %xmm1, %xmm9 1932 movaps -12 * SIZE(BO), %xmm1 1933 1934 addq $1 * SIZE, AO 1935 addq $4 * SIZE, BO 1936 1937 subq $1, %rax 1938 BRANCH 1939 jg .L46 1940 ALIGN_4 1941 1942.L48: 1943#if defined(LN) || defined(RT) 1944 movq KK, %rax 1945#ifdef LN 1946 subq $1, %rax 1947#else 1948 subq $4, %rax 1949#endif 1950 1951 leaq (, %rax, SIZE), %rax 1952 1953 movq AORIG, AO 1954 leaq (AO, %rax, 1), AO 1955 leaq (B, %rax, 4), BO 1956#endif 1957 1958 addpd %xmm10, %xmm8 1959 addpd %xmm11, %xmm9 1960 1961#if defined(LN) || defined(LT) 1962 movapd -16 * SIZE(BO), %xmm0 1963 movapd -14 * SIZE(BO), %xmm1 1964#else 1965 movapd -16 * SIZE(AO), %xmm0 1966 movapd -14 * SIZE(AO), %xmm1 1967#endif 1968 1969 subpd %xmm8, %xmm0 1970 subpd %xmm9, %xmm1 1971 1972#if defined(LN) || defined(LT) 1973 movddup -16 * SIZE(AO), %xmm8 1974 mulpd %xmm8, %xmm0 1975 mulpd %xmm8, %xmm1 1976#endif 1977 1978#if defined(RN) || defined(RT) 1979 pshufd $0xe, %xmm1, %xmm3 1980 movaps %xmm1, %xmm2 1981 pshufd $0xe, %xmm0, %xmm1 1982#endif 1983 1984#ifdef RN 1985 movsd -16 * SIZE(BO), %xmm8 1986 mulsd %xmm8, %xmm0 1987 movsd -15 * SIZE(BO), %xmm9 1988 mulsd %xmm0, %xmm9 1989 subsd %xmm9, %xmm1 1990 movsd -14 * SIZE(BO), %xmm10 1991 mulsd %xmm0, %xmm10 1992 subsd %xmm10, %xmm2 1993 movsd -13 * SIZE(BO), %xmm11 1994 mulsd %xmm0, %xmm11 1995 subsd %xmm11, %xmm3 1996 1997 movsd -11 * SIZE(BO), %xmm9 1998 mulsd %xmm9, %xmm1 1999 movsd -10 * SIZE(BO), %xmm10 2000 mulsd %xmm1, %xmm10 2001 subsd %xmm10, %xmm2 2002 movsd -9 * SIZE(BO), %xmm11 2003 mulsd %xmm1, %xmm11 2004 subsd %xmm11, %xmm3 2005 2006 movsd -6 * SIZE(BO), %xmm10 2007 mulsd %xmm10, %xmm2 2008 movsd -5 * SIZE(BO), %xmm11 2009 mulsd %xmm2, %xmm11 2010 subsd %xmm11, %xmm3 2011 2012 movsd -1 * SIZE(BO), %xmm11 2013 mulsd %xmm11, %xmm3 2014#endif 2015 2016#ifdef RT 2017 movsd -1 * SIZE(BO), %xmm12 2018 mulsd %xmm12, %xmm3 2019 movsd -2 * SIZE(BO), %xmm13 2020 mulsd %xmm3, %xmm13 2021 subsd %xmm13, %xmm2 2022 movsd -3 * SIZE(BO), %xmm14 2023 mulsd %xmm3, %xmm14 2024 subsd %xmm14, %xmm1 2025 movsd -4 * SIZE(BO), %xmm15 2026 mulsd %xmm3, %xmm15 2027 subsd %xmm15, %xmm0 2028 2029 movsd -6 * SIZE(BO), %xmm13 2030 mulsd %xmm13, %xmm2 2031 movsd -7 * SIZE(BO), %xmm14 2032 mulsd %xmm2, %xmm14 2033 subsd %xmm14, %xmm1 2034 movsd -8 * SIZE(BO), %xmm15 2035 mulsd %xmm2, %xmm15 2036 subsd %xmm15, %xmm0 2037 2038 movsd -11 * SIZE(BO), %xmm14 2039 mulsd %xmm14, %xmm1 2040 movsd -12 * SIZE(BO), %xmm15 2041 mulsd %xmm1, %xmm15 2042 subsd %xmm15, %xmm0 2043 2044 movsd -16 * SIZE(BO), %xmm15 2045 mulsd %xmm15, %xmm0 2046#endif 2047 2048#if defined(RN) || defined(RT) 2049 unpcklpd %xmm1, %xmm0 2050 movaps %xmm2, %xmm1 2051 unpcklpd %xmm3, %xmm1 2052#endif 2053 2054#ifdef LN 2055 subq $1 * SIZE, CO1 2056 subq $1 * SIZE, CO2 2057#endif 2058 2059 movsd %xmm0, 0 * SIZE(CO1) 2060 movhps %xmm0, 0 * SIZE(CO1, LDC, 1) 2061 movsd %xmm1, 0 * SIZE(CO2) 2062 movhps %xmm1, 0 * SIZE(CO2, LDC, 1) 2063 2064#if defined(LN) || defined(LT) 2065 movapd %xmm0, -16 * SIZE(BO) 2066 movapd %xmm1, -14 * SIZE(BO) 2067#else 2068 movapd %xmm0, -16 * SIZE(AO) 2069 movapd %xmm1, -14 * SIZE(AO) 2070#endif 2071 2072#ifndef LN 2073 addq $1 * SIZE, CO1 2074 addq $1 * SIZE, CO2 2075#endif 2076 2077#if defined(LT) || defined(RN) 2078 movq K, %rax 2079 subq KK, %rax 2080 leaq (,%rax, SIZE), %rax 2081 leaq (AO, %rax, 1), AO 2082 leaq (BO, %rax, 4), BO 2083#endif 2084 2085#ifdef LN 2086 subq $1, KK 2087#endif 2088 2089#ifdef LT 2090 addq $1, KK 2091#endif 2092 2093#ifdef RT 2094 movq K, %rax 2095 salq $BASE_SHIFT, %rax 2096 addq %rax, AORIG 2097#endif 2098 ALIGN_4 2099 2100.L49: 2101#ifdef LN 2102 leaq (, K, SIZE), %rax 2103 leaq (B, %rax, 4), B 2104#endif 2105#if defined(LT) || defined(RN) 2106 movq BO, B 2107#endif 2108 2109#ifdef RN 2110 addq $4, KK 2111#endif 2112 2113#ifdef RT 2114 subq $4, KK 2115#endif 2116 ALIGN_4 2117 2118.L50: 2119 testq $2, N 2120 jle .L70 2121 ALIGN_4 2122 2123#if defined(LT) || defined(RN) 2124 movq A, AO 2125#else 2126 movq A, AORIG 2127#endif 2128 2129#ifdef RT 2130 movq K, %rax 2131 salq $1 + BASE_SHIFT, %rax 2132 subq %rax, B 2133 2134 leaq (, LDC, 2), %rax 2135 subq %rax, C 2136#endif 2137 2138 movq C, CO1 2139 leaq (C, LDC, 1), CO2 2140#ifndef RT 2141 leaq (C, LDC, 2), C 2142#endif 2143 2144#ifdef LN 2145 movq OFFSET, %rax 2146 addq M, %rax 2147 movq %rax, KK 2148#endif 2149 2150#ifdef LT 2151 movq OFFSET, %rax 2152 movq %rax, KK 2153#endif 2154 2155 movq M, I 2156 sarq $1, I 2157 NOBRANCH 2158 jle .L60 2159 ALIGN_4 2160 2161.L51: 2162#ifdef LN 2163 movq K, %rax 2164 salq $1 + BASE_SHIFT, %rax 2165 subq %rax, AORIG 2166#endif 2167 2168#if defined(LN) || defined(RT) 2169 movq KK, %rax 2170 leaq (, %rax, SIZE), %rax 2171 movq AORIG, AO 2172 leaq (AO, %rax, 2), AO 2173 leaq (B, %rax, 2), BO 2174#else 2175 movq B, BO 2176#endif 2177 2178 xorps %xmm1, %xmm1 2179 movaps -16 * SIZE(AO), %xmm0 2180 xorps %xmm2, %xmm2 2181 2182 xorps %xmm8, %xmm8 2183 prefetcht0 2 * SIZE(CO1) 2184 xorps %xmm9, %xmm9 2185 prefetcht0 2 * SIZE(CO2) 2186 xorps %xmm10, %xmm10 2187 xorps %xmm11, %xmm11 2188 2189#if defined(LT) || defined(RN) 2190 movq KK, %rax 2191#else 2192 movq K, %rax 2193 subq KK, %rax 2194#endif 2195 sarq $2, %rax 2196 NOBRANCH 2197 jle .L55 2198 ALIGN_3 2199 2200.L52: 2201 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 2202 2203 addpd %xmm1, %xmm8 2204 movaps -16 * SIZE(BO), %xmm1 2205 addpd %xmm2, %xmm9 2206 pshufd $0x4e, %xmm1, %xmm2 2207 mulpd %xmm0, %xmm1 2208 mulpd %xmm0, %xmm2 2209 movaps -14 * SIZE(AO), %xmm0 2210 2211 addpd %xmm1, %xmm10 2212 movaps -14 * SIZE(BO), %xmm1 2213 addpd %xmm2, %xmm11 2214 pshufd $0x4e, %xmm1, %xmm2 2215 mulpd %xmm0, %xmm1 2216 mulpd %xmm0, %xmm2 2217 movaps -12 * SIZE(AO), %xmm0 2218 2219 addpd %xmm1, %xmm8 2220 movaps -12 * SIZE(BO), %xmm1 2221 addpd %xmm2, %xmm9 2222 pshufd $0x4e, %xmm1, %xmm2 2223 mulpd %xmm0, %xmm1 2224 mulpd %xmm0, %xmm2 2225 movaps -10 * SIZE(AO), %xmm0 2226 2227 addpd %xmm1, %xmm10 2228 movaps -10 * SIZE(BO), %xmm1 2229 addpd %xmm2, %xmm11 2230 pshufd $0x4e, %xmm1, %xmm2 2231 mulpd %xmm0, %xmm1 2232 mulpd %xmm0, %xmm2 2233 movaps -8 * SIZE(AO), %xmm0 2234 2235 subq $-8 * SIZE, AO 2236 subq $-8 * SIZE, BO 2237 2238 subq $1, %rax 2239 BRANCH 2240 jg .L52 2241 2242 addpd %xmm10, %xmm8 2243 addpd %xmm11, %xmm9 2244 ALIGN_3 2245 2246.L55: 2247#if defined(LT) || defined(RN) 2248 movq KK, %rax 2249#else 2250 movq K, %rax 2251 subq KK, %rax 2252#endif 2253 andq $3, %rax # if (k & 1) 2254 BRANCH 2255 je .L58 2256 ALIGN_3 2257 2258.L56: 2259 addpd %xmm1, %xmm8 2260 movaps -16 * SIZE(BO), %xmm1 2261 addpd %xmm2, %xmm9 2262 pshufd $0x4e, %xmm1, %xmm2 2263 mulpd %xmm0, %xmm1 2264 mulpd %xmm0, %xmm2 2265 movaps -14 * SIZE(AO), %xmm0 2266 2267 addq $2 * SIZE, AO 2268 addq $2 * SIZE, BO 2269 2270 subq $1, %rax 2271 BRANCH 2272 jg .L56 2273 ALIGN_4 2274 2275.L58: 2276#if defined(LN) || defined(RT) 2277 movq KK, %rax 2278#ifdef LN 2279 subq $2, %rax 2280#else 2281 subq $2, %rax 2282#endif 2283 2284 leaq (, %rax, SIZE), %rax 2285 2286 movq AORIG, AO 2287 leaq (AO, %rax, 2), AO 2288 leaq (B, %rax, 2), BO 2289#endif 2290 2291 addpd %xmm1, %xmm8 2292 addpd %xmm2, %xmm9 2293 2294#if defined(LN) || defined(LT) 2295 movaps %xmm8, %xmm0 2296 shufpd $0, %xmm9, %xmm8 2297 shufpd $3, %xmm0, %xmm9 2298 2299 movapd -16 * SIZE(BO), %xmm0 2300 movapd -14 * SIZE(BO), %xmm1 2301#else 2302 movaps %xmm8, %xmm0 2303 shufpd $2, %xmm9, %xmm8 2304 shufpd $2, %xmm0, %xmm9 2305 2306 movapd -16 * SIZE(AO), %xmm0 2307 movapd -14 * SIZE(AO), %xmm1 2308#endif 2309 2310 subpd %xmm8, %xmm0 2311 subpd %xmm9, %xmm1 2312 2313#ifdef LN 2314 movddup -13 * SIZE(AO), %xmm8 2315 mulpd %xmm8, %xmm1 2316 movddup -14 * SIZE(AO), %xmm12 2317 mulpd %xmm1, %xmm12 2318 subpd %xmm12, %xmm0 2319 movddup -16 * SIZE(AO), %xmm8 2320 mulpd %xmm8, %xmm0 2321#endif 2322 2323#ifdef LT 2324 movddup -16 * SIZE(AO), %xmm8 2325 mulpd %xmm8, %xmm0 2326 movddup -15 * SIZE(AO), %xmm12 2327 mulpd %xmm0, %xmm12 2328 subpd %xmm12, %xmm1 2329 movddup -13 * SIZE(AO), %xmm8 2330 mulpd %xmm8, %xmm1 2331#endif 2332 2333#ifdef RN 2334 movddup -16 * SIZE(BO), %xmm10 2335 mulpd %xmm10, %xmm0 2336 movddup -15 * SIZE(BO), %xmm11 2337 mulpd %xmm0, %xmm11 2338 subpd %xmm11, %xmm1 2339 2340 movddup -13 * SIZE(BO), %xmm11 2341 mulpd %xmm11, %xmm1 2342#endif 2343 2344#ifdef RT 2345 movddup -13 * SIZE(BO), %xmm14 2346 mulpd %xmm14, %xmm1 2347 movddup -14 * SIZE(BO), %xmm15 2348 mulpd %xmm1, %xmm15 2349 subpd %xmm15, %xmm0 2350 2351 movddup -16 * SIZE(BO), %xmm15 2352 mulpd %xmm15, %xmm0 2353#endif 2354 2355#ifdef LN 2356 subq $2 * SIZE, CO1 2357 subq $2 * SIZE, CO2 2358#endif 2359 2360#if defined(LN) || defined(LT) 2361 movsd %xmm0, 0 * SIZE(CO1) 2362 movsd %xmm1, 1 * SIZE(CO1) 2363 movhps %xmm0, 0 * SIZE(CO2) 2364 movhps %xmm1, 1 * SIZE(CO2) 2365#else 2366 movsd %xmm0, 0 * SIZE(CO1) 2367 movhps %xmm0, 1 * SIZE(CO1) 2368 movsd %xmm1, 0 * SIZE(CO2) 2369 movhps %xmm1, 1 * SIZE(CO2) 2370#endif 2371 2372#if defined(LN) || defined(LT) 2373 movapd %xmm0, -16 * SIZE(BO) 2374 movapd %xmm1, -14 * SIZE(BO) 2375#else 2376 movapd %xmm0, -16 * SIZE(AO) 2377 movapd %xmm1, -14 * SIZE(AO) 2378#endif 2379 2380#ifndef LN 2381 addq $2 * SIZE, CO1 2382 addq $2 * SIZE, CO2 2383#endif 2384 2385 2386#if defined(LT) || defined(RN) 2387 movq K, %rax 2388 subq KK, %rax 2389 leaq (,%rax, SIZE), %rax 2390 leaq (AO, %rax, 2), AO 2391 leaq (BO, %rax, 2), BO 2392#endif 2393 2394#ifdef LN 2395 subq $2, KK 2396#endif 2397 2398#ifdef LT 2399 addq $2, KK 2400#endif 2401 2402#ifdef RT 2403 movq K, %rax 2404 salq $1 + BASE_SHIFT, %rax 2405 addq %rax, AORIG 2406#endif 2407 2408 decq I 2409 BRANCH 2410 jg .L51 2411 ALIGN_4 2412 2413.L60: 2414 testq $1, M 2415 BRANCH 2416 jle .L69 2417 ALIGN_4 2418 2419#ifdef LN 2420 movq K, %rax 2421 salq $BASE_SHIFT, %rax 2422 subq %rax, AORIG 2423#endif 2424 2425#if defined(LN) || defined(RT) 2426 movq KK, %rax 2427 leaq (, %rax, SIZE), %rax 2428 movq AORIG, AO 2429 leaq (AO, %rax, 1), AO 2430 leaq (B, %rax, 2), BO 2431#else 2432 movq B, BO 2433#endif 2434 2435 movddup -16 * SIZE(AO), %xmm0 2436 xorps %xmm8, %xmm8 2437 movaps -16 * SIZE(BO), %xmm1 2438 xorps %xmm9, %xmm9 2439 2440#if defined(LT) || defined(RN) 2441 movq KK, %rax 2442#else 2443 movq K, %rax 2444 subq KK, %rax 2445#endif 2446 sarq $2, %rax 2447 NOBRANCH 2448 jle .L65 2449 ALIGN_3 2450 2451.L62: 2452 mulpd %xmm0, %xmm1 2453 movddup -15 * SIZE(AO), %xmm0 2454 addpd %xmm1, %xmm8 2455 movaps -14 * SIZE(BO), %xmm1 2456 2457 mulpd %xmm0, %xmm1 2458 movddup -14 * SIZE(AO), %xmm0 2459 addpd %xmm1, %xmm9 2460 movaps -12 * SIZE(BO), %xmm1 2461 2462 mulpd %xmm0, %xmm1 2463 movddup -13 * SIZE(AO), %xmm0 2464 addpd %xmm1, %xmm8 2465 movaps -10 * SIZE(BO), %xmm1 2466 2467 mulpd %xmm0, %xmm1 2468 movddup -12 * SIZE(AO), %xmm0 2469 addpd %xmm1, %xmm9 2470 movaps -8 * SIZE(BO), %xmm1 2471 2472 subq $-4 * SIZE, AO 2473 subq $-8 * SIZE, BO 2474 2475 subq $1, %rax 2476 BRANCH 2477 jg .L62 2478 ALIGN_3 2479 2480.L65: 2481#if defined(LT) || defined(RN) 2482 movq KK, %rax 2483#else 2484 movq K, %rax 2485 subq KK, %rax 2486#endif 2487 andq $3, %rax # if (k & 1) 2488 BRANCH 2489 je .L68 2490 ALIGN_3 2491 2492.L66: 2493 mulpd %xmm0, %xmm1 2494 movddup -15 * SIZE(AO), %xmm0 2495 addpd %xmm1, %xmm8 2496 movaps -14 * SIZE(BO), %xmm1 2497 2498 addq $1 * SIZE, AO 2499 addq $2 * SIZE, BO 2500 2501 subq $1, %rax 2502 BRANCH 2503 jg .L66 2504 ALIGN_4 2505 2506.L68: 2507#if defined(LN) || defined(RT) 2508 movq KK, %rax 2509#ifdef LN 2510 subq $1, %rax 2511#else 2512 subq $2, %rax 2513#endif 2514 2515 leaq (, %rax, SIZE), %rax 2516 2517 movq AORIG, AO 2518 leaq (AO, %rax, 1), AO 2519 leaq (B, %rax, 2), BO 2520#endif 2521 2522 addpd %xmm9, %xmm8 2523 2524#if defined(LN) || defined(LT) 2525 movapd -16 * SIZE(BO), %xmm0 2526#else 2527 movapd -16 * SIZE(AO), %xmm0 2528#endif 2529 2530 subpd %xmm8, %xmm0 2531 2532#if defined(LN) || defined(LT) 2533 movddup -16 * SIZE(AO), %xmm8 2534 mulpd %xmm8, %xmm0 2535#endif 2536 2537#if defined(RN) || defined(RT) 2538 pshufd $0xe, %xmm0, %xmm1 2539#endif 2540 2541#ifdef RN 2542 movsd -16 * SIZE(BO), %xmm10 2543 mulsd %xmm10, %xmm0 2544 movsd -15 * SIZE(BO), %xmm11 2545 mulsd %xmm0, %xmm11 2546 subsd %xmm11, %xmm1 2547 2548 movsd -13 * SIZE(BO), %xmm11 2549 mulsd %xmm11, %xmm1 2550#endif 2551 2552#ifdef RT 2553 movsd -13 * SIZE(BO), %xmm14 2554 mulsd %xmm14, %xmm1 2555 movsd -14 * SIZE(BO), %xmm15 2556 mulsd %xmm1, %xmm15 2557 subsd %xmm15, %xmm0 2558 2559 movsd -16 * SIZE(BO), %xmm15 2560 mulsd %xmm15, %xmm0 2561#endif 2562 2563#if defined(RN) || defined(RT) 2564 unpcklpd %xmm1, %xmm0 2565#endif 2566 2567#ifdef LN 2568 subq $1 * SIZE, CO1 2569 subq $1 * SIZE, CO2 2570#endif 2571 2572 movsd %xmm0, 0 * SIZE(CO1) 2573 movhps %xmm0, 0 * SIZE(CO2) 2574 2575#if defined(LN) || defined(LT) 2576 movapd %xmm0, -16 * SIZE(BO) 2577#else 2578 movapd %xmm0, -16 * SIZE(AO) 2579#endif 2580 2581#ifndef LN 2582 addq $1 * SIZE, CO1 2583 addq $1 * SIZE, CO2 2584#endif 2585 2586#if defined(LT) || defined(RN) 2587 movq K, %rax 2588 subq KK, %rax 2589 leaq (,%rax, SIZE), %rax 2590 leaq (AO, %rax, 1), AO 2591 leaq (BO, %rax, 2), BO 2592#endif 2593 2594#ifdef LN 2595 subq $1, KK 2596#endif 2597 2598#ifdef LT 2599 addq $1, KK 2600#endif 2601 2602#ifdef RT 2603 movq K, %rax 2604 salq $BASE_SHIFT, %rax 2605 addq %rax, AORIG 2606#endif 2607 ALIGN_4 2608 2609.L69: 2610#ifdef LN 2611 leaq (, K, SIZE), %rax 2612 leaq (B, %rax, 2), B 2613#endif 2614#if defined(LT) || defined(RN) 2615 movq BO, B 2616#endif 2617 2618#ifdef RN 2619 addq $2, KK 2620#endif 2621 2622#ifdef RT 2623 subq $2, KK 2624#endif 2625 ALIGN_4 2626 2627.L70: 2628 testq $1, N 2629 jle .L999 2630 ALIGN_4 2631 2632#if defined(LT) || defined(RN) 2633 movq A, AO 2634#else 2635 movq A, AORIG 2636#endif 2637 2638#ifdef RT 2639 movq K, %rax 2640 salq $BASE_SHIFT, %rax 2641 subq %rax, B 2642 2643 subq LDC, C 2644#endif 2645 2646 movq C, CO1 2647#ifndef RT 2648 addq LDC, C 2649#endif 2650 2651#ifdef LN 2652 movq OFFSET, %rax 2653 addq M, %rax 2654 movq %rax, KK 2655#endif 2656 2657#ifdef LT 2658 movq OFFSET, %rax 2659 movq %rax, KK 2660#endif 2661 2662 movq M, I 2663 sarq $1, I 2664 NOBRANCH 2665 jle .L80 2666 ALIGN_4 2667 2668.L71: 2669#ifdef LN 2670 movq K, %rax 2671 salq $1 + BASE_SHIFT, %rax 2672 subq %rax, AORIG 2673#endif 2674 2675#if defined(LN) || defined(RT) 2676 movq KK, %rax 2677 leaq (, %rax, SIZE), %rax 2678 movq AORIG, AO 2679 leaq (AO, %rax, 2), AO 2680 leaq (B, %rax, 1), BO 2681#else 2682 movq B, BO 2683#endif 2684 2685 xorps %xmm1, %xmm1 2686 movaps -16 * SIZE(AO), %xmm0 2687 xorps %xmm2, %xmm2 2688 2689 xorps %xmm8, %xmm8 2690 prefetcht0 2 * SIZE(CO1) 2691 xorps %xmm9, %xmm9 2692 xorps %xmm10, %xmm10 2693 xorps %xmm11, %xmm11 2694 2695#if defined(LT) || defined(RN) 2696 movq KK, %rax 2697#else 2698 movq K, %rax 2699 subq KK, %rax 2700#endif 2701 sarq $2, %rax 2702 NOBRANCH 2703 jle .L75 2704 ALIGN_3 2705 2706.L72: 2707 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 2708 2709 addpd %xmm1, %xmm8 2710 movddup -16 * SIZE(BO), %xmm1 2711 mulpd %xmm0, %xmm1 2712 movaps -14 * SIZE(AO), %xmm0 2713 2714 addpd %xmm1, %xmm9 2715 movddup -15 * SIZE(BO), %xmm1 2716 mulpd %xmm0, %xmm1 2717 movaps -12 * SIZE(AO), %xmm0 2718 2719 addpd %xmm1, %xmm8 2720 movddup -14 * SIZE(BO), %xmm1 2721 mulpd %xmm0, %xmm1 2722 movaps -10 * SIZE(AO), %xmm0 2723 2724 addpd %xmm1, %xmm9 2725 movddup -13 * SIZE(BO), %xmm1 2726 mulpd %xmm0, %xmm1 2727 movaps -8 * SIZE(AO), %xmm0 2728 2729 subq $-8 * SIZE, AO 2730 subq $-4 * SIZE, BO 2731 2732 subq $1, %rax 2733 BRANCH 2734 jg .L72 2735 2736 addpd %xmm9, %xmm8 2737 ALIGN_3 2738 2739.L75: 2740#if defined(LT) || defined(RN) 2741 movq KK, %rax 2742#else 2743 movq K, %rax 2744 subq KK, %rax 2745#endif 2746 andq $3, %rax # if (k & 1) 2747 BRANCH 2748 je .L78 2749 ALIGN_3 2750 2751.L76: 2752 addpd %xmm1, %xmm8 2753 movddup -16 * SIZE(BO), %xmm1 2754 mulpd %xmm0, %xmm1 2755 movaps -14 * SIZE(AO), %xmm0 2756 2757 addq $2 * SIZE, AO 2758 addq $1 * SIZE, BO 2759 2760 subq $1, %rax 2761 BRANCH 2762 jg .L76 2763 ALIGN_4 2764 2765.L78: 2766#if defined(LN) || defined(RT) 2767 movq KK, %rax 2768#ifdef LN 2769 subq $2, %rax 2770#else 2771 subq $1, %rax 2772#endif 2773 2774 leaq (, %rax, SIZE), %rax 2775 2776 movq AORIG, AO 2777 leaq (AO, %rax, 2), AO 2778 leaq (B, %rax, 1), BO 2779#endif 2780 2781 addpd %xmm1, %xmm8 2782 2783#if defined(LN) || defined(LT) 2784 movapd -16 * SIZE(BO), %xmm0 2785#else 2786 movapd -16 * SIZE(AO), %xmm0 2787#endif 2788 2789 subpd %xmm8, %xmm0 2790 2791#if defined(LN) || defined(LT) 2792 pshufd $0xe, %xmm0, %xmm1 2793#endif 2794 2795#ifdef LN 2796 movsd -13 * SIZE(AO), %xmm8 2797 mulsd %xmm8, %xmm1 2798 movsd -14 * SIZE(AO), %xmm12 2799 mulsd %xmm1, %xmm12 2800 subsd %xmm12, %xmm0 2801 movsd -16 * SIZE(AO), %xmm8 2802 mulsd %xmm8, %xmm0 2803#endif 2804 2805#ifdef LT 2806 movsd -16 * SIZE(AO), %xmm8 2807 mulsd %xmm8, %xmm0 2808 movsd -15 * SIZE(AO), %xmm12 2809 mulsd %xmm0, %xmm12 2810 subsd %xmm12, %xmm1 2811 movsd -13 * SIZE(AO), %xmm8 2812 mulsd %xmm8, %xmm1 2813#endif 2814 2815#if defined(LN) || defined(LT) 2816 unpcklpd %xmm1, %xmm0 2817#endif 2818 2819#if defined(RN) || defined(RT) 2820 movddup -16 * SIZE(BO), %xmm10 2821 mulpd %xmm10, %xmm0 2822#endif 2823 2824#ifdef LN 2825 subq $2 * SIZE, CO1 2826#endif 2827 2828 movsd %xmm0, 0 * SIZE(CO1) 2829 movhps %xmm0, 1 * SIZE(CO1) 2830 2831#if defined(LN) || defined(LT) 2832 movapd %xmm0, -16 * SIZE(BO) 2833#else 2834 movapd %xmm0, -16 * SIZE(AO) 2835#endif 2836 2837#ifndef LN 2838 addq $2 * SIZE, CO1 2839#endif 2840 2841 2842#if defined(LT) || defined(RN) 2843 movq K, %rax 2844 subq KK, %rax 2845 leaq (,%rax, SIZE), %rax 2846 leaq (AO, %rax, 2), AO 2847 leaq (BO, %rax, 1), BO 2848#endif 2849 2850#ifdef LN 2851 subq $2, KK 2852#endif 2853 2854#ifdef LT 2855 addq $2, KK 2856#endif 2857 2858#ifdef RT 2859 movq K, %rax 2860 salq $1 + BASE_SHIFT, %rax 2861 addq %rax, AORIG 2862#endif 2863 2864 decq I 2865 BRANCH 2866 jg .L71 2867 ALIGN_4 2868 2869.L80: 2870 testq $1, M 2871 BRANCH 2872 jle .L89 2873 ALIGN_4 2874 2875#ifdef LN 2876 movq K, %rax 2877 salq $BASE_SHIFT, %rax 2878 subq %rax, AORIG 2879#endif 2880 2881#if defined(LN) || defined(RT) 2882 movq KK, %rax 2883 leaq (, %rax, SIZE), %rax 2884 movq AORIG, AO 2885 leaq (AO, %rax, 1), AO 2886 leaq (B, %rax, 1), BO 2887#else 2888 movq B, BO 2889#endif 2890 2891 movsd -16 * SIZE(AO), %xmm0 2892 movhps -15 * SIZE(AO), %xmm0 2893 xorps %xmm8, %xmm8 2894 movsd -16 * SIZE(BO), %xmm1 2895 movhps -15 * SIZE(BO), %xmm1 2896 xorps %xmm9, %xmm9 2897 2898#if defined(LT) || defined(RN) 2899 movq KK, %rax 2900#else 2901 movq K, %rax 2902 subq KK, %rax 2903#endif 2904 sarq $2, %rax 2905 NOBRANCH 2906 jle .L85 2907 ALIGN_3 2908 2909.L82: 2910 mulpd %xmm0, %xmm1 2911 movsd -14 * SIZE(AO), %xmm0 2912 movhps -13 * SIZE(AO), %xmm0 2913 addpd %xmm1, %xmm8 2914 movsd -14 * SIZE(BO), %xmm1 2915 movhps -13 * SIZE(BO), %xmm1 2916 2917 mulpd %xmm0, %xmm1 2918 movsd -12 * SIZE(AO), %xmm0 2919 movhps -11 * SIZE(AO), %xmm0 2920 addpd %xmm1, %xmm9 2921 movsd -12 * SIZE(BO), %xmm1 2922 movhps -11 * SIZE(BO), %xmm1 2923 2924 subq $-4 * SIZE, AO 2925 subq $-4 * SIZE, BO 2926 2927 subq $1, %rax 2928 BRANCH 2929 jg .L82 2930 2931 addpd %xmm9, %xmm8 2932 ALIGN_3 2933 2934.L85: 2935#if defined(LT) || defined(RN) 2936 movq KK, %rax 2937#else 2938 movq K, %rax 2939 subq KK, %rax 2940#endif 2941 andq $3, %rax # if (k & 1) 2942 BRANCH 2943 je .L88 2944 ALIGN_3 2945 2946.L86: 2947 mulsd %xmm0, %xmm1 2948 movsd -15 * SIZE(AO), %xmm0 2949 addsd %xmm1, %xmm8 2950 movsd -15 * SIZE(BO), %xmm1 2951 2952 addq $1 * SIZE, AO 2953 addq $1 * SIZE, BO 2954 2955 subq $1, %rax 2956 BRANCH 2957 jg .L86 2958 ALIGN_4 2959 2960.L88: 2961#if defined(LN) || defined(RT) 2962 movq KK, %rax 2963 subq $1, %rax 2964 2965 leaq (, %rax, SIZE), %rax 2966 2967 movq AORIG, AO 2968 leaq (AO, %rax, 1), AO 2969 leaq (B, %rax, 1), BO 2970#endif 2971 2972 haddpd %xmm8, %xmm8 2973 2974#if defined(LN) || defined(LT) 2975 movsd -16 * SIZE(BO), %xmm0 2976#else 2977 movsd -16 * SIZE(AO), %xmm0 2978#endif 2979 2980 subsd %xmm8, %xmm0 2981 2982#if defined(LN) || defined(LT) 2983 movsd -16 * SIZE(AO), %xmm8 2984 mulsd %xmm8, %xmm0 2985#endif 2986 2987#if defined(RN) || defined(RT) 2988 movsd -16 * SIZE(BO), %xmm10 2989 mulsd %xmm10, %xmm0 2990#endif 2991 2992#ifdef LN 2993 subq $1 * SIZE, CO1 2994#endif 2995 2996 movsd %xmm0, 0 * SIZE(CO1) 2997 2998#if defined(LN) || defined(LT) 2999 movsd %xmm0, -16 * SIZE(BO) 3000#else 3001 movsd %xmm0, -16 * SIZE(AO) 3002#endif 3003 3004#ifndef LN 3005 addq $1 * SIZE, CO1 3006#endif 3007 3008 3009#if defined(LT) || defined(RN) 3010 movq K, %rax 3011 subq KK, %rax 3012 leaq (,%rax, SIZE), %rax 3013 leaq (AO, %rax, 1), AO 3014 leaq (BO, %rax, 1), BO 3015#endif 3016 3017#ifdef LN 3018 subq $1, KK 3019#endif 3020 3021#ifdef LT 3022 addq $1, KK 3023#endif 3024 3025#ifdef RT 3026 movq K, %rax 3027 salq $BASE_SHIFT, %rax 3028 addq %rax, AORIG 3029#endif 3030 ALIGN_4 3031 3032.L89: 3033#ifdef LN 3034 leaq (, K, SIZE), %rax 3035 leaq (B, %rax, 1), B 3036#endif 3037#if defined(LT) || defined(RN) 3038 movq BO, B 3039#endif 3040 3041#ifdef RN 3042 addq $1, KK 3043#endif 3044 3045#ifdef RT 3046 subq $1, KK 3047#endif 3048 ALIGN_4 3049 3050 3051.L999: 3052 movq 0(%rsp), %rbx 3053 movq 8(%rsp), %rbp 3054 movq 16(%rsp), %r12 3055 movq 24(%rsp), %r13 3056 movq 32(%rsp), %r14 3057 movq 40(%rsp), %r15 3058 3059#ifdef WINDOWS_ABI 3060 movq 48(%rsp), %rdi 3061 movq 56(%rsp), %rsi 3062 movups 64(%rsp), %xmm6 3063 movups 80(%rsp), %xmm7 3064 movups 96(%rsp), %xmm8 3065 movups 112(%rsp), %xmm9 3066 movups 128(%rsp), %xmm10 3067 movups 144(%rsp), %xmm11 3068 movups 160(%rsp), %xmm12 3069 movups 176(%rsp), %xmm13 3070 movups 192(%rsp), %xmm14 3071 movups 208(%rsp), %xmm15 3072#endif 3073 3074 addq $STACKSIZE, %rsp 3075 ret 3076 3077 EPILOGUE 3078