1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define M %rdi 43#define N %rsi 44#define K %rdx 45#define A %rcx 46#define B %r8 47#define C %r9 48#define LDC %r10 49 50#define I %r11 51#define J %r12 52#define AO %r13 53#define BO %r14 54#define CO1 %r15 55#define CO2 %rbp 56 57#ifndef WINDOWS_ABI 58 59#define STACKSIZE 64 60 61#define OLD_LDC 8 + STACKSIZE(%rsp) 62#define OLD_OFFSET 16 + STACKSIZE(%rsp) 63 64#else 65 66#define STACKSIZE 256 67 68#define OLD_A 40 + STACKSIZE(%rsp) 69#define OLD_B 48 + STACKSIZE(%rsp) 70#define OLD_C 56 + STACKSIZE(%rsp) 71#define OLD_LDC 64 + STACKSIZE(%rsp) 72#define OLD_OFFSET 72 + STACKSIZE(%rsp) 73 74#endif 75 76#define ALPHA 0(%rsp) 77#define OFFSET 16(%rsp) 78#define KK 24(%rsp) 79#define KKK 32(%rsp) 80#define AORIG 40(%rsp) 81#define BORIG 48(%rsp) 82#define BUFFER 128(%rsp) 83 84#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) 85#define PREFETCH prefetch 86#define PREFETCHW prefetchw 87#define PREFETCHNTA prefetchnta 88#ifndef ALLOC_HUGETLB 89#define PREFETCHSIZE (8 * 4 + 4) 90#else 91#define PREFETCHSIZE (8 * 2 + 4) 92#endif 93#endif 94 95#ifdef GENERIC 96#define PREFETCH prefetcht0 97#define PREFETCHW prefetcht0 98#define PREFETCHNTA prefetchnta 99#define PREFETCHSIZE (8 * 4 + 4) 100#endif 101 102#ifdef OPTERON 103#define movsd movlpd 104#endif 105 106#define KERNEL1(xx) \ 107 mulpd %xmm8, %xmm9 ;\ 108 addpd %xmm9, %xmm0 ;\ 109 movapd 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ 110 mulpd %xmm8, %xmm11 ;\ 111 PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\ 112 addpd %xmm11, %xmm1 ;\ 113 movapd 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ 114 mulpd %xmm8, %xmm13 ;\ 115 mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\ 116 addpd %xmm13, %xmm2 ;\ 117 movapd 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ 118 addpd %xmm8, %xmm3 ;\ 119 movapd 8 * SIZE + 1 * (xx) * SIZE(AO), %xmm8 120 121#define KERNEL2(xx) \ 122 mulpd %xmm10, %xmm9 ;\ 123 addpd %xmm9, %xmm4 ;\ 124 movapd 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ 125 mulpd %xmm10, %xmm11 ;\ 126 addpd %xmm11, %xmm5 ;\ 127 movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ 128 mulpd %xmm10, %xmm13 ;\ 129 mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\ 130 addpd %xmm13, %xmm6 ;\ 131 movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ 132 addpd %xmm10, %xmm7 ;\ 133 movapd 10 * SIZE + 1 * (xx) * SIZE(AO), %xmm10 134 135#define KERNEL3(xx) \ 136 mulpd %xmm12, %xmm15 ;\ 137 addpd %xmm15, %xmm0 ;\ 138 movapd 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ 139 mulpd %xmm12, %xmm11 ;\ 140 addpd %xmm11, %xmm1 ;\ 141 movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ 142 mulpd %xmm12, %xmm13 ;\ 143 mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\ 144 addpd %xmm13, %xmm2 ;\ 145 movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ 146 addpd %xmm12, %xmm3 ;\ 147 movapd 12 * SIZE + 1 * (xx) * SIZE(AO), %xmm12 148 149#define KERNEL4(xx) \ 150 mulpd %xmm14, %xmm15 ;\ 151 addpd %xmm15, %xmm4 ;\ 152 movapd 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ 153 mulpd %xmm14, %xmm11 ;\ 154 addpd %xmm11, %xmm5 ;\ 155 movapd 18 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ 156 mulpd %xmm14, %xmm13 ;\ 157 mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\ 158 addpd %xmm13, %xmm6 ;\ 159 movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ 160 addpd %xmm14, %xmm7 ;\ 161 movapd 14 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 162 163#define KERNEL5(xx) \ 164 mulpd %xmm8, %xmm9 ;\ 165 addpd %xmm9, %xmm0 ;\ 166 movapd 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ 167 mulpd %xmm8, %xmm11 ;\ 168 PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO) ;\ 169 addpd %xmm11, %xmm1 ;\ 170 movapd 18 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ 171 mulpd %xmm8, %xmm13 ;\ 172 mulpd 22 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\ 173 addpd %xmm13, %xmm2 ;\ 174 movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ 175 addpd %xmm8, %xmm3 ;\ 176 movapd 16 * SIZE + 1 * (xx) * SIZE(AO), %xmm8 177 178#define KERNEL6(xx) \ 179 mulpd %xmm10, %xmm9 ;\ 180 addpd %xmm9, %xmm4 ;\ 181 movapd 32 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ 182 mulpd %xmm10, %xmm11 ;\ 183 addpd %xmm11, %xmm5 ;\ 184 movapd 26 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ 185 mulpd %xmm10, %xmm13 ;\ 186 mulpd 22 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\ 187 addpd %xmm13, %xmm6 ;\ 188 movapd 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ 189 addpd %xmm10, %xmm7 ;\ 190 movapd 18 * SIZE + 1 * (xx) * SIZE(AO), %xmm10 191 192#define KERNEL7(xx) \ 193 mulpd %xmm12, %xmm15 ;\ 194 addpd %xmm15, %xmm0 ;\ 195 movapd 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ 196 mulpd %xmm12, %xmm11 ;\ 197 addpd %xmm11, %xmm1 ;\ 198 movapd 26 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ 199 mulpd %xmm12, %xmm13 ;\ 200 mulpd 30 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\ 201 addpd %xmm13, %xmm2 ;\ 202 movapd 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ 203 addpd %xmm12, %xmm3 ;\ 204 movapd 20 * SIZE + 1 * (xx) * SIZE(AO), %xmm12 205 206#define KERNEL8(xx) \ 207 mulpd %xmm14, %xmm15 ;\ 208 addpd %xmm15, %xmm4 ;\ 209 movapd 40 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ 210 mulpd %xmm14, %xmm11 ;\ 211 addpd %xmm11, %xmm5 ;\ 212 movapd 34 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ 213 mulpd %xmm14, %xmm13 ;\ 214 mulpd 30 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\ 215 addpd %xmm13, %xmm6 ;\ 216 movapd 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ 217 addpd %xmm14, %xmm7 ;\ 218 movapd 22 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 219 220 PROLOGUE 221 PROFCODE 222 223 subq $STACKSIZE, %rsp 224 movq %rbx, 0(%rsp) 225 movq %rbp, 8(%rsp) 226 movq %r12, 16(%rsp) 227 movq %r13, 24(%rsp) 228 movq %r14, 32(%rsp) 229 movq %r15, 40(%rsp) 230 231#ifdef WINDOWS_ABI 232 movq %rdi, 48(%rsp) 233 movq %rsi, 56(%rsp) 234 movups %xmm6, 64(%rsp) 235 movups %xmm7, 80(%rsp) 236 movups %xmm8, 96(%rsp) 237 movups %xmm9, 112(%rsp) 238 movups %xmm10, 128(%rsp) 239 movups %xmm11, 144(%rsp) 240 movups %xmm12, 160(%rsp) 241 movups %xmm13, 176(%rsp) 242 movups %xmm14, 192(%rsp) 243 movups %xmm15, 208(%rsp) 244 245 movq ARG1, M 246 movq ARG2, N 247 movq ARG3, K 248 movq OLD_A, A 249 movq OLD_B, B 250 movq OLD_C, C 251 movq OLD_LDC, LDC 252 movsd OLD_OFFSET, %xmm4 253 254 movaps %xmm3, %xmm0 255 256#else 257 movq OLD_LDC, LDC 258 movsd OLD_OFFSET, %xmm4 259 260#endif 261 262 movq %rsp, %rbx # save old stack 263 subq $128 + LOCAL_BUFFER_SIZE, %rsp 264 andq $-4096, %rsp # align stack 265 266 STACK_TOUCHING 267 268 movsd %xmm4, OFFSET 269 movsd %xmm4, KK 270 271 leaq (, LDC, SIZE), LDC 272 273#ifdef LN 274 leaq (, M, SIZE), %rax 275 addq %rax, C 276 imulq K, %rax 277 addq %rax, A 278#endif 279 280#ifdef RT 281 leaq (, N, SIZE), %rax 282 imulq K, %rax 283 addq %rax, B 284 movq N, %rax 285 imulq LDC, %rax 286 addq %rax, C 287#endif 288 289#ifdef RN 290 negq KK 291#endif 292 293#ifdef RT 294 movq N, %rax 295 subq OFFSET, %rax 296 movq %rax, KK 297#endif 298 299 testq $1, N 300 je .L40 301 ALIGN_4 302 303.L81: 304/* Copying to Sub Buffer */ 305 306#ifdef LN 307 movq OFFSET, %rax 308 addq M, %rax 309 movq %rax, KK 310#endif 311 312 leaq BUFFER, BO 313 314#ifdef RT 315 movq K, %rax 316 salq $0 + BASE_SHIFT, %rax 317 subq %rax, B 318#endif 319 320#if defined(LN) || defined(RT) 321 movq KK, %rax 322 movq B, BORIG 323 leaq (, %rax, SIZE), %rax 324 leaq (B, %rax, 1), B 325 leaq (BO, %rax, 2), BO 326#endif 327 328#ifdef LT 329 movq OFFSET, %rax 330 movq %rax, KK 331#endif 332 333#if defined(LT) || defined(RN) 334 movq KK, %rax 335#else 336 movq K, %rax 337 subq KK, %rax 338#endif 339 sarq $3, %rax 340 jle .L83 341 ALIGN_4 342 343.L82: 344 PREFETCH 56 * SIZE(B) 345 346 movsd 0 * SIZE(B), %xmm0 347 movsd 1 * SIZE(B), %xmm1 348 movsd 2 * SIZE(B), %xmm2 349 movsd 3 * SIZE(B), %xmm3 350 movsd 4 * SIZE(B), %xmm4 351 movsd 5 * SIZE(B), %xmm5 352 movsd 6 * SIZE(B), %xmm6 353 movsd 7 * SIZE(B), %xmm7 354 355 addq $ 8 * SIZE, B 356 addq $16 * SIZE, BO 357 358 movsd %xmm0, -16 * SIZE(BO) 359 movsd %xmm0, -15 * SIZE(BO) 360 movsd %xmm1, -14 * SIZE(BO) 361 movsd %xmm1, -13 * SIZE(BO) 362 movsd %xmm2, -12 * SIZE(BO) 363 movsd %xmm2, -11 * SIZE(BO) 364 movsd %xmm3, -10 * SIZE(BO) 365 movsd %xmm3, -9 * SIZE(BO) 366 movsd %xmm4, -8 * SIZE(BO) 367 movsd %xmm4, -7 * SIZE(BO) 368 movsd %xmm5, -6 * SIZE(BO) 369 movsd %xmm5, -5 * SIZE(BO) 370 movsd %xmm6, -4 * SIZE(BO) 371 movsd %xmm6, -3 * SIZE(BO) 372 movsd %xmm7, -2 * SIZE(BO) 373 movsd %xmm7, -1 * SIZE(BO) 374 375 decq %rax 376 jne .L82 377 ALIGN_4 378 379.L83: 380#if defined(LT) || defined(RN) 381 movq KK, %rax 382#else 383 movq K, %rax 384 subq KK, %rax 385#endif 386 andq $7, %rax 387 BRANCH 388 jle .L90 389 ALIGN_4 390 391.L84: 392 movsd 0 * SIZE(B), %xmm0 393 394 movsd %xmm0, 0 * SIZE(BO) 395 movsd %xmm0, 1 * SIZE(BO) 396 397 addq $1 * SIZE, B 398 addq $2 * SIZE, BO 399 decq %rax 400 jne .L84 401 ALIGN_4 402 403.L90: 404#if defined(LT) || defined(RN) 405 movq A, AO 406#else 407 movq A, AORIG 408#endif 409 410#ifdef RT 411 subq LDC, C 412#endif 413 414 movq C, CO1 # coffset1 = c 415#ifndef RT 416 addq LDC, C 417#endif 418 419 movq M, I 420 sarq $2, I # i = (m >> 2) 421 jle .L100 422 ALIGN_4 423 424.L91: 425#ifdef LN 426 movq K, %rax 427 salq $2 + BASE_SHIFT, %rax 428 subq %rax, AORIG 429#endif 430 431#if defined(LN) || defined(RT) 432 movq KK, %rax 433 movq AORIG, AO 434 leaq (, %rax, SIZE), %rax 435 leaq (AO, %rax, 4), AO 436#endif 437 438 leaq BUFFER, BO 439 440#if defined(LN) || defined(RT) 441 movq KK, %rax 442 salq $0 + BASE_SHIFT, %rax 443 leaq (BO, %rax, 2), BO 444#endif 445 446 movapd 0 * SIZE(AO), %xmm8 447 pxor %xmm0, %xmm0 448 movapd 0 * SIZE(BO), %xmm9 449 pxor %xmm1, %xmm1 450 movapd 8 * SIZE(AO), %xmm10 451 pxor %xmm2, %xmm2 452 movapd 8 * SIZE(BO), %xmm11 453 pxor %xmm3, %xmm3 454 455 movapd 16 * SIZE(AO), %xmm12 456 movapd 24 * SIZE(AO), %xmm14 457 458 PREFETCHW 4 * SIZE(CO1) 459 460#if defined(LT) || defined(RN) 461 movq KK, %rax 462#else 463 movq K, %rax 464 subq KK, %rax 465#endif 466 sarq $3, %rax 467 je .L95 468 ALIGN_4 469 470.L92: 471 mulpd %xmm9, %xmm8 472 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 473 mulpd 2 * SIZE(AO), %xmm9 474 addpd %xmm8, %xmm0 475 movapd 4 * SIZE(AO), %xmm8 476 addpd %xmm9, %xmm1 477 movapd 2 * SIZE(BO), %xmm9 478 mulpd %xmm9, %xmm8 479 mulpd 6 * SIZE(AO), %xmm9 480 addpd %xmm8, %xmm2 481 movapd 32 * SIZE(AO), %xmm8 482 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) 483 addpd %xmm9, %xmm3 484 movapd 4 * SIZE(BO), %xmm9 485 mulpd %xmm9, %xmm10 486 mulpd 10 * SIZE(AO), %xmm9 487 addpd %xmm10, %xmm0 488 movapd 12 * SIZE(AO), %xmm10 489 addpd %xmm9, %xmm1 490 movapd 6 * SIZE(BO), %xmm9 491 mulpd %xmm9, %xmm10 492 mulpd 14 * SIZE(AO), %xmm9 493 addpd %xmm10, %xmm2 494 movapd 40 * SIZE(AO), %xmm10 495 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) 496 addpd %xmm9, %xmm3 497 movapd 16 * SIZE(BO), %xmm9 498 mulpd %xmm11, %xmm12 499 mulpd 18 * SIZE(AO), %xmm11 500 addpd %xmm12, %xmm0 501 movapd 20 * SIZE(AO), %xmm12 502 addpd %xmm11, %xmm1 503 movapd 10 * SIZE(BO), %xmm11 504 mulpd %xmm11, %xmm12 505 mulpd 22 * SIZE(AO), %xmm11 506 addpd %xmm12, %xmm2 507 movapd 48 * SIZE(AO), %xmm12 508 PREFETCH (PREFETCHSIZE + 24) * SIZE(AO) 509 addpd %xmm11, %xmm3 510 movapd 12 * SIZE(BO), %xmm11 511 mulpd %xmm11, %xmm14 512 mulpd 26 * SIZE(AO), %xmm11 513 addpd %xmm14, %xmm0 514 movapd 28 * SIZE(AO), %xmm14 515 addpd %xmm11, %xmm1 516 movapd 14 * SIZE(BO), %xmm11 517 mulpd %xmm11, %xmm14 518 mulpd 30 * SIZE(AO), %xmm11 519 addpd %xmm14, %xmm2 520 movapd 56 * SIZE(AO), %xmm14 521 addpd %xmm11, %xmm3 522 movapd 24 * SIZE(BO), %xmm11 523 524 addq $32 * SIZE, AO 525 addq $16 * SIZE, BO 526 decq %rax 527 jne .L92 528 ALIGN_4 529 530.L95: 531#if defined(LT) || defined(RN) 532 movq KK, %rax 533#else 534 movq K, %rax 535 subq KK, %rax 536#endif 537 andq $7, %rax # if (k & 1) 538 BRANCH 539 je .L99 540 ALIGN_4 541 542.L96: 543 mulpd %xmm9, %xmm8 544 mulpd 2 * SIZE(AO), %xmm9 545 addpd %xmm8, %xmm0 546 movapd 4 * SIZE(AO), %xmm8 547 addpd %xmm9, %xmm1 548 movapd 2 * SIZE(BO), %xmm9 549 550 addq $4 * SIZE, AO # aoffset += 4 551 addq $2 * SIZE, BO # boffset1 += 8 552 decq %rax 553 jg .L96 554 ALIGN_4 555 556.L99: 557 addpd %xmm2, %xmm0 558 addpd %xmm3, %xmm1 559 560#if defined(LN) || defined(RT) 561 movq KK, %rax 562#ifdef LN 563 subq $4, %rax 564#else 565 subq $1, %rax 566#endif 567 568 movq AORIG, AO 569 movq BORIG, B 570 leaq BUFFER, BO 571 572 leaq (, %rax, SIZE), %rax 573 leaq (AO, %rax, 4), AO 574 leaq (B, %rax, 1), B 575 leaq (BO, %rax, 2), BO 576#endif 577 578#if defined(LN) || defined(LT) 579 movapd 0 * SIZE(B), %xmm2 580 movapd 2 * SIZE(B), %xmm3 581 582 subpd %xmm0, %xmm2 583 subpd %xmm1, %xmm3 584#else 585 movapd 0 * SIZE(AO), %xmm2 586 movapd 2 * SIZE(AO), %xmm3 587 588 subpd %xmm0, %xmm2 589 subpd %xmm1, %xmm3 590#endif 591 592#ifdef LN 593 movapd %xmm2, %xmm0 594 unpckhpd %xmm0, %xmm0 595 596 movapd %xmm3, %xmm1 597 unpckhpd %xmm1, %xmm1 598 599 movsd 15 * SIZE(AO), %xmm4 600 mulsd %xmm4, %xmm1 601 602 movsd 14 * SIZE(AO), %xmm5 603 mulsd %xmm1, %xmm5 604 subsd %xmm5, %xmm3 605 movsd 13 * SIZE(AO), %xmm6 606 mulsd %xmm1, %xmm6 607 subsd %xmm6, %xmm0 608 movsd 12 * SIZE(AO), %xmm7 609 mulsd %xmm1, %xmm7 610 subsd %xmm7, %xmm2 611 612 movsd 10 * SIZE(AO), %xmm4 613 mulsd %xmm4, %xmm3 614 615 movsd 9 * SIZE(AO), %xmm5 616 mulsd %xmm3, %xmm5 617 subsd %xmm5, %xmm0 618 movsd 8 * SIZE(AO), %xmm6 619 mulsd %xmm3, %xmm6 620 subsd %xmm6, %xmm2 621 622 movsd 5 * SIZE(AO), %xmm4 623 mulsd %xmm4, %xmm0 624 625 movsd 4 * SIZE(AO), %xmm5 626 mulsd %xmm0, %xmm5 627 subsd %xmm5, %xmm2 628 629 movsd 0 * SIZE(AO), %xmm4 630 mulsd %xmm4, %xmm2 631 632 unpcklpd %xmm0, %xmm2 633 unpcklpd %xmm1, %xmm3 634#endif 635 636#ifdef LT 637 movapd %xmm2, %xmm0 638 unpckhpd %xmm0, %xmm0 639 640 movapd %xmm3, %xmm1 641 unpckhpd %xmm1, %xmm1 642 643 movsd 0 * SIZE(AO), %xmm4 644 mulsd %xmm4, %xmm2 645 646 movsd 1 * SIZE(AO), %xmm5 647 mulsd %xmm2, %xmm5 648 subsd %xmm5, %xmm0 649 movsd 2 * SIZE(AO), %xmm6 650 mulsd %xmm2, %xmm6 651 subsd %xmm6, %xmm3 652 movsd 3 * SIZE(AO), %xmm7 653 mulsd %xmm2, %xmm7 654 subsd %xmm7, %xmm1 655 656 movsd 5 * SIZE(AO), %xmm4 657 mulsd %xmm4, %xmm0 658 659 movsd 6 * SIZE(AO), %xmm5 660 mulsd %xmm0, %xmm5 661 subsd %xmm5, %xmm3 662 movsd 7 * SIZE(AO), %xmm6 663 mulsd %xmm0, %xmm6 664 subsd %xmm6, %xmm1 665 666 movsd 10 * SIZE(AO), %xmm4 667 mulsd %xmm4, %xmm3 668 669 movsd 11 * SIZE(AO), %xmm5 670 mulsd %xmm3, %xmm5 671 subsd %xmm5, %xmm1 672 673 movsd 15 * SIZE(AO), %xmm4 674 mulsd %xmm4, %xmm1 675 676 unpcklpd %xmm0, %xmm2 677 unpcklpd %xmm1, %xmm3 678#endif 679 680#ifdef RN 681 movlpd 0 * SIZE(B), %xmm0 682 movhpd 0 * SIZE(B), %xmm0 683 mulpd %xmm0, %xmm2 684 mulpd %xmm0, %xmm3 685#endif 686 687#ifdef RT 688 movlpd 0 * SIZE(B), %xmm0 689 movhpd 0 * SIZE(B), %xmm0 690 mulpd %xmm0, %xmm2 691 mulpd %xmm0, %xmm3 692#endif 693 694#ifdef LN 695 subq $4 * SIZE, CO1 696#endif 697 698#if defined(LN) || defined(LT) 699 movsd %xmm2, 0 * SIZE(CO1) 700 movhpd %xmm2, 1 * SIZE(CO1) 701 movsd %xmm3, 2 * SIZE(CO1) 702 movhpd %xmm3, 3 * SIZE(CO1) 703#else 704 movsd %xmm2, 0 * SIZE(CO1) 705 movhpd %xmm2, 1 * SIZE(CO1) 706 movsd %xmm3, 2 * SIZE(CO1) 707 movhpd %xmm3, 3 * SIZE(CO1) 708#endif 709 710#if defined(LN) || defined(LT) 711 movapd %xmm2, 0 * SIZE(B) 712 movapd %xmm3, 2 * SIZE(B) 713 714 movlpd %xmm2, 0 * SIZE(BO) 715 movlpd %xmm2, 1 * SIZE(BO) 716 movhpd %xmm2, 2 * SIZE(BO) 717 movhpd %xmm2, 3 * SIZE(BO) 718 movlpd %xmm3, 4 * SIZE(BO) 719 movlpd %xmm3, 5 * SIZE(BO) 720 movhpd %xmm3, 6 * SIZE(BO) 721 movhpd %xmm3, 7 * SIZE(BO) 722#else 723 movapd %xmm2, 0 * SIZE(AO) 724 movapd %xmm3, 2 * SIZE(AO) 725#endif 726 727#ifndef LN 728 addq $4 * SIZE, CO1 729#endif 730 731#if defined(LT) || defined(RN) 732 movq K, %rax 733 subq KK, %rax 734 leaq (,%rax, SIZE), %rax 735 leaq (AO, %rax, 4), AO 736#ifdef LT 737 addq $4 * SIZE, B 738#endif 739#endif 740 741#ifdef LN 742 subq $4, KK 743 movq BORIG, B 744#endif 745 746#ifdef LT 747 addq $4, KK 748#endif 749 750#ifdef RT 751 movq K, %rax 752 movq BORIG, B 753 salq $2 + BASE_SHIFT, %rax 754 addq %rax, AORIG 755#endif 756 757 decq I # i -- 758 jg .L91 759 ALIGN_4 760 761.L100: 762 testq $2, M 763 je .L110 764 ALIGN_4 765 766.L101: 767#ifdef LN 768 movq K, %rax 769 salq $1 + BASE_SHIFT, %rax 770 subq %rax, AORIG 771#endif 772 773#if defined(LN) || defined(RT) 774 movq KK, %rax 775 movq AORIG, AO 776 leaq (, %rax, SIZE), %rax 777 leaq (AO, %rax, 2), AO 778#endif 779 780 leaq BUFFER, BO 781 782#if defined(LN) || defined(RT) 783 movq KK, %rax 784 salq $0 + BASE_SHIFT, %rax 785 leaq (BO, %rax, 2), BO 786#endif 787 788 movapd 0 * SIZE(AO), %xmm8 789 pxor %xmm0, %xmm0 790 movapd 0 * SIZE(BO), %xmm9 791 pxor %xmm1, %xmm1 792 movapd 8 * SIZE(AO), %xmm10 793 pxor %xmm2, %xmm2 794 movapd 8 * SIZE(BO), %xmm11 795 pxor %xmm3, %xmm3 796 797#if defined(LT) || defined(RN) 798 movq KK, %rax 799#else 800 movq K, %rax 801 subq KK, %rax 802#endif 803 sarq $3, %rax 804 je .L105 805 ALIGN_4 806 807.L102: 808 mulpd %xmm8, %xmm9 809 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 810 movapd 2 * SIZE(AO), %xmm8 811 mulpd 2 * SIZE(BO), %xmm8 812 addpd %xmm9, %xmm0 813 movapd 16 * SIZE(BO), %xmm9 814 addpd %xmm8, %xmm1 815 movapd 4 * SIZE(AO), %xmm8 816 mulpd 4 * SIZE(BO), %xmm8 817 addpd %xmm8, %xmm2 818 movapd 6 * SIZE(AO), %xmm8 819 mulpd 6 * SIZE(BO), %xmm8 820 addpd %xmm8, %xmm3 821 movapd 16 * SIZE(AO), %xmm8 822 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 823 mulpd %xmm10, %xmm11 824 movapd 10 * SIZE(AO), %xmm10 825 mulpd 10 * SIZE(BO), %xmm10 826 addpd %xmm11, %xmm0 827 movapd 24 * SIZE(BO), %xmm11 828 addpd %xmm10, %xmm1 829 movapd 12 * SIZE(AO), %xmm10 830 mulpd 12 * SIZE(BO), %xmm10 831 addpd %xmm10, %xmm2 832 movapd 14 * SIZE(AO), %xmm10 833 mulpd 14 * SIZE(BO), %xmm10 834 addpd %xmm10, %xmm3 835 movapd 24 * SIZE(AO), %xmm10 836 837 addq $16 * SIZE, AO 838 addq $16 * SIZE, BO 839 decq %rax 840 jne .L102 841 ALIGN_4 842 843.L105: 844#if defined(LT) || defined(RN) 845 movq KK, %rax 846#else 847 movq K, %rax 848 subq KK, %rax 849#endif 850 andq $7, %rax # if (k & 1) 851 BRANCH 852 je .L109 853 ALIGN_4 854 855.L106: 856 mulpd %xmm8, %xmm9 857 addpd %xmm9, %xmm0 858 movapd 2 * SIZE(AO), %xmm8 859 movapd 2 * SIZE(BO), %xmm9 860 861 addq $2 * SIZE, AO # aoffset += 4 862 addq $2 * SIZE, BO # boffset1 += 8 863 decq %rax 864 jg .L106 865 ALIGN_4 866 867.L109: 868 addpd %xmm1, %xmm0 869 addpd %xmm3, %xmm2 870 addpd %xmm2, %xmm0 871 872#if defined(LN) || defined(RT) 873 movq KK, %rax 874#ifdef LN 875 subq $2, %rax 876#else 877 subq $1, %rax 878#endif 879 880 movq AORIG, AO 881 movq BORIG, B 882 leaq BUFFER, BO 883 884 leaq (, %rax, SIZE), %rax 885 leaq (AO, %rax, 2), AO 886 leaq (B, %rax, 1), B 887 leaq (BO, %rax, 2), BO 888#endif 889 890#if defined(LN) || defined(LT) 891 movapd 0 * SIZE(B), %xmm2 892 subpd %xmm0, %xmm2 893#else 894 movapd 0 * SIZE(AO), %xmm2 895 subpd %xmm0, %xmm2 896#endif 897 898#ifdef LN 899 movapd %xmm2, %xmm0 900 unpckhpd %xmm0, %xmm0 901 902 movsd 3 * SIZE(AO), %xmm4 903 mulsd %xmm4, %xmm0 904 905 movsd 2 * SIZE(AO), %xmm5 906 mulsd %xmm0, %xmm5 907 subsd %xmm5, %xmm2 908 909 movsd 0 * SIZE(AO), %xmm4 910 mulsd %xmm4, %xmm2 911 912 unpcklpd %xmm0, %xmm2 913#endif 914 915#ifdef LT 916 movapd %xmm2, %xmm0 917 unpckhpd %xmm0, %xmm0 918 919 movsd 0 * SIZE(AO), %xmm4 920 mulsd %xmm4, %xmm2 921 922 movsd 1 * SIZE(AO), %xmm5 923 mulsd %xmm2, %xmm5 924 subsd %xmm5, %xmm0 925 926 movsd 3 * SIZE(AO), %xmm4 927 mulsd %xmm4, %xmm0 928 929 unpcklpd %xmm0, %xmm2 930#endif 931 932#ifdef RN 933 movlpd 0 * SIZE(B), %xmm0 934 movhpd 0 * SIZE(B), %xmm0 935 mulpd %xmm0, %xmm2 936#endif 937 938#ifdef RT 939 movlpd 0 * SIZE(B), %xmm0 940 movhpd 0 * SIZE(B), %xmm0 941 mulpd %xmm0, %xmm2 942#endif 943 944#ifdef LN 945 subq $2 * SIZE, CO1 946#endif 947 948#if defined(LN) || defined(LT) 949 movsd %xmm2, 0 * SIZE(CO1) 950 movhpd %xmm2, 1 * SIZE(CO1) 951#else 952 movsd %xmm2, 0 * SIZE(CO1) 953 movhpd %xmm2, 1 * SIZE(CO1) 954#endif 955 956#if defined(LN) || defined(LT) 957 movapd %xmm2, 0 * SIZE(B) 958 959 movlpd %xmm2, 0 * SIZE(BO) 960 movlpd %xmm2, 1 * SIZE(BO) 961 movhpd %xmm2, 2 * SIZE(BO) 962 movhpd %xmm2, 3 * SIZE(BO) 963#else 964 movapd %xmm2, 0 * SIZE(AO) 965#endif 966 967#ifndef LN 968 addq $2 * SIZE, CO1 969#endif 970 971#if defined(LT) || defined(RN) 972 movq K, %rax 973 subq KK, %rax 974 leaq (,%rax, SIZE), %rax 975 leaq (AO, %rax, 2), AO 976#ifdef LT 977 addq $2 * SIZE, B 978#endif 979#endif 980 981#ifdef LN 982 subq $2, KK 983 movq BORIG, B 984#endif 985 986#ifdef LT 987 addq $2, KK 988#endif 989 990#ifdef RT 991 movq K, %rax 992 movq BORIG, B 993 salq $1 + BASE_SHIFT, %rax 994 addq %rax, AORIG 995#endif 996 ALIGN_4 997 998.L110: 999 testq $1, M 1000 je .L119 1001 ALIGN_4 1002 1003.L111: 1004#ifdef LN 1005 movq K, %rax 1006 salq $0 + BASE_SHIFT, %rax 1007 subq %rax, AORIG 1008#endif 1009 1010#if defined(LN) || defined(RT) 1011 movq KK, %rax 1012 movq AORIG, AO 1013 leaq (, %rax, SIZE), %rax 1014 leaq (AO, %rax, 1), AO 1015#endif 1016 1017 leaq BUFFER, BO 1018 1019#if defined(LN) || defined(RT) 1020 movq KK, %rax 1021 salq $0 + BASE_SHIFT, %rax 1022 leaq (BO, %rax, 2), BO 1023#endif 1024 1025 movsd 0 * SIZE(AO), %xmm8 1026 pxor %xmm0, %xmm0 1027 movsd 0 * SIZE(BO), %xmm9 1028 pxor %xmm1, %xmm1 1029 movsd 4 * SIZE(AO), %xmm10 1030 pxor %xmm2, %xmm2 1031 movsd 8 * SIZE(BO), %xmm11 1032 pxor %xmm3, %xmm3 1033 1034#if defined(LT) || defined(RN) 1035 movq KK, %rax 1036#else 1037 movq K, %rax 1038 subq KK, %rax 1039#endif 1040 sarq $3, %rax 1041 je .L115 1042 ALIGN_4 1043 1044.L112: 1045 mulsd %xmm8, %xmm9 1046 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 1047 movsd 1 * SIZE(AO), %xmm8 1048 addsd %xmm9, %xmm0 1049 movsd 16 * SIZE(BO), %xmm9 1050 mulsd 2 * SIZE(BO), %xmm8 1051 addsd %xmm8, %xmm1 1052 movsd 2 * SIZE(AO), %xmm8 1053 mulsd 4 * SIZE(BO), %xmm8 1054 addsd %xmm8, %xmm2 1055 movsd 3 * SIZE(AO), %xmm8 1056 mulsd 6 * SIZE(BO), %xmm8 1057 addsd %xmm8, %xmm3 1058 movsd 8 * SIZE(AO), %xmm8 1059 mulsd %xmm10, %xmm11 1060 movsd 5 * SIZE(AO), %xmm10 1061 addsd %xmm11, %xmm0 1062 movsd 24 * SIZE(BO), %xmm11 1063 mulsd 10 * SIZE(BO), %xmm10 1064 addsd %xmm10, %xmm1 1065 movsd 6 * SIZE(AO), %xmm10 1066 mulsd 12 * SIZE(BO), %xmm10 1067 addsd %xmm10, %xmm2 1068 movsd 7 * SIZE(AO), %xmm10 1069 mulsd 14 * SIZE(BO), %xmm10 1070 addsd %xmm10, %xmm3 1071 movsd 12 * SIZE(AO), %xmm10 1072 1073 addq $ 8 * SIZE, AO 1074 addq $16 * SIZE, BO 1075 decq %rax 1076 jne .L112 1077 ALIGN_4 1078 1079.L115: 1080#if defined(LT) || defined(RN) 1081 movq KK, %rax 1082#else 1083 movq K, %rax 1084 subq KK, %rax 1085#endif 1086 andq $7, %rax # if (k & 1) 1087 BRANCH 1088 je .L118 1089 ALIGN_4 1090 1091.L116: 1092 mulsd %xmm8, %xmm9 1093 movsd 1 * SIZE(AO), %xmm8 1094 addsd %xmm9, %xmm0 1095 movsd 2 * SIZE(BO), %xmm9 1096 1097 addq $1 * SIZE, AO # aoffset += 4 1098 addq $2 * SIZE, BO # boffset1 += 8 1099 decq %rax 1100 jg .L116 1101 ALIGN_4 1102 1103.L118: 1104 addsd %xmm2, %xmm0 1105 addsd %xmm3, %xmm1 1106 addsd %xmm1, %xmm0 1107 1108#if defined(LN) || defined(RT) 1109 movq KK, %rax 1110#ifdef LN 1111 subq $1, %rax 1112#else 1113 subq $1, %rax 1114#endif 1115 1116 movq AORIG, AO 1117 movq BORIG, B 1118 leaq BUFFER, BO 1119 1120 leaq (, %rax, SIZE), %rax 1121 leaq (AO, %rax, 1), AO 1122 leaq (B, %rax, 1), B 1123 leaq (BO, %rax, 2), BO 1124#endif 1125 1126#if defined(LN) || defined(LT) 1127 movsd 0 * SIZE(B), %xmm2 1128 subsd %xmm0, %xmm2 1129#else 1130 movsd 0 * SIZE(AO), %xmm2 1131 subsd %xmm0, %xmm2 1132#endif 1133 1134#ifdef LN 1135 movsd 0 * SIZE(AO), %xmm4 1136 mulsd %xmm4, %xmm2 1137#endif 1138 1139#ifdef LT 1140 movsd 0 * SIZE(AO), %xmm4 1141 mulsd %xmm4, %xmm2 1142#endif 1143 1144#ifdef RN 1145 movsd 0 * SIZE(B), %xmm0 1146 mulsd %xmm0, %xmm2 1147#endif 1148 1149#ifdef RT 1150 movsd 0 * SIZE(B), %xmm0 1151 mulsd %xmm0, %xmm2 1152#endif 1153 1154#ifdef LN 1155 subq $1 * SIZE, CO1 1156#endif 1157 1158#if defined(LN) || defined(LT) 1159 movsd %xmm2, 0 * SIZE(CO1) 1160#else 1161 movsd %xmm2, 0 * SIZE(CO1) 1162#endif 1163 1164#if defined(LN) || defined(LT) 1165 movsd %xmm2, 0 * SIZE(B) 1166 1167 movlpd %xmm2, 0 * SIZE(BO) 1168 movlpd %xmm2, 1 * SIZE(BO) 1169#else 1170 movsd %xmm2, 0 * SIZE(AO) 1171#endif 1172 1173#ifndef LN 1174 addq $1 * SIZE, CO1 1175#endif 1176 1177#if defined(LT) || defined(RN) 1178 movq K, %rax 1179 subq KK, %rax 1180 leaq (,%rax, SIZE), %rax 1181 leaq (AO, %rax, 1), AO 1182#ifdef LT 1183 addq $1 * SIZE, B 1184#endif 1185#endif 1186 1187#ifdef LN 1188 subq $1, KK 1189 movq BORIG, B 1190#endif 1191 1192#ifdef LT 1193 addq $1, KK 1194#endif 1195 1196#ifdef RT 1197 movq K, %rax 1198 movq BORIG, B 1199 salq $0 + BASE_SHIFT, %rax 1200 addq %rax, AORIG 1201#endif 1202 ALIGN_4 1203 1204.L119: 1205#ifdef LN 1206 leaq (, K, SIZE), %rax 1207 leaq (B, %rax, 1), B 1208#endif 1209 1210#if defined(LT) || defined(RN) 1211 movq K, %rax 1212 subq KK, %rax 1213 leaq (,%rax, SIZE), %rax 1214 leaq (B, %rax, 1), B 1215#endif 1216 1217#ifdef RN 1218 addq $1, KK 1219#endif 1220 1221#ifdef RT 1222 subq $1, KK 1223#endif 1224 ALIGN_4 1225 1226 1227.L40: 1228 testq $2, N 1229 je .L80 1230 ALIGN_4 1231 1232.L41: 1233/* Copying to Sub Buffer */ 1234 1235#ifdef LN 1236 movq OFFSET, %rax 1237 addq M, %rax 1238 movq %rax, KK 1239#endif 1240 1241 leaq BUFFER, BO 1242 1243#ifdef RT 1244 movq K, %rax 1245 salq $1 + BASE_SHIFT, %rax 1246 subq %rax, B 1247#endif 1248 1249#if defined(LN) || defined(RT) 1250 movq KK, %rax 1251 movq B, BORIG 1252 leaq (, %rax, SIZE), %rax 1253 leaq (B, %rax, 2), B 1254 leaq (BO, %rax, 4), BO 1255#endif 1256 1257#ifdef LT 1258 movq OFFSET, %rax 1259 movq %rax, KK 1260#endif 1261 1262#if defined(LT) || defined(RN) 1263 movq KK, %rax 1264#else 1265 movq K, %rax 1266 subq KK, %rax 1267#endif 1268 sarq $2, %rax 1269 jle .L43 1270 ALIGN_4 1271 1272.L42: 1273 PREFETCH 56 * SIZE(B) 1274 1275 movsd 0 * SIZE(B), %xmm0 1276 movsd 1 * SIZE(B), %xmm1 1277 movsd 2 * SIZE(B), %xmm2 1278 movsd 3 * SIZE(B), %xmm3 1279 movsd 4 * SIZE(B), %xmm4 1280 movsd 5 * SIZE(B), %xmm5 1281 movsd 6 * SIZE(B), %xmm6 1282 movsd 7 * SIZE(B), %xmm7 1283 1284 addq $ 8 * SIZE, B 1285 addq $16 * SIZE, BO 1286 1287 movsd %xmm0, -16 * SIZE(BO) 1288 movsd %xmm0, -15 * SIZE(BO) 1289 movsd %xmm1, -14 * SIZE(BO) 1290 movsd %xmm1, -13 * SIZE(BO) 1291 movsd %xmm2, -12 * SIZE(BO) 1292 movsd %xmm2, -11 * SIZE(BO) 1293 movsd %xmm3, -10 * SIZE(BO) 1294 movsd %xmm3, -9 * SIZE(BO) 1295 movsd %xmm4, -8 * SIZE(BO) 1296 movsd %xmm4, -7 * SIZE(BO) 1297 movsd %xmm5, -6 * SIZE(BO) 1298 movsd %xmm5, -5 * SIZE(BO) 1299 movsd %xmm6, -4 * SIZE(BO) 1300 movsd %xmm6, -3 * SIZE(BO) 1301 movsd %xmm7, -2 * SIZE(BO) 1302 movsd %xmm7, -1 * SIZE(BO) 1303 1304 decq %rax 1305 jne .L42 1306 ALIGN_4 1307 1308.L43: 1309#if defined(LT) || defined(RN) 1310 movq KK, %rax 1311#else 1312 movq K, %rax 1313 subq KK, %rax 1314#endif 1315 andq $3, %rax 1316 BRANCH 1317 jle .L50 1318 ALIGN_4 1319 1320.L44: 1321 movsd 0 * SIZE(B), %xmm0 1322 movsd 1 * SIZE(B), %xmm1 1323 1324 movsd %xmm0, 0 * SIZE(BO) 1325 movsd %xmm0, 1 * SIZE(BO) 1326 movsd %xmm1, 2 * SIZE(BO) 1327 movsd %xmm1, 3 * SIZE(BO) 1328 1329 addq $2 * SIZE, B 1330 addq $4 * SIZE, BO 1331 decq %rax 1332 jne .L44 1333 ALIGN_4 1334 1335.L50: 1336#if defined(LT) || defined(RN) 1337 movq A, AO 1338#else 1339 movq A, AORIG 1340#endif 1341 1342#ifdef RT 1343 leaq (, LDC, 2), %rax 1344 subq %rax, C 1345#endif 1346 1347 movq C, CO1 # coffset1 = c 1348 leaq (C, LDC, 1), CO2 # coffset2 = c + ldc 1349#ifndef RT 1350 leaq (C, LDC, 2), C 1351#endif 1352 1353 movq M, I 1354 sarq $2, I # i = (m >> 2) 1355 jle .L60 1356 ALIGN_4 1357 1358.L51: 1359#ifdef LN 1360 movq K, %rax 1361 salq $2 + BASE_SHIFT, %rax 1362 subq %rax, AORIG 1363#endif 1364 1365#if defined(LN) || defined(RT) 1366 movq KK, %rax 1367 movq AORIG, AO 1368 leaq (, %rax, SIZE), %rax 1369 leaq (AO, %rax, 4), AO 1370#endif 1371 1372 leaq BUFFER, BO 1373 1374#if defined(LN) || defined(RT) 1375 movq KK, %rax 1376 salq $1 + BASE_SHIFT, %rax 1377 leaq (BO, %rax, 2), BO 1378#endif 1379 1380 movapd 0 * SIZE(AO), %xmm8 1381 pxor %xmm0, %xmm0 1382 movapd 0 * SIZE(BO), %xmm9 1383 pxor %xmm1, %xmm1 1384 movapd 8 * SIZE(AO), %xmm10 1385 pxor %xmm4, %xmm4 1386 movapd 8 * SIZE(BO), %xmm11 1387 pxor %xmm5, %xmm5 1388 1389 movapd 16 * SIZE(AO), %xmm12 1390 movapd 16 * SIZE(BO), %xmm13 1391 movapd 24 * SIZE(AO), %xmm14 1392 movapd 24 * SIZE(BO), %xmm15 1393 1394 PREFETCHW 4 * SIZE(CO1) 1395 PREFETCHW 4 * SIZE(CO2) 1396 1397#if defined(LT) || defined(RN) 1398 movq KK, %rax 1399#else 1400 movq K, %rax 1401 subq KK, %rax 1402#endif 1403 sarq $3, %rax 1404 je .L55 1405 ALIGN_4 1406 1407.L52: 1408 mulpd %xmm8, %xmm9 1409 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 1410 mulpd 2 * SIZE(BO), %xmm8 1411 addpd %xmm9, %xmm0 1412 movapd 0 * SIZE(BO), %xmm9 1413 addpd %xmm8, %xmm1 1414 movapd 2 * SIZE(AO), %xmm8 1415 mulpd %xmm8, %xmm9 1416 mulpd 2 * SIZE(BO), %xmm8 1417 addpd %xmm9, %xmm4 1418 movapd 4 * SIZE(BO), %xmm9 1419 addpd %xmm8, %xmm5 1420 movapd 4 * SIZE(AO), %xmm8 1421 1422 mulpd %xmm8, %xmm9 1423 mulpd 6 * SIZE(BO), %xmm8 1424 addpd %xmm9, %xmm0 1425 movapd 4 * SIZE(BO), %xmm9 1426 addpd %xmm8, %xmm1 1427 movapd 6 * SIZE(AO), %xmm8 1428 mulpd %xmm8, %xmm9 1429 mulpd 6 * SIZE(BO), %xmm8 1430 addpd %xmm9, %xmm4 1431 movapd 32 * SIZE(BO), %xmm9 1432 addpd %xmm8, %xmm5 1433 movapd 32 * SIZE(AO), %xmm8 1434 1435 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) 1436 mulpd %xmm10, %xmm11 1437 mulpd 10 * SIZE(BO), %xmm10 1438 addpd %xmm11, %xmm0 1439 movapd 8 * SIZE(BO), %xmm11 1440 addpd %xmm10, %xmm1 1441 movapd 10 * SIZE(AO), %xmm10 1442 mulpd %xmm10, %xmm11 1443 mulpd 10 * SIZE(BO), %xmm10 1444 addpd %xmm11, %xmm4 1445 movapd 12 * SIZE(BO), %xmm11 1446 addpd %xmm10, %xmm5 1447 movapd 12 * SIZE(AO), %xmm10 1448 1449 mulpd %xmm10, %xmm11 1450 mulpd 14 * SIZE(BO), %xmm10 1451 addpd %xmm11, %xmm0 1452 movapd 12 * SIZE(BO), %xmm11 1453 addpd %xmm10, %xmm1 1454 movapd 14 * SIZE(AO), %xmm10 1455 mulpd %xmm10, %xmm11 1456 mulpd 14 * SIZE(BO), %xmm10 1457 addpd %xmm11, %xmm4 1458 movapd 40 * SIZE(BO), %xmm11 1459 addpd %xmm10, %xmm5 1460 movapd 40 * SIZE(AO), %xmm10 1461 1462 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) 1463 mulpd %xmm12, %xmm13 1464 mulpd 18 * SIZE(BO), %xmm12 1465 addpd %xmm13, %xmm0 1466 movapd 16 * SIZE(BO), %xmm13 1467 addpd %xmm12, %xmm1 1468 movapd 18 * SIZE(AO), %xmm12 1469 mulpd %xmm12, %xmm13 1470 mulpd 18 * SIZE(BO), %xmm12 1471 addpd %xmm13, %xmm4 1472 movapd 20 * SIZE(BO), %xmm13 1473 addpd %xmm12, %xmm5 1474 movapd 20 * SIZE(AO), %xmm12 1475 1476 mulpd %xmm12, %xmm13 1477 mulpd 22 * SIZE(BO), %xmm12 1478 addpd %xmm13, %xmm0 1479 movapd 20 * SIZE(BO), %xmm13 1480 addpd %xmm12, %xmm1 1481 movapd 22 * SIZE(AO), %xmm12 1482 mulpd %xmm12, %xmm13 1483 mulpd 22 * SIZE(BO), %xmm12 1484 addpd %xmm13, %xmm4 1485 movapd 48 * SIZE(BO), %xmm13 1486 addpd %xmm12, %xmm5 1487 movapd 48 * SIZE(AO), %xmm12 1488 1489 PREFETCH (PREFETCHSIZE + 24) * SIZE(AO) 1490 mulpd %xmm14, %xmm15 1491 mulpd 26 * SIZE(BO), %xmm14 1492 addpd %xmm15, %xmm0 1493 movapd 24 * SIZE(BO), %xmm15 1494 addpd %xmm14, %xmm1 1495 movapd 26 * SIZE(AO), %xmm14 1496 mulpd %xmm14, %xmm15 1497 mulpd 26 * SIZE(BO), %xmm14 1498 addpd %xmm15, %xmm4 1499 movapd 28 * SIZE(BO), %xmm15 1500 addpd %xmm14, %xmm5 1501 movapd 28 * SIZE(AO), %xmm14 1502 1503 mulpd %xmm14, %xmm15 1504 mulpd 30 * SIZE(BO), %xmm14 1505 addpd %xmm15, %xmm0 1506 movapd 28 * SIZE(BO), %xmm15 1507 addpd %xmm14, %xmm1 1508 movapd 30 * SIZE(AO), %xmm14 1509 mulpd %xmm14, %xmm15 1510 mulpd 30 * SIZE(BO), %xmm14 1511 addpd %xmm15, %xmm4 1512 movapd 56 * SIZE(BO), %xmm15 1513 addpd %xmm14, %xmm5 1514 movapd 56 * SIZE(AO), %xmm14 1515 1516 addq $32 * SIZE, AO 1517 addq $32 * SIZE, BO 1518 decq %rax 1519 jne .L52 1520 ALIGN_4 1521 1522.L55: 1523#if defined(LT) || defined(RN) 1524 movq KK, %rax 1525#else 1526 movq K, %rax 1527 subq KK, %rax 1528#endif 1529 andq $7, %rax # if (k & 1) 1530 BRANCH 1531 je .L59 1532 ALIGN_4 1533 1534.L56: 1535 movapd 0 * SIZE(BO), %xmm9 1536 mulpd %xmm8, %xmm9 1537 addpd %xmm9, %xmm0 1538 mulpd 2 * SIZE(BO), %xmm8 1539 addpd %xmm8, %xmm1 1540 movapd 2 * SIZE(AO), %xmm8 1541 movapd 0 * SIZE(BO), %xmm9 1542 mulpd %xmm8, %xmm9 1543 addpd %xmm9, %xmm4 1544 mulpd 2 * SIZE(BO), %xmm8 1545 addpd %xmm8, %xmm5 1546 movapd 4 * SIZE(AO), %xmm8 1547 1548 addq $4 * SIZE, AO # aoffset += 4 1549 addq $4 * SIZE, BO # boffset1 += 8 1550 decq %rax 1551 jg .L56 1552 ALIGN_4 1553 1554.L59: 1555#if defined(LN) || defined(RT) 1556 movq KK, %rax 1557#ifdef LN 1558 subq $4, %rax 1559#else 1560 subq $2, %rax 1561#endif 1562 1563 movq AORIG, AO 1564 movq BORIG, B 1565 leaq BUFFER, BO 1566 1567 leaq (, %rax, SIZE), %rax 1568 leaq (AO, %rax, 4), AO 1569 leaq (B, %rax, 2), B 1570 leaq (BO, %rax, 4), BO 1571#endif 1572 1573#if defined(LN) || defined(LT) 1574 movapd %xmm0, %xmm8 1575 unpcklpd %xmm1, %xmm0 1576 unpckhpd %xmm1, %xmm8 1577 1578 movapd %xmm4, %xmm12 1579 unpcklpd %xmm5, %xmm4 1580 unpckhpd %xmm5, %xmm12 1581 1582 movapd 0 * SIZE(B), %xmm1 1583 movapd 2 * SIZE(B), %xmm5 1584 movapd 4 * SIZE(B), %xmm9 1585 movapd 6 * SIZE(B), %xmm13 1586 1587 subpd %xmm0, %xmm1 1588 subpd %xmm8, %xmm5 1589 subpd %xmm4, %xmm9 1590 subpd %xmm12, %xmm13 1591#else 1592 movapd 0 * SIZE(AO), %xmm8 1593 movapd 2 * SIZE(AO), %xmm9 1594 movapd 4 * SIZE(AO), %xmm10 1595 movapd 6 * SIZE(AO), %xmm11 1596 1597 subpd %xmm0, %xmm8 1598 subpd %xmm4, %xmm9 1599 subpd %xmm1, %xmm10 1600 subpd %xmm5, %xmm11 1601#endif 1602 1603#ifdef LN 1604 movlpd 15 * SIZE(AO), %xmm0 1605 movhpd 15 * SIZE(AO), %xmm0 1606 mulpd %xmm0, %xmm13 1607 movlpd 14 * SIZE(AO), %xmm2 1608 movhpd 14 * SIZE(AO), %xmm2 1609 mulpd %xmm13, %xmm2 1610 subpd %xmm2, %xmm9 1611 movlpd 13 * SIZE(AO), %xmm4 1612 movhpd 13 * SIZE(AO), %xmm4 1613 mulpd %xmm13, %xmm4 1614 subpd %xmm4, %xmm5 1615 movlpd 12 * SIZE(AO), %xmm6 1616 movhpd 12 * SIZE(AO), %xmm6 1617 mulpd %xmm13, %xmm6 1618 subpd %xmm6, %xmm1 1619 1620 movlpd 10 * SIZE(AO), %xmm0 1621 movhpd 10 * SIZE(AO), %xmm0 1622 mulpd %xmm0, %xmm9 1623 movlpd 9 * SIZE(AO), %xmm2 1624 movhpd 9 * SIZE(AO), %xmm2 1625 mulpd %xmm9, %xmm2 1626 subpd %xmm2, %xmm5 1627 movlpd 8 * SIZE(AO), %xmm4 1628 movhpd 8 * SIZE(AO), %xmm4 1629 mulpd %xmm9, %xmm4 1630 subpd %xmm4, %xmm1 1631 1632 movlpd 5 * SIZE(AO), %xmm0 1633 movhpd 5 * SIZE(AO), %xmm0 1634 mulpd %xmm0, %xmm5 1635 movlpd 4 * SIZE(AO), %xmm2 1636 movhpd 4 * SIZE(AO), %xmm2 1637 mulpd %xmm5, %xmm2 1638 subpd %xmm2, %xmm1 1639 1640 movlpd 0 * SIZE(AO), %xmm0 1641 movhpd 0 * SIZE(AO), %xmm0 1642 mulpd %xmm0, %xmm1 1643#endif 1644 1645#ifdef LT 1646 movlpd 0 * SIZE(AO), %xmm0 1647 movhpd 0 * SIZE(AO), %xmm0 1648 mulpd %xmm0, %xmm1 1649 movlpd 1 * SIZE(AO), %xmm2 1650 movhpd 1 * SIZE(AO), %xmm2 1651 mulpd %xmm1, %xmm2 1652 subpd %xmm2, %xmm5 1653 movlpd 2 * SIZE(AO), %xmm4 1654 movhpd 2 * SIZE(AO), %xmm4 1655 mulpd %xmm1, %xmm4 1656 subpd %xmm4, %xmm9 1657 movlpd 3 * SIZE(AO), %xmm6 1658 movhpd 3 * SIZE(AO), %xmm6 1659 mulpd %xmm1, %xmm6 1660 subpd %xmm6, %xmm13 1661 1662 1663 movlpd 5 * SIZE(AO), %xmm0 1664 movhpd 5 * SIZE(AO), %xmm0 1665 mulpd %xmm0, %xmm5 1666 1667 movlpd 6 * SIZE(AO), %xmm2 1668 movhpd 6 * SIZE(AO), %xmm2 1669 mulpd %xmm5, %xmm2 1670 subpd %xmm2, %xmm9 1671 movlpd 7 * SIZE(AO), %xmm4 1672 movhpd 7 * SIZE(AO), %xmm4 1673 mulpd %xmm5, %xmm4 1674 subpd %xmm4, %xmm13 1675 1676 movlpd 10 * SIZE(AO), %xmm0 1677 movhpd 10 * SIZE(AO), %xmm0 1678 mulpd %xmm0, %xmm9 1679 movlpd 11 * SIZE(AO), %xmm2 1680 movhpd 11 * SIZE(AO), %xmm2 1681 mulpd %xmm9, %xmm2 1682 subpd %xmm2, %xmm13 1683 1684 movlpd 15 * SIZE(AO), %xmm0 1685 movhpd 15 * SIZE(AO), %xmm0 1686 mulpd %xmm0, %xmm13 1687#endif 1688 1689#ifdef RN 1690 movlpd 0 * SIZE(B), %xmm0 1691 movhpd 0 * SIZE(B), %xmm0 1692 mulpd %xmm0, %xmm8 1693 mulpd %xmm0, %xmm9 1694 1695 movlpd 1 * SIZE(B), %xmm1 1696 movhpd 1 * SIZE(B), %xmm1 1697 mulpd %xmm8, %xmm1 1698 subpd %xmm1, %xmm10 1699 movlpd 1 * SIZE(B), %xmm1 1700 movhpd 1 * SIZE(B), %xmm1 1701 mulpd %xmm9, %xmm1 1702 subpd %xmm1, %xmm11 1703 1704 movlpd 3 * SIZE(B), %xmm0 1705 movhpd 3 * SIZE(B), %xmm0 1706 mulpd %xmm0, %xmm10 1707 mulpd %xmm0, %xmm11 1708#endif 1709 1710#ifdef RT 1711 movlpd 3 * SIZE(B), %xmm0 1712 movhpd 3 * SIZE(B), %xmm0 1713 mulpd %xmm0, %xmm10 1714 mulpd %xmm0, %xmm11 1715 1716 movlpd 2 * SIZE(B), %xmm1 1717 movhpd 2 * SIZE(B), %xmm1 1718 mulpd %xmm10, %xmm1 1719 subpd %xmm1, %xmm8 1720 movlpd 2 * SIZE(B), %xmm1 1721 movhpd 2 * SIZE(B), %xmm1 1722 mulpd %xmm11, %xmm1 1723 subpd %xmm1, %xmm9 1724 1725 movlpd 0 * SIZE(B), %xmm0 1726 movhpd 0 * SIZE(B), %xmm0 1727 mulpd %xmm0, %xmm8 1728 mulpd %xmm0, %xmm9 1729#endif 1730 1731#ifdef LN 1732 subq $4 * SIZE, CO1 1733 subq $4 * SIZE, CO2 1734#endif 1735 1736#if defined(LN) || defined(LT) 1737 movsd %xmm1, 0 * SIZE(CO1) 1738 movsd %xmm5, 1 * SIZE(CO1) 1739 movsd %xmm9, 2 * SIZE(CO1) 1740 movsd %xmm13, 3 * SIZE(CO1) 1741 1742 movhpd %xmm1, 0 * SIZE(CO2) 1743 movhpd %xmm5, 1 * SIZE(CO2) 1744 movhpd %xmm9, 2 * SIZE(CO2) 1745 movhpd %xmm13, 3 * SIZE(CO2) 1746#else 1747 movsd %xmm8, 0 * SIZE(CO1) 1748 movhpd %xmm8, 1 * SIZE(CO1) 1749 movsd %xmm9, 2 * SIZE(CO1) 1750 movhpd %xmm9, 3 * SIZE(CO1) 1751 1752 movsd %xmm10, 0 * SIZE(CO2) 1753 movhpd %xmm10, 1 * SIZE(CO2) 1754 movsd %xmm11, 2 * SIZE(CO2) 1755 movhpd %xmm11, 3 * SIZE(CO2) 1756#endif 1757 1758#if defined(LN) || defined(LT) 1759 movapd %xmm1, 0 * SIZE(B) 1760 movapd %xmm5, 2 * SIZE(B) 1761 movapd %xmm9, 4 * SIZE(B) 1762 movapd %xmm13, 6 * SIZE(B) 1763 1764 movlpd %xmm1, 0 * SIZE(BO) 1765 movlpd %xmm1, 1 * SIZE(BO) 1766 movhpd %xmm1, 2 * SIZE(BO) 1767 movhpd %xmm1, 3 * SIZE(BO) 1768 movlpd %xmm5, 4 * SIZE(BO) 1769 movlpd %xmm5, 5 * SIZE(BO) 1770 movhpd %xmm5, 6 * SIZE(BO) 1771 movhpd %xmm5, 7 * SIZE(BO) 1772 movlpd %xmm9, 8 * SIZE(BO) 1773 movlpd %xmm9, 9 * SIZE(BO) 1774 movhpd %xmm9, 10 * SIZE(BO) 1775 movhpd %xmm9, 11 * SIZE(BO) 1776 movlpd %xmm13, 12 * SIZE(BO) 1777 movlpd %xmm13, 13 * SIZE(BO) 1778 movhpd %xmm13, 14 * SIZE(BO) 1779 movhpd %xmm13, 15 * SIZE(BO) 1780#else 1781 movapd %xmm8, 0 * SIZE(AO) 1782 movapd %xmm9, 2 * SIZE(AO) 1783 movapd %xmm10, 4 * SIZE(AO) 1784 movapd %xmm11, 6 * SIZE(AO) 1785#endif 1786 1787#ifndef LN 1788 addq $4 * SIZE, CO1 1789 addq $4 * SIZE, CO2 1790#endif 1791 1792#if defined(LT) || defined(RN) 1793 movq K, %rax 1794 subq KK, %rax 1795 leaq (,%rax, SIZE), %rax 1796 leaq (AO, %rax, 4), AO 1797#ifdef LT 1798 addq $8 * SIZE, B 1799#endif 1800#endif 1801 1802#ifdef LN 1803 subq $4, KK 1804 movq BORIG, B 1805#endif 1806 1807#ifdef LT 1808 addq $4, KK 1809#endif 1810 1811#ifdef RT 1812 movq K, %rax 1813 movq BORIG, B 1814 salq $2 + BASE_SHIFT, %rax 1815 addq %rax, AORIG 1816#endif 1817 1818 decq I # i -- 1819 jg .L51 1820 ALIGN_4 1821 1822.L60: 1823 testq $2, M 1824 je .L70 1825 ALIGN_4 1826 1827.L61: 1828#ifdef LN 1829 movq K, %rax 1830 salq $1 + BASE_SHIFT, %rax 1831 subq %rax, AORIG 1832#endif 1833 1834#if defined(LN) || defined(RT) 1835 movq KK, %rax 1836 movq AORIG, AO 1837 leaq (, %rax, SIZE), %rax 1838 leaq (AO, %rax, 2), AO 1839#endif 1840 1841 leaq BUFFER, BO 1842 1843#if defined(LN) || defined(RT) 1844 movq KK, %rax 1845 salq $1 + BASE_SHIFT, %rax 1846 leaq (BO, %rax, 2), BO 1847#endif 1848 1849 movapd 0 * SIZE(AO), %xmm8 1850 pxor %xmm0, %xmm0 1851 movapd 0 * SIZE(BO), %xmm9 1852 pxor %xmm1, %xmm1 1853 movapd 8 * SIZE(AO), %xmm10 1854 pxor %xmm2, %xmm2 1855 movapd 8 * SIZE(BO), %xmm11 1856 pxor %xmm3, %xmm3 1857 1858 movapd 16 * SIZE(BO), %xmm13 1859 movapd 24 * SIZE(BO), %xmm15 1860 1861#if defined(LT) || defined(RN) 1862 movq KK, %rax 1863#else 1864 movq K, %rax 1865 subq KK, %rax 1866#endif 1867 sarq $3, %rax 1868 je .L65 1869 ALIGN_4 1870 1871.L62: 1872 mulpd %xmm8, %xmm9 1873 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 1874 mulpd 2 * SIZE(BO), %xmm8 1875 addpd %xmm9, %xmm0 1876 movapd 4 * SIZE(BO), %xmm9 1877 addpd %xmm8, %xmm1 1878 movapd 2 * SIZE(AO), %xmm8 1879 1880 mulpd %xmm8, %xmm9 1881 mulpd 6 * SIZE(BO), %xmm8 1882 addpd %xmm9, %xmm2 1883 movapd 32 * SIZE(BO), %xmm9 1884 addpd %xmm8, %xmm3 1885 movapd 4 * SIZE(AO), %xmm8 1886 1887 mulpd %xmm8, %xmm11 1888 mulpd 10 * SIZE(BO), %xmm8 1889 addpd %xmm11, %xmm0 1890 movapd 12 * SIZE(BO), %xmm11 1891 addpd %xmm8, %xmm1 1892 movapd 6 * SIZE(AO), %xmm8 1893 1894 mulpd %xmm8, %xmm11 1895 mulpd 14 * SIZE(BO), %xmm8 1896 addpd %xmm11, %xmm2 1897 movapd 40 * SIZE(BO), %xmm11 1898 addpd %xmm8, %xmm3 1899 movapd 16 * SIZE(AO), %xmm8 1900 1901 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) 1902 mulpd %xmm10, %xmm13 1903 mulpd 18 * SIZE(BO), %xmm10 1904 addpd %xmm13, %xmm0 1905 movapd 20 * SIZE(BO), %xmm13 1906 addpd %xmm10, %xmm1 1907 movapd 10 * SIZE(AO), %xmm10 1908 1909 mulpd %xmm10, %xmm13 1910 mulpd 22 * SIZE(BO), %xmm10 1911 addpd %xmm13, %xmm2 1912 movapd 48 * SIZE(BO), %xmm13 1913 addpd %xmm10, %xmm3 1914 movapd 12 * SIZE(AO), %xmm10 1915 1916 mulpd %xmm10, %xmm15 1917 mulpd 26 * SIZE(BO), %xmm10 1918 addpd %xmm15, %xmm0 1919 movapd 28 * SIZE(BO), %xmm15 1920 addpd %xmm10, %xmm1 1921 movapd 14 * SIZE(AO), %xmm10 1922 1923 mulpd %xmm10, %xmm15 1924 mulpd 30 * SIZE(BO), %xmm10 1925 addpd %xmm15, %xmm2 1926 movapd 56 * SIZE(BO), %xmm15 1927 addpd %xmm10, %xmm3 1928 movapd 24 * SIZE(AO), %xmm10 1929 1930 addq $16 * SIZE, AO 1931 addq $32 * SIZE, BO 1932 decq %rax 1933 jne .L62 1934 ALIGN_4 1935 1936.L65: 1937#if defined(LT) || defined(RN) 1938 movq KK, %rax 1939#else 1940 movq K, %rax 1941 subq KK, %rax 1942#endif 1943 andq $7, %rax # if (k & 1) 1944 BRANCH 1945 je .L69 1946 ALIGN_4 1947 1948.L66: 1949 mulpd %xmm8, %xmm9 1950 mulpd 2 * SIZE(BO), %xmm8 1951 addpd %xmm9, %xmm0 1952 movapd 4 * SIZE(BO), %xmm9 1953 addpd %xmm8, %xmm1 1954 movapd 2 * SIZE(AO), %xmm8 1955 1956 addq $2 * SIZE, AO # aoffset += 4 1957 addq $4 * SIZE, BO # boffset1 += 8 1958 decq %rax 1959 jg .L66 1960 ALIGN_4 1961 1962.L69: 1963 addpd %xmm2, %xmm0 1964 addpd %xmm3, %xmm1 1965 1966#if defined(LN) || defined(RT) 1967 movq KK, %rax 1968#ifdef LN 1969 subq $2, %rax 1970#else 1971 subq $2, %rax 1972#endif 1973 1974 movq AORIG, AO 1975 movq BORIG, B 1976 leaq BUFFER, BO 1977 1978 leaq (, %rax, SIZE), %rax 1979 leaq (AO, %rax, 2), AO 1980 leaq (B, %rax, 2), B 1981 leaq (BO, %rax, 4), BO 1982#endif 1983 1984#if defined(LN) || defined(LT) 1985 movapd %xmm0, %xmm8 1986 unpcklpd %xmm1, %xmm0 1987 unpckhpd %xmm1, %xmm8 1988 1989 movapd 0 * SIZE(B), %xmm1 1990 movapd 2 * SIZE(B), %xmm5 1991 1992 subpd %xmm0, %xmm1 1993 subpd %xmm8, %xmm5 1994#else 1995 movapd 0 * SIZE(AO), %xmm8 1996 movapd 2 * SIZE(AO), %xmm10 1997 1998 subpd %xmm0, %xmm8 1999 subpd %xmm1, %xmm10 2000#endif 2001 2002 2003#ifdef LN 2004 movlpd 3 * SIZE(AO), %xmm0 2005 movhpd 3 * SIZE(AO), %xmm0 2006 mulpd %xmm0, %xmm5 2007 2008 movlpd 2 * SIZE(AO), %xmm2 2009 movhpd 2 * SIZE(AO), %xmm2 2010 mulpd %xmm5, %xmm2 2011 subpd %xmm2, %xmm1 2012 2013 movlpd 0 * SIZE(AO), %xmm0 2014 movhpd 0 * SIZE(AO), %xmm0 2015 mulpd %xmm0, %xmm1 2016#endif 2017 2018#ifdef LT 2019 movlpd 0 * SIZE(AO), %xmm0 2020 movhpd 0 * SIZE(AO), %xmm0 2021 mulpd %xmm0, %xmm1 2022 2023 movlpd 1 * SIZE(AO), %xmm2 2024 movhpd 1 * SIZE(AO), %xmm2 2025 mulpd %xmm1, %xmm2 2026 subpd %xmm2, %xmm5 2027 2028 movlpd 3 * SIZE(AO), %xmm0 2029 movhpd 3 * SIZE(AO), %xmm0 2030 mulpd %xmm0, %xmm5 2031#endif 2032 2033#ifdef RN 2034 movlpd 0 * SIZE(B), %xmm0 2035 movhpd 0 * SIZE(B), %xmm0 2036 mulpd %xmm0, %xmm8 2037 2038 movlpd 1 * SIZE(B), %xmm1 2039 movhpd 1 * SIZE(B), %xmm1 2040 mulpd %xmm8, %xmm1 2041 subpd %xmm1, %xmm10 2042 2043 movlpd 3 * SIZE(B), %xmm0 2044 movhpd 3 * SIZE(B), %xmm0 2045 mulpd %xmm0, %xmm10 2046#endif 2047 2048#ifdef RT 2049 movlpd 3 * SIZE(B), %xmm0 2050 movhpd 3 * SIZE(B), %xmm0 2051 mulpd %xmm0, %xmm10 2052 2053 movlpd 2 * SIZE(B), %xmm1 2054 movhpd 2 * SIZE(B), %xmm1 2055 mulpd %xmm10, %xmm1 2056 subpd %xmm1, %xmm8 2057 2058 movlpd 0 * SIZE(B), %xmm0 2059 movhpd 0 * SIZE(B), %xmm0 2060 mulpd %xmm0, %xmm8 2061#endif 2062 2063#ifdef LN 2064 subq $2 * SIZE, CO1 2065 subq $2 * SIZE, CO2 2066#endif 2067 2068#if defined(LN) || defined(LT) 2069 movsd %xmm1, 0 * SIZE(CO1) 2070 movsd %xmm5, 1 * SIZE(CO1) 2071 2072 movhpd %xmm1, 0 * SIZE(CO2) 2073 movhpd %xmm5, 1 * SIZE(CO2) 2074#else 2075 movsd %xmm8, 0 * SIZE(CO1) 2076 movhpd %xmm8, 1 * SIZE(CO1) 2077 2078 movsd %xmm10, 0 * SIZE(CO2) 2079 movhpd %xmm10, 1 * SIZE(CO2) 2080#endif 2081 2082#if defined(LN) || defined(LT) 2083 movapd %xmm1, 0 * SIZE(B) 2084 movapd %xmm5, 2 * SIZE(B) 2085 2086 movlpd %xmm1, 0 * SIZE(BO) 2087 movlpd %xmm1, 1 * SIZE(BO) 2088 movhpd %xmm1, 2 * SIZE(BO) 2089 movhpd %xmm1, 3 * SIZE(BO) 2090 movlpd %xmm5, 4 * SIZE(BO) 2091 movlpd %xmm5, 5 * SIZE(BO) 2092 movhpd %xmm5, 6 * SIZE(BO) 2093 movhpd %xmm5, 7 * SIZE(BO) 2094#else 2095 movapd %xmm8, 0 * SIZE(AO) 2096 movapd %xmm10, 2 * SIZE(AO) 2097#endif 2098 2099#ifndef LN 2100 addq $2 * SIZE, CO1 2101 addq $2 * SIZE, CO2 2102#endif 2103 2104#if defined(LT) || defined(RN) 2105 movq K, %rax 2106 subq KK, %rax 2107 leaq (,%rax, SIZE), %rax 2108 leaq (AO, %rax, 2), AO 2109#ifdef LT 2110 addq $4 * SIZE, B 2111#endif 2112#endif 2113 2114#ifdef LN 2115 subq $2, KK 2116 movq BORIG, B 2117#endif 2118 2119#ifdef LT 2120 addq $2, KK 2121#endif 2122 2123#ifdef RT 2124 movq K, %rax 2125 movq BORIG, B 2126 salq $1 + BASE_SHIFT, %rax 2127 addq %rax, AORIG 2128#endif 2129 ALIGN_4 2130 2131.L70: 2132 testq $1, M 2133 je .L79 2134 ALIGN_4 2135 2136.L71: 2137#ifdef LN 2138 movq K, %rax 2139 salq $0 + BASE_SHIFT, %rax 2140 subq %rax, AORIG 2141#endif 2142 2143#if defined(LN) || defined(RT) 2144 movq KK, %rax 2145 movq AORIG, AO 2146 leaq (, %rax, SIZE), %rax 2147 leaq (AO, %rax, 1), AO 2148#endif 2149 2150 leaq BUFFER, BO 2151 2152#if defined(LN) || defined(RT) 2153 movq KK, %rax 2154 salq $1 + BASE_SHIFT, %rax 2155 leaq (BO, %rax, 2), BO 2156#endif 2157 2158 movsd 0 * SIZE(AO), %xmm8 2159 pxor %xmm0, %xmm0 2160 movsd 0 * SIZE(BO), %xmm9 2161 pxor %xmm1, %xmm1 2162 movsd 4 * SIZE(AO), %xmm10 2163 pxor %xmm2, %xmm2 2164 movsd 8 * SIZE(BO), %xmm11 2165 pxor %xmm3, %xmm3 2166 2167 movsd 16 * SIZE(BO), %xmm13 2168 movsd 24 * SIZE(BO), %xmm15 2169 2170#if defined(LT) || defined(RN) 2171 movq KK, %rax 2172#else 2173 movq K, %rax 2174 subq KK, %rax 2175#endif 2176 sarq $3, %rax 2177 je .L75 2178 ALIGN_4 2179 2180.L72: 2181 mulsd %xmm8, %xmm9 2182 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 2183 mulsd 2 * SIZE(BO), %xmm8 2184 addsd %xmm9, %xmm0 2185 movsd 4 * SIZE(BO), %xmm9 2186 addsd %xmm8, %xmm1 2187 movsd 1 * SIZE(AO), %xmm8 2188 2189 mulsd %xmm8, %xmm9 2190 mulsd 6 * SIZE(BO), %xmm8 2191 addsd %xmm9, %xmm2 2192 movsd 32 * SIZE(BO), %xmm9 2193 addsd %xmm8, %xmm3 2194 movsd 2 * SIZE(AO), %xmm8 2195 2196 mulsd %xmm8, %xmm11 2197 mulsd 10 * SIZE(BO), %xmm8 2198 addsd %xmm11, %xmm0 2199 movsd 12 * SIZE(BO), %xmm11 2200 addsd %xmm8, %xmm1 2201 movsd 3 * SIZE(AO), %xmm8 2202 2203 mulsd %xmm8, %xmm11 2204 mulsd 14 * SIZE(BO), %xmm8 2205 addsd %xmm11, %xmm2 2206 movsd 40 * SIZE(BO), %xmm11 2207 addsd %xmm8, %xmm3 2208 movsd 8 * SIZE(AO), %xmm8 2209 2210 mulsd %xmm10, %xmm13 2211 mulsd 18 * SIZE(BO), %xmm10 2212 addsd %xmm13, %xmm0 2213 movsd 20 * SIZE(BO), %xmm13 2214 addsd %xmm10, %xmm1 2215 movsd 5 * SIZE(AO), %xmm10 2216 2217 mulsd %xmm10, %xmm13 2218 mulsd 22 * SIZE(BO), %xmm10 2219 addsd %xmm13, %xmm2 2220 movsd 48 * SIZE(BO), %xmm13 2221 addsd %xmm10, %xmm3 2222 movsd 6 * SIZE(AO), %xmm10 2223 2224 mulsd %xmm10, %xmm15 2225 mulsd 26 * SIZE(BO), %xmm10 2226 addsd %xmm15, %xmm0 2227 movsd 28 * SIZE(BO), %xmm15 2228 addsd %xmm10, %xmm1 2229 movsd 7 * SIZE(AO), %xmm10 2230 2231 mulsd %xmm10, %xmm15 2232 mulsd 30 * SIZE(BO), %xmm10 2233 addsd %xmm15, %xmm2 2234 movsd 56 * SIZE(BO), %xmm15 2235 addsd %xmm10, %xmm3 2236 movsd 12 * SIZE(AO), %xmm10 2237 2238 addq $ 8 * SIZE, AO 2239 addq $32 * SIZE, BO 2240 decq %rax 2241 jne .L72 2242 ALIGN_4 2243 2244.L75: 2245#if defined(LT) || defined(RN) 2246 movq KK, %rax 2247#else 2248 movq K, %rax 2249 subq KK, %rax 2250#endif 2251 andq $7, %rax # if (k & 1) 2252 BRANCH 2253 je .L78 2254 ALIGN_4 2255 2256.L76: 2257 mulsd %xmm8, %xmm9 2258 mulsd 2 * SIZE(BO), %xmm8 2259 addsd %xmm9, %xmm0 2260 addsd %xmm8, %xmm1 2261 movsd 1 * SIZE(AO), %xmm8 2262 movsd 4 * SIZE(BO), %xmm9 2263 2264 addq $1 * SIZE, AO # aoffset += 4 2265 addq $4 * SIZE, BO # boffset1 += 8 2266 decq %rax 2267 jg .L76 2268 ALIGN_4 2269 2270.L78: 2271 addsd %xmm2, %xmm0 2272 addsd %xmm3, %xmm1 2273 2274#if defined(LN) || defined(RT) 2275 movq KK, %rax 2276#ifdef LN 2277 subq $1, %rax 2278#else 2279 subq $2, %rax 2280#endif 2281 2282 movq AORIG, AO 2283 movq BORIG, B 2284 leaq BUFFER, BO 2285 2286 leaq (, %rax, SIZE), %rax 2287 leaq (AO, %rax, 1), AO 2288 leaq (B, %rax, 2), B 2289 leaq (BO, %rax, 4), BO 2290#endif 2291 2292#if defined(LN) || defined(LT) 2293 movsd 0 * SIZE(B), %xmm4 2294 movsd 1 * SIZE(B), %xmm5 2295#else 2296 movsd 0 * SIZE(AO), %xmm4 2297 movsd 1 * SIZE(AO), %xmm5 2298#endif 2299 2300 subsd %xmm0, %xmm4 2301 subsd %xmm1, %xmm5 2302 2303#ifdef LN 2304 movsd 0 * SIZE(AO), %xmm0 2305 2306 mulsd %xmm0, %xmm4 2307 mulsd %xmm0, %xmm5 2308#endif 2309 2310#ifdef LT 2311 movsd 0 * SIZE(AO), %xmm0 2312 2313 mulsd %xmm0, %xmm4 2314 mulsd %xmm0, %xmm5 2315#endif 2316 2317#ifdef RN 2318 mulsd 0 * SIZE(B), %xmm4 2319 movsd 1 * SIZE(B), %xmm1 2320 mulsd %xmm4, %xmm1 2321 subsd %xmm1, %xmm5 2322 2323 mulsd 3 * SIZE(B), %xmm5 2324#endif 2325 2326#ifdef RT 2327 mulsd 3 * SIZE(B), %xmm5 2328 2329 movlpd 2 * SIZE(B), %xmm1 2330 mulsd %xmm5, %xmm1 2331 subsd %xmm1, %xmm4 2332 2333 mulsd 0 * SIZE(B), %xmm4 2334#endif 2335 2336#ifdef LN 2337 subq $1 * SIZE, CO1 2338 subq $1 * SIZE, CO2 2339#endif 2340 2341 movsd %xmm4, 0 * SIZE(CO1) 2342 movsd %xmm5, 0 * SIZE(CO2) 2343 2344#if defined(LN) || defined(LT) 2345 movsd %xmm4, 0 * SIZE(B) 2346 movsd %xmm5, 1 * SIZE(B) 2347 2348 movsd %xmm4, 0 * SIZE(BO) 2349 movsd %xmm4, 1 * SIZE(BO) 2350 movsd %xmm5, 2 * SIZE(BO) 2351 movsd %xmm5, 3 * SIZE(BO) 2352#else 2353 movsd %xmm4, 0 * SIZE(AO) 2354 movsd %xmm5, 1 * SIZE(AO) 2355#endif 2356 2357#ifndef LN 2358 addq $1 * SIZE, CO1 2359 addq $1 * SIZE, CO2 2360#endif 2361 2362#if defined(LT) || defined(RN) 2363 movq K, %rax 2364 subq KK, %rax 2365 leaq (,%rax, SIZE), %rax 2366 leaq (AO, %rax, 1), AO 2367#ifdef LT 2368 addq $2 * SIZE, B 2369#endif 2370#endif 2371 2372#ifdef LN 2373 subq $1, KK 2374 movq BORIG, B 2375#endif 2376 2377#ifdef LT 2378 addq $1, KK 2379#endif 2380 2381#ifdef RT 2382 movq K, %rax 2383 movq BORIG, B 2384 salq $0 + BASE_SHIFT, %rax 2385 addq %rax, AORIG 2386#endif 2387 ALIGN_4 2388 2389.L79: 2390#ifdef LN 2391 leaq (, K, SIZE), %rax 2392 leaq (B, %rax, 2), B 2393#endif 2394 2395#if defined(LT) || defined(RN) 2396 movq K, %rax 2397 subq KK, %rax 2398 leaq (,%rax, SIZE), %rax 2399 leaq (B, %rax, 2), B 2400#endif 2401 2402#ifdef RN 2403 addq $2, KK 2404#endif 2405 2406#ifdef RT 2407 subq $2, KK 2408#endif 2409 ALIGN_4 2410 2411.L80: 2412 movq N, J 2413 sarq $2, J # j = (n >> 2) 2414 jle .L999 2415 2416.L01: 2417/* Copying to Sub Buffer */ 2418 2419#ifdef LN 2420 movq OFFSET, %rax 2421 addq M, %rax 2422 movq %rax, KK 2423#endif 2424 2425 leaq BUFFER, BO 2426 2427#ifdef RT 2428 movq K, %rax 2429 salq $2 + BASE_SHIFT, %rax 2430 subq %rax, B 2431#endif 2432 2433#if defined(LN) || defined(RT) 2434 movq KK, %rax 2435 movq B, BORIG 2436 leaq (, %rax, SIZE), %rax 2437 leaq (B, %rax, 4), B 2438 leaq (BO, %rax, 8), BO 2439#endif 2440 2441#ifdef LT 2442 movq OFFSET, %rax 2443 movq %rax, KK 2444#endif 2445 2446#if defined(LT) || defined(RN) 2447 movq KK, %rax 2448#else 2449 movq K, %rax 2450 subq KK, %rax 2451#endif 2452 sarq $2, %rax 2453 jle .L03 2454 2455 addq %rax, %rax 2456 ALIGN_4 2457 2458.L02: 2459 PREFETCHNTA 40 * SIZE(B) 2460 2461 movsd 0 * SIZE(B), %xmm0 2462 movsd 1 * SIZE(B), %xmm1 2463 movsd 2 * SIZE(B), %xmm2 2464 movsd 3 * SIZE(B), %xmm3 2465 movsd 4 * SIZE(B), %xmm4 2466 movsd 5 * SIZE(B), %xmm5 2467 movsd 6 * SIZE(B), %xmm6 2468 movsd 7 * SIZE(B), %xmm7 2469 2470 addq $16 * SIZE, BO 2471 addq $ 8 * SIZE, B 2472 2473 movsd %xmm0, -16 * SIZE(BO) 2474 movsd %xmm0, -15 * SIZE(BO) 2475 movsd %xmm1, -14 * SIZE(BO) 2476 movsd %xmm1, -13 * SIZE(BO) 2477 movsd %xmm2, -12 * SIZE(BO) 2478 movsd %xmm2, -11 * SIZE(BO) 2479 movsd %xmm3, -10 * SIZE(BO) 2480 movsd %xmm3, -9 * SIZE(BO) 2481 movsd %xmm4, -8 * SIZE(BO) 2482 movsd %xmm4, -7 * SIZE(BO) 2483 movsd %xmm5, -6 * SIZE(BO) 2484 movsd %xmm5, -5 * SIZE(BO) 2485 movsd %xmm6, -4 * SIZE(BO) 2486 movsd %xmm6, -3 * SIZE(BO) 2487 movsd %xmm7, -2 * SIZE(BO) 2488 movsd %xmm7, -1 * SIZE(BO) 2489 2490 decq %rax 2491 jne .L02 2492 ALIGN_4 2493 2494.L03: 2495#if defined(LT) || defined(RN) 2496 movq KK, %rax 2497#else 2498 movq K, %rax 2499 subq KK, %rax 2500#endif 2501 andq $3, %rax 2502 BRANCH 2503 jle .L10 2504 ALIGN_4 2505 2506.L04: 2507 movsd 0 * SIZE(B), %xmm0 2508 movsd 1 * SIZE(B), %xmm1 2509 movsd 2 * SIZE(B), %xmm2 2510 movsd 3 * SIZE(B), %xmm3 2511 2512 movsd %xmm0, 0 * SIZE(BO) 2513 movsd %xmm0, 1 * SIZE(BO) 2514 movsd %xmm1, 2 * SIZE(BO) 2515 movsd %xmm1, 3 * SIZE(BO) 2516 movsd %xmm2, 4 * SIZE(BO) 2517 movsd %xmm2, 5 * SIZE(BO) 2518 movsd %xmm3, 6 * SIZE(BO) 2519 movsd %xmm3, 7 * SIZE(BO) 2520 2521 addq $4 * SIZE, B 2522 addq $8 * SIZE, BO 2523 decq %rax 2524 jne .L04 2525 ALIGN_4 2526 2527.L10: 2528#if defined(LT) || defined(RN) 2529 movq A, AO 2530#else 2531 movq A, AORIG 2532#endif 2533 2534#ifdef RT 2535 leaq (, LDC, 4), %rax 2536 subq %rax, C 2537#endif 2538 2539 movq C, CO1 # coffset1 = c 2540 leaq (C, LDC, 1), CO2 # coffset2 = c + ldc 2541#ifndef RT 2542 leaq (C, LDC, 4), C 2543#endif 2544 2545 movq M, I 2546 sarq $2, I # i = (m >> 2) 2547 jle .L20 2548 ALIGN_4 2549 2550.L11: 2551#ifdef LN 2552 movq K, %rax 2553 salq $2 + BASE_SHIFT, %rax 2554 subq %rax, AORIG 2555#endif 2556 2557#if defined(LN) || defined(RT) 2558 movq KK, %rax 2559 movq AORIG, AO 2560 leaq (, %rax, SIZE), %rax 2561 leaq (AO, %rax, 4), AO 2562#endif 2563 2564 leaq BUFFER, BO 2565 2566#if defined(LN) || defined(RT) 2567 movq KK, %rax 2568 salq $2 + BASE_SHIFT, %rax 2569 leaq (BO, %rax, 2), BO 2570#endif 2571 2572 movapd 0 * SIZE(BO), %xmm9 2573 movapd 2 * SIZE(BO), %xmm11 2574 movapd 4 * SIZE(BO), %xmm13 2575 movapd 8 * SIZE(BO), %xmm15 2576 2577 movapd 0 * SIZE(AO), %xmm8 2578 pxor %xmm0, %xmm0 2579 movapd 2 * SIZE(AO), %xmm10 2580 pxor %xmm1, %xmm1 2581 movapd 4 * SIZE(AO), %xmm12 2582 pxor %xmm2, %xmm2 2583 movapd 6 * SIZE(AO), %xmm14 2584 pxor %xmm3, %xmm3 2585 2586 PREFETCHW 4 * SIZE(CO1) 2587 pxor %xmm4, %xmm4 2588 PREFETCHW 4 * SIZE(CO2) 2589 pxor %xmm5, %xmm5 2590 PREFETCHW 4 * SIZE(CO1, LDC, 2) 2591 pxor %xmm6, %xmm6 2592 PREFETCHW 4 * SIZE(CO2, LDC, 2) 2593 pxor %xmm7, %xmm7 2594 2595#if defined(LT) || defined(RN) 2596 movq KK, %rax 2597#else 2598 movq K, %rax 2599 subq KK, %rax 2600#endif 2601 andq $-8, %rax 2602 salq $4, %rax 2603 je .L15 2604.L1X: 2605 KERNEL1(16 * 0) 2606 KERNEL2(16 * 0) 2607 KERNEL3(16 * 0) 2608 KERNEL4(16 * 0) 2609 KERNEL5(16 * 0) 2610 KERNEL6(16 * 0) 2611 KERNEL7(16 * 0) 2612 KERNEL8(16 * 0) 2613 KERNEL1(16 * 1) 2614 KERNEL2(16 * 1) 2615 KERNEL3(16 * 1) 2616 KERNEL4(16 * 1) 2617 KERNEL5(16 * 1) 2618 KERNEL6(16 * 1) 2619 KERNEL7(16 * 1) 2620 KERNEL8(16 * 1) 2621 cmpq $64 * 2, %rax 2622 jle .L12 2623 KERNEL1(16 * 2) 2624 KERNEL2(16 * 2) 2625 KERNEL3(16 * 2) 2626 KERNEL4(16 * 2) 2627 KERNEL5(16 * 2) 2628 KERNEL6(16 * 2) 2629 KERNEL7(16 * 2) 2630 KERNEL8(16 * 2) 2631 KERNEL1(16 * 3) 2632 KERNEL2(16 * 3) 2633 KERNEL3(16 * 3) 2634 KERNEL4(16 * 3) 2635 KERNEL5(16 * 3) 2636 KERNEL6(16 * 3) 2637 KERNEL7(16 * 3) 2638 KERNEL8(16 * 3) 2639 cmpq $64 * 4, %rax 2640 jle .L12 2641 KERNEL1(16 * 4) 2642 KERNEL2(16 * 4) 2643 KERNEL3(16 * 4) 2644 KERNEL4(16 * 4) 2645 KERNEL5(16 * 4) 2646 KERNEL6(16 * 4) 2647 KERNEL7(16 * 4) 2648 KERNEL8(16 * 4) 2649 KERNEL1(16 * 5) 2650 KERNEL2(16 * 5) 2651 KERNEL3(16 * 5) 2652 KERNEL4(16 * 5) 2653 KERNEL5(16 * 5) 2654 KERNEL6(16 * 5) 2655 KERNEL7(16 * 5) 2656 KERNEL8(16 * 5) 2657 cmpq $64 * 6, %rax 2658 jle .L12 2659 KERNEL1(16 * 6) 2660 KERNEL2(16 * 6) 2661 KERNEL3(16 * 6) 2662 KERNEL4(16 * 6) 2663 KERNEL5(16 * 6) 2664 KERNEL6(16 * 6) 2665 KERNEL7(16 * 6) 2666 KERNEL8(16 * 6) 2667 KERNEL1(16 * 7) 2668 KERNEL2(16 * 7) 2669 KERNEL3(16 * 7) 2670 KERNEL4(16 * 7) 2671 KERNEL5(16 * 7) 2672 KERNEL6(16 * 7) 2673 KERNEL7(16 * 7) 2674 KERNEL8(16 * 7) 2675 2676 addq $16 * 8 * SIZE, AO 2677 addq $32 * 8 * SIZE, BO 2678 subq $64 * 8, %rax 2679 jg .L1X 2680 2681.L12: 2682 leaq (AO, %rax, 2), AO # * 16 2683 leaq (BO, %rax, 4), BO # * 64 2684 ALIGN_4 2685 2686.L15: 2687#if defined(LT) || defined(RN) 2688 movq KK, %rax 2689#else 2690 movq K, %rax 2691 subq KK, %rax 2692#endif 2693 andq $7, %rax # if (k & 1) 2694 BRANCH 2695 je .L19 2696 ALIGN_4 2697 2698.L16: 2699 mulpd %xmm8, %xmm9 2700 addpd %xmm9, %xmm0 2701 movapd 2 * SIZE(BO), %xmm9 2702 mulpd %xmm8, %xmm9 2703 addpd %xmm9, %xmm1 2704 movapd 4 * SIZE(BO), %xmm9 2705 mulpd %xmm8, %xmm9 2706 mulpd 6 * SIZE(BO), %xmm8 2707 addpd %xmm9, %xmm2 2708 movapd 0 * SIZE(BO), %xmm9 2709 addpd %xmm8, %xmm3 2710 movapd 4 * SIZE(AO), %xmm8 2711 mulpd %xmm10, %xmm9 2712 addpd %xmm9, %xmm4 2713 movapd 2 * SIZE(BO), %xmm9 2714 mulpd %xmm10, %xmm9 2715 addpd %xmm9, %xmm5 2716 movapd 4 * SIZE(BO), %xmm9 2717 mulpd %xmm10, %xmm9 2718 mulpd 6 * SIZE(BO), %xmm10 2719 addpd %xmm9, %xmm6 2720 movapd 8 * SIZE(BO), %xmm9 2721 addpd %xmm10, %xmm7 2722 movapd 6 * SIZE(AO), %xmm10 2723 2724 addq $4 * SIZE, AO # aoffset += 4 2725 addq $8 * SIZE, BO # boffset1 += 8 2726 decq %rax 2727 jg .L16 2728 ALIGN_4 2729 2730.L19: 2731#if defined(LN) || defined(RT) 2732 movq KK, %rax 2733#ifdef LN 2734 subq $4, %rax 2735#else 2736 subq $4, %rax 2737#endif 2738 2739 movq AORIG, AO 2740 movq BORIG, B 2741 leaq BUFFER, BO 2742 2743 leaq (, %rax, SIZE), %rax 2744 leaq (AO, %rax, 4), AO 2745 leaq (B, %rax, 4), B 2746 leaq (BO, %rax, 8), BO 2747#endif 2748 2749#if defined(LN) || defined(LT) 2750 movapd %xmm0, %xmm8 2751 unpcklpd %xmm1, %xmm0 2752 unpckhpd %xmm1, %xmm8 2753 2754 movapd %xmm2, %xmm10 2755 unpcklpd %xmm3, %xmm2 2756 unpckhpd %xmm3, %xmm10 2757 2758 movapd %xmm4, %xmm12 2759 unpcklpd %xmm5, %xmm4 2760 unpckhpd %xmm5, %xmm12 2761 2762 movapd %xmm6, %xmm14 2763 unpcklpd %xmm7, %xmm6 2764 unpckhpd %xmm7, %xmm14 2765 2766 movapd 0 * SIZE(B), %xmm1 2767 movapd 2 * SIZE(B), %xmm3 2768 movapd 4 * SIZE(B), %xmm5 2769 movapd 6 * SIZE(B), %xmm7 2770 movapd 8 * SIZE(B), %xmm9 2771 movapd 10 * SIZE(B), %xmm11 2772 movapd 12 * SIZE(B), %xmm13 2773 movapd 14 * SIZE(B), %xmm15 2774 2775 subpd %xmm0, %xmm1 2776 subpd %xmm2, %xmm3 2777 subpd %xmm8, %xmm5 2778 subpd %xmm10, %xmm7 2779 subpd %xmm4, %xmm9 2780 subpd %xmm6, %xmm11 2781 subpd %xmm12, %xmm13 2782 subpd %xmm14, %xmm15 2783#else 2784 movapd 0 * SIZE(AO), %xmm8 2785 movapd 2 * SIZE(AO), %xmm9 2786 movapd 4 * SIZE(AO), %xmm10 2787 movapd 6 * SIZE(AO), %xmm11 2788 2789 movapd 8 * SIZE(AO), %xmm12 2790 movapd 10 * SIZE(AO), %xmm13 2791 movapd 12 * SIZE(AO), %xmm14 2792 movapd 14 * SIZE(AO), %xmm15 2793 2794 subpd %xmm0, %xmm8 2795 subpd %xmm4, %xmm9 2796 subpd %xmm1, %xmm10 2797 subpd %xmm5, %xmm11 2798 subpd %xmm2, %xmm12 2799 subpd %xmm6, %xmm13 2800 subpd %xmm3, %xmm14 2801 subpd %xmm7, %xmm15 2802#endif 2803 2804#ifdef LN 2805 movlpd 15 * SIZE(AO), %xmm0 2806 movhpd 15 * SIZE(AO), %xmm0 2807 mulpd %xmm0, %xmm13 2808 mulpd %xmm0, %xmm15 2809 2810 movlpd 14 * SIZE(AO), %xmm2 2811 movhpd 14 * SIZE(AO), %xmm2 2812 mulpd %xmm13, %xmm2 2813 subpd %xmm2, %xmm9 2814 movlpd 14 * SIZE(AO), %xmm2 2815 movhpd 14 * SIZE(AO), %xmm2 2816 mulpd %xmm15, %xmm2 2817 subpd %xmm2, %xmm11 2818 2819 movlpd 13 * SIZE(AO), %xmm4 2820 movhpd 13 * SIZE(AO), %xmm4 2821 mulpd %xmm13, %xmm4 2822 subpd %xmm4, %xmm5 2823 movlpd 13 * SIZE(AO), %xmm4 2824 movhpd 13 * SIZE(AO), %xmm4 2825 mulpd %xmm15, %xmm4 2826 subpd %xmm4, %xmm7 2827 2828 movlpd 12 * SIZE(AO), %xmm6 2829 movhpd 12 * SIZE(AO), %xmm6 2830 mulpd %xmm13, %xmm6 2831 subpd %xmm6, %xmm1 2832 movlpd 12 * SIZE(AO), %xmm6 2833 movhpd 12 * SIZE(AO), %xmm6 2834 mulpd %xmm15, %xmm6 2835 subpd %xmm6, %xmm3 2836 2837 movlpd 10 * SIZE(AO), %xmm0 2838 movhpd 10 * SIZE(AO), %xmm0 2839 mulpd %xmm0, %xmm9 2840 mulpd %xmm0, %xmm11 2841 2842 movlpd 9 * SIZE(AO), %xmm2 2843 movhpd 9 * SIZE(AO), %xmm2 2844 mulpd %xmm9, %xmm2 2845 subpd %xmm2, %xmm5 2846 movlpd 9 * SIZE(AO), %xmm2 2847 movhpd 9 * SIZE(AO), %xmm2 2848 mulpd %xmm11, %xmm2 2849 subpd %xmm2, %xmm7 2850 2851 movlpd 8 * SIZE(AO), %xmm4 2852 movhpd 8 * SIZE(AO), %xmm4 2853 mulpd %xmm9, %xmm4 2854 subpd %xmm4, %xmm1 2855 movlpd 8 * SIZE(AO), %xmm4 2856 movhpd 8 * SIZE(AO), %xmm4 2857 mulpd %xmm11, %xmm4 2858 subpd %xmm4, %xmm3 2859 2860 movlpd 5 * SIZE(AO), %xmm0 2861 movhpd 5 * SIZE(AO), %xmm0 2862 mulpd %xmm0, %xmm5 2863 mulpd %xmm0, %xmm7 2864 2865 movlpd 4 * SIZE(AO), %xmm2 2866 movhpd 4 * SIZE(AO), %xmm2 2867 mulpd %xmm5, %xmm2 2868 subpd %xmm2, %xmm1 2869 movlpd 4 * SIZE(AO), %xmm2 2870 movhpd 4 * SIZE(AO), %xmm2 2871 mulpd %xmm7, %xmm2 2872 subpd %xmm2, %xmm3 2873 2874 movlpd 0 * SIZE(AO), %xmm0 2875 movhpd 0 * SIZE(AO), %xmm0 2876 mulpd %xmm0, %xmm1 2877 mulpd %xmm0, %xmm3 2878#endif 2879 2880#ifdef LT 2881 movlpd 0 * SIZE(AO), %xmm0 2882 movhpd 0 * SIZE(AO), %xmm0 2883 mulpd %xmm0, %xmm1 2884 mulpd %xmm0, %xmm3 2885 2886 movlpd 1 * SIZE(AO), %xmm2 2887 movhpd 1 * SIZE(AO), %xmm2 2888 mulpd %xmm1, %xmm2 2889 subpd %xmm2, %xmm5 2890 2891 movlpd 1 * SIZE(AO), %xmm2 2892 movhpd 1 * SIZE(AO), %xmm2 2893 mulpd %xmm3, %xmm2 2894 subpd %xmm2, %xmm7 2895 2896 movlpd 2 * SIZE(AO), %xmm4 2897 movhpd 2 * SIZE(AO), %xmm4 2898 mulpd %xmm1, %xmm4 2899 subpd %xmm4, %xmm9 2900 movlpd 2 * SIZE(AO), %xmm4 2901 movhpd 2 * SIZE(AO), %xmm4 2902 mulpd %xmm3, %xmm4 2903 subpd %xmm4, %xmm11 2904 2905 movlpd 3 * SIZE(AO), %xmm6 2906 movhpd 3 * SIZE(AO), %xmm6 2907 mulpd %xmm1, %xmm6 2908 subpd %xmm6, %xmm13 2909 movlpd 3 * SIZE(AO), %xmm6 2910 movhpd 3 * SIZE(AO), %xmm6 2911 mulpd %xmm3, %xmm6 2912 subpd %xmm6, %xmm15 2913 2914 movlpd 5 * SIZE(AO), %xmm0 2915 movhpd 5 * SIZE(AO), %xmm0 2916 mulpd %xmm0, %xmm5 2917 mulpd %xmm0, %xmm7 2918 2919 movlpd 6 * SIZE(AO), %xmm2 2920 movhpd 6 * SIZE(AO), %xmm2 2921 mulpd %xmm5, %xmm2 2922 subpd %xmm2, %xmm9 2923 movlpd 6 * SIZE(AO), %xmm2 2924 movhpd 6 * SIZE(AO), %xmm2 2925 mulpd %xmm7, %xmm2 2926 subpd %xmm2, %xmm11 2927 2928 movlpd 7 * SIZE(AO), %xmm4 2929 movhpd 7 * SIZE(AO), %xmm4 2930 mulpd %xmm5, %xmm4 2931 subpd %xmm4, %xmm13 2932 movlpd 7 * SIZE(AO), %xmm4 2933 movhpd 7 * SIZE(AO), %xmm4 2934 mulpd %xmm7, %xmm4 2935 subpd %xmm4, %xmm15 2936 2937 movlpd 10 * SIZE(AO), %xmm0 2938 movhpd 10 * SIZE(AO), %xmm0 2939 mulpd %xmm0, %xmm9 2940 mulpd %xmm0, %xmm11 2941 2942 movlpd 11 * SIZE(AO), %xmm2 2943 movhpd 11 * SIZE(AO), %xmm2 2944 mulpd %xmm9, %xmm2 2945 subpd %xmm2, %xmm13 2946 movlpd 11 * SIZE(AO), %xmm2 2947 movhpd 11 * SIZE(AO), %xmm2 2948 mulpd %xmm11, %xmm2 2949 subpd %xmm2, %xmm15 2950 2951 movlpd 15 * SIZE(AO), %xmm0 2952 movhpd 15 * SIZE(AO), %xmm0 2953 mulpd %xmm0, %xmm13 2954 mulpd %xmm0, %xmm15 2955#endif 2956 2957 2958#ifdef RN 2959 movlpd 0 * SIZE(B), %xmm0 2960 movhpd 0 * SIZE(B), %xmm0 2961 mulpd %xmm0, %xmm8 2962 mulpd %xmm0, %xmm9 2963 2964 movlpd 1 * SIZE(B), %xmm1 2965 movhpd 1 * SIZE(B), %xmm1 2966 mulpd %xmm8, %xmm1 2967 subpd %xmm1, %xmm10 2968 movlpd 1 * SIZE(B), %xmm1 2969 movhpd 1 * SIZE(B), %xmm1 2970 mulpd %xmm9, %xmm1 2971 subpd %xmm1, %xmm11 2972 2973 movlpd 2 * SIZE(B), %xmm2 2974 movhpd 2 * SIZE(B), %xmm2 2975 mulpd %xmm8, %xmm2 2976 subpd %xmm2, %xmm12 2977 movlpd 2 * SIZE(B), %xmm2 2978 movhpd 2 * SIZE(B), %xmm2 2979 mulpd %xmm9, %xmm2 2980 subpd %xmm2, %xmm13 2981 2982 movlpd 3 * SIZE(B), %xmm3 2983 movhpd 3 * SIZE(B), %xmm3 2984 mulpd %xmm8, %xmm3 2985 subpd %xmm3, %xmm14 2986 movlpd 3 * SIZE(B), %xmm3 2987 movhpd 3 * SIZE(B), %xmm3 2988 mulpd %xmm9, %xmm3 2989 subpd %xmm3, %xmm15 2990 2991 movlpd 5 * SIZE(B), %xmm0 2992 movhpd 5 * SIZE(B), %xmm0 2993 mulpd %xmm0, %xmm10 2994 mulpd %xmm0, %xmm11 2995 2996 movlpd 6 * SIZE(B), %xmm1 2997 movhpd 6 * SIZE(B), %xmm1 2998 mulpd %xmm10, %xmm1 2999 subpd %xmm1, %xmm12 3000 movlpd 6 * SIZE(B), %xmm1 3001 movhpd 6 * SIZE(B), %xmm1 3002 mulpd %xmm11, %xmm1 3003 subpd %xmm1, %xmm13 3004 3005 movlpd 7 * SIZE(B), %xmm2 3006 movhpd 7 * SIZE(B), %xmm2 3007 mulpd %xmm10, %xmm2 3008 subpd %xmm2, %xmm14 3009 movlpd 7 * SIZE(B), %xmm2 3010 movhpd 7 * SIZE(B), %xmm2 3011 mulpd %xmm11, %xmm2 3012 subpd %xmm2, %xmm15 3013 3014 movlpd 10 * SIZE(B), %xmm0 3015 movhpd 10 * SIZE(B), %xmm0 3016 mulpd %xmm0, %xmm12 3017 mulpd %xmm0, %xmm13 3018 3019 movlpd 11 * SIZE(B), %xmm1 3020 movhpd 11 * SIZE(B), %xmm1 3021 mulpd %xmm12, %xmm1 3022 subpd %xmm1, %xmm14 3023 movlpd 11 * SIZE(B), %xmm1 3024 movhpd 11 * SIZE(B), %xmm1 3025 mulpd %xmm13, %xmm1 3026 subpd %xmm1, %xmm15 3027 3028 movlpd 15 * SIZE(B), %xmm0 3029 movhpd 15 * SIZE(B), %xmm0 3030 mulpd %xmm0, %xmm14 3031 mulpd %xmm0, %xmm15 3032#endif 3033 3034#ifdef RT 3035 movlpd 15 * SIZE(B), %xmm0 3036 movhpd 15 * SIZE(B), %xmm0 3037 mulpd %xmm0, %xmm14 3038 mulpd %xmm0, %xmm15 3039 3040 movlpd 14 * SIZE(B), %xmm1 3041 movhpd 14 * SIZE(B), %xmm1 3042 mulpd %xmm14, %xmm1 3043 subpd %xmm1, %xmm12 3044 movlpd 14 * SIZE(B), %xmm1 3045 movhpd 14 * SIZE(B), %xmm1 3046 mulpd %xmm15, %xmm1 3047 subpd %xmm1, %xmm13 3048 3049 movlpd 13 * SIZE(B), %xmm2 3050 movhpd 13 * SIZE(B), %xmm2 3051 mulpd %xmm14, %xmm2 3052 subpd %xmm2, %xmm10 3053 movlpd 13 * SIZE(B), %xmm2 3054 movhpd 13 * SIZE(B), %xmm2 3055 mulpd %xmm15, %xmm2 3056 subpd %xmm2, %xmm11 3057 3058 movlpd 12 * SIZE(B), %xmm3 3059 movhpd 12 * SIZE(B), %xmm3 3060 mulpd %xmm14, %xmm3 3061 subpd %xmm3, %xmm8 3062 movlpd 12 * SIZE(B), %xmm3 3063 movhpd 12 * SIZE(B), %xmm3 3064 mulpd %xmm15, %xmm3 3065 subpd %xmm3, %xmm9 3066 3067 movlpd 10 * SIZE(B), %xmm0 3068 movhpd 10 * SIZE(B), %xmm0 3069 mulpd %xmm0, %xmm12 3070 mulpd %xmm0, %xmm13 3071 3072 movlpd 9 * SIZE(B), %xmm1 3073 movhpd 9 * SIZE(B), %xmm1 3074 mulpd %xmm12, %xmm1 3075 subpd %xmm1, %xmm10 3076 movlpd 9 * SIZE(B), %xmm1 3077 movhpd 9 * SIZE(B), %xmm1 3078 mulpd %xmm13, %xmm1 3079 subpd %xmm1, %xmm11 3080 3081 movlpd 8 * SIZE(B), %xmm2 3082 movhpd 8 * SIZE(B), %xmm2 3083 mulpd %xmm12, %xmm2 3084 subpd %xmm2, %xmm8 3085 movlpd 8 * SIZE(B), %xmm2 3086 movhpd 8 * SIZE(B), %xmm2 3087 mulpd %xmm13, %xmm2 3088 subpd %xmm2, %xmm9 3089 3090 movlpd 5 * SIZE(B), %xmm0 3091 movhpd 5 * SIZE(B), %xmm0 3092 mulpd %xmm0, %xmm10 3093 mulpd %xmm0, %xmm11 3094 3095 movlpd 4 * SIZE(B), %xmm1 3096 movhpd 4 * SIZE(B), %xmm1 3097 mulpd %xmm10, %xmm1 3098 subpd %xmm1, %xmm8 3099 movlpd 4 * SIZE(B), %xmm1 3100 movhpd 4 * SIZE(B), %xmm1 3101 mulpd %xmm11, %xmm1 3102 subpd %xmm1, %xmm9 3103 3104 movlpd 0 * SIZE(B), %xmm0 3105 movhpd 0 * SIZE(B), %xmm0 3106 mulpd %xmm0, %xmm8 3107 mulpd %xmm0, %xmm9 3108#endif 3109 3110#ifdef LN 3111 subq $4 * SIZE, CO1 3112 subq $4 * SIZE, CO2 3113#endif 3114 3115#if defined(LN) || defined(LT) 3116 movsd %xmm1, 0 * SIZE(CO1) 3117 movsd %xmm5, 1 * SIZE(CO1) 3118 movsd %xmm9, 2 * SIZE(CO1) 3119 movsd %xmm13, 3 * SIZE(CO1) 3120 3121 movhpd %xmm1, 0 * SIZE(CO2) 3122 movhpd %xmm5, 1 * SIZE(CO2) 3123 movhpd %xmm9, 2 * SIZE(CO2) 3124 movhpd %xmm13, 3 * SIZE(CO2) 3125 3126 movsd %xmm3, 0 * SIZE(CO1, LDC, 2) 3127 movsd %xmm7, 1 * SIZE(CO1, LDC, 2) 3128 movsd %xmm11, 2 * SIZE(CO1, LDC, 2) 3129 movsd %xmm15, 3 * SIZE(CO1, LDC, 2) 3130 3131 movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) 3132 movhpd %xmm7, 1 * SIZE(CO2, LDC, 2) 3133 movhpd %xmm11, 2 * SIZE(CO2, LDC, 2) 3134 movhpd %xmm15, 3 * SIZE(CO2, LDC, 2) 3135#else 3136 movsd %xmm8, 0 * SIZE(CO1) 3137 movhpd %xmm8, 1 * SIZE(CO1) 3138 movsd %xmm9, 2 * SIZE(CO1) 3139 movhpd %xmm9, 3 * SIZE(CO1) 3140 3141 movsd %xmm10, 0 * SIZE(CO2) 3142 movhpd %xmm10, 1 * SIZE(CO2) 3143 movsd %xmm11, 2 * SIZE(CO2) 3144 movhpd %xmm11, 3 * SIZE(CO2) 3145 3146 movsd %xmm12, 0 * SIZE(CO1, LDC, 2) 3147 movhpd %xmm12, 1 * SIZE(CO1, LDC, 2) 3148 movsd %xmm13, 2 * SIZE(CO1, LDC, 2) 3149 movhpd %xmm13, 3 * SIZE(CO1, LDC, 2) 3150 3151 movsd %xmm14, 0 * SIZE(CO2, LDC, 2) 3152 movhpd %xmm14, 1 * SIZE(CO2, LDC, 2) 3153 movsd %xmm15, 2 * SIZE(CO2, LDC, 2) 3154 movhpd %xmm15, 3 * SIZE(CO2, LDC, 2) 3155#endif 3156 3157#if defined(LN) || defined(LT) 3158 movapd %xmm1, 0 * SIZE(B) 3159 movapd %xmm3, 2 * SIZE(B) 3160 movapd %xmm5, 4 * SIZE(B) 3161 movapd %xmm7, 6 * SIZE(B) 3162 movapd %xmm9, 8 * SIZE(B) 3163 movapd %xmm11, 10 * SIZE(B) 3164 movapd %xmm13, 12 * SIZE(B) 3165 movapd %xmm15, 14 * SIZE(B) 3166 3167 movlpd %xmm1, 0 * SIZE(BO) 3168 movlpd %xmm1, 1 * SIZE(BO) 3169 movhpd %xmm1, 2 * SIZE(BO) 3170 movhpd %xmm1, 3 * SIZE(BO) 3171 movlpd %xmm3, 4 * SIZE(BO) 3172 movlpd %xmm3, 5 * SIZE(BO) 3173 movhpd %xmm3, 6 * SIZE(BO) 3174 movhpd %xmm3, 7 * SIZE(BO) 3175 movlpd %xmm5, 8 * SIZE(BO) 3176 movlpd %xmm5, 9 * SIZE(BO) 3177 movhpd %xmm5, 10 * SIZE(BO) 3178 movhpd %xmm5, 11 * SIZE(BO) 3179 movlpd %xmm7, 12 * SIZE(BO) 3180 movlpd %xmm7, 13 * SIZE(BO) 3181 movhpd %xmm7, 14 * SIZE(BO) 3182 movhpd %xmm7, 15 * SIZE(BO) 3183 movlpd %xmm9, 16 * SIZE(BO) 3184 movlpd %xmm9, 17 * SIZE(BO) 3185 movhpd %xmm9, 18 * SIZE(BO) 3186 movhpd %xmm9, 19 * SIZE(BO) 3187 movlpd %xmm11, 20 * SIZE(BO) 3188 movlpd %xmm11, 21 * SIZE(BO) 3189 movhpd %xmm11, 22 * SIZE(BO) 3190 movhpd %xmm11, 23 * SIZE(BO) 3191 movlpd %xmm13, 24 * SIZE(BO) 3192 movlpd %xmm13, 25 * SIZE(BO) 3193 movhpd %xmm13, 26 * SIZE(BO) 3194 movhpd %xmm13, 27 * SIZE(BO) 3195 movlpd %xmm15, 28 * SIZE(BO) 3196 movlpd %xmm15, 29 * SIZE(BO) 3197 movhpd %xmm15, 30 * SIZE(BO) 3198 movhpd %xmm15, 31 * SIZE(BO) 3199#else 3200 movapd %xmm8, 0 * SIZE(AO) 3201 movapd %xmm9, 2 * SIZE(AO) 3202 movapd %xmm10, 4 * SIZE(AO) 3203 movapd %xmm11, 6 * SIZE(AO) 3204 movapd %xmm12, 8 * SIZE(AO) 3205 movapd %xmm13, 10 * SIZE(AO) 3206 movapd %xmm14, 12 * SIZE(AO) 3207 movapd %xmm15, 14 * SIZE(AO) 3208#endif 3209 3210#ifndef LN 3211 addq $4 * SIZE, CO1 3212 addq $4 * SIZE, CO2 3213#endif 3214 3215#if defined(LT) || defined(RN) 3216 movq K, %rax 3217 subq KK, %rax 3218 leaq (,%rax, SIZE), %rax 3219 leaq (AO, %rax, 4), AO 3220#ifdef LT 3221 addq $16 * SIZE, B 3222#endif 3223#endif 3224 3225#ifdef LN 3226 subq $4, KK 3227 movq BORIG, B 3228#endif 3229 3230#ifdef LT 3231 addq $4, KK 3232#endif 3233 3234#ifdef RT 3235 movq K, %rax 3236 movq BORIG, B 3237 salq $2 + BASE_SHIFT, %rax 3238 addq %rax, AORIG 3239#endif 3240 3241 decq I # i -- 3242 jg .L11 3243 ALIGN_4 3244 3245.L20: 3246 testq $3, M 3247 je .L39 3248 3249 testq $2, M 3250 je .L30 3251 ALIGN_4 3252 3253.L21: 3254#ifdef LN 3255 movq K, %rax 3256 salq $1 + BASE_SHIFT, %rax 3257 subq %rax, AORIG 3258#endif 3259 3260#if defined(LN) || defined(RT) 3261 movq KK, %rax 3262 movq AORIG, AO 3263 leaq (, %rax, SIZE), %rax 3264 leaq (AO, %rax, 2), AO 3265#endif 3266 3267 leaq BUFFER, BO 3268 3269#if defined(LN) || defined(RT) 3270 movq KK, %rax 3271 salq $2 + BASE_SHIFT, %rax 3272 leaq (BO, %rax, 2), BO 3273#endif 3274 3275 movapd 0 * SIZE(AO), %xmm8 3276 pxor %xmm0, %xmm0 3277 movapd 0 * SIZE(BO), %xmm9 3278 pxor %xmm1, %xmm1 3279 movapd 8 * SIZE(AO), %xmm10 3280 pxor %xmm2, %xmm2 3281 movapd 8 * SIZE(BO), %xmm11 3282 pxor %xmm3, %xmm3 3283 3284 movapd 16 * SIZE(BO), %xmm13 3285 movapd 24 * SIZE(BO), %xmm15 3286 3287#if defined(LT) || defined(RN) 3288 movq KK, %rax 3289#else 3290 movq K, %rax 3291 subq KK, %rax 3292#endif 3293 sarq $3, %rax 3294 je .L25 3295 ALIGN_4 3296 3297.L22: 3298 mulpd %xmm8, %xmm9 3299 addpd %xmm9, %xmm0 3300 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 3301 movapd 2 * SIZE(BO), %xmm9 3302 mulpd %xmm8, %xmm9 3303 addpd %xmm9, %xmm1 3304 movapd 4 * SIZE(BO), %xmm9 3305 mulpd %xmm8, %xmm9 3306 mulpd 6 * SIZE(BO), %xmm8 3307 addpd %xmm9, %xmm2 3308 movapd 32 * SIZE(BO), %xmm9 3309 addpd %xmm8, %xmm3 3310 movapd 2 * SIZE(AO), %xmm8 3311 3312 mulpd %xmm8, %xmm11 3313 addpd %xmm11, %xmm0 3314 movapd 10 * SIZE(BO), %xmm11 3315 mulpd %xmm8, %xmm11 3316 addpd %xmm11, %xmm1 3317 movapd 12 * SIZE(BO), %xmm11 3318 mulpd %xmm8, %xmm11 3319 mulpd 14 * SIZE(BO), %xmm8 3320 addpd %xmm11, %xmm2 3321 movapd 40 * SIZE(BO), %xmm11 3322 addpd %xmm8, %xmm3 3323 movapd 4 * SIZE(AO), %xmm8 3324 3325 mulpd %xmm8, %xmm13 3326 addpd %xmm13, %xmm0 3327 movapd 18 * SIZE(BO), %xmm13 3328 mulpd %xmm8, %xmm13 3329 addpd %xmm13, %xmm1 3330 movapd 20 * SIZE(BO), %xmm13 3331 mulpd %xmm8, %xmm13 3332 mulpd 22 * SIZE(BO), %xmm8 3333 addpd %xmm13, %xmm2 3334 movapd 48 * SIZE(BO), %xmm13 3335 addpd %xmm8, %xmm3 3336 movapd 6 * SIZE(AO), %xmm8 3337 3338 mulpd %xmm8, %xmm15 3339 addpd %xmm15, %xmm0 3340 movapd 26 * SIZE(BO), %xmm15 3341 mulpd %xmm8, %xmm15 3342 addpd %xmm15, %xmm1 3343 movapd 28 * SIZE(BO), %xmm15 3344 mulpd %xmm8, %xmm15 3345 mulpd 30 * SIZE(BO), %xmm8 3346 addpd %xmm15, %xmm2 3347 movapd 56 * SIZE(BO), %xmm15 3348 addpd %xmm8, %xmm3 3349 movapd 16 * SIZE(AO), %xmm8 3350 3351 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) 3352 mulpd %xmm10, %xmm9 3353 addpd %xmm9, %xmm0 3354 movapd 34 * SIZE(BO), %xmm9 3355 mulpd %xmm10, %xmm9 3356 addpd %xmm9, %xmm1 3357 movapd 36 * SIZE(BO), %xmm9 3358 mulpd %xmm10, %xmm9 3359 mulpd 38 * SIZE(BO), %xmm10 3360 addpd %xmm9, %xmm2 3361 movapd 64 * SIZE(BO), %xmm9 3362 addpd %xmm10, %xmm3 3363 movapd 10 * SIZE(AO), %xmm10 3364 3365 mulpd %xmm10, %xmm11 3366 addpd %xmm11, %xmm0 3367 movapd 42 * SIZE(BO), %xmm11 3368 mulpd %xmm10, %xmm11 3369 addpd %xmm11, %xmm1 3370 movapd 44 * SIZE(BO), %xmm11 3371 mulpd %xmm10, %xmm11 3372 mulpd 46 * SIZE(BO), %xmm10 3373 addpd %xmm11, %xmm2 3374 movapd 72 * SIZE(BO), %xmm11 3375 addpd %xmm10, %xmm3 3376 movapd 12 * SIZE(AO), %xmm10 3377 3378 mulpd %xmm10, %xmm13 3379 addpd %xmm13, %xmm0 3380 movapd 50 * SIZE(BO), %xmm13 3381 mulpd %xmm10, %xmm13 3382 addpd %xmm13, %xmm1 3383 movapd 52 * SIZE(BO), %xmm13 3384 mulpd %xmm10, %xmm13 3385 mulpd 54 * SIZE(BO), %xmm10 3386 addpd %xmm13, %xmm2 3387 movapd 80 * SIZE(BO), %xmm13 3388 addpd %xmm10, %xmm3 3389 movapd 14 * SIZE(AO), %xmm10 3390 3391 mulpd %xmm10, %xmm15 3392 addpd %xmm15, %xmm0 3393 movapd 58 * SIZE(BO), %xmm15 3394 mulpd %xmm10, %xmm15 3395 addpd %xmm15, %xmm1 3396 movapd 60 * SIZE(BO), %xmm15 3397 mulpd %xmm10, %xmm15 3398 mulpd 62 * SIZE(BO), %xmm10 3399 addpd %xmm15, %xmm2 3400 movapd 88 * SIZE(BO), %xmm15 3401 addpd %xmm10, %xmm3 3402 movapd 24 * SIZE(AO), %xmm10 3403 3404 addq $16 * SIZE, AO 3405 addq $64 * SIZE, BO 3406 decq %rax 3407 jne .L22 3408 ALIGN_4 3409 3410.L25: 3411#if defined(LT) || defined(RN) 3412 movq KK, %rax 3413#else 3414 movq K, %rax 3415 subq KK, %rax 3416#endif 3417 andq $7, %rax # if (k & 1) 3418 BRANCH 3419 je .L29 3420 ALIGN_4 3421 3422.L26: 3423 mulpd %xmm8, %xmm9 3424 addpd %xmm9, %xmm0 3425 movapd 2 * SIZE(BO), %xmm9 3426 mulpd %xmm8, %xmm9 3427 addpd %xmm9, %xmm1 3428 movapd 4 * SIZE(BO), %xmm9 3429 mulpd %xmm8, %xmm9 3430 mulpd 6 * SIZE(BO), %xmm8 3431 addpd %xmm9, %xmm2 3432 movapd 8 * SIZE(BO), %xmm9 3433 addpd %xmm8, %xmm3 3434 movapd 2 * SIZE(AO), %xmm8 3435 3436 addq $2 * SIZE, AO # aoffset += 4 3437 addq $8 * SIZE, BO # boffset1 += 8 3438 decq %rax 3439 jg .L26 3440 ALIGN_4 3441 3442.L29: 3443#if defined(LN) || defined(RT) 3444 movq KK, %rax 3445#ifdef LN 3446 subq $2, %rax 3447#else 3448 subq $4, %rax 3449#endif 3450 3451 movq AORIG, AO 3452 movq BORIG, B 3453 leaq BUFFER, BO 3454 3455 leaq (, %rax, SIZE), %rax 3456 leaq (AO, %rax, 2), AO 3457 leaq (B, %rax, 4), B 3458 leaq (BO, %rax, 8), BO 3459#endif 3460 3461#if defined(LN) || defined(LT) 3462 movapd %xmm0, %xmm8 3463 unpcklpd %xmm1, %xmm0 3464 unpckhpd %xmm1, %xmm8 3465 3466 movapd %xmm2, %xmm10 3467 unpcklpd %xmm3, %xmm2 3468 unpckhpd %xmm3, %xmm10 3469 3470 movapd 0 * SIZE(B), %xmm1 3471 movapd 2 * SIZE(B), %xmm3 3472 movapd 4 * SIZE(B), %xmm5 3473 movapd 6 * SIZE(B), %xmm7 3474 3475 subpd %xmm0, %xmm1 3476 subpd %xmm2, %xmm3 3477 subpd %xmm8, %xmm5 3478 subpd %xmm10, %xmm7 3479#else 3480 movapd 0 * SIZE(AO), %xmm8 3481 movapd 2 * SIZE(AO), %xmm10 3482 movapd 4 * SIZE(AO), %xmm12 3483 movapd 6 * SIZE(AO), %xmm14 3484 3485 subpd %xmm0, %xmm8 3486 subpd %xmm1, %xmm10 3487 subpd %xmm2, %xmm12 3488 subpd %xmm3, %xmm14 3489#endif 3490 3491#ifdef LN 3492 movlpd 3 * SIZE(AO), %xmm0 3493 movhpd 3 * SIZE(AO), %xmm0 3494 mulpd %xmm0, %xmm5 3495 mulpd %xmm0, %xmm7 3496 3497 movlpd 2 * SIZE(AO), %xmm2 3498 movhpd 2 * SIZE(AO), %xmm2 3499 mulpd %xmm5, %xmm2 3500 subpd %xmm2, %xmm1 3501 movlpd 2 * SIZE(AO), %xmm2 3502 movhpd 2 * SIZE(AO), %xmm2 3503 mulpd %xmm7, %xmm2 3504 subpd %xmm2, %xmm3 3505 3506 movlpd 0 * SIZE(AO), %xmm0 3507 movhpd 0 * SIZE(AO), %xmm0 3508 mulpd %xmm0, %xmm1 3509 mulpd %xmm0, %xmm3 3510#endif 3511 3512#ifdef LT 3513 movlpd 0 * SIZE(AO), %xmm0 3514 movhpd 0 * SIZE(AO), %xmm0 3515 mulpd %xmm0, %xmm1 3516 mulpd %xmm0, %xmm3 3517 3518 movlpd 1 * SIZE(AO), %xmm2 3519 movhpd 1 * SIZE(AO), %xmm2 3520 mulpd %xmm1, %xmm2 3521 subpd %xmm2, %xmm5 3522 movlpd 1 * SIZE(AO), %xmm2 3523 movhpd 1 * SIZE(AO), %xmm2 3524 mulpd %xmm3, %xmm2 3525 subpd %xmm2, %xmm7 3526 3527 movlpd 3 * SIZE(AO), %xmm0 3528 movhpd 3 * SIZE(AO), %xmm0 3529 mulpd %xmm0, %xmm5 3530 mulpd %xmm0, %xmm7 3531#endif 3532 3533#ifdef RN 3534 movlpd 0 * SIZE(B), %xmm0 3535 movhpd 0 * SIZE(B), %xmm0 3536 mulpd %xmm0, %xmm8 3537 3538 movlpd 1 * SIZE(B), %xmm1 3539 movhpd 1 * SIZE(B), %xmm1 3540 mulpd %xmm8, %xmm1 3541 subpd %xmm1, %xmm10 3542 movlpd 2 * SIZE(B), %xmm2 3543 movhpd 2 * SIZE(B), %xmm2 3544 mulpd %xmm8, %xmm2 3545 subpd %xmm2, %xmm12 3546 movlpd 3 * SIZE(B), %xmm3 3547 movhpd 3 * SIZE(B), %xmm3 3548 mulpd %xmm8, %xmm3 3549 subpd %xmm3, %xmm14 3550 3551 movlpd 5 * SIZE(B), %xmm0 3552 movhpd 5 * SIZE(B), %xmm0 3553 mulpd %xmm0, %xmm10 3554 movlpd 6 * SIZE(B), %xmm1 3555 movhpd 6 * SIZE(B), %xmm1 3556 mulpd %xmm10, %xmm1 3557 subpd %xmm1, %xmm12 3558 movlpd 7 * SIZE(B), %xmm2 3559 movhpd 7 * SIZE(B), %xmm2 3560 mulpd %xmm10, %xmm2 3561 subpd %xmm2, %xmm14 3562 3563 movlpd 10 * SIZE(B), %xmm0 3564 movhpd 10 * SIZE(B), %xmm0 3565 mulpd %xmm0, %xmm12 3566 3567 movlpd 11 * SIZE(B), %xmm1 3568 movhpd 11 * SIZE(B), %xmm1 3569 mulpd %xmm12, %xmm1 3570 subpd %xmm1, %xmm14 3571 3572 movlpd 15 * SIZE(B), %xmm0 3573 movhpd 15 * SIZE(B), %xmm0 3574 mulpd %xmm0, %xmm14 3575#endif 3576 3577#ifdef RT 3578 movlpd 15 * SIZE(B), %xmm0 3579 movhpd 15 * SIZE(B), %xmm0 3580 mulpd %xmm0, %xmm14 3581 3582 movlpd 14 * SIZE(B), %xmm1 3583 movhpd 14 * SIZE(B), %xmm1 3584 mulpd %xmm14, %xmm1 3585 subpd %xmm1, %xmm12 3586 movlpd 13 * SIZE(B), %xmm2 3587 movhpd 13 * SIZE(B), %xmm2 3588 mulpd %xmm14, %xmm2 3589 subpd %xmm2, %xmm10 3590 movlpd 12 * SIZE(B), %xmm3 3591 movhpd 12 * SIZE(B), %xmm3 3592 mulpd %xmm14, %xmm3 3593 subpd %xmm3, %xmm8 3594 3595 movlpd 10 * SIZE(B), %xmm0 3596 movhpd 10 * SIZE(B), %xmm0 3597 mulpd %xmm0, %xmm12 3598 movlpd 9 * SIZE(B), %xmm1 3599 movhpd 9 * SIZE(B), %xmm1 3600 mulpd %xmm12, %xmm1 3601 subpd %xmm1, %xmm10 3602 movlpd 8 * SIZE(B), %xmm2 3603 movhpd 8 * SIZE(B), %xmm2 3604 mulpd %xmm12, %xmm2 3605 subpd %xmm2, %xmm8 3606 3607 movlpd 5 * SIZE(B), %xmm0 3608 movhpd 5 * SIZE(B), %xmm0 3609 mulpd %xmm0, %xmm10 3610 movlpd 4 * SIZE(B), %xmm1 3611 movhpd 4 * SIZE(B), %xmm1 3612 mulpd %xmm10, %xmm1 3613 subpd %xmm1, %xmm8 3614 3615 movlpd 0 * SIZE(B), %xmm0 3616 movhpd 0 * SIZE(B), %xmm0 3617 mulpd %xmm0, %xmm8 3618#endif 3619 3620#ifdef LN 3621 subq $2 * SIZE, CO1 3622 subq $2 * SIZE, CO2 3623#endif 3624 3625#if defined(LN) || defined(LT) 3626 movsd %xmm1, 0 * SIZE(CO1) 3627 movsd %xmm5, 1 * SIZE(CO1) 3628 3629 movhpd %xmm1, 0 * SIZE(CO2) 3630 movhpd %xmm5, 1 * SIZE(CO2) 3631 3632 movsd %xmm3, 0 * SIZE(CO1, LDC, 2) 3633 movsd %xmm7, 1 * SIZE(CO1, LDC, 2) 3634 3635 movhpd %xmm3, 0 * SIZE(CO2, LDC, 2) 3636 movhpd %xmm7, 1 * SIZE(CO2, LDC, 2) 3637#else 3638 movsd %xmm8, 0 * SIZE(CO1) 3639 movhpd %xmm8, 1 * SIZE(CO1) 3640 3641 movsd %xmm10, 0 * SIZE(CO2) 3642 movhpd %xmm10, 1 * SIZE(CO2) 3643 3644 movsd %xmm12, 0 * SIZE(CO1, LDC, 2) 3645 movhpd %xmm12, 1 * SIZE(CO1, LDC, 2) 3646 3647 movsd %xmm14, 0 * SIZE(CO2, LDC, 2) 3648 movhpd %xmm14, 1 * SIZE(CO2, LDC, 2) 3649#endif 3650 3651#if defined(LN) || defined(LT) 3652 movapd %xmm1, 0 * SIZE(B) 3653 movapd %xmm3, 2 * SIZE(B) 3654 movapd %xmm5, 4 * SIZE(B) 3655 movapd %xmm7, 6 * SIZE(B) 3656 3657 movlpd %xmm1, 0 * SIZE(BO) 3658 movlpd %xmm1, 1 * SIZE(BO) 3659 movhpd %xmm1, 2 * SIZE(BO) 3660 movhpd %xmm1, 3 * SIZE(BO) 3661 movlpd %xmm3, 4 * SIZE(BO) 3662 movlpd %xmm3, 5 * SIZE(BO) 3663 movhpd %xmm3, 6 * SIZE(BO) 3664 movhpd %xmm3, 7 * SIZE(BO) 3665 movlpd %xmm5, 8 * SIZE(BO) 3666 movlpd %xmm5, 9 * SIZE(BO) 3667 movhpd %xmm5, 10 * SIZE(BO) 3668 movhpd %xmm5, 11 * SIZE(BO) 3669 movlpd %xmm7, 12 * SIZE(BO) 3670 movlpd %xmm7, 13 * SIZE(BO) 3671 movhpd %xmm7, 14 * SIZE(BO) 3672 movhpd %xmm7, 15 * SIZE(BO) 3673#else 3674 movapd %xmm8, 0 * SIZE(AO) 3675 movapd %xmm10, 2 * SIZE(AO) 3676 movapd %xmm12, 4 * SIZE(AO) 3677 movapd %xmm14, 6 * SIZE(AO) 3678#endif 3679 3680#ifndef LN 3681 addq $2 * SIZE, CO1 3682 addq $2 * SIZE, CO2 3683#endif 3684 3685#if defined(LT) || defined(RN) 3686 movq K, %rax 3687 subq KK, %rax 3688 leaq (,%rax, SIZE), %rax 3689 leaq (AO, %rax, 2), AO 3690#ifdef LT 3691 addq $8 * SIZE, B 3692#endif 3693#endif 3694 3695#ifdef LN 3696 subq $2, KK 3697 movq BORIG, B 3698#endif 3699 3700#ifdef LT 3701 addq $2, KK 3702#endif 3703 3704#ifdef RT 3705 movq K, %rax 3706 movq BORIG, B 3707 salq $1 + BASE_SHIFT, %rax 3708 addq %rax, AORIG 3709#endif 3710 ALIGN_4 3711 3712.L30: 3713 testq $1, M 3714 je .L39 3715 ALIGN_4 3716 3717.L31: 3718#ifdef LN 3719 movq K, %rax 3720 salq $0 + BASE_SHIFT, %rax 3721 subq %rax, AORIG 3722#endif 3723 3724#if defined(LN) || defined(RT) 3725 movq KK, %rax 3726 movq AORIG, AO 3727 leaq (, %rax, SIZE), %rax 3728 leaq (AO, %rax, 1), AO 3729#endif 3730 3731 leaq BUFFER, BO 3732 3733#if defined(LN) || defined(RT) 3734 movq KK, %rax 3735 salq $2 + BASE_SHIFT, %rax 3736 leaq (BO, %rax, 2), BO 3737#endif 3738 3739 movsd 0 * SIZE(AO), %xmm8 3740 pxor %xmm0, %xmm0 3741 movsd 0 * SIZE(BO), %xmm9 3742 pxor %xmm1, %xmm1 3743 movsd 8 * SIZE(AO), %xmm10 3744 pxor %xmm2, %xmm2 3745 movsd 8 * SIZE(BO), %xmm11 3746 pxor %xmm3, %xmm3 3747 3748 movsd 16 * SIZE(BO), %xmm13 3749 movsd 24 * SIZE(BO), %xmm15 3750 3751#if defined(LT) || defined(RN) 3752 movq KK, %rax 3753#else 3754 movq K, %rax 3755 subq KK, %rax 3756#endif 3757 sarq $3, %rax 3758 je .L35 3759 ALIGN_4 3760 3761.L32: 3762 mulsd %xmm8, %xmm9 3763 addsd %xmm9, %xmm0 3764 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 3765 movsd 2 * SIZE(BO), %xmm9 3766 mulsd %xmm8, %xmm9 3767 addsd %xmm9, %xmm1 3768 movsd 4 * SIZE(BO), %xmm9 3769 mulsd %xmm8, %xmm9 3770 mulsd 6 * SIZE(BO), %xmm8 3771 addsd %xmm9, %xmm2 3772 movsd 32 * SIZE(BO), %xmm9 3773 addsd %xmm8, %xmm3 3774 movsd 1 * SIZE(AO), %xmm8 3775 3776 mulsd %xmm8, %xmm11 3777 addsd %xmm11, %xmm0 3778 movsd 10 * SIZE(BO), %xmm11 3779 mulsd %xmm8, %xmm11 3780 addsd %xmm11, %xmm1 3781 movsd 12 * SIZE(BO), %xmm11 3782 mulsd %xmm8, %xmm11 3783 mulsd 14 * SIZE(BO), %xmm8 3784 addsd %xmm11, %xmm2 3785 movsd 40 * SIZE(BO), %xmm11 3786 addsd %xmm8, %xmm3 3787 movsd 2 * SIZE(AO), %xmm8 3788 3789 mulsd %xmm8, %xmm13 3790 addsd %xmm13, %xmm0 3791 movsd 18 * SIZE(BO), %xmm13 3792 mulsd %xmm8, %xmm13 3793 addsd %xmm13, %xmm1 3794 movsd 20 * SIZE(BO), %xmm13 3795 mulsd %xmm8, %xmm13 3796 mulsd 22 * SIZE(BO), %xmm8 3797 addsd %xmm13, %xmm2 3798 movsd 48 * SIZE(BO), %xmm13 3799 addsd %xmm8, %xmm3 3800 movsd 3 * SIZE(AO), %xmm8 3801 3802 mulsd %xmm8, %xmm15 3803 addsd %xmm15, %xmm0 3804 movsd 26 * SIZE(BO), %xmm15 3805 mulsd %xmm8, %xmm15 3806 addsd %xmm15, %xmm1 3807 movsd 28 * SIZE(BO), %xmm15 3808 mulsd %xmm8, %xmm15 3809 mulsd 30 * SIZE(BO), %xmm8 3810 addsd %xmm15, %xmm2 3811 movsd 56 * SIZE(BO), %xmm15 3812 addsd %xmm8, %xmm3 3813 movsd 4 * SIZE(AO), %xmm8 3814 3815 mulsd %xmm8, %xmm9 3816 addsd %xmm9, %xmm0 3817 movsd 34 * SIZE(BO), %xmm9 3818 mulsd %xmm8, %xmm9 3819 addsd %xmm9, %xmm1 3820 movsd 36 * SIZE(BO), %xmm9 3821 mulsd %xmm8, %xmm9 3822 mulsd 38 * SIZE(BO), %xmm8 3823 addsd %xmm9, %xmm2 3824 movsd 64 * SIZE(BO), %xmm9 3825 addsd %xmm8, %xmm3 3826 movsd 5 * SIZE(AO), %xmm8 3827 3828 mulsd %xmm8, %xmm11 3829 addsd %xmm11, %xmm0 3830 movsd 42 * SIZE(BO), %xmm11 3831 mulsd %xmm8, %xmm11 3832 addsd %xmm11, %xmm1 3833 movsd 44 * SIZE(BO), %xmm11 3834 mulsd %xmm8, %xmm11 3835 mulsd 46 * SIZE(BO), %xmm8 3836 addsd %xmm11, %xmm2 3837 movsd 72 * SIZE(BO), %xmm11 3838 addsd %xmm8, %xmm3 3839 movsd 6 * SIZE(AO), %xmm8 3840 3841 mulsd %xmm8, %xmm13 3842 addsd %xmm13, %xmm0 3843 movsd 50 * SIZE(BO), %xmm13 3844 mulsd %xmm8, %xmm13 3845 addsd %xmm13, %xmm1 3846 movsd 52 * SIZE(BO), %xmm13 3847 mulsd %xmm8, %xmm13 3848 mulsd 54 * SIZE(BO), %xmm8 3849 addsd %xmm13, %xmm2 3850 movsd 80 * SIZE(BO), %xmm13 3851 addsd %xmm8, %xmm3 3852 movsd 7 * SIZE(AO), %xmm8 3853 3854 mulsd %xmm8, %xmm15 3855 addsd %xmm15, %xmm0 3856 movsd 58 * SIZE(BO), %xmm15 3857 mulsd %xmm8, %xmm15 3858 addsd %xmm15, %xmm1 3859 movsd 60 * SIZE(BO), %xmm15 3860 mulsd %xmm8, %xmm15 3861 mulsd 62 * SIZE(BO), %xmm8 3862 addsd %xmm15, %xmm2 3863 movsd 88 * SIZE(BO), %xmm15 3864 addsd %xmm8, %xmm3 3865 movsd 8 * SIZE(AO), %xmm8 3866 3867 addq $ 8 * SIZE, AO 3868 addq $64 * SIZE, BO 3869 decq %rax 3870 jne .L32 3871 ALIGN_4 3872 3873.L35: 3874#if defined(LT) || defined(RN) 3875 movq KK, %rax 3876#else 3877 movq K, %rax 3878 subq KK, %rax 3879#endif 3880 andq $7, %rax # if (k & 1) 3881 BRANCH 3882 je .L38 3883 ALIGN_4 3884 3885.L36: 3886 mulsd %xmm8, %xmm9 3887 addsd %xmm9, %xmm0 3888 movsd 2 * SIZE(BO), %xmm9 3889 mulsd %xmm8, %xmm9 3890 addsd %xmm9, %xmm1 3891 movsd 4 * SIZE(BO), %xmm9 3892 mulsd %xmm8, %xmm9 3893 mulsd 6 * SIZE(BO), %xmm8 3894 addsd %xmm9, %xmm2 3895 movsd 8 * SIZE(BO), %xmm9 3896 addsd %xmm8, %xmm3 3897 movsd 1 * SIZE(AO), %xmm8 3898 3899 addq $1 * SIZE, AO # aoffset += 4 3900 addq $8 * SIZE, BO # boffset1 += 8 3901 decq %rax 3902 jg .L36 3903 ALIGN_4 3904 3905.L38: 3906#if defined(LN) || defined(RT) 3907 movq KK, %rax 3908#ifdef LN 3909 subq $1, %rax 3910#else 3911 subq $4, %rax 3912#endif 3913 3914 movq AORIG, AO 3915 movq BORIG, B 3916 leaq BUFFER, BO 3917 3918 leaq (, %rax, SIZE), %rax 3919 leaq (AO, %rax, 1), AO 3920 leaq (B, %rax, 4), B 3921 leaq (BO, %rax, 8), BO 3922#endif 3923 3924#if defined(LN) || defined(LT) 3925 movsd 0 * SIZE(B), %xmm4 3926 movsd 1 * SIZE(B), %xmm5 3927 movsd 2 * SIZE(B), %xmm6 3928 movsd 3 * SIZE(B), %xmm7 3929#else 3930 movsd 0 * SIZE(AO), %xmm4 3931 movsd 1 * SIZE(AO), %xmm5 3932 movsd 2 * SIZE(AO), %xmm6 3933 movsd 3 * SIZE(AO), %xmm7 3934#endif 3935 3936 subsd %xmm0, %xmm4 3937 subsd %xmm1, %xmm5 3938 subsd %xmm2, %xmm6 3939 subsd %xmm3, %xmm7 3940 3941#ifdef LN 3942 movsd 0 * SIZE(AO), %xmm0 3943 3944 mulsd %xmm0, %xmm4 3945 mulsd %xmm0, %xmm5 3946 mulsd %xmm0, %xmm6 3947 mulsd %xmm0, %xmm7 3948#endif 3949 3950#ifdef LT 3951 movsd 0 * SIZE(AO), %xmm0 3952 3953 mulsd %xmm0, %xmm4 3954 mulsd %xmm0, %xmm5 3955 mulsd %xmm0, %xmm6 3956 mulsd %xmm0, %xmm7 3957#endif 3958 3959#ifdef RN 3960 mulsd 0 * SIZE(B), %xmm4 3961 movlpd 1 * SIZE(B), %xmm1 3962 mulsd %xmm4, %xmm1 3963 subsd %xmm1, %xmm5 3964 movlpd 2 * SIZE(B), %xmm2 3965 mulsd %xmm4, %xmm2 3966 subsd %xmm2, %xmm6 3967 movlpd 3 * SIZE(B), %xmm3 3968 mulsd %xmm4, %xmm3 3969 subsd %xmm3, %xmm7 3970 3971 mulsd 5 * SIZE(B), %xmm5 3972 movlpd 6 * SIZE(B), %xmm1 3973 mulsd %xmm5, %xmm1 3974 subsd %xmm1, %xmm6 3975 movlpd 7 * SIZE(B), %xmm2 3976 mulsd %xmm5, %xmm2 3977 subsd %xmm2, %xmm7 3978 3979 mulsd 10 * SIZE(B), %xmm6 3980 movlpd 11 * SIZE(B), %xmm1 3981 mulsd %xmm6, %xmm1 3982 subsd %xmm1, %xmm7 3983 3984 mulsd 15 * SIZE(B), %xmm7 3985#endif 3986 3987#ifdef RT 3988 mulsd 15 * SIZE(B), %xmm7 3989 3990 movlpd 14 * SIZE(B), %xmm1 3991 mulsd %xmm7, %xmm1 3992 subsd %xmm1, %xmm6 3993 movlpd 13 * SIZE(B), %xmm2 3994 mulsd %xmm7, %xmm2 3995 subsd %xmm2, %xmm5 3996 movlpd 12 * SIZE(B), %xmm3 3997 mulsd %xmm7, %xmm3 3998 subsd %xmm3, %xmm4 3999 4000 mulsd 10 * SIZE(B), %xmm6 4001 4002 movlpd 9 * SIZE(B), %xmm1 4003 mulsd %xmm6, %xmm1 4004 subsd %xmm1, %xmm5 4005 movlpd 8 * SIZE(B), %xmm2 4006 mulsd %xmm6, %xmm2 4007 subsd %xmm2, %xmm4 4008 4009 mulsd 5 * SIZE(B), %xmm5 4010 4011 movlpd 4 * SIZE(B), %xmm1 4012 mulsd %xmm5, %xmm1 4013 subsd %xmm1, %xmm4 4014 4015 mulsd 0 * SIZE(B), %xmm4 4016#endif 4017 4018#ifdef LN 4019 subq $1 * SIZE, CO1 4020 subq $1 * SIZE, CO2 4021#endif 4022 4023 movsd %xmm4, 0 * SIZE(CO1) 4024 movsd %xmm5, 0 * SIZE(CO2) 4025 movsd %xmm6, 0 * SIZE(CO1, LDC, 2) 4026 movsd %xmm7, 0 * SIZE(CO2, LDC, 2) 4027 4028#if defined(LN) || defined(LT) 4029 movsd %xmm4, 0 * SIZE(B) 4030 movsd %xmm5, 1 * SIZE(B) 4031 movsd %xmm6, 2 * SIZE(B) 4032 movsd %xmm7, 3 * SIZE(B) 4033 4034 movsd %xmm4, 0 * SIZE(BO) 4035 movsd %xmm4, 1 * SIZE(BO) 4036 movsd %xmm5, 2 * SIZE(BO) 4037 movsd %xmm5, 3 * SIZE(BO) 4038 movsd %xmm6, 4 * SIZE(BO) 4039 movsd %xmm6, 5 * SIZE(BO) 4040 movsd %xmm7, 6 * SIZE(BO) 4041 movsd %xmm7, 7 * SIZE(BO) 4042#else 4043 movsd %xmm4, 0 * SIZE(AO) 4044 movsd %xmm5, 1 * SIZE(AO) 4045 movsd %xmm6, 2 * SIZE(AO) 4046 movsd %xmm7, 3 * SIZE(AO) 4047#endif 4048 4049#ifndef LN 4050 addq $1 * SIZE, CO1 4051 addq $1 * SIZE, CO2 4052#endif 4053 4054#if defined(LT) || defined(RN) 4055 movq K, %rax 4056 subq KK, %rax 4057 leaq (,%rax, SIZE), %rax 4058 leaq (AO, %rax, 1), AO 4059#ifdef LT 4060 addq $4 * SIZE, B 4061#endif 4062#endif 4063 4064#ifdef LN 4065 subq $1, KK 4066 movq BORIG, B 4067#endif 4068 4069#ifdef LT 4070 addq $1, KK 4071#endif 4072 4073#ifdef RT 4074 movq K, %rax 4075 movq BORIG, B 4076 salq $0 + BASE_SHIFT, %rax 4077 addq %rax, AORIG 4078#endif 4079 ALIGN_4 4080 4081.L39: 4082#ifdef LN 4083 leaq (, K, SIZE), %rax 4084 leaq (B, %rax, 4), B 4085#endif 4086 4087#if defined(LT) || defined(RN) 4088 movq K, %rax 4089 subq KK, %rax 4090 leaq (,%rax, SIZE), %rax 4091 leaq (B, %rax, 4), B 4092#endif 4093 4094#ifdef RN 4095 addq $4, KK 4096#endif 4097 4098#ifdef RT 4099 subq $4, KK 4100#endif 4101 4102 decq J # j -- 4103 jg .L01 4104 ALIGN_4 4105 4106.L999: 4107 movq %rbx, %rsp 4108 4109 movq 0(%rsp), %rbx 4110 movq 8(%rsp), %rbp 4111 movq 16(%rsp), %r12 4112 movq 24(%rsp), %r13 4113 movq 32(%rsp), %r14 4114 movq 40(%rsp), %r15 4115 4116#ifdef WINDOWS_ABI 4117 movq 48(%rsp), %rdi 4118 movq 56(%rsp), %rsi 4119 movups 64(%rsp), %xmm6 4120 movups 80(%rsp), %xmm7 4121 movups 96(%rsp), %xmm8 4122 movups 112(%rsp), %xmm9 4123 movups 128(%rsp), %xmm10 4124 movups 144(%rsp), %xmm11 4125 movups 160(%rsp), %xmm12 4126 movups 176(%rsp), %xmm13 4127 movups 192(%rsp), %xmm14 4128 movups 208(%rsp), %xmm15 4129#endif 4130 4131 addq $STACKSIZE, %rsp 4132 ret 4133 4134 EPILOGUE 4135