1/*********************************************************************/ 2/* */ 3/* Optimized BLAS libraries */ 4/* By Kazushige Goto <kgoto@tacc.utexas.edu> */ 5/* */ 6/* Copyright (c) The University of Texas, 2009. All rights reserved. */ 7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */ 8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */ 9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */ 10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */ 11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */ 12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */ 13/* THE USE OF THE SOFTWARE OR DOCUMENTATION. */ 14/* Under no circumstances shall University be liable for incidental, */ 15/* special, indirect, direct or consequential damages or loss of */ 16/* profits, interruption of business, or related expenses which may */ 17/* arise from use of Software or Documentation, including but not */ 18/* limited to those resulting from defects in Software and/or */ 19/* Documentation, or loss or inaccuracy of data of any kind. */ 20/*********************************************************************/ 21 22#define ASSEMBLER 23#include "common.h" 24 25#define OLD_M %rdi 26#define OLD_N %rsi 27#define M %r13 28#define N %r14 29#define K %rdx 30#define A %rcx 31#define B %r8 32#define C %r9 33#define LDC %r10 34 35#define I %r11 36#define J %r12 37#define AO %rdi 38#define BO %rsi 39#define CO1 %r15 40#define CO2 %rbp 41 42#ifndef WINDOWS_ABI 43 44#define STACKSIZE 64 45 46#define OLD_LDC 8 + STACKSIZE(%rsp) 47#define OLD_OFFSET 16 + STACKSIZE(%rsp) 48 49#else 50 51#define STACKSIZE 256 52 53#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) 54#define OLD_A 48 + STACKSIZE(%rsp) 55#define OLD_B 56 + STACKSIZE(%rsp) 56#define OLD_C 64 + STACKSIZE(%rsp) 57#define OLD_LDC 72 + STACKSIZE(%rsp) 58#define OLD_OFFSET 80 + STACKSIZE(%rsp) 59 60#endif 61 62#define POSINV 0(%rsp) 63#define ALPHA_R 16(%rsp) 64#define ALPHA_I 32(%rsp) 65#define OFFSET 40(%rsp) 66#define KK 48(%rsp) 67#define KKK 56(%rsp) 68#define AORIG 64(%rsp) 69#define BORIG 72(%rsp) 70#define BUFFER 128(%rsp) 71 72#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) 73#define PREFETCH prefetch 74#define PREFETCHW prefetchw 75#define PREFETCHNTA prefetchnta 76#define PREFETCHSIZE (8 * 6 + 4) 77#endif 78 79#ifdef GENERIC 80#define PREFETCH prefetcht0 81#define PREFETCHW prefetcht0 82#define PREFETCHNTA prefetchnta 83#define PREFETCHSIZE (8 * 6 + 4) 84#endif 85 86#define KERNEL1(xx) \ 87 mulpd %xmm8, %xmm9 ;\ 88 addpd %xmm9, %xmm0 ;\ 89 movapd 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ 90 mulpd %xmm8, %xmm11 ;\ 91 PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\ 92 addpd %xmm11, %xmm1 ;\ 93 movapd 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ 94 mulpd %xmm8, %xmm13 ;\ 95 mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\ 96 addpd %xmm13, %xmm2 ;\ 97 movapd 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ 98 addpd %xmm8, %xmm3 ;\ 99 movapd 8 * SIZE + 1 * (xx) * SIZE(AO), %xmm8 100 101#define KERNEL2(xx) \ 102 mulpd %xmm10, %xmm9 ;\ 103 addpd %xmm9, %xmm4 ;\ 104 movapd 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ 105 mulpd %xmm10, %xmm11 ;\ 106 addpd %xmm11, %xmm5 ;\ 107 movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ 108 mulpd %xmm10, %xmm13 ;\ 109 mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\ 110 addpd %xmm13, %xmm6 ;\ 111 movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ 112 addpd %xmm10, %xmm7 ;\ 113 movapd 10 * SIZE + 1 * (xx) * SIZE(AO), %xmm10 114 115#define KERNEL3(xx) \ 116 mulpd %xmm12, %xmm15 ;\ 117 addpd %xmm15, %xmm0 ;\ 118 movapd 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ 119 mulpd %xmm12, %xmm11 ;\ 120 addpd %xmm11, %xmm1 ;\ 121 movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ 122 mulpd %xmm12, %xmm13 ;\ 123 mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\ 124 addpd %xmm13, %xmm2 ;\ 125 movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ 126 addpd %xmm12, %xmm3 ;\ 127 movapd 12 * SIZE + 1 * (xx) * SIZE(AO), %xmm12 128 129#define KERNEL4(xx) \ 130 mulpd %xmm14, %xmm15 ;\ 131 addpd %xmm15, %xmm4 ;\ 132 movapd 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ 133 mulpd %xmm14, %xmm11 ;\ 134 addpd %xmm11, %xmm5 ;\ 135 movapd 18 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ 136 mulpd %xmm14, %xmm13 ;\ 137 mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\ 138 addpd %xmm13, %xmm6 ;\ 139 movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ 140 addpd %xmm14, %xmm7 ;\ 141 movapd 14 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 142 143#define KERNEL5(xx) \ 144 mulpd %xmm8, %xmm9 ;\ 145 addpd %xmm9, %xmm0 ;\ 146 movapd 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ 147 mulpd %xmm8, %xmm11 ;\ 148 PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO) ;\ 149 addpd %xmm11, %xmm1 ;\ 150 movapd 18 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ 151 mulpd %xmm8, %xmm13 ;\ 152 mulpd 22 * SIZE + 2 * (xx) * SIZE(BO), %xmm8 ;\ 153 addpd %xmm13, %xmm2 ;\ 154 movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ 155 addpd %xmm8, %xmm3 ;\ 156 movapd 16 * SIZE + 1 * (xx) * SIZE(AO), %xmm8 157 158#define KERNEL6(xx) \ 159 mulpd %xmm10, %xmm9 ;\ 160 addpd %xmm9, %xmm4 ;\ 161 movapd 32 * SIZE + 2 * (xx) * SIZE(BO), %xmm9 ;\ 162 mulpd %xmm10, %xmm11 ;\ 163 addpd %xmm11, %xmm5 ;\ 164 movapd 26 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ 165 mulpd %xmm10, %xmm13 ;\ 166 mulpd 22 * SIZE + 2 * (xx) * SIZE(BO), %xmm10 ;\ 167 addpd %xmm13, %xmm6 ;\ 168 movapd 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ 169 addpd %xmm10, %xmm7 ;\ 170 movapd 18 * SIZE + 1 * (xx) * SIZE(AO), %xmm10 171 172#define KERNEL7(xx) \ 173 mulpd %xmm12, %xmm15 ;\ 174 addpd %xmm15, %xmm0 ;\ 175 movapd 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ 176 mulpd %xmm12, %xmm11 ;\ 177 addpd %xmm11, %xmm1 ;\ 178 movapd 26 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ 179 mulpd %xmm12, %xmm13 ;\ 180 mulpd 30 * SIZE + 2 * (xx) * SIZE(BO), %xmm12 ;\ 181 addpd %xmm13, %xmm2 ;\ 182 movapd 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ 183 addpd %xmm12, %xmm3 ;\ 184 movapd 20 * SIZE + 1 * (xx) * SIZE(AO), %xmm12 185 186#define KERNEL8(xx) \ 187 mulpd %xmm14, %xmm15 ;\ 188 addpd %xmm15, %xmm4 ;\ 189 movapd 40 * SIZE + 2 * (xx) * SIZE(BO), %xmm15 ;\ 190 mulpd %xmm14, %xmm11 ;\ 191 addpd %xmm11, %xmm5 ;\ 192 movapd 34 * SIZE + 2 * (xx) * SIZE(BO), %xmm11 ;\ 193 mulpd %xmm14, %xmm13 ;\ 194 mulpd 30 * SIZE + 2 * (xx) * SIZE(BO), %xmm14 ;\ 195 addpd %xmm13, %xmm6 ;\ 196 movapd 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm13 ;\ 197 addpd %xmm14, %xmm7 ;\ 198 movapd 22 * SIZE + 1 * (xx) * SIZE(AO), %xmm14 199 200 201#ifndef CONJ 202#define NN 203#else 204#if defined(LN) || defined(LT) 205#define CN 206#else 207#define NC 208#endif 209#endif 210 211 PROLOGUE 212 PROFCODE 213 214 subq $STACKSIZE, %rsp 215 216 movq %rbx, 0(%rsp) 217 movq %rbp, 8(%rsp) 218 movq %r12, 16(%rsp) 219 movq %r13, 24(%rsp) 220 movq %r14, 32(%rsp) 221 movq %r15, 40(%rsp) 222 223#ifdef WINDOWS_ABI 224 movq %rdi, 48(%rsp) 225 movq %rsi, 56(%rsp) 226 movups %xmm6, 64(%rsp) 227 movups %xmm7, 80(%rsp) 228 movups %xmm8, 96(%rsp) 229 movups %xmm9, 112(%rsp) 230 movups %xmm10, 128(%rsp) 231 movups %xmm11, 144(%rsp) 232 movups %xmm12, 160(%rsp) 233 movups %xmm13, 176(%rsp) 234 movups %xmm14, 192(%rsp) 235 movups %xmm15, 208(%rsp) 236 237 movq ARG1, OLD_M 238 movq ARG2, OLD_N 239 movq ARG3, K 240 movq OLD_A, A 241 movq OLD_B, B 242 movq OLD_C, C 243 movq OLD_LDC, LDC 244 movsd OLD_OFFSET, %xmm4 245 246 movaps %xmm3, %xmm0 247 248#else 249 movq OLD_LDC, LDC 250 movsd OLD_OFFSET, %xmm4 251 252#endif 253 254 movq %rsp, %rbx # save old stack 255 subq $128 + LOCAL_BUFFER_SIZE, %rsp 256 andq $-4096, %rsp # align stack 257 258 STACK_TOUCHING 259 260 movq OLD_M, M 261 movq OLD_N, N 262 263 pcmpeqb %xmm15, %xmm15 264 psllq $63, %xmm15 # Generate mask 265 pxor %xmm2, %xmm2 266 267 movlpd %xmm2, 0 + POSINV 268 movlpd %xmm15, 8 + POSINV 269 270 movlpd %xmm4, OFFSET 271 movlpd %xmm4, KK 272 273 salq $ZBASE_SHIFT, LDC 274 275#ifdef LN 276 movq M, %rax 277 salq $ZBASE_SHIFT, %rax 278 addq %rax, C 279 imulq K, %rax 280 addq %rax, A 281#endif 282 283#ifdef RT 284 movq N, %rax 285 salq $ZBASE_SHIFT, %rax 286 imulq K, %rax 287 addq %rax, B 288 289 movq N, %rax 290 imulq LDC, %rax 291 addq %rax, C 292#endif 293 294#ifdef RN 295 negq KK 296#endif 297 298#ifdef RT 299 movq N, %rax 300 subq OFFSET, %rax 301 movq %rax, KK 302#endif 303 304 movq N, J 305 sarq $1, J # j = (n >> 2) 306 jle .L100 307 ALIGN_4 308 309.L01: 310#ifdef LN 311 movq OFFSET, %rax 312 addq M, %rax 313 movq %rax, KK 314#endif 315 316 leaq BUFFER, BO 317 318#ifdef RT 319 movq K, %rax 320 salq $1 + ZBASE_SHIFT, %rax 321 subq %rax, B 322#endif 323 324#if defined(LN) || defined(RT) 325 movq KK, %rax 326 movq B, BORIG 327 salq $ZBASE_SHIFT, %rax 328 leaq (B, %rax, 2), B 329 leaq (BO, %rax, 4), BO 330#endif 331 332#if defined(LT) 333 movq OFFSET, %rax 334 movq %rax, KK 335#endif 336 337#if defined(LT) || defined(RN) 338 movq KK, %rax 339#else 340 movq K, %rax 341 subq KK, %rax 342#endif 343 sarq $2, %rax 344 jle .L03 345 346 addq %rax, %rax 347 ALIGN_4 348 349.L02: 350 PREFETCHNTA 56 * SIZE(B) 351 352 movlpd 0 * SIZE(B), %xmm0 353 movlpd 1 * SIZE(B), %xmm1 354 movlpd 2 * SIZE(B), %xmm2 355 movlpd 3 * SIZE(B), %xmm3 356 movlpd 4 * SIZE(B), %xmm4 357 movlpd 5 * SIZE(B), %xmm5 358 movlpd 6 * SIZE(B), %xmm6 359 movlpd 7 * SIZE(B), %xmm7 360 361 movlpd %xmm0, 0 * SIZE(BO) 362 movlpd %xmm0, 1 * SIZE(BO) 363 movlpd %xmm1, 2 * SIZE(BO) 364 movlpd %xmm1, 3 * SIZE(BO) 365 movlpd %xmm2, 4 * SIZE(BO) 366 movlpd %xmm2, 5 * SIZE(BO) 367 movlpd %xmm3, 6 * SIZE(BO) 368 movlpd %xmm3, 7 * SIZE(BO) 369 movlpd %xmm4, 8 * SIZE(BO) 370 movlpd %xmm4, 9 * SIZE(BO) 371 movlpd %xmm5, 10 * SIZE(BO) 372 movlpd %xmm5, 11 * SIZE(BO) 373 movlpd %xmm6, 12 * SIZE(BO) 374 movlpd %xmm6, 13 * SIZE(BO) 375 movlpd %xmm7, 14 * SIZE(BO) 376 movlpd %xmm7, 15 * SIZE(BO) 377 378 subq $-16 * SIZE, BO 379 addq $ 8 * SIZE, B 380 decq %rax 381 jne .L02 382 ALIGN_4 383 384.L03: 385#if defined(LT) || defined(RN) 386 movq KK, %rax 387#else 388 movq K, %rax 389 subq KK, %rax 390#endif 391 andq $3, %rax 392 BRANCH 393 jle .L05 394 ALIGN_4 395 396.L04: 397 movlpd 0 * SIZE(B), %xmm0 398 movlpd 1 * SIZE(B), %xmm1 399 movlpd 2 * SIZE(B), %xmm2 400 movlpd 3 * SIZE(B), %xmm3 401 402 movlpd %xmm0, 0 * SIZE(BO) 403 movlpd %xmm0, 1 * SIZE(BO) 404 movlpd %xmm1, 2 * SIZE(BO) 405 movlpd %xmm1, 3 * SIZE(BO) 406 movlpd %xmm2, 4 * SIZE(BO) 407 movlpd %xmm2, 5 * SIZE(BO) 408 movlpd %xmm3, 6 * SIZE(BO) 409 movlpd %xmm3, 7 * SIZE(BO) 410 411 addq $ 4 * SIZE, B 412 addq $ 8 * SIZE, BO 413 414 decq %rax 415 jne .L04 416 ALIGN_4 417 418.L05: 419#if defined(LT) || defined(RN) 420 movq A, AO 421#else 422 movq A, AORIG 423#endif 424 425#ifdef RT 426 leaq (, LDC, 2), %rax 427 subq %rax, C 428#endif 429 430 movq C, CO1 431 leaq (C, LDC, 1), CO2 432 433#ifndef RT 434 leaq (C, LDC, 2), C 435#endif 436 437 movq M, I 438 sarq $1, I # i = (m >> 2) 439 jle .L30 440 ALIGN_4 441 442.L10: 443#ifdef LN 444 movq K, %rax 445 salq $1 + ZBASE_SHIFT, %rax 446 subq %rax, AORIG 447#endif 448 449#if defined(LN) || defined(RT) 450 movq KK, %rax 451 movq AORIG, AO 452 salq $ZBASE_SHIFT, %rax 453 leaq (AO, %rax, 2), AO 454#endif 455 456 leaq BUFFER, BO 457 458#if defined(LN) || defined(RT) 459 movq KK, %rax 460 salq $1 + ZBASE_SHIFT, %rax 461 leaq (BO, %rax, 2), BO 462#endif 463 464 movapd 0 * SIZE(AO), %xmm8 465 pxor %xmm0, %xmm0 466 movapd 2 * SIZE(AO), %xmm10 467 pxor %xmm1, %xmm1 468 movapd 4 * SIZE(AO), %xmm12 469 pxor %xmm2, %xmm2 470 movapd 6 * SIZE(AO), %xmm14 471 pxor %xmm3, %xmm3 472 473 movapd 0 * SIZE(BO), %xmm9 474 pxor %xmm4, %xmm4 475 movapd 2 * SIZE(BO), %xmm11 476 pxor %xmm5, %xmm5 477 movapd 4 * SIZE(BO), %xmm13 478 movapd 8 * SIZE(BO), %xmm15 479 480 PREFETCHW 4 * SIZE(CO1) 481 pxor %xmm6, %xmm6 482 PREFETCHW 4 * SIZE(CO2) 483 pxor %xmm7, %xmm7 484 485#if defined(LT) || defined(RN) 486 movq KK, %rax 487#else 488 movq K, %rax 489 subq KK, %rax 490#endif 491 andq $-8, %rax 492 salq $4, %rax 493 je .L15 494.L1X: 495 KERNEL1(16 * 0) 496 KERNEL2(16 * 0) 497 KERNEL3(16 * 0) 498 KERNEL4(16 * 0) 499 KERNEL5(16 * 0) 500 KERNEL6(16 * 0) 501 KERNEL7(16 * 0) 502 KERNEL8(16 * 0) 503 KERNEL1(16 * 1) 504 KERNEL2(16 * 1) 505 KERNEL3(16 * 1) 506 KERNEL4(16 * 1) 507 KERNEL5(16 * 1) 508 KERNEL6(16 * 1) 509 KERNEL7(16 * 1) 510 KERNEL8(16 * 1) 511 cmpq $64 * 2, %rax 512 jle .L12 513 KERNEL1(16 * 2) 514 KERNEL2(16 * 2) 515 KERNEL3(16 * 2) 516 KERNEL4(16 * 2) 517 KERNEL5(16 * 2) 518 KERNEL6(16 * 2) 519 KERNEL7(16 * 2) 520 KERNEL8(16 * 2) 521 KERNEL1(16 * 3) 522 KERNEL2(16 * 3) 523 KERNEL3(16 * 3) 524 KERNEL4(16 * 3) 525 KERNEL5(16 * 3) 526 KERNEL6(16 * 3) 527 KERNEL7(16 * 3) 528 KERNEL8(16 * 3) 529 cmpq $64 * 4, %rax 530 jle .L12 531 KERNEL1(16 * 4) 532 KERNEL2(16 * 4) 533 KERNEL3(16 * 4) 534 KERNEL4(16 * 4) 535 KERNEL5(16 * 4) 536 KERNEL6(16 * 4) 537 KERNEL7(16 * 4) 538 KERNEL8(16 * 4) 539 KERNEL1(16 * 5) 540 KERNEL2(16 * 5) 541 KERNEL3(16 * 5) 542 KERNEL4(16 * 5) 543 KERNEL5(16 * 5) 544 KERNEL6(16 * 5) 545 KERNEL7(16 * 5) 546 KERNEL8(16 * 5) 547 cmpq $64 * 6, %rax 548 jle .L12 549 KERNEL1(16 * 6) 550 KERNEL2(16 * 6) 551 KERNEL3(16 * 6) 552 KERNEL4(16 * 6) 553 KERNEL5(16 * 6) 554 KERNEL6(16 * 6) 555 KERNEL7(16 * 6) 556 KERNEL8(16 * 6) 557 KERNEL1(16 * 7) 558 KERNEL2(16 * 7) 559 KERNEL3(16 * 7) 560 KERNEL4(16 * 7) 561 KERNEL5(16 * 7) 562 KERNEL6(16 * 7) 563 KERNEL7(16 * 7) 564 KERNEL8(16 * 7) 565 566 addq $16 * 8 * SIZE, AO 567 addq $32 * 8 * SIZE, BO 568 subq $64 * 8, %rax 569 jg .L1X 570 571.L12: 572 leaq (AO, %rax, 2), AO # * 16 573 leaq (BO, %rax, 4), BO # * 64 574 ALIGN_4 575 576.L15: 577#if defined(LT) || defined(RN) 578 movq KK, %rax 579#else 580 movq K, %rax 581 subq KK, %rax 582#endif 583 movapd POSINV, %xmm15 584 andq $7, %rax # if (k & 1) 585 BRANCH 586 je .L19 587 ALIGN_4 588 589.L16: 590 mulpd %xmm8, %xmm9 591 addpd %xmm9, %xmm0 592 movapd 2 * SIZE(BO), %xmm9 593 mulpd %xmm8, %xmm9 594 addpd %xmm9, %xmm1 595 movapd 4 * SIZE(BO), %xmm9 596 mulpd %xmm8, %xmm9 597 mulpd 6 * SIZE(BO), %xmm8 598 addpd %xmm9, %xmm2 599 movapd 0 * SIZE(BO), %xmm9 600 addpd %xmm8, %xmm3 601 movapd 4 * SIZE(AO), %xmm8 602 mulpd %xmm10, %xmm9 603 addpd %xmm9, %xmm4 604 movapd 2 * SIZE(BO), %xmm9 605 mulpd %xmm10, %xmm9 606 addpd %xmm9, %xmm5 607 movapd 4 * SIZE(BO), %xmm9 608 mulpd %xmm10, %xmm9 609 mulpd 6 * SIZE(BO), %xmm10 610 addpd %xmm9, %xmm6 611 movapd 8 * SIZE(BO), %xmm9 612 addpd %xmm10, %xmm7 613 movapd 6 * SIZE(AO), %xmm10 614 615 addq $4 * SIZE, AO # aoffset += 4 616 addq $8 * SIZE, BO # boffset1 += 8 617 decq %rax 618 jg .L16 619 ALIGN_4 620 621.L19: 622#if defined(LN) || defined(RT) 623 movq KK, %rax 624#ifdef LN 625 subq $2, %rax 626#else 627 subq $2, %rax 628#endif 629 630 movq AORIG, AO 631 movq BORIG, B 632 leaq BUFFER, BO 633 634 salq $ZBASE_SHIFT, %rax 635 leaq (AO, %rax, 2), AO 636 leaq (B, %rax, 2), B 637 leaq (BO, %rax, 4), BO 638#endif 639 640 SHUFPD_1 %xmm1, %xmm1 641 SHUFPD_1 %xmm3, %xmm3 642 SHUFPD_1 %xmm5, %xmm5 643 SHUFPD_1 %xmm7, %xmm7 644 645#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 646 defined(NR) || defined(NC) || defined(TR) || defined(TC) 647 xorpd %xmm15, %xmm1 648 xorpd %xmm15, %xmm3 649 xorpd %xmm15, %xmm5 650 xorpd %xmm15, %xmm7 651#else 652 xorpd %xmm15, %xmm0 653 xorpd %xmm15, %xmm2 654 xorpd %xmm15, %xmm4 655 xorpd %xmm15, %xmm6 656#endif 657 658#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 659 defined(RR) || defined(RC) || defined(CR) || defined(CC) 660 subpd %xmm1, %xmm0 661 subpd %xmm3, %xmm2 662 subpd %xmm5, %xmm4 663 subpd %xmm7, %xmm6 664#else 665 addpd %xmm1, %xmm0 666 addpd %xmm3, %xmm2 667 addpd %xmm5, %xmm4 668 addpd %xmm7, %xmm6 669#endif 670 671#if defined(LN) || defined(LT) 672 movapd 0 * SIZE(B), %xmm1 673 movapd 2 * SIZE(B), %xmm3 674 movapd 4 * SIZE(B), %xmm5 675 movapd 6 * SIZE(B), %xmm7 676 677 subpd %xmm0, %xmm1 678 subpd %xmm2, %xmm3 679 subpd %xmm4, %xmm5 680 subpd %xmm6, %xmm7 681#else 682 movapd 0 * SIZE(AO), %xmm1 683 movapd 2 * SIZE(AO), %xmm5 684 movapd 4 * SIZE(AO), %xmm3 685 movapd 6 * SIZE(AO), %xmm7 686 687 subpd %xmm0, %xmm1 688 subpd %xmm2, %xmm3 689 subpd %xmm4, %xmm5 690 subpd %xmm6, %xmm7 691#endif 692 693#ifndef CONJ 694 SHUFPD_1 %xmm15, %xmm15 695#endif 696 697#ifdef LN 698 movlpd 6 * SIZE(AO), %xmm8 699 movhpd 6 * SIZE(AO), %xmm8 700 movlpd 7 * SIZE(AO), %xmm9 701 movhpd 7 * SIZE(AO), %xmm9 702 movlpd 4 * SIZE(AO), %xmm10 703 movhpd 4 * SIZE(AO), %xmm10 704 movlpd 5 * SIZE(AO), %xmm11 705 movhpd 5 * SIZE(AO), %xmm11 706 movlpd 0 * SIZE(AO), %xmm12 707 movhpd 0 * SIZE(AO), %xmm12 708 movlpd 1 * SIZE(AO), %xmm13 709 movhpd 1 * SIZE(AO), %xmm13 710 711 pshufd $0x4e, %xmm5, %xmm4 712 pshufd $0x4e, %xmm7, %xmm6 713 714 xorpd %xmm15, %xmm4 715 xorpd %xmm15, %xmm6 716 717 mulpd %xmm8, %xmm5 718 mulpd %xmm9, %xmm4 719 mulpd %xmm8, %xmm7 720 mulpd %xmm9, %xmm6 721 722 addpd %xmm4, %xmm5 723 addpd %xmm6, %xmm7 724 725 movapd %xmm5, %xmm0 726 movapd %xmm7, %xmm2 727 pshufd $0x4e, %xmm5, %xmm4 728 pshufd $0x4e, %xmm7, %xmm6 729 730 xorpd %xmm15, %xmm4 731 xorpd %xmm15, %xmm6 732 733 mulpd %xmm10, %xmm0 734 mulpd %xmm10, %xmm2 735 mulpd %xmm11, %xmm4 736 mulpd %xmm11, %xmm6 737 738 subpd %xmm0, %xmm1 739 subpd %xmm2, %xmm3 740 subpd %xmm4, %xmm1 741 subpd %xmm6, %xmm3 742 743 pshufd $0x4e, %xmm1, %xmm0 744 pshufd $0x4e, %xmm3, %xmm2 745 746 xorpd %xmm15, %xmm0 747 xorpd %xmm15, %xmm2 748 749 mulpd %xmm12, %xmm1 750 mulpd %xmm13, %xmm0 751 mulpd %xmm12, %xmm3 752 mulpd %xmm13, %xmm2 753 754 addpd %xmm0, %xmm1 755 addpd %xmm2, %xmm3 756#endif 757 758#ifdef LT 759 movlpd 0 * SIZE(AO), %xmm8 760 movhpd 0 * SIZE(AO), %xmm8 761 movlpd 1 * SIZE(AO), %xmm9 762 movhpd 1 * SIZE(AO), %xmm9 763 movlpd 2 * SIZE(AO), %xmm10 764 movhpd 2 * SIZE(AO), %xmm10 765 movlpd 3 * SIZE(AO), %xmm11 766 movhpd 3 * SIZE(AO), %xmm11 767 movlpd 6 * SIZE(AO), %xmm12 768 movhpd 6 * SIZE(AO), %xmm12 769 movlpd 7 * SIZE(AO), %xmm13 770 movhpd 7 * SIZE(AO), %xmm13 771 772 pshufd $0x4e, %xmm1, %xmm0 773 pshufd $0x4e, %xmm3, %xmm2 774 775 xorpd %xmm15, %xmm0 776 xorpd %xmm15, %xmm2 777 778 mulpd %xmm8, %xmm1 779 mulpd %xmm9, %xmm0 780 mulpd %xmm8, %xmm3 781 mulpd %xmm9, %xmm2 782 783 addpd %xmm0, %xmm1 784 addpd %xmm2, %xmm3 785 786 movapd %xmm1, %xmm0 787 movapd %xmm3, %xmm2 788 pshufd $0x4e, %xmm1, %xmm4 789 pshufd $0x4e, %xmm3, %xmm6 790 791 xorpd %xmm15, %xmm4 792 xorpd %xmm15, %xmm6 793 794 mulpd %xmm10, %xmm0 795 mulpd %xmm10, %xmm2 796 mulpd %xmm11, %xmm4 797 mulpd %xmm11, %xmm6 798 799 subpd %xmm0, %xmm5 800 subpd %xmm2, %xmm7 801 subpd %xmm4, %xmm5 802 subpd %xmm6, %xmm7 803 804 pshufd $0x4e, %xmm5, %xmm4 805 pshufd $0x4e, %xmm7, %xmm6 806 807 xorpd %xmm15, %xmm4 808 xorpd %xmm15, %xmm6 809 810 mulpd %xmm12, %xmm5 811 mulpd %xmm13, %xmm4 812 mulpd %xmm12, %xmm7 813 mulpd %xmm13, %xmm6 814 815 addpd %xmm4, %xmm5 816 addpd %xmm6, %xmm7 817#endif 818 819#ifdef RN 820 movlpd 0 * SIZE(B), %xmm8 821 movhpd 0 * SIZE(B), %xmm8 822 movlpd 1 * SIZE(B), %xmm9 823 movhpd 1 * SIZE(B), %xmm9 824 movlpd 2 * SIZE(B), %xmm10 825 movhpd 2 * SIZE(B), %xmm10 826 movlpd 3 * SIZE(B), %xmm11 827 movhpd 3 * SIZE(B), %xmm11 828 movlpd 6 * SIZE(B), %xmm12 829 movhpd 6 * SIZE(B), %xmm12 830 movlpd 7 * SIZE(B), %xmm13 831 movhpd 7 * SIZE(B), %xmm13 832 833 pshufd $0x4e, %xmm1, %xmm0 834 pshufd $0x4e, %xmm5, %xmm4 835 836 xorpd %xmm15, %xmm0 837 xorpd %xmm15, %xmm4 838 839 mulpd %xmm8, %xmm1 840 mulpd %xmm9, %xmm0 841 mulpd %xmm8, %xmm5 842 mulpd %xmm9, %xmm4 843 844 addpd %xmm0, %xmm1 845 addpd %xmm4, %xmm5 846 847 movapd %xmm1, %xmm0 848 movapd %xmm5, %xmm2 849 pshufd $0x4e, %xmm1, %xmm4 850 pshufd $0x4e, %xmm5, %xmm6 851 852 xorpd %xmm15, %xmm4 853 xorpd %xmm15, %xmm6 854 855 mulpd %xmm10, %xmm0 856 mulpd %xmm10, %xmm2 857 mulpd %xmm11, %xmm4 858 mulpd %xmm11, %xmm6 859 860 subpd %xmm0, %xmm3 861 subpd %xmm2, %xmm7 862 subpd %xmm4, %xmm3 863 subpd %xmm6, %xmm7 864 865 pshufd $0x4e, %xmm3, %xmm2 866 pshufd $0x4e, %xmm7, %xmm6 867 868 xorpd %xmm15, %xmm2 869 xorpd %xmm15, %xmm6 870 871 mulpd %xmm12, %xmm3 872 mulpd %xmm13, %xmm2 873 mulpd %xmm12, %xmm7 874 mulpd %xmm13, %xmm6 875 876 addpd %xmm2, %xmm3 877 addpd %xmm6, %xmm7 878#endif 879 880#ifdef RT 881 movlpd 6 * SIZE(B), %xmm8 882 movhpd 6 * SIZE(B), %xmm8 883 movlpd 7 * SIZE(B), %xmm9 884 movhpd 7 * SIZE(B), %xmm9 885 movlpd 4 * SIZE(B), %xmm10 886 movhpd 4 * SIZE(B), %xmm10 887 movlpd 5 * SIZE(B), %xmm11 888 movhpd 5 * SIZE(B), %xmm11 889 movlpd 0 * SIZE(B), %xmm12 890 movhpd 0 * SIZE(B), %xmm12 891 movlpd 1 * SIZE(B), %xmm13 892 movhpd 1 * SIZE(B), %xmm13 893 894 pshufd $0x4e, %xmm3, %xmm2 895 pshufd $0x4e, %xmm7, %xmm6 896 897 xorpd %xmm15, %xmm2 898 xorpd %xmm15, %xmm6 899 900 mulpd %xmm8, %xmm3 901 mulpd %xmm9, %xmm2 902 mulpd %xmm8, %xmm7 903 mulpd %xmm9, %xmm6 904 905 addpd %xmm2, %xmm3 906 addpd %xmm6, %xmm7 907 908 movapd %xmm3, %xmm0 909 movapd %xmm7, %xmm2 910 pshufd $0x4e, %xmm3, %xmm4 911 pshufd $0x4e, %xmm7, %xmm6 912 913 xorpd %xmm15, %xmm4 914 xorpd %xmm15, %xmm6 915 916 mulpd %xmm10, %xmm0 917 mulpd %xmm10, %xmm2 918 mulpd %xmm11, %xmm4 919 mulpd %xmm11, %xmm6 920 921 subpd %xmm0, %xmm1 922 subpd %xmm2, %xmm5 923 subpd %xmm4, %xmm1 924 subpd %xmm6, %xmm5 925 926 pshufd $0x4e, %xmm1, %xmm0 927 pshufd $0x4e, %xmm5, %xmm4 928 929 xorpd %xmm15, %xmm0 930 xorpd %xmm15, %xmm4 931 932 mulpd %xmm12, %xmm1 933 mulpd %xmm13, %xmm0 934 mulpd %xmm12, %xmm5 935 mulpd %xmm13, %xmm4 936 937 addpd %xmm0, %xmm1 938 addpd %xmm4, %xmm5 939#endif 940 941#ifdef LN 942 subq $4 * SIZE, CO1 943 subq $4 * SIZE, CO2 944#endif 945 946 movsd %xmm1, 0 * SIZE(CO1) 947 movhpd %xmm1, 1 * SIZE(CO1) 948 movsd %xmm5, 2 * SIZE(CO1) 949 movhpd %xmm5, 3 * SIZE(CO1) 950 951 movsd %xmm3, 0 * SIZE(CO2) 952 movhpd %xmm3, 1 * SIZE(CO2) 953 movsd %xmm7, 2 * SIZE(CO2) 954 movhpd %xmm7, 3 * SIZE(CO2) 955 956#if defined(LN) || defined(LT) 957 movapd %xmm1, 0 * SIZE(B) 958 movapd %xmm3, 2 * SIZE(B) 959 movapd %xmm5, 4 * SIZE(B) 960 movapd %xmm7, 6 * SIZE(B) 961 962 movlpd %xmm1, 0 * SIZE(BO) 963 movlpd %xmm1, 1 * SIZE(BO) 964 movhpd %xmm1, 2 * SIZE(BO) 965 movhpd %xmm1, 3 * SIZE(BO) 966 movlpd %xmm3, 4 * SIZE(BO) 967 movlpd %xmm3, 5 * SIZE(BO) 968 movhpd %xmm3, 6 * SIZE(BO) 969 movhpd %xmm3, 7 * SIZE(BO) 970 movlpd %xmm5, 8 * SIZE(BO) 971 movlpd %xmm5, 9 * SIZE(BO) 972 movhpd %xmm5, 10 * SIZE(BO) 973 movhpd %xmm5, 11 * SIZE(BO) 974 movlpd %xmm7, 12 * SIZE(BO) 975 movlpd %xmm7, 13 * SIZE(BO) 976 movhpd %xmm7, 14 * SIZE(BO) 977 movhpd %xmm7, 15 * SIZE(BO) 978#else 979 movapd %xmm1, 0 * SIZE(AO) 980 movapd %xmm5, 2 * SIZE(AO) 981 movapd %xmm3, 4 * SIZE(AO) 982 movapd %xmm7, 6 * SIZE(AO) 983#endif 984 985#ifndef LN 986 addq $4 * SIZE, CO1 987 addq $4 * SIZE, CO2 988#endif 989 990#if defined(LT) || defined(RN) 991 movq K, %rax 992 subq KK, %rax 993 salq $ZBASE_SHIFT, %rax 994 leaq (AO, %rax, 2), AO 995#ifdef LT 996 addq $8 * SIZE, B 997#endif 998#endif 999 1000#ifdef LN 1001 subq $2, KK 1002 movq BORIG, B 1003#endif 1004 1005#ifdef LT 1006 addq $2, KK 1007#endif 1008 1009#ifdef RT 1010 movq K, %rax 1011 movq BORIG, B 1012 salq $1 + ZBASE_SHIFT, %rax 1013 addq %rax, AORIG 1014#endif 1015 1016 decq I # i -- 1017 jg .L10 1018 ALIGN_4 1019 1020.L30: 1021 testq $1, M 1022 jle .L99 1023 1024#ifdef LN 1025 movq K, %rax 1026 salq $0 + ZBASE_SHIFT, %rax 1027 subq %rax, AORIG 1028#endif 1029 1030#if defined(LN) || defined(RT) 1031 movq KK, %rax 1032 movq AORIG, AO 1033 salq $ZBASE_SHIFT, %rax 1034 addq %rax, AO 1035#endif 1036 1037 leaq BUFFER, BO 1038 1039#if defined(LN) || defined(RT) 1040 movq KK, %rax 1041 salq $1 + ZBASE_SHIFT, %rax 1042 leaq (BO, %rax, 2), BO 1043#endif 1044 1045 pxor %xmm0, %xmm0 1046 pxor %xmm1, %xmm1 1047 pxor %xmm2, %xmm2 1048 pxor %xmm3, %xmm3 1049 1050#if defined(LT) || defined(RN) 1051 movq KK, %rax 1052#else 1053 movq K, %rax 1054 subq KK, %rax 1055#endif 1056 sarq $2, %rax 1057 je .L42 1058 1059.L41: 1060 movapd 0 * SIZE(AO), %xmm8 1061 1062 movapd 0 * SIZE(BO), %xmm9 1063 mulpd %xmm8, %xmm9 1064 addpd %xmm9, %xmm0 1065 1066 movapd 2 * SIZE(BO), %xmm9 1067 mulpd %xmm8, %xmm9 1068 addpd %xmm9, %xmm1 1069 1070 movapd 4 * SIZE(BO), %xmm9 1071 mulpd %xmm8, %xmm9 1072 addpd %xmm9, %xmm2 1073 1074 movapd 6 * SIZE(BO), %xmm9 1075 mulpd %xmm8, %xmm9 1076 addpd %xmm9, %xmm3 1077 1078 movapd 2 * SIZE(AO), %xmm8 1079 1080 movapd 8 * SIZE(BO), %xmm9 1081 mulpd %xmm8, %xmm9 1082 addpd %xmm9, %xmm0 1083 1084 movapd 10 * SIZE(BO), %xmm9 1085 mulpd %xmm8, %xmm9 1086 addpd %xmm9, %xmm1 1087 1088 movapd 12 * SIZE(BO), %xmm9 1089 mulpd %xmm8, %xmm9 1090 addpd %xmm9, %xmm2 1091 1092 movapd 14 * SIZE(BO), %xmm9 1093 mulpd %xmm8, %xmm9 1094 addpd %xmm9, %xmm3 1095 1096 movapd 4 * SIZE(AO), %xmm8 1097 1098 movapd 16 * SIZE(BO), %xmm9 1099 mulpd %xmm8, %xmm9 1100 addpd %xmm9, %xmm0 1101 1102 movapd 18 * SIZE(BO), %xmm9 1103 mulpd %xmm8, %xmm9 1104 addpd %xmm9, %xmm1 1105 1106 movapd 20 * SIZE(BO), %xmm9 1107 mulpd %xmm8, %xmm9 1108 addpd %xmm9, %xmm2 1109 1110 movapd 22 * SIZE(BO), %xmm9 1111 mulpd %xmm8, %xmm9 1112 addpd %xmm9, %xmm3 1113 1114 movapd 6 * SIZE(AO), %xmm8 1115 1116 movapd 24 * SIZE(BO), %xmm9 1117 mulpd %xmm8, %xmm9 1118 addpd %xmm9, %xmm0 1119 1120 movapd 26 * SIZE(BO), %xmm9 1121 mulpd %xmm8, %xmm9 1122 addpd %xmm9, %xmm1 1123 1124 movapd 28 * SIZE(BO), %xmm9 1125 mulpd %xmm8, %xmm9 1126 addpd %xmm9, %xmm2 1127 1128 movapd 30 * SIZE(BO), %xmm9 1129 mulpd %xmm8, %xmm9 1130 addpd %xmm9, %xmm3 1131 1132 addq $ 8 * SIZE, AO 1133 addq $32 * SIZE, BO 1134 decq %rax 1135 jne .L41 1136 1137.L42: 1138#if defined(LT) || defined(RN) 1139 movq KK, %rax 1140#else 1141 movq K, %rax 1142 subq KK, %rax 1143#endif 1144 movapd POSINV, %xmm15 1145 andq $3, %rax # if (k & 1) 1146 BRANCH 1147 jle .L44 1148 1149.L43: 1150 movapd 0 * SIZE(AO), %xmm8 1151 1152 movapd 0 * SIZE(BO), %xmm9 1153 mulpd %xmm8, %xmm9 1154 addpd %xmm9, %xmm0 1155 1156 movapd 2 * SIZE(BO), %xmm9 1157 mulpd %xmm8, %xmm9 1158 addpd %xmm9, %xmm1 1159 1160 movapd 4 * SIZE(BO), %xmm9 1161 mulpd %xmm8, %xmm9 1162 addpd %xmm9, %xmm2 1163 1164 movapd 6 * SIZE(BO), %xmm9 1165 mulpd %xmm8, %xmm9 1166 addpd %xmm9, %xmm3 1167 1168 addq $2 * SIZE, AO # aoffset += 4 1169 addq $8 * SIZE, BO # boffset1 += 8 1170 1171 decq %rax 1172 jg .L43 1173 ALIGN_4 1174 1175.L44: 1176#if defined(LN) || defined(RT) 1177 movq KK, %rax 1178#ifdef LN 1179 subq $1, %rax 1180#else 1181 subq $2, %rax 1182#endif 1183 1184 movq AORIG, AO 1185 movq BORIG, B 1186 leaq BUFFER, BO 1187 1188 salq $ZBASE_SHIFT, %rax 1189 leaq (AO, %rax, 1), AO 1190 leaq (B, %rax, 2), B 1191 leaq (BO, %rax, 4), BO 1192#endif 1193 1194 SHUFPD_1 %xmm1, %xmm1 1195 SHUFPD_1 %xmm3, %xmm3 1196 1197#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 1198 defined(NR) || defined(NC) || defined(TR) || defined(TC) 1199 xorpd %xmm15, %xmm1 1200 xorpd %xmm15, %xmm3 1201#else 1202 xorpd %xmm15, %xmm0 1203 xorpd %xmm15, %xmm2 1204#endif 1205 1206#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 1207 defined(RR) || defined(RC) || defined(CR) || defined(CC) 1208 subpd %xmm1, %xmm0 1209 subpd %xmm3, %xmm2 1210#else 1211 addpd %xmm1, %xmm0 1212 addpd %xmm3, %xmm2 1213#endif 1214 1215#if defined(LN) || defined(LT) 1216 movapd 0 * SIZE(B), %xmm1 1217 movapd 2 * SIZE(B), %xmm3 1218 1219 subpd %xmm0, %xmm1 1220 subpd %xmm2, %xmm3 1221#else 1222 movapd 0 * SIZE(AO), %xmm1 1223 movapd 2 * SIZE(AO), %xmm3 1224 1225 subpd %xmm0, %xmm1 1226 subpd %xmm2, %xmm3 1227#endif 1228 1229#ifndef CONJ 1230 SHUFPD_1 %xmm15, %xmm15 1231#endif 1232 1233#if defined(LN) || defined(LT) 1234 movlpd 0 * SIZE(AO), %xmm8 1235 movhpd 0 * SIZE(AO), %xmm8 1236 movlpd 1 * SIZE(AO), %xmm9 1237 movhpd 1 * SIZE(AO), %xmm9 1238 1239 pshufd $0x4e, %xmm1, %xmm0 1240 pshufd $0x4e, %xmm3, %xmm2 1241 1242 xorpd %xmm15, %xmm0 1243 xorpd %xmm15, %xmm2 1244 1245 mulpd %xmm8, %xmm1 1246 mulpd %xmm9, %xmm0 1247 mulpd %xmm8, %xmm3 1248 mulpd %xmm9, %xmm2 1249 1250 addpd %xmm0, %xmm1 1251 addpd %xmm2, %xmm3 1252#endif 1253 1254#ifdef RN 1255 movlpd 0 * SIZE(B), %xmm8 1256 movhpd 0 * SIZE(B), %xmm8 1257 movlpd 1 * SIZE(B), %xmm9 1258 movhpd 1 * SIZE(B), %xmm9 1259 movlpd 2 * SIZE(B), %xmm10 1260 movhpd 2 * SIZE(B), %xmm10 1261 movlpd 3 * SIZE(B), %xmm11 1262 movhpd 3 * SIZE(B), %xmm11 1263 movlpd 6 * SIZE(B), %xmm12 1264 movhpd 6 * SIZE(B), %xmm12 1265 movlpd 7 * SIZE(B), %xmm13 1266 movhpd 7 * SIZE(B), %xmm13 1267 1268 pshufd $0x4e, %xmm1, %xmm0 1269 1270 xorpd %xmm15, %xmm0 1271 1272 mulpd %xmm8, %xmm1 1273 mulpd %xmm9, %xmm0 1274 1275 addpd %xmm0, %xmm1 1276 1277 movapd %xmm1, %xmm0 1278 pshufd $0x4e, %xmm1, %xmm4 1279 1280 xorpd %xmm15, %xmm4 1281 1282 mulpd %xmm10, %xmm0 1283 mulpd %xmm11, %xmm4 1284 1285 subpd %xmm0, %xmm3 1286 subpd %xmm4, %xmm3 1287 1288 pshufd $0x4e, %xmm3, %xmm2 1289 1290 xorpd %xmm15, %xmm2 1291 1292 mulpd %xmm12, %xmm3 1293 mulpd %xmm13, %xmm2 1294 1295 addpd %xmm2, %xmm3 1296#endif 1297 1298#ifdef RT 1299 movlpd 6 * SIZE(B), %xmm8 1300 movhpd 6 * SIZE(B), %xmm8 1301 movlpd 7 * SIZE(B), %xmm9 1302 movhpd 7 * SIZE(B), %xmm9 1303 movlpd 4 * SIZE(B), %xmm10 1304 movhpd 4 * SIZE(B), %xmm10 1305 movlpd 5 * SIZE(B), %xmm11 1306 movhpd 5 * SIZE(B), %xmm11 1307 movlpd 0 * SIZE(B), %xmm12 1308 movhpd 0 * SIZE(B), %xmm12 1309 movlpd 1 * SIZE(B), %xmm13 1310 movhpd 1 * SIZE(B), %xmm13 1311 1312 pshufd $0x4e, %xmm3, %xmm2 1313 1314 xorpd %xmm15, %xmm2 1315 1316 mulpd %xmm8, %xmm3 1317 mulpd %xmm9, %xmm2 1318 1319 addpd %xmm2, %xmm3 1320 1321 movapd %xmm3, %xmm0 1322 pshufd $0x4e, %xmm3, %xmm4 1323 1324 xorpd %xmm15, %xmm4 1325 1326 mulpd %xmm10, %xmm0 1327 mulpd %xmm11, %xmm4 1328 1329 subpd %xmm0, %xmm1 1330 subpd %xmm4, %xmm1 1331 1332 pshufd $0x4e, %xmm1, %xmm0 1333 1334 xorpd %xmm15, %xmm0 1335 1336 mulpd %xmm12, %xmm1 1337 mulpd %xmm13, %xmm0 1338 1339 addpd %xmm0, %xmm1 1340#endif 1341 1342#ifdef LN 1343 subq $2 * SIZE, CO1 1344 subq $2 * SIZE, CO2 1345#endif 1346 1347 movsd %xmm1, 0 * SIZE(CO1) 1348 movhpd %xmm1, 1 * SIZE(CO1) 1349 1350 movsd %xmm3, 0 * SIZE(CO2) 1351 movhpd %xmm3, 1 * SIZE(CO2) 1352 1353#if defined(LN) || defined(LT) 1354 movapd %xmm1, 0 * SIZE(B) 1355 movapd %xmm3, 2 * SIZE(B) 1356 1357 movlpd %xmm1, 0 * SIZE(BO) 1358 movlpd %xmm1, 1 * SIZE(BO) 1359 movhpd %xmm1, 2 * SIZE(BO) 1360 movhpd %xmm1, 3 * SIZE(BO) 1361 movlpd %xmm3, 4 * SIZE(BO) 1362 movlpd %xmm3, 5 * SIZE(BO) 1363 movhpd %xmm3, 6 * SIZE(BO) 1364 movhpd %xmm3, 7 * SIZE(BO) 1365#else 1366 movapd %xmm1, 0 * SIZE(AO) 1367 movapd %xmm3, 2 * SIZE(AO) 1368 1369#endif 1370 1371#ifndef LN 1372 addq $2 * SIZE, CO1 1373 addq $2 * SIZE, CO2 1374#endif 1375 1376#if defined(LT) || defined(RN) 1377 movq K, %rax 1378 subq KK, %rax 1379 salq $ZBASE_SHIFT, %rax 1380 leaq (AO, %rax, 1), AO 1381#ifdef LT 1382 addq $4 * SIZE, B 1383#endif 1384#endif 1385 1386#ifdef LN 1387 subq $1, KK 1388 movq BORIG, B 1389#endif 1390 1391#ifdef LT 1392 addq $1, KK 1393#endif 1394 1395#ifdef RT 1396 movq K, %rax 1397 movq BORIG, B 1398 salq $0 + ZBASE_SHIFT, %rax 1399 addq %rax, AORIG 1400#endif 1401 ALIGN_4 1402 1403.L99: 1404#ifdef LN 1405 leaq (, K, SIZE), %rax 1406 leaq (B, %rax, 4), B 1407#endif 1408 1409#if defined(LT) || defined(RN) 1410 movq K, %rax 1411 subq KK, %rax 1412 leaq (,%rax, SIZE), %rax 1413 leaq (B, %rax, 2 * COMPSIZE), B 1414#endif 1415 1416#ifdef RN 1417 addq $2, KK 1418#endif 1419 1420#ifdef RT 1421 subq $2, KK 1422#endif 1423 1424 decq J # j -- 1425 jg .L01 1426 1427.L100: 1428 testq $1, N 1429 jle .L999 1430 1431.L101: 1432#ifdef LN 1433 movq OFFSET, %rax 1434 addq M, %rax 1435 movq %rax, KK 1436#endif 1437 1438/* Copying to Sub Buffer */ 1439 leaq BUFFER, BO 1440 1441#ifdef RT 1442 movq K, %rax 1443 salq $0 + ZBASE_SHIFT, %rax 1444 subq %rax, B 1445#endif 1446 1447#if defined(LN) || defined(RT) 1448 movq KK, %rax 1449 movq B, BORIG 1450 salq $ZBASE_SHIFT, %rax 1451 leaq (B, %rax, 1), B 1452 leaq (BO, %rax, 2), BO 1453#endif 1454 1455#if defined(LT) 1456 movq OFFSET, %rax 1457 movq %rax, KK 1458#endif 1459 1460#if defined(LT) || defined(RN) 1461 movq KK, %rax 1462#else 1463 movq K, %rax 1464 subq KK, %rax 1465#endif 1466 sarq $2, %rax 1467 jle .L103 1468 ALIGN_4 1469 1470.L102: 1471 movlpd 0 * SIZE(B), %xmm0 1472 movlpd 1 * SIZE(B), %xmm1 1473 movlpd 2 * SIZE(B), %xmm2 1474 movlpd 3 * SIZE(B), %xmm3 1475 movlpd 4 * SIZE(B), %xmm4 1476 movlpd 5 * SIZE(B), %xmm5 1477 movlpd 6 * SIZE(B), %xmm6 1478 movlpd 7 * SIZE(B), %xmm7 1479 1480 movlpd %xmm0, 0 * SIZE(BO) 1481 movlpd %xmm0, 1 * SIZE(BO) 1482 movlpd %xmm1, 2 * SIZE(BO) 1483 movlpd %xmm1, 3 * SIZE(BO) 1484 movlpd %xmm2, 4 * SIZE(BO) 1485 movlpd %xmm2, 5 * SIZE(BO) 1486 movlpd %xmm3, 6 * SIZE(BO) 1487 movlpd %xmm3, 7 * SIZE(BO) 1488 movlpd %xmm4, 8 * SIZE(BO) 1489 movlpd %xmm4, 9 * SIZE(BO) 1490 movlpd %xmm5, 10 * SIZE(BO) 1491 movlpd %xmm5, 11 * SIZE(BO) 1492 movlpd %xmm6, 12 * SIZE(BO) 1493 movlpd %xmm6, 13 * SIZE(BO) 1494 movlpd %xmm7, 14 * SIZE(BO) 1495 movlpd %xmm7, 15 * SIZE(BO) 1496 1497 subq $-16 * SIZE, BO 1498 addq $ 8 * SIZE, B 1499 decq %rax 1500 jne .L102 1501 ALIGN_4 1502 1503.L103: 1504#if defined(LT) || defined(RN) 1505 movq KK, %rax 1506#else 1507 movq K, %rax 1508 subq KK, %rax 1509#endif 1510 andq $3, %rax 1511 BRANCH 1512 jle .L105 1513 ALIGN_4 1514 1515.L104: 1516 movlpd 0 * SIZE(B), %xmm0 1517 movlpd 1 * SIZE(B), %xmm1 1518 1519 movlpd %xmm0, 0 * SIZE(BO) 1520 movlpd %xmm0, 1 * SIZE(BO) 1521 movlpd %xmm1, 2 * SIZE(BO) 1522 movlpd %xmm1, 3 * SIZE(BO) 1523 1524 addq $4 * SIZE, BO 1525 addq $2 * SIZE, B 1526 decq %rax 1527 jne .L104 1528 ALIGN_4 1529 1530.L105: 1531#if defined(LT) || defined(RN) 1532 movq A, AO 1533#else 1534 movq A, AORIG 1535#endif 1536 1537#ifdef RT 1538 subq LDC, C 1539#endif 1540 1541 movq C, CO1 1542#ifndef RT 1543 addq LDC, C 1544#endif 1545 1546 movq M, I 1547 sarq $1, I # i = (m >> 2) 1548 jle .L130 1549 ALIGN_4 1550 1551.L110: 1552#ifdef LN 1553 movq K, %rax 1554 salq $1 + ZBASE_SHIFT, %rax 1555 subq %rax, AORIG 1556#endif 1557 1558#if defined(LN) || defined(RT) 1559 movq KK, %rax 1560 movq AORIG, AO 1561 salq $ZBASE_SHIFT, %rax 1562 leaq (AO, %rax, 2), AO 1563#endif 1564 1565 leaq BUFFER, BO 1566 1567#if defined(LN) || defined(RT) 1568 movq KK, %rax 1569 salq $0 + ZBASE_SHIFT, %rax 1570 leaq (BO, %rax, 2), BO 1571#endif 1572 1573 pxor %xmm0, %xmm0 1574 pxor %xmm1, %xmm1 1575 pxor %xmm4, %xmm4 1576 pxor %xmm5, %xmm5 1577 PREFETCHW 4 * SIZE(CO1) 1578 1579#if defined(LT) || defined(RN) 1580 movq KK, %rax 1581#else 1582 movq K, %rax 1583 subq KK, %rax 1584#endif 1585 sarq $2, %rax 1586 je .L112 1587 1588.L111: 1589 movapd 0 * SIZE(AO), %xmm8 1590 movapd 0 * SIZE(BO), %xmm9 1591 mulpd %xmm8, %xmm9 1592 addpd %xmm9, %xmm0 1593 mulpd 2 * SIZE(BO), %xmm8 1594 addpd %xmm8, %xmm1 1595 1596 movapd 2 * SIZE(AO), %xmm8 1597 movapd 0 * SIZE(BO), %xmm9 1598 mulpd %xmm8, %xmm9 1599 addpd %xmm9, %xmm4 1600 mulpd 2 * SIZE(BO), %xmm8 1601 addpd %xmm8, %xmm5 1602 1603 movapd 4 * SIZE(AO), %xmm8 1604 movapd 4 * SIZE(BO), %xmm9 1605 mulpd %xmm8, %xmm9 1606 addpd %xmm9, %xmm0 1607 mulpd 6 * SIZE(BO), %xmm8 1608 addpd %xmm8, %xmm1 1609 1610 movapd 6 * SIZE(AO), %xmm8 1611 movapd 4 * SIZE(BO), %xmm9 1612 mulpd %xmm8, %xmm9 1613 addpd %xmm9, %xmm4 1614 mulpd 6 * SIZE(BO), %xmm8 1615 addpd %xmm8, %xmm5 1616 1617 movapd 8 * SIZE(AO), %xmm8 1618 movapd 8 * SIZE(BO), %xmm9 1619 mulpd %xmm8, %xmm9 1620 addpd %xmm9, %xmm0 1621 mulpd 10 * SIZE(BO), %xmm8 1622 addpd %xmm8, %xmm1 1623 1624 movapd 10 * SIZE(AO), %xmm8 1625 movapd 8 * SIZE(BO), %xmm9 1626 mulpd %xmm8, %xmm9 1627 addpd %xmm9, %xmm4 1628 mulpd 10 * SIZE(BO), %xmm8 1629 addpd %xmm8, %xmm5 1630 1631 movapd 12 * SIZE(AO), %xmm8 1632 movapd 12 * SIZE(BO), %xmm9 1633 mulpd %xmm8, %xmm9 1634 addpd %xmm9, %xmm0 1635 mulpd 14 * SIZE(BO), %xmm8 1636 addpd %xmm8, %xmm1 1637 1638 movapd 14 * SIZE(AO), %xmm8 1639 movapd 12 * SIZE(BO), %xmm9 1640 mulpd %xmm8, %xmm9 1641 addpd %xmm9, %xmm4 1642 mulpd 14 * SIZE(BO), %xmm8 1643 addpd %xmm8, %xmm5 1644 1645 addq $16 * SIZE, AO 1646 addq $16 * SIZE, BO 1647 decq %rax 1648 jne .L111 1649 ALIGN_4 1650 1651.L112: 1652#if defined(LT) || defined(RN) 1653 movq KK, %rax 1654#else 1655 movq K, %rax 1656 subq KK, %rax 1657#endif 1658 movapd POSINV, %xmm15 1659 andq $3, %rax # if (k & 1) 1660 BRANCH 1661 jle .L114 1662 1663.L113: 1664 movapd 0 * SIZE(AO), %xmm8 1665 movapd 0 * SIZE(BO), %xmm9 1666 mulpd %xmm8, %xmm9 1667 addpd %xmm9, %xmm0 1668 mulpd 2 * SIZE(BO), %xmm8 1669 addpd %xmm8, %xmm1 1670 1671 movapd 2 * SIZE(AO), %xmm8 1672 movapd 0 * SIZE(BO), %xmm9 1673 mulpd %xmm8, %xmm9 1674 addpd %xmm9, %xmm4 1675 mulpd 2 * SIZE(BO), %xmm8 1676 addpd %xmm8, %xmm5 1677 1678 addq $4 * SIZE, AO # aoffset += 4 1679 addq $4 * SIZE, BO # boffset1 += 8 1680 decq %rax 1681 jg .L113 1682 ALIGN_4 1683 1684.L114: 1685#if defined(LN) || defined(RT) 1686 movq KK, %rax 1687#ifdef LN 1688 subq $2, %rax 1689#else 1690 subq $1, %rax 1691#endif 1692 1693 movq AORIG, AO 1694 movq BORIG, B 1695 leaq BUFFER, BO 1696 1697 salq $ZBASE_SHIFT, %rax 1698 leaq (AO, %rax, 2), AO 1699 leaq (B, %rax, 1), B 1700 leaq (BO, %rax, 2), BO 1701#endif 1702 1703 SHUFPD_1 %xmm1, %xmm1 1704 SHUFPD_1 %xmm5, %xmm5 1705 1706#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 1707 defined(NR) || defined(NC) || defined(TR) || defined(TC) 1708 xorpd %xmm15, %xmm1 1709 xorpd %xmm15, %xmm5 1710#else 1711 xorpd %xmm15, %xmm0 1712 xorpd %xmm15, %xmm4 1713#endif 1714 1715#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 1716 defined(RR) || defined(RC) || defined(CR) || defined(CC) 1717 subpd %xmm1, %xmm0 1718 subpd %xmm5, %xmm4 1719#else 1720 addpd %xmm1, %xmm0 1721 addpd %xmm5, %xmm4 1722#endif 1723 1724#if defined(LN) || defined(LT) 1725 movapd 0 * SIZE(B), %xmm1 1726 movapd 2 * SIZE(B), %xmm5 1727 1728 subpd %xmm0, %xmm1 1729 subpd %xmm4, %xmm5 1730#else 1731 movapd 0 * SIZE(AO), %xmm1 1732 movapd 2 * SIZE(AO), %xmm5 1733 1734 subpd %xmm0, %xmm1 1735 subpd %xmm4, %xmm5 1736#endif 1737 1738#ifndef CONJ 1739 SHUFPD_1 %xmm15, %xmm15 1740#endif 1741 1742#ifdef LN 1743 movlpd 6 * SIZE(AO), %xmm8 1744 movhpd 6 * SIZE(AO), %xmm8 1745 movlpd 7 * SIZE(AO), %xmm9 1746 movhpd 7 * SIZE(AO), %xmm9 1747 movlpd 4 * SIZE(AO), %xmm10 1748 movhpd 4 * SIZE(AO), %xmm10 1749 movlpd 5 * SIZE(AO), %xmm11 1750 movhpd 5 * SIZE(AO), %xmm11 1751 movlpd 0 * SIZE(AO), %xmm12 1752 movhpd 0 * SIZE(AO), %xmm12 1753 movlpd 1 * SIZE(AO), %xmm13 1754 movhpd 1 * SIZE(AO), %xmm13 1755 1756 pshufd $0x4e, %xmm5, %xmm4 1757 1758 xorpd %xmm15, %xmm4 1759 1760 mulpd %xmm8, %xmm5 1761 mulpd %xmm9, %xmm4 1762 1763 addpd %xmm4, %xmm5 1764 1765 movapd %xmm5, %xmm0 1766 pshufd $0x4e, %xmm5, %xmm4 1767 1768 xorpd %xmm15, %xmm4 1769 1770 mulpd %xmm10, %xmm0 1771 mulpd %xmm11, %xmm4 1772 1773 subpd %xmm0, %xmm1 1774 subpd %xmm4, %xmm1 1775 1776 pshufd $0x4e, %xmm1, %xmm0 1777 1778 xorpd %xmm15, %xmm0 1779 1780 mulpd %xmm12, %xmm1 1781 mulpd %xmm13, %xmm0 1782 1783 addpd %xmm0, %xmm1 1784#endif 1785 1786#ifdef LT 1787 movlpd 0 * SIZE(AO), %xmm8 1788 movhpd 0 * SIZE(AO), %xmm8 1789 movlpd 1 * SIZE(AO), %xmm9 1790 movhpd 1 * SIZE(AO), %xmm9 1791 movlpd 2 * SIZE(AO), %xmm10 1792 movhpd 2 * SIZE(AO), %xmm10 1793 movlpd 3 * SIZE(AO), %xmm11 1794 movhpd 3 * SIZE(AO), %xmm11 1795 movlpd 6 * SIZE(AO), %xmm12 1796 movhpd 6 * SIZE(AO), %xmm12 1797 movlpd 7 * SIZE(AO), %xmm13 1798 movhpd 7 * SIZE(AO), %xmm13 1799 1800 pshufd $0x4e, %xmm1, %xmm0 1801 1802 xorpd %xmm15, %xmm0 1803 1804 mulpd %xmm8, %xmm1 1805 mulpd %xmm9, %xmm0 1806 1807 addpd %xmm0, %xmm1 1808 1809 movapd %xmm1, %xmm0 1810 pshufd $0x4e, %xmm1, %xmm4 1811 1812 xorpd %xmm15, %xmm4 1813 1814 mulpd %xmm10, %xmm0 1815 mulpd %xmm11, %xmm4 1816 1817 subpd %xmm0, %xmm5 1818 subpd %xmm4, %xmm5 1819 1820 pshufd $0x4e, %xmm5, %xmm4 1821 1822 xorpd %xmm15, %xmm4 1823 1824 mulpd %xmm12, %xmm5 1825 mulpd %xmm13, %xmm4 1826 1827 addpd %xmm4, %xmm5 1828#endif 1829 1830#ifdef RN 1831 movlpd 0 * SIZE(B), %xmm8 1832 movhpd 0 * SIZE(B), %xmm8 1833 movlpd 1 * SIZE(B), %xmm9 1834 movhpd 1 * SIZE(B), %xmm9 1835 1836 pshufd $0x4e, %xmm1, %xmm0 1837 pshufd $0x4e, %xmm5, %xmm4 1838 1839 xorpd %xmm15, %xmm0 1840 xorpd %xmm15, %xmm4 1841 1842 mulpd %xmm8, %xmm1 1843 mulpd %xmm9, %xmm0 1844 mulpd %xmm8, %xmm5 1845 mulpd %xmm9, %xmm4 1846 1847 addpd %xmm0, %xmm1 1848 addpd %xmm4, %xmm5 1849#endif 1850 1851#ifdef RT 1852 movlpd 0 * SIZE(B), %xmm8 1853 movhpd 0 * SIZE(B), %xmm8 1854 movlpd 1 * SIZE(B), %xmm9 1855 movhpd 1 * SIZE(B), %xmm9 1856 1857 pshufd $0x4e, %xmm1, %xmm0 1858 pshufd $0x4e, %xmm5, %xmm4 1859 1860 xorpd %xmm15, %xmm0 1861 xorpd %xmm15, %xmm4 1862 1863 mulpd %xmm8, %xmm1 1864 mulpd %xmm9, %xmm0 1865 mulpd %xmm8, %xmm5 1866 mulpd %xmm9, %xmm4 1867 1868 addpd %xmm0, %xmm1 1869 addpd %xmm4, %xmm5 1870#endif 1871 1872#ifdef LN 1873 subq $4 * SIZE, CO1 1874#endif 1875 1876 movsd %xmm1, 0 * SIZE(CO1) 1877 movhpd %xmm1, 1 * SIZE(CO1) 1878 movsd %xmm5, 2 * SIZE(CO1) 1879 movhpd %xmm5, 3 * SIZE(CO1) 1880 1881#if defined(LN) || defined(LT) 1882 movapd %xmm1, 0 * SIZE(B) 1883 movapd %xmm5, 2 * SIZE(B) 1884 1885 movlpd %xmm1, 0 * SIZE(BO) 1886 movlpd %xmm1, 1 * SIZE(BO) 1887 movhpd %xmm1, 2 * SIZE(BO) 1888 movhpd %xmm1, 3 * SIZE(BO) 1889 movlpd %xmm5, 4 * SIZE(BO) 1890 movlpd %xmm5, 5 * SIZE(BO) 1891 movhpd %xmm5, 6 * SIZE(BO) 1892 movhpd %xmm5, 7 * SIZE(BO) 1893#else 1894 movapd %xmm1, 0 * SIZE(AO) 1895 movapd %xmm5, 2 * SIZE(AO) 1896#endif 1897 1898#ifndef LN 1899 addq $4 * SIZE, CO1 1900#endif 1901 1902#if defined(LT) || defined(RN) 1903 movq K, %rax 1904 subq KK, %rax 1905 salq $ZBASE_SHIFT, %rax 1906 leaq (AO, %rax, 2), AO 1907#ifdef LT 1908 addq $4 * SIZE, B 1909#endif 1910#endif 1911 1912#ifdef LN 1913 subq $2, KK 1914 movq BORIG, B 1915#endif 1916 1917#ifdef LT 1918 addq $2, KK 1919#endif 1920 1921#ifdef RT 1922 movq K, %rax 1923 movq BORIG, B 1924 salq $1 + ZBASE_SHIFT, %rax 1925 addq %rax, AORIG 1926#endif 1927 1928 decq I # i -- 1929 jg .L110 1930 ALIGN_4 1931 1932.L130: 1933 testq $1, M 1934 jle .L199 1935 ALIGN_4 1936 1937.L140: 1938#ifdef LN 1939 movq K, %rax 1940 salq $0 + ZBASE_SHIFT, %rax 1941 subq %rax, AORIG 1942#endif 1943 1944#if defined(LN) || defined(RT) 1945 movq KK, %rax 1946 movq AORIG, AO 1947 salq $ZBASE_SHIFT, %rax 1948 leaq (AO, %rax, 1), AO 1949#endif 1950 1951 leaq BUFFER, BO 1952 1953#if defined(LN) || defined(RT) 1954 movq KK, %rax 1955 salq $0 + ZBASE_SHIFT, %rax 1956 leaq (BO, %rax, 2), BO 1957#endif 1958 1959 pxor %xmm0, %xmm0 1960 pxor %xmm1, %xmm1 1961 pxor %xmm2, %xmm2 1962 pxor %xmm3, %xmm3 1963 1964#if defined(LT) || defined(RN) 1965 movq KK, %rax 1966#else 1967 movq K, %rax 1968 subq KK, %rax 1969#endif 1970 sarq $2, %rax 1971 je .L142 1972 1973.L141: 1974 movapd 0 * SIZE(AO), %xmm8 1975 movapd 0 * SIZE(BO), %xmm9 1976 mulpd %xmm8, %xmm9 1977 addpd %xmm9, %xmm0 1978 mulpd 2 * SIZE(BO), %xmm8 1979 addpd %xmm8, %xmm1 1980 1981 movapd 2 * SIZE(AO), %xmm8 1982 movapd 4 * SIZE(BO), %xmm9 1983 mulpd %xmm8, %xmm9 1984 addpd %xmm9, %xmm2 1985 mulpd 6 * SIZE(BO), %xmm8 1986 addpd %xmm8, %xmm3 1987 1988 movapd 4 * SIZE(AO), %xmm8 1989 movapd 8 * SIZE(BO), %xmm9 1990 mulpd %xmm8, %xmm9 1991 addpd %xmm9, %xmm0 1992 mulpd 10 * SIZE(BO), %xmm8 1993 addpd %xmm8, %xmm1 1994 1995 movapd 6 * SIZE(AO), %xmm8 1996 movapd 12 * SIZE(BO), %xmm9 1997 mulpd %xmm8, %xmm9 1998 addpd %xmm9, %xmm2 1999 mulpd 14 * SIZE(BO), %xmm8 2000 addpd %xmm8, %xmm3 2001 2002 addq $8 * SIZE, AO 2003 addq $16 * SIZE, BO 2004 decq %rax 2005 jne .L141 2006 2007.L142: 2008 addpd %xmm2, %xmm0 2009 addpd %xmm3, %xmm1 2010 2011 movapd POSINV, %xmm15 2012 2013#if defined(LT) || defined(RN) 2014 movq KK, %rax 2015#else 2016 movq K, %rax 2017 subq KK, %rax 2018#endif 2019 andq $3, %rax # if (k & 1) 2020 BRANCH 2021 jle .L144 2022 2023.L143: 2024 movapd 0 * SIZE(AO), %xmm8 2025 movapd 0 * SIZE(BO), %xmm9 2026 mulpd %xmm8, %xmm9 2027 addpd %xmm9, %xmm0 2028 mulpd 2 * SIZE(BO), %xmm8 2029 addpd %xmm8, %xmm1 2030 2031 addq $2 * SIZE, AO # aoffset += 4 2032 addq $4 * SIZE, BO # boffset1 += 8 2033 decq %rax 2034 jg .L143 2035 ALIGN_4 2036 2037.L144: 2038#if defined(LN) || defined(RT) 2039 movq KK, %rax 2040#ifdef LN 2041 subq $1, %rax 2042#else 2043 subq $1, %rax 2044#endif 2045 2046 movq AORIG, AO 2047 movq BORIG, B 2048 leaq BUFFER, BO 2049 2050 salq $ZBASE_SHIFT, %rax 2051 leaq (AO, %rax, 1), AO 2052 leaq (B, %rax, 1), B 2053 leaq (BO, %rax, 2), BO 2054#endif 2055 2056 SHUFPD_1 %xmm1, %xmm1 2057 2058#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 2059 defined(NR) || defined(NC) || defined(TR) || defined(TC) 2060 xorpd %xmm15, %xmm1 2061#else 2062 xorpd %xmm15, %xmm0 2063#endif 2064 2065#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 2066 defined(RR) || defined(RC) || defined(CR) || defined(CC) 2067 subpd %xmm1, %xmm0 2068#else 2069 addpd %xmm1, %xmm0 2070#endif 2071 2072 2073#if defined(LN) || defined(LT) 2074 movapd 0 * SIZE(B), %xmm1 2075 2076 subpd %xmm0, %xmm1 2077#else 2078 movapd 0 * SIZE(AO), %xmm1 2079 2080 subpd %xmm0, %xmm1 2081#endif 2082 2083#ifndef CONJ 2084 SHUFPD_1 %xmm15, %xmm15 2085#endif 2086 2087#ifdef LN 2088 movlpd 0 * SIZE(AO), %xmm8 2089 movhpd 0 * SIZE(AO), %xmm8 2090 movlpd 1 * SIZE(AO), %xmm9 2091 movhpd 1 * SIZE(AO), %xmm9 2092 2093 pshufd $0x4e, %xmm1, %xmm0 2094 xorpd %xmm15, %xmm0 2095 2096 mulpd %xmm8, %xmm1 2097 mulpd %xmm9, %xmm0 2098 2099 addpd %xmm0, %xmm1 2100#endif 2101 2102#ifdef LT 2103 movlpd 0 * SIZE(AO), %xmm8 2104 movhpd 0 * SIZE(AO), %xmm8 2105 movlpd 1 * SIZE(AO), %xmm9 2106 movhpd 1 * SIZE(AO), %xmm9 2107 2108 pshufd $0x4e, %xmm1, %xmm0 2109 2110 xorpd %xmm15, %xmm0 2111 2112 mulpd %xmm8, %xmm1 2113 mulpd %xmm9, %xmm0 2114 2115 addpd %xmm0, %xmm1 2116#endif 2117 2118#ifdef RN 2119 movlpd 0 * SIZE(B), %xmm8 2120 movhpd 0 * SIZE(B), %xmm8 2121 movlpd 1 * SIZE(B), %xmm9 2122 movhpd 1 * SIZE(B), %xmm9 2123 2124 pshufd $0x4e, %xmm1, %xmm0 2125 2126 xorpd %xmm15, %xmm0 2127 2128 mulpd %xmm8, %xmm1 2129 mulpd %xmm9, %xmm0 2130 2131 addpd %xmm0, %xmm1 2132#endif 2133 2134#ifdef RT 2135 movlpd 0 * SIZE(B), %xmm8 2136 movhpd 0 * SIZE(B), %xmm8 2137 movlpd 1 * SIZE(B), %xmm9 2138 movhpd 1 * SIZE(B), %xmm9 2139 2140 pshufd $0x4e, %xmm1, %xmm0 2141 2142 xorpd %xmm15, %xmm0 2143 2144 mulpd %xmm8, %xmm1 2145 mulpd %xmm9, %xmm0 2146 2147 addpd %xmm0, %xmm1 2148#endif 2149 2150#ifdef LN 2151 subq $2 * SIZE, CO1 2152#endif 2153 2154 movsd %xmm1, 0 * SIZE(CO1) 2155 movhpd %xmm1, 1 * SIZE(CO1) 2156 2157#if defined(LN) || defined(LT) 2158 movapd %xmm1, 0 * SIZE(B) 2159 2160 movlpd %xmm1, 0 * SIZE(BO) 2161 movlpd %xmm1, 1 * SIZE(BO) 2162 movhpd %xmm1, 2 * SIZE(BO) 2163 movhpd %xmm1, 3 * SIZE(BO) 2164#else 2165 movapd %xmm1, 0 * SIZE(AO) 2166#endif 2167 2168#ifndef LN 2169 addq $2 * SIZE, CO1 2170#endif 2171 2172#if defined(LT) || defined(RN) 2173 movq K, %rax 2174 subq KK, %rax 2175 salq $ZBASE_SHIFT, %rax 2176 leaq (AO, %rax, 1), AO 2177#ifdef LT 2178 addq $2 * SIZE, B 2179#endif 2180#endif 2181 2182#ifdef LN 2183 subq $1, KK 2184 movq BORIG, B 2185#endif 2186 2187#ifdef LT 2188 addq $1, KK 2189#endif 2190 2191#ifdef RT 2192 movq K, %rax 2193 movq BORIG, B 2194 salq $0 + ZBASE_SHIFT, %rax 2195 addq %rax, AORIG 2196#endif 2197 ALIGN_4 2198 2199.L199: 2200#ifdef LN 2201 leaq (, K, SIZE), %rax 2202 leaq (B, %rax, 2), B 2203#endif 2204 2205#if defined(LT) || defined(RN) 2206 movq K, %rax 2207 subq KK, %rax 2208 leaq (,%rax, SIZE), %rax 2209 leaq (B, %rax, 1 * COMPSIZE), B 2210#endif 2211 2212#ifdef RN 2213 addq $1, KK 2214#endif 2215 2216#ifdef RT 2217 subq $1, KK 2218#endif 2219 ALIGN_4 2220 2221 2222.L999: 2223 movq %rbx, %rsp 2224 movq 0(%rsp), %rbx 2225 movq 8(%rsp), %rbp 2226 movq 16(%rsp), %r12 2227 movq 24(%rsp), %r13 2228 movq 32(%rsp), %r14 2229 movq 40(%rsp), %r15 2230 2231#ifdef WINDOWS_ABI 2232 movq 48(%rsp), %rdi 2233 movq 56(%rsp), %rsi 2234 movups 64(%rsp), %xmm6 2235 movups 80(%rsp), %xmm7 2236 movups 96(%rsp), %xmm8 2237 movups 112(%rsp), %xmm9 2238 movups 128(%rsp), %xmm10 2239 movups 144(%rsp), %xmm11 2240 movups 160(%rsp), %xmm12 2241 movups 176(%rsp), %xmm13 2242 movups 192(%rsp), %xmm14 2243 movups 208(%rsp), %xmm15 2244#endif 2245 2246 addq $STACKSIZE, %rsp 2247 ret 2248 2249 EPILOGUE 2250