1/*********************************************************************/ 2/* */ 3/* Optimized BLAS libraries */ 4/* By Kazushige Goto <kgoto@tacc.utexas.edu> */ 5/* */ 6/* Copyright (c) The University of Texas, 2009. All rights reserved. */ 7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */ 8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */ 9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */ 10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */ 11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */ 12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */ 13/* THE USE OF THE SOFTWARE OR DOCUMENTATION. */ 14/* Under no circumstances shall University be liable for incidental, */ 15/* special, indirect, direct or consequential damages or loss of */ 16/* profits, interruption of business, or related expenses which may */ 17/* arise from use of Software or Documentation, including but not */ 18/* limited to those resulting from defects in Software and/or */ 19/* Documentation, or loss or inaccuracy of data of any kind. */ 20/*********************************************************************/ 21 22#define ASSEMBLER 23#include "common.h" 24 25#define STACK 16 26#define ARGS 16 27 28#define M 4 + STACK + ARGS(%esp) 29#define N 8 + STACK + ARGS(%esp) 30#define K 12 + STACK + ARGS(%esp) 31#define ALPHA 16 + STACK + ARGS(%esp) 32#define A 24 + STACK + ARGS(%esp) 33#define ARG_B 28 + STACK + ARGS(%esp) 34#define C 32 + STACK + ARGS(%esp) 35#define ARG_LDC 36 + STACK + ARGS(%esp) 36#define OFFSET 40 + STACK + ARGS(%esp) 37 38#define J 0 + STACK(%esp) 39#define KK 4 + STACK(%esp) 40#define KKK 8 + STACK(%esp) 41#define AORIG 12 + STACK(%esp) 42 43#ifdef PENTIUM4 44#define PREFETCH prefetcht1 45#define PREFETCHSIZE 84 46#endif 47 48#if defined(PENRYN) || defined(DUNNINGTON) 49#define PREFETCH prefetcht1 50#define PREFETCHSIZE 84 51#endif 52 53#ifdef PENTIUMM 54#define PREFETCH prefetcht1 55#define PREFETCHSIZE 84 56#endif 57 58#define AA %edx 59#define BB %ecx 60#define LDC %ebp 61#define B %edi 62#define CO1 %esi 63 64 PROLOGUE 65 66 subl $ARGS, %esp 67 68 pushl %ebp 69 pushl %edi 70 pushl %esi 71 pushl %ebx 72 73 PROFCODE 74 75 movl ARG_B, B 76 movl ARG_LDC, LDC 77 78 movl OFFSET, %eax 79#ifdef RN 80 negl %eax 81#endif 82 movl %eax, KK 83 84 leal (, LDC, SIZE), LDC 85 86#ifdef LN 87 movl M, %eax 88 leal (, %eax, SIZE), %eax 89 addl %eax, C 90 imull K, %eax 91 addl %eax, A 92#endif 93 94#ifdef RT 95 movl N, %eax 96 leal (, %eax, SIZE), %eax 97 imull K, %eax 98 addl %eax, B 99 movl N, %eax 100 imull LDC, %eax 101 addl %eax, C 102#endif 103 104#ifdef RT 105 movl N, %eax 106 subl OFFSET, %eax 107 movl %eax, KK 108#endif 109 110 movl N, %eax 111 sarl $2, %eax 112 movl %eax, J 113 jle .L30 114 ALIGN_2 115 116.L10: 117#if defined(LT) || defined(RN) 118 movl A, AA 119#else 120 movl A, %eax 121 movl %eax, AORIG 122#endif 123 124#ifdef RT 125 movl K, %eax 126 sall $2 + BASE_SHIFT, %eax 127 subl %eax, B 128#endif 129 130 leal (, LDC, 4), %eax 131 132#ifdef RT 133 subl %eax, C 134#endif 135 movl C, CO1 136#ifndef RT 137 addl %eax, C 138#endif 139 140#ifdef LN 141 movl OFFSET, %eax 142 addl M, %eax 143 movl %eax, KK 144#endif 145 146#ifdef LT 147 movl OFFSET, %eax 148 movl %eax, KK 149#endif 150 151 movl M, %ebx 152 testl $1, %ebx # i = (m >> 2) 153 jle .L20 154 155#ifdef LN 156 movl K, %eax 157 sall $BASE_SHIFT, %eax 158 subl %eax, AORIG 159#endif 160 161#if defined(LN) || defined(RT) 162 movl KK, %eax 163 movl AORIG, AA 164 leal (AA, %eax, SIZE), AA 165#endif 166 167 movl B, BB 168 169#if defined(LN) || defined(RT) 170 movl KK, %eax 171 sall $2 + BASE_SHIFT, %eax 172 addl %eax, BB 173#endif 174 175 movddup 0 * SIZE(AA), %xmm0 176 pxor %xmm4, %xmm4 177 movddup 8 * SIZE(AA), %xmm1 178 pxor %xmm5, %xmm5 179 movapd 0 * SIZE(BB), %xmm2 180 pxor %xmm6, %xmm6 181 movapd 8 * SIZE(BB), %xmm3 182 pxor %xmm7, %xmm7 183 184#if defined(LT) || defined(RN) 185 movl KK, %eax 186#else 187 movl K, %eax 188 subl KK, %eax 189#endif 190 sarl $4, %eax 191 je .L25 192 ALIGN_4 193 194.L22: 195 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) 196 mulpd %xmm0, %xmm2 197 mulpd 2 * SIZE(BB), %xmm0 198 addpd %xmm2, %xmm4 199 movapd 4 * SIZE(BB), %xmm2 200 addpd %xmm0, %xmm5 201 movddup 1 * SIZE(AA), %xmm0 202 mulpd %xmm0, %xmm2 203 mulpd 6 * SIZE(BB), %xmm0 204 addpd %xmm2, %xmm6 205 movapd 16 * SIZE(BB), %xmm2 206 addpd %xmm0, %xmm7 207 movddup 2 * SIZE(AA), %xmm0 208 mulpd %xmm0, %xmm3 209 mulpd 10 * SIZE(BB), %xmm0 210 addpd %xmm3, %xmm4 211 movapd 12 * SIZE(BB), %xmm3 212 addpd %xmm0, %xmm5 213 movddup 3 * SIZE(AA), %xmm0 214 mulpd %xmm0, %xmm3 215 mulpd 14 * SIZE(BB), %xmm0 216 addpd %xmm3, %xmm6 217 movapd 24 * SIZE(BB), %xmm3 218 addpd %xmm0, %xmm7 219 movddup 4 * SIZE(AA), %xmm0 220 mulpd %xmm0, %xmm2 221 mulpd 18 * SIZE(BB), %xmm0 222 addpd %xmm2, %xmm4 223 movapd 20 * SIZE(BB), %xmm2 224 addpd %xmm0, %xmm5 225 movddup 5 * SIZE(AA), %xmm0 226 mulpd %xmm0, %xmm2 227 mulpd 22 * SIZE(BB), %xmm0 228 addpd %xmm2, %xmm6 229 movapd 32 * SIZE(BB), %xmm2 230 addpd %xmm0, %xmm7 231 movddup 6 * SIZE(AA), %xmm0 232 mulpd %xmm0, %xmm3 233 mulpd 26 * SIZE(BB), %xmm0 234 addpd %xmm3, %xmm4 235 movapd 28 * SIZE(BB), %xmm3 236 addpd %xmm0, %xmm5 237 movddup 7 * SIZE(AA), %xmm0 238 mulpd %xmm0, %xmm3 239 mulpd 30 * SIZE(BB), %xmm0 240 addpd %xmm3, %xmm6 241 movapd 40 * SIZE(BB), %xmm3 242 addpd %xmm0, %xmm7 243 movddup 16 * SIZE(AA), %xmm0 244 mulpd %xmm1, %xmm2 245 mulpd 34 * SIZE(BB), %xmm1 246 addpd %xmm2, %xmm4 247 movapd 36 * SIZE(BB), %xmm2 248 addpd %xmm1, %xmm5 249 movddup 9 * SIZE(AA), %xmm1 250 mulpd %xmm1, %xmm2 251 mulpd 38 * SIZE(BB), %xmm1 252 addpd %xmm2, %xmm6 253 movapd 48 * SIZE(BB), %xmm2 254 addpd %xmm1, %xmm7 255 movddup 10 * SIZE(AA), %xmm1 256 mulpd %xmm1, %xmm3 257 mulpd 42 * SIZE(BB), %xmm1 258 addpd %xmm3, %xmm4 259 movapd 44 * SIZE(BB), %xmm3 260 addpd %xmm1, %xmm5 261 movddup 11 * SIZE(AA), %xmm1 262 mulpd %xmm1, %xmm3 263 mulpd 46 * SIZE(BB), %xmm1 264 addpd %xmm3, %xmm6 265 movapd 56 * SIZE(BB), %xmm3 266 addpd %xmm1, %xmm7 267 movddup 12 * SIZE(AA), %xmm1 268 mulpd %xmm1, %xmm2 269 mulpd 50 * SIZE(BB), %xmm1 270 addpd %xmm2, %xmm4 271 movapd 52 * SIZE(BB), %xmm2 272 addpd %xmm1, %xmm5 273 movddup 13 * SIZE(AA), %xmm1 274 mulpd %xmm1, %xmm2 275 mulpd 54 * SIZE(BB), %xmm1 276 addpd %xmm2, %xmm6 277 movapd 64 * SIZE(BB), %xmm2 278 addpd %xmm1, %xmm7 279 movddup 14 * SIZE(AA), %xmm1 280 mulpd %xmm1, %xmm3 281 mulpd 58 * SIZE(BB), %xmm1 282 addpd %xmm3, %xmm4 283 movapd 60 * SIZE(BB), %xmm3 284 addpd %xmm1, %xmm5 285 movddup 15 * SIZE(AA), %xmm1 286 mulpd %xmm1, %xmm3 287 mulpd 62 * SIZE(BB), %xmm1 288 addpd %xmm3, %xmm6 289 movapd 72 * SIZE(BB), %xmm3 290 addpd %xmm1, %xmm7 291 movddup 24 * SIZE(AA), %xmm1 292 293 addl $16 * SIZE, AA 294 addl $64 * SIZE, BB 295 decl %eax 296 jne .L22 297 ALIGN_4 298 299.L25: 300#if defined(LT) || defined(RN) 301 movl KK, %eax 302#else 303 movl K, %eax 304 subl KK, %eax 305#endif 306 andl $15, %eax # if (k & 1) 307 BRANCH 308 je .L28 309 310.L26: 311 mulpd %xmm0, %xmm2 312 mulpd 2 * SIZE(BB), %xmm0 313 addpd %xmm2, %xmm4 314 movapd 4 * SIZE(BB), %xmm2 315 addpd %xmm0, %xmm5 316 movddup 1 * SIZE(AA), %xmm0 317 318 addl $1 * SIZE, AA 319 addl $4 * SIZE, BB 320 321 decl %eax 322 jg .L26 323 ALIGN_4 324 325.L28: 326 addpd %xmm6, %xmm4 327 addpd %xmm7, %xmm5 328 329#if defined(LN) || defined(RT) 330 movl KK, %eax 331#ifdef LN 332 subl $1, %eax 333#else 334 subl $4, %eax 335#endif 336 337 movl AORIG, AA 338 339 leal (, %eax, SIZE), %eax 340 leal (AA, %eax, 1), AA 341 leal (B, %eax, 4), BB 342#endif 343 344#if defined(LN) || defined(LT) 345 movapd 0 * SIZE(BB), %xmm0 346 movapd 2 * SIZE(BB), %xmm1 347 348 subpd %xmm4, %xmm0 349 subpd %xmm5, %xmm1 350#else 351 movapd 0 * SIZE(AA), %xmm1 352 movapd 2 * SIZE(AA), %xmm3 353 354 subpd %xmm4, %xmm1 355 subpd %xmm5, %xmm3 356 357 movapd %xmm1, %xmm0 358 unpckhpd %xmm1, %xmm1 359 movapd %xmm3, %xmm2 360 unpckhpd %xmm3, %xmm3 361#endif 362 363#ifdef LN 364 movddup 0 * SIZE(AA), %xmm4 365 mulpd %xmm4, %xmm0 366 mulpd %xmm4, %xmm1 367#endif 368 369#ifdef LT 370 movddup 0 * SIZE(AA), %xmm4 371 mulpd %xmm4, %xmm0 372 mulpd %xmm4, %xmm1 373#endif 374 375#ifdef RN 376 movsd 0 * SIZE(BB), %xmm4 377 mulsd %xmm4, %xmm0 378 movsd 1 * SIZE(BB), %xmm4 379 mulsd %xmm0, %xmm4 380 subsd %xmm4, %xmm1 381 movsd 2 * SIZE(BB), %xmm4 382 mulsd %xmm0, %xmm4 383 subsd %xmm4, %xmm2 384 movsd 3 * SIZE(BB), %xmm4 385 mulsd %xmm0, %xmm4 386 subsd %xmm4, %xmm3 387 388 movsd 5 * SIZE(BB), %xmm4 389 mulsd %xmm4, %xmm1 390 movsd 6 * SIZE(BB), %xmm4 391 mulsd %xmm1, %xmm4 392 subsd %xmm4, %xmm2 393 movsd 7 * SIZE(BB), %xmm4 394 mulsd %xmm1, %xmm4 395 subsd %xmm4, %xmm3 396 397 movsd 10 * SIZE(BB), %xmm4 398 mulsd %xmm4, %xmm2 399 movsd 11 * SIZE(BB), %xmm4 400 mulsd %xmm2, %xmm4 401 subsd %xmm4, %xmm3 402 403 movsd 15 * SIZE(BB), %xmm4 404 mulsd %xmm4, %xmm3 405#endif 406 407#ifdef RT 408 movsd 15 * SIZE(BB), %xmm4 409 mulsd %xmm4, %xmm3 410 movsd 14 * SIZE(BB), %xmm4 411 mulsd %xmm3, %xmm4 412 subsd %xmm4, %xmm2 413 movsd 13 * SIZE(BB), %xmm4 414 mulsd %xmm3, %xmm4 415 subsd %xmm4, %xmm1 416 movsd 12 * SIZE(BB), %xmm4 417 mulsd %xmm3, %xmm4 418 subsd %xmm4, %xmm0 419 420 movsd 10 * SIZE(BB), %xmm4 421 mulsd %xmm4, %xmm2 422 movsd 9 * SIZE(BB), %xmm4 423 mulsd %xmm2, %xmm4 424 subsd %xmm4, %xmm1 425 movsd 8 * SIZE(BB), %xmm4 426 mulsd %xmm2, %xmm4 427 subsd %xmm4, %xmm0 428 429 movsd 5 * SIZE(BB), %xmm4 430 mulsd %xmm4, %xmm1 431 movsd 4 * SIZE(BB), %xmm4 432 mulsd %xmm1, %xmm4 433 subsd %xmm4, %xmm0 434 435 movsd 0 * SIZE(BB), %xmm4 436 mulsd %xmm4, %xmm0 437#endif 438 439#if defined(LN) || defined(LT) 440 movapd %xmm0, 0 * SIZE(BB) 441 movapd %xmm1, 2 * SIZE(BB) 442#else 443 movsd %xmm0, 0 * SIZE(AA) 444 movsd %xmm1, 1 * SIZE(AA) 445 movsd %xmm2, 2 * SIZE(AA) 446 movsd %xmm3, 3 * SIZE(AA) 447#endif 448 449#ifdef LN 450 subl $1 * SIZE, CO1 451#endif 452 453 leal (LDC, LDC, 2), %eax 454 455#if defined(LN) || defined(LT) 456 movsd %xmm0, 0 * SIZE(CO1) 457 movhpd %xmm0, 0 * SIZE(CO1, LDC, 1) 458 movsd %xmm1, 0 * SIZE(CO1, LDC, 2) 459 movhpd %xmm1, 0 * SIZE(CO1, %eax, 1) 460#else 461 movsd %xmm0, 0 * SIZE(CO1) 462 movsd %xmm1, 0 * SIZE(CO1, LDC, 1) 463 movsd %xmm2, 0 * SIZE(CO1, LDC, 2) 464 movsd %xmm3, 0 * SIZE(CO1, %eax, 1) 465#endif 466 467#ifndef LN 468 addl $1 * SIZE, CO1 469#endif 470 471#if defined(LT) || defined(RN) 472 movl K, %eax 473 subl KK, %eax 474 leal (,%eax, SIZE), %eax 475 leal (AA, %eax, 1), AA 476 leal (BB, %eax, 4), BB 477#endif 478 479#ifdef LN 480 subl $1, KK 481#endif 482 483#ifdef LT 484 addl $1, KK 485#endif 486 487#ifdef RT 488 movl K, %eax 489 sall $BASE_SHIFT, %eax 490 addl %eax, AORIG 491#endif 492 ALIGN_4 493 494.L20: 495 movl M, %ebx 496 sarl $1, %ebx # i = (m >> 2) 497 jle .L29 498 ALIGN_4 499 500.L11: 501#ifdef LN 502 movl K, %eax 503 sall $1 + BASE_SHIFT, %eax 504 subl %eax, AORIG 505#endif 506 507#if defined(LN) || defined(RT) 508 movl KK, %eax 509 movl AORIG, AA 510 leal (, %eax, SIZE), %eax 511 leal (AA, %eax, 2), AA 512#endif 513 514 movl B, BB 515 516#if defined(LN) || defined(RT) 517 movl KK, %eax 518 sall $2 + BASE_SHIFT, %eax 519 addl %eax, BB 520#endif 521 522 movapd 0 * SIZE(AA), %xmm0 523 pxor %xmm4, %xmm4 524 movapd 8 * SIZE(AA), %xmm1 525 pxor %xmm5, %xmm5 526 movddup 0 * SIZE(BB), %xmm2 527 pxor %xmm6, %xmm6 528 movddup 8 * SIZE(BB), %xmm3 529 pxor %xmm7, %xmm7 530 531 leal (LDC, LDC, 2), %eax 532 533#ifdef LN 534 prefetchnta -2 * SIZE(CO1) 535 prefetchnta -2 * SIZE(CO1, LDC, 1) 536 prefetchnta -2 * SIZE(CO1, LDC, 2) 537 prefetchnta -2 * SIZE(CO1, %eax, 1) 538#else 539 prefetchnta 2 * SIZE(CO1) 540 prefetchnta 2 * SIZE(CO1, LDC, 1) 541 prefetchnta 2 * SIZE(CO1, LDC, 2) 542 prefetchnta 2 * SIZE(CO1, %eax, 1) 543#endif 544 545#if defined(LT) || defined(RN) 546 movl KK, %eax 547#else 548 movl K, %eax 549 subl KK, %eax 550#endif 551 sarl $3, %eax 552 je .L15 553 ALIGN_4 554 555.L12: 556 mulpd %xmm0, %xmm2 557 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) 558 addpd %xmm2, %xmm4 559 movddup 1 * SIZE(BB), %xmm2 560 mulpd %xmm0, %xmm2 561 addpd %xmm2, %xmm5 562 movddup 2 * SIZE(BB), %xmm2 563 mulpd %xmm0, %xmm2 564 addpd %xmm2, %xmm6 565 movddup 3 * SIZE(BB), %xmm2 566 mulpd %xmm0, %xmm2 567 movapd 2 * SIZE(AA), %xmm0 568 addpd %xmm2, %xmm7 569 movddup 4 * SIZE(BB), %xmm2 570 mulpd %xmm0, %xmm2 571 addpd %xmm2, %xmm4 572 movddup 5 * SIZE(BB), %xmm2 573 mulpd %xmm0, %xmm2 574 addpd %xmm2, %xmm5 575 movddup 6 * SIZE(BB), %xmm2 576 mulpd %xmm0, %xmm2 577 addpd %xmm2, %xmm6 578 movddup 7 * SIZE(BB), %xmm2 579 mulpd %xmm0, %xmm2 580 movapd 4 * SIZE(AA), %xmm0 581 addpd %xmm2, %xmm7 582 movddup 16 * SIZE(BB), %xmm2 583 mulpd %xmm0, %xmm3 584 addpd %xmm3, %xmm4 585 movddup 9 * SIZE(BB), %xmm3 586 mulpd %xmm0, %xmm3 587 addpd %xmm3, %xmm5 588 movddup 10 * SIZE(BB), %xmm3 589 mulpd %xmm0, %xmm3 590 addpd %xmm3, %xmm6 591 movddup 11 * SIZE(BB), %xmm3 592 mulpd %xmm0, %xmm3 593 movapd 6 * SIZE(AA), %xmm0 594 addpd %xmm3, %xmm7 595 movddup 12 * SIZE(BB), %xmm3 596 mulpd %xmm0, %xmm3 597 addpd %xmm3, %xmm4 598 movddup 13 * SIZE(BB), %xmm3 599 mulpd %xmm0, %xmm3 600 addpd %xmm3, %xmm5 601 movddup 14 * SIZE(BB), %xmm3 602 mulpd %xmm0, %xmm3 603 addpd %xmm3, %xmm6 604 movddup 15 * SIZE(BB), %xmm3 605 mulpd %xmm0, %xmm3 606 movapd 16 * SIZE(AA), %xmm0 607 addpd %xmm3, %xmm7 608 movddup 24 * SIZE(BB), %xmm3 609 mulpd %xmm1, %xmm2 610 addpd %xmm2, %xmm4 611 movddup 17 * SIZE(BB), %xmm2 612 mulpd %xmm1, %xmm2 613 addpd %xmm2, %xmm5 614 movddup 18 * SIZE(BB), %xmm2 615 mulpd %xmm1, %xmm2 616 addpd %xmm2, %xmm6 617 movddup 19 * SIZE(BB), %xmm2 618 mulpd %xmm1, %xmm2 619 movapd 10 * SIZE(AA), %xmm1 620 addpd %xmm2, %xmm7 621 movddup 20 * SIZE(BB), %xmm2 622 mulpd %xmm1, %xmm2 623 addpd %xmm2, %xmm4 624 movddup 21 * SIZE(BB), %xmm2 625 mulpd %xmm1, %xmm2 626 addpd %xmm2, %xmm5 627 movddup 22 * SIZE(BB), %xmm2 628 mulpd %xmm1, %xmm2 629 addpd %xmm2, %xmm6 630 movddup 23 * SIZE(BB), %xmm2 631 mulpd %xmm1, %xmm2 632 movapd 12 * SIZE(AA), %xmm1 633 addpd %xmm2, %xmm7 634 movddup 32 * SIZE(BB), %xmm2 635 mulpd %xmm1, %xmm3 636 addpd %xmm3, %xmm4 637 movddup 25 * SIZE(BB), %xmm3 638 mulpd %xmm1, %xmm3 639 addpd %xmm3, %xmm5 640 movddup 26 * SIZE(BB), %xmm3 641 mulpd %xmm1, %xmm3 642 addpd %xmm3, %xmm6 643 movddup 27 * SIZE(BB), %xmm3 644 mulpd %xmm1, %xmm3 645 movapd 14 * SIZE(AA), %xmm1 646 addpd %xmm3, %xmm7 647 movddup 28 * SIZE(BB), %xmm3 648 mulpd %xmm1, %xmm3 649 addpd %xmm3, %xmm4 650 movddup 29 * SIZE(BB), %xmm3 651 mulpd %xmm1, %xmm3 652 addpd %xmm3, %xmm5 653 movddup 30 * SIZE(BB), %xmm3 654 mulpd %xmm1, %xmm3 655 addpd %xmm3, %xmm6 656 movddup 31 * SIZE(BB), %xmm3 657 mulpd %xmm1, %xmm3 658 movapd 24 * SIZE(AA), %xmm1 659 addpd %xmm3, %xmm7 660 movddup 40 * SIZE(BB), %xmm3 661 662 addl $32 * SIZE, BB 663 addl $16 * SIZE, AA 664 decl %eax 665 jne .L12 666 ALIGN_4 667 668.L15: 669#if defined(LT) || defined(RN) 670 movl KK, %eax 671#else 672 movl K, %eax 673 subl KK, %eax 674#endif 675 andl $7, %eax # if (k & 1) 676 BRANCH 677 je .L18 678 ALIGN_3 679 680.L16: 681 mulpd %xmm0, %xmm2 682 addpd %xmm2, %xmm4 683 movddup 1 * SIZE(BB), %xmm2 684 mulpd %xmm0, %xmm2 685 addpd %xmm2, %xmm5 686 movddup 2 * SIZE(BB), %xmm2 687 mulpd %xmm0, %xmm2 688 addpd %xmm2, %xmm6 689 movddup 3 * SIZE(BB), %xmm2 690 mulpd %xmm0, %xmm2 691 movapd 2 * SIZE(AA), %xmm0 692 addpd %xmm2, %xmm7 693 movddup 4 * SIZE(BB), %xmm2 694 695 addl $2 * SIZE, AA 696 addl $4 * SIZE, BB 697 decl %eax 698 jg .L16 699 ALIGN_4 700 701.L18: 702#if defined(LN) || defined(RT) 703 movl KK, %eax 704#ifdef LN 705 subl $2, %eax 706#else 707 subl $4, %eax 708#endif 709 710 movl AORIG, AA 711 712 leal (, %eax, SIZE), %eax 713 leal (AA, %eax, 2), AA 714 leal (B, %eax, 4), BB 715#endif 716 717#if defined(LN) || defined(LT) 718 movapd %xmm4, %xmm0 719 unpcklpd %xmm5, %xmm4 720 unpckhpd %xmm5, %xmm0 721 722 movapd %xmm6, %xmm1 723 unpcklpd %xmm7, %xmm6 724 unpckhpd %xmm7, %xmm1 725 726 movapd 0 * SIZE(BB), %xmm2 727 movapd 2 * SIZE(BB), %xmm5 728 movapd 4 * SIZE(BB), %xmm3 729 movapd 6 * SIZE(BB), %xmm7 730 731 subpd %xmm4, %xmm2 732 subpd %xmm6, %xmm5 733 subpd %xmm0, %xmm3 734 subpd %xmm1, %xmm7 735#else 736 movapd 0 * SIZE(AA), %xmm0 737 movapd 2 * SIZE(AA), %xmm1 738 movapd 4 * SIZE(AA), %xmm2 739 movapd 6 * SIZE(AA), %xmm3 740 741 subpd %xmm4, %xmm0 742 subpd %xmm5, %xmm1 743 subpd %xmm6, %xmm2 744 subpd %xmm7, %xmm3 745#endif 746 747#ifdef LN 748 movddup 3 * SIZE(AA), %xmm4 749 mulpd %xmm4, %xmm3 750 mulpd %xmm4, %xmm7 751 752 movddup 2 * SIZE(AA), %xmm4 753 movapd %xmm4, %xmm6 754 mulpd %xmm3, %xmm4 755 subpd %xmm4, %xmm2 756 mulpd %xmm7, %xmm6 757 subpd %xmm6, %xmm5 758 759 movddup 0 * SIZE(AA), %xmm4 760 mulpd %xmm4, %xmm2 761 mulpd %xmm4, %xmm5 762 763#endif 764 765#ifdef LT 766 movddup 0 * SIZE(AA), %xmm4 767 mulpd %xmm4, %xmm2 768 mulpd %xmm4, %xmm5 769 770 movddup 1 * SIZE(AA), %xmm4 771 movapd %xmm4, %xmm6 772 mulpd %xmm2, %xmm4 773 subpd %xmm4, %xmm3 774 mulpd %xmm5, %xmm6 775 subpd %xmm6, %xmm7 776 777 movddup 3 * SIZE(AA), %xmm4 778 mulpd %xmm4, %xmm3 779 mulpd %xmm4, %xmm7 780#endif 781 782#ifdef RN 783 movddup 0 * SIZE(BB), %xmm4 784 mulpd %xmm4, %xmm0 785 movddup 1 * SIZE(BB), %xmm4 786 mulpd %xmm0, %xmm4 787 subpd %xmm4, %xmm1 788 movddup 2 * SIZE(BB), %xmm4 789 mulpd %xmm0, %xmm4 790 subpd %xmm4, %xmm2 791 movddup 3 * SIZE(BB), %xmm4 792 mulpd %xmm0, %xmm4 793 subpd %xmm4, %xmm3 794 795 movddup 5 * SIZE(BB), %xmm4 796 mulpd %xmm4, %xmm1 797 movddup 6 * SIZE(BB), %xmm4 798 mulpd %xmm1, %xmm4 799 subpd %xmm4, %xmm2 800 movddup 7 * SIZE(BB), %xmm4 801 mulpd %xmm1, %xmm4 802 subpd %xmm4, %xmm3 803 804 movddup 10 * SIZE(BB), %xmm4 805 mulpd %xmm4, %xmm2 806 movddup 11 * SIZE(BB), %xmm4 807 mulpd %xmm2, %xmm4 808 subpd %xmm4, %xmm3 809 810 movddup 15 * SIZE(BB), %xmm4 811 mulpd %xmm4, %xmm3 812#endif 813 814#ifdef RT 815 movddup 15 * SIZE(BB), %xmm4 816 mulpd %xmm4, %xmm3 817 movddup 14 * SIZE(BB), %xmm4 818 mulpd %xmm3, %xmm4 819 subpd %xmm4, %xmm2 820 movddup 13 * SIZE(BB), %xmm4 821 mulpd %xmm3, %xmm4 822 subpd %xmm4, %xmm1 823 movddup 12 * SIZE(BB), %xmm4 824 mulpd %xmm3, %xmm4 825 subpd %xmm4, %xmm0 826 827 movddup 10 * SIZE(BB), %xmm4 828 mulpd %xmm4, %xmm2 829 movddup 9 * SIZE(BB), %xmm4 830 mulpd %xmm2, %xmm4 831 subpd %xmm4, %xmm1 832 movddup 8 * SIZE(BB), %xmm4 833 mulpd %xmm2, %xmm4 834 subpd %xmm4, %xmm0 835 836 movddup 5 * SIZE(BB), %xmm4 837 mulpd %xmm4, %xmm1 838 movddup 4 * SIZE(BB), %xmm4 839 mulpd %xmm1, %xmm4 840 subpd %xmm4, %xmm0 841 842 movddup 0 * SIZE(BB), %xmm4 843 mulpd %xmm4, %xmm0 844#endif 845 846#if defined(LN) || defined(LT) 847 movapd %xmm2, 0 * SIZE(BB) 848 movapd %xmm5, 2 * SIZE(BB) 849 movapd %xmm3, 4 * SIZE(BB) 850 movapd %xmm7, 6 * SIZE(BB) 851#else 852 movapd %xmm0, 0 * SIZE(AA) 853 movapd %xmm1, 2 * SIZE(AA) 854 movapd %xmm2, 4 * SIZE(AA) 855 movapd %xmm3, 6 * SIZE(AA) 856#endif 857 858#ifdef LN 859 subl $2 * SIZE, CO1 860#endif 861 862 leal (LDC, LDC, 2), %eax 863 864#if defined(LN) || defined(LT) 865 movsd %xmm2, 0 * SIZE(CO1) 866 movsd %xmm3, 1 * SIZE(CO1) 867 movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) 868 movhpd %xmm3, 1 * SIZE(CO1, LDC, 1) 869 movsd %xmm5, 0 * SIZE(CO1, LDC, 2) 870 movsd %xmm7, 1 * SIZE(CO1, LDC, 2) 871 movhpd %xmm5, 0 * SIZE(CO1, %eax, 1) 872 movhpd %xmm7, 1 * SIZE(CO1, %eax, 1) 873#else 874 movsd %xmm0, 0 * SIZE(CO1) 875 movhpd %xmm0, 1 * SIZE(CO1) 876 movsd %xmm1, 0 * SIZE(CO1, LDC, 1) 877 movhpd %xmm1, 1 * SIZE(CO1, LDC, 1) 878 movsd %xmm2, 0 * SIZE(CO1, LDC, 2) 879 movhpd %xmm2, 1 * SIZE(CO1, LDC, 2) 880 movsd %xmm3, 0 * SIZE(CO1, %eax, 1) 881 movhpd %xmm3, 1 * SIZE(CO1, %eax, 1) 882#endif 883 884#ifndef LN 885 addl $2 * SIZE, CO1 886#endif 887 888#if defined(LT) || defined(RN) 889 movl K, %eax 890 subl KK, %eax 891 leal (,%eax, SIZE), %eax 892 leal (AA, %eax, 2), AA 893 leal (BB, %eax, 4), BB 894#endif 895 896#ifdef LN 897 subl $2, KK 898#endif 899 900#ifdef LT 901 addl $2, KK 902#endif 903 904#ifdef RT 905 movl K, %eax 906 sall $1 + BASE_SHIFT, %eax 907 addl %eax, AORIG 908#endif 909 910 decl %ebx # i -- 911 jg .L11 912 ALIGN_4 913 914.L29: 915#ifdef LN 916 movl K, %eax 917 leal (, %eax, SIZE), %eax 918 leal (B, %eax, 4), B 919#endif 920 921#if defined(LT) || defined(RN) 922 movl BB, B 923#endif 924 925#ifdef RN 926 addl $4, KK 927#endif 928 929#ifdef RT 930 subl $4, KK 931#endif 932 933 decl J # j -- 934 jg .L10 935 ALIGN_4 936 937.L30: 938 testl $2, N 939 je .L60 940 941#if defined(LT) || defined(RN) 942 movl A, AA 943#else 944 movl A, %eax 945 movl %eax, AORIG 946#endif 947 948#ifdef RT 949 movl K, %eax 950 sall $1 + BASE_SHIFT, %eax 951 subl %eax, B 952#endif 953 954 leal (, LDC, 2), %eax 955 956#ifdef RT 957 subl %eax, C 958#endif 959 movl C, CO1 960#ifndef RT 961 addl %eax, C 962#endif 963 964#ifdef LN 965 movl OFFSET, %eax 966 addl M, %eax 967 movl %eax, KK 968#endif 969 970#ifdef LT 971 movl OFFSET, %eax 972 movl %eax, KK 973#endif 974 975 movl M, %ebx 976 testl $1, %ebx # i = (m >> 2) 977 jle .L50 978 979#ifdef LN 980 movl K, %eax 981 sall $BASE_SHIFT, %eax 982 subl %eax, AORIG 983#endif 984 985#if defined(LN) || defined(RT) 986 movl KK, %eax 987 movl AORIG, AA 988 leal (AA, %eax, SIZE), AA 989#endif 990 991 movl B, BB 992 993#if defined(LN) || defined(RT) 994 movl KK, %eax 995 sall $1 + BASE_SHIFT, %eax 996 addl %eax, BB 997#endif 998 999 movddup 0 * SIZE(AA), %xmm0 1000 pxor %xmm4, %xmm4 1001 movddup 8 * SIZE(AA), %xmm1 1002 pxor %xmm5, %xmm5 1003 movapd 0 * SIZE(BB), %xmm2 1004 pxor %xmm6, %xmm6 1005 movapd 8 * SIZE(BB), %xmm3 1006 pxor %xmm7, %xmm7 1007 1008#if defined(LT) || defined(RN) 1009 movl KK, %eax 1010#else 1011 movl K, %eax 1012 subl KK, %eax 1013#endif 1014 sarl $4, %eax 1015 je .L55 1016 ALIGN_4 1017 1018.L52: 1019 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) 1020 mulpd %xmm0, %xmm2 1021 movddup 1 * SIZE(AA), %xmm0 1022 addpd %xmm2, %xmm4 1023 mulpd 2 * SIZE(BB), %xmm0 1024 movapd 16 * SIZE(BB), %xmm2 1025 addpd %xmm0, %xmm5 1026 movddup 2 * SIZE(AA), %xmm0 1027 mulpd 4 * SIZE(BB), %xmm0 1028 addpd %xmm0, %xmm6 1029 movddup 3 * SIZE(AA), %xmm0 1030 mulpd 6 * SIZE(BB), %xmm0 1031 addpd %xmm0, %xmm7 1032 movddup 4 * SIZE(AA), %xmm0 1033 mulpd %xmm0, %xmm3 1034 movddup 5 * SIZE(AA), %xmm0 1035 addpd %xmm3, %xmm4 1036 mulpd 10 * SIZE(BB), %xmm0 1037 movapd 24 * SIZE(BB), %xmm3 1038 addpd %xmm0, %xmm5 1039 movddup 6 * SIZE(AA), %xmm0 1040 mulpd 12 * SIZE(BB), %xmm0 1041 addpd %xmm0, %xmm6 1042 movddup 7 * SIZE(AA), %xmm0 1043 mulpd 14 * SIZE(BB), %xmm0 1044 addpd %xmm0, %xmm7 1045 movddup 16 * SIZE(AA), %xmm0 1046 mulpd %xmm1, %xmm2 1047 movddup 9 * SIZE(AA), %xmm1 1048 addpd %xmm2, %xmm4 1049 mulpd 18 * SIZE(BB), %xmm1 1050 movapd 32 * SIZE(BB), %xmm2 1051 addpd %xmm1, %xmm5 1052 movddup 10 * SIZE(AA), %xmm1 1053 mulpd 20 * SIZE(BB), %xmm1 1054 addpd %xmm1, %xmm6 1055 movddup 11 * SIZE(AA), %xmm1 1056 mulpd 22 * SIZE(BB), %xmm1 1057 addpd %xmm1, %xmm7 1058 movddup 12 * SIZE(AA), %xmm1 1059 mulpd %xmm1, %xmm3 1060 movddup 13 * SIZE(AA), %xmm1 1061 addpd %xmm3, %xmm4 1062 mulpd 26 * SIZE(BB), %xmm1 1063 movapd 40 * SIZE(BB), %xmm3 1064 addpd %xmm1, %xmm5 1065 movddup 14 * SIZE(AA), %xmm1 1066 mulpd 28 * SIZE(BB), %xmm1 1067 addpd %xmm1, %xmm6 1068 movddup 15 * SIZE(AA), %xmm1 1069 mulpd 30 * SIZE(BB), %xmm1 1070 addpd %xmm1, %xmm7 1071 movddup 24 * SIZE(AA), %xmm1 1072 1073 addl $16 * SIZE, AA 1074 addl $32 * SIZE, BB 1075 decl %eax 1076 jne .L52 1077 ALIGN_4 1078 1079.L55: 1080#if defined(LT) || defined(RN) 1081 movl KK, %eax 1082#else 1083 movl K, %eax 1084 subl KK, %eax 1085#endif 1086 andl $15, %eax # if (k & 1) 1087 BRANCH 1088 je .L58 1089 1090.L56: 1091 mulpd %xmm0, %xmm2 1092 movddup 1 * SIZE(AA), %xmm0 1093 addpd %xmm2, %xmm4 1094 movapd 2 * SIZE(BB), %xmm2 1095 1096 addl $1 * SIZE, AA 1097 addl $2 * SIZE, BB 1098 decl %eax 1099 jg .L56 1100 ALIGN_4 1101 1102.L58: 1103 addpd %xmm5, %xmm4 1104 addpd %xmm7, %xmm6 1105 addpd %xmm6, %xmm4 1106 1107#if defined(LN) || defined(RT) 1108 movl KK, %eax 1109#ifdef LN 1110 subl $1, %eax 1111#else 1112 subl $2, %eax 1113#endif 1114 1115 movl AORIG, AA 1116 1117 leal (, %eax, SIZE), %eax 1118 addl %eax, AA 1119 leal (B, %eax, 2), BB 1120#endif 1121 1122#if defined(LN) || defined(LT) 1123 movapd 0 * SIZE(BB), %xmm0 1124 1125 subpd %xmm4, %xmm0 1126#else 1127 movapd 0 * SIZE(AA), %xmm1 1128 1129 subpd %xmm4, %xmm1 1130 1131 movapd %xmm1, %xmm0 1132 unpckhpd %xmm1, %xmm1 1133#endif 1134 1135#ifdef LN 1136 movddup 0 * SIZE(AA), %xmm4 1137 mulpd %xmm4, %xmm0 1138#endif 1139 1140#ifdef LT 1141 movddup 0 * SIZE(AA), %xmm4 1142 mulpd %xmm4, %xmm0 1143#endif 1144 1145#ifdef RN 1146 movsd 0 * SIZE(BB), %xmm4 1147 mulsd %xmm4, %xmm0 1148 1149 movsd 1 * SIZE(BB), %xmm4 1150 mulsd %xmm0, %xmm4 1151 subsd %xmm4, %xmm1 1152 1153 movsd 3 * SIZE(BB), %xmm4 1154 mulsd %xmm4, %xmm1 1155#endif 1156 1157#ifdef RT 1158 movsd 3 * SIZE(BB), %xmm4 1159 mulsd %xmm4, %xmm1 1160 1161 movsd 2 * SIZE(BB), %xmm4 1162 mulsd %xmm1, %xmm4 1163 subsd %xmm4, %xmm0 1164 1165 movsd 0 * SIZE(BB), %xmm4 1166 mulsd %xmm4, %xmm0 1167#endif 1168 1169#if defined(LN) || defined(LT) 1170 movapd %xmm0, 0 * SIZE(BB) 1171#else 1172 movsd %xmm0, 0 * SIZE(AA) 1173 movsd %xmm1, 1 * SIZE(AA) 1174#endif 1175 1176#ifdef LN 1177 subl $1 * SIZE, CO1 1178#endif 1179 1180#if defined(LN) || defined(LT) 1181 movsd %xmm0, 0 * SIZE(CO1) 1182 movhpd %xmm0, 0 * SIZE(CO1, LDC, 1) 1183#else 1184 movsd %xmm0, 0 * SIZE(CO1) 1185 movsd %xmm1, 0 * SIZE(CO1, LDC, 1) 1186#endif 1187 1188#ifndef LN 1189 addl $1 * SIZE, CO1 1190#endif 1191 1192#if defined(LT) || defined(RN) 1193 movl K, %eax 1194 subl KK, %eax 1195 leal (,%eax, SIZE), %eax 1196 leal (AA, %eax, 1), AA 1197 leal (BB, %eax, 2), BB 1198#endif 1199 1200#ifdef LN 1201 subl $1, KK 1202#endif 1203 1204#ifdef LT 1205 addl $1, KK 1206#endif 1207 1208#ifdef RT 1209 movl K, %eax 1210 sall $1 + BASE_SHIFT, %eax 1211 addl %eax, AORIG 1212#endif 1213 ALIGN_4 1214 1215.L50: 1216 movl M, %ebx 1217 sarl $1, %ebx # i = (m >> 2) 1218 jle .L59 1219 ALIGN_4 1220 1221.L41: 1222#ifdef LN 1223 movl K, %eax 1224 sall $1 + BASE_SHIFT, %eax 1225 subl %eax, AORIG 1226#endif 1227 1228#if defined(LN) || defined(RT) 1229 movl KK, %eax 1230 movl AORIG, AA 1231 leal (, %eax, SIZE), %eax 1232 leal (AA, %eax, 2), AA 1233#endif 1234 1235 movl B, BB 1236 1237#if defined(LN) || defined(RT) 1238 movl KK, %eax 1239 sall $1 + BASE_SHIFT, %eax 1240 addl %eax, BB 1241#endif 1242 1243 movapd 0 * SIZE(AA), %xmm0 1244 pxor %xmm4, %xmm4 1245 movapd 8 * SIZE(AA), %xmm1 1246 pxor %xmm5, %xmm5 1247 movddup 0 * SIZE(BB), %xmm2 1248 pxor %xmm6, %xmm6 1249 movddup 8 * SIZE(BB), %xmm3 1250 pxor %xmm7, %xmm7 1251 1252#ifdef LN 1253 prefetchnta -2 * SIZE(CO1) 1254 prefetchnta -2 * SIZE(CO1, LDC, 1) 1255#else 1256 prefetchnta 2 * SIZE(CO1) 1257 prefetchnta 2 * SIZE(CO1, LDC, 1) 1258#endif 1259 1260#if defined(LT) || defined(RN) 1261 movl KK, %eax 1262#else 1263 movl K, %eax 1264 subl KK, %eax 1265#endif 1266 sarl $3, %eax 1267 je .L45 1268 ALIGN_4 1269 1270.L42: 1271 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) 1272 mulpd %xmm0, %xmm2 1273 addpd %xmm2, %xmm4 1274 movddup 1 * SIZE(BB), %xmm2 1275 mulpd %xmm0, %xmm2 1276 movapd 2 * SIZE(AA), %xmm0 1277 addpd %xmm2, %xmm5 1278 movddup 2 * SIZE(BB), %xmm2 1279 mulpd %xmm0, %xmm2 1280 addpd %xmm2, %xmm6 1281 movddup 3 * SIZE(BB), %xmm2 1282 mulpd %xmm0, %xmm2 1283 movapd 4 * SIZE(AA), %xmm0 1284 addpd %xmm2, %xmm7 1285 movddup 4 * SIZE(BB), %xmm2 1286 mulpd %xmm0, %xmm2 1287 addpd %xmm2, %xmm4 1288 movddup 5 * SIZE(BB), %xmm2 1289 mulpd %xmm0, %xmm2 1290 movapd 6 * SIZE(AA), %xmm0 1291 addpd %xmm2, %xmm5 1292 movddup 6 * SIZE(BB), %xmm2 1293 mulpd %xmm0, %xmm2 1294 addpd %xmm2, %xmm6 1295 movddup 7 * SIZE(BB), %xmm2 1296 mulpd %xmm0, %xmm2 1297 movapd 16 * SIZE(AA), %xmm0 1298 addpd %xmm2, %xmm7 1299 movddup 16 * SIZE(BB), %xmm2 1300 mulpd %xmm1, %xmm3 1301 addpd %xmm3, %xmm4 1302 movddup 9 * SIZE(BB), %xmm3 1303 mulpd %xmm1, %xmm3 1304 movapd 10 * SIZE(AA), %xmm1 1305 addpd %xmm3, %xmm5 1306 movddup 10 * SIZE(BB), %xmm3 1307 mulpd %xmm1, %xmm3 1308 addpd %xmm3, %xmm6 1309 movddup 11 * SIZE(BB), %xmm3 1310 mulpd %xmm1, %xmm3 1311 movapd 12 * SIZE(AA), %xmm1 1312 addpd %xmm3, %xmm7 1313 movddup 12 * SIZE(BB), %xmm3 1314 mulpd %xmm1, %xmm3 1315 addpd %xmm3, %xmm4 1316 movddup 13 * SIZE(BB), %xmm3 1317 mulpd %xmm1, %xmm3 1318 movapd 14 * SIZE(AA), %xmm1 1319 addpd %xmm3, %xmm5 1320 movddup 14 * SIZE(BB), %xmm3 1321 mulpd %xmm1, %xmm3 1322 addpd %xmm3, %xmm6 1323 movddup 15 * SIZE(BB), %xmm3 1324 mulpd %xmm1, %xmm3 1325 movapd 24 * SIZE(AA), %xmm1 1326 addpd %xmm3, %xmm7 1327 movddup 24 * SIZE(BB), %xmm3 1328 1329 addl $16 * SIZE, AA 1330 addl $16 * SIZE, BB 1331 decl %eax 1332 jne .L42 1333 ALIGN_4 1334 1335.L45: 1336#if defined(LT) || defined(RN) 1337 movl KK, %eax 1338#else 1339 movl K, %eax 1340 subl KK, %eax 1341#endif 1342 andl $7, %eax # if (k & 1) 1343 BRANCH 1344 je .L48 1345 ALIGN_3 1346 1347.L46: 1348 mulpd %xmm0, %xmm2 1349 addpd %xmm2, %xmm4 1350 movddup 1 * SIZE(BB), %xmm2 1351 mulpd %xmm0, %xmm2 1352 movapd 2 * SIZE(AA), %xmm0 1353 addpd %xmm2, %xmm5 1354 movddup 2 * SIZE(BB), %xmm2 1355 1356 addl $2 * SIZE, AA 1357 addl $2 * SIZE, BB 1358 decl %eax 1359 jg .L46 1360 ALIGN_4 1361 1362.L48: 1363 addpd %xmm6, %xmm4 1364 addpd %xmm7, %xmm5 1365 1366#if defined(LN) || defined(RT) 1367 movl KK, %eax 1368#ifdef LN 1369 subl $2, %eax 1370#else 1371 subl $2, %eax 1372#endif 1373 1374 movl AORIG, AA 1375 1376 leal (, %eax, SIZE), %eax 1377 leal (AA, %eax, 2), AA 1378 leal (B, %eax, 2), BB 1379#endif 1380 1381#if defined(LN) || defined(LT) 1382 movapd %xmm4, %xmm0 1383 unpcklpd %xmm5, %xmm4 1384 unpckhpd %xmm5, %xmm0 1385 1386 movapd 0 * SIZE(BB), %xmm2 1387 movapd 2 * SIZE(BB), %xmm3 1388 1389 subpd %xmm4, %xmm2 1390 subpd %xmm0, %xmm3 1391#else 1392 movapd 0 * SIZE(AA), %xmm0 1393 movapd 2 * SIZE(AA), %xmm1 1394 1395 subpd %xmm4, %xmm0 1396 subpd %xmm5, %xmm1 1397#endif 1398 1399#ifdef LN 1400 movddup 3 * SIZE(AA), %xmm4 1401 mulpd %xmm4, %xmm3 1402 1403 movddup 2 * SIZE(AA), %xmm4 1404 mulpd %xmm3, %xmm4 1405 subpd %xmm4, %xmm2 1406 1407 movddup 0 * SIZE(AA), %xmm4 1408 mulpd %xmm4, %xmm2 1409 1410#endif 1411 1412#ifdef LT 1413 movddup 0 * SIZE(AA), %xmm4 1414 mulpd %xmm4, %xmm2 1415 1416 movddup 1 * SIZE(AA), %xmm4 1417 mulpd %xmm2, %xmm4 1418 subpd %xmm4, %xmm3 1419 1420 movddup 3 * SIZE(AA), %xmm4 1421 mulpd %xmm4, %xmm3 1422#endif 1423 1424#ifdef RN 1425 movddup 0 * SIZE(BB), %xmm4 1426 mulpd %xmm4, %xmm0 1427 1428 movddup 1 * SIZE(BB), %xmm4 1429 mulpd %xmm0, %xmm4 1430 subpd %xmm4, %xmm1 1431 1432 movddup 3 * SIZE(BB), %xmm4 1433 mulpd %xmm4, %xmm1 1434#endif 1435 1436#ifdef RT 1437 movddup 3 * SIZE(BB), %xmm4 1438 mulpd %xmm4, %xmm1 1439 1440 movddup 2 * SIZE(BB), %xmm4 1441 mulpd %xmm1, %xmm4 1442 subpd %xmm4, %xmm0 1443 1444 movddup 0 * SIZE(BB), %xmm4 1445 mulpd %xmm4, %xmm0 1446#endif 1447 1448#if defined(LN) || defined(LT) 1449 movapd %xmm2, 0 * SIZE(BB) 1450 movapd %xmm3, 2 * SIZE(BB) 1451#else 1452 movapd %xmm0, 0 * SIZE(AA) 1453 movapd %xmm1, 2 * SIZE(AA) 1454#endif 1455 1456#ifdef LN 1457 subl $2 * SIZE, CO1 1458#endif 1459 1460#if defined(LN) || defined(LT) 1461 movsd %xmm2, 0 * SIZE(CO1) 1462 movsd %xmm3, 1 * SIZE(CO1) 1463 movhpd %xmm2, 0 * SIZE(CO1, LDC, 1) 1464 movhpd %xmm3, 1 * SIZE(CO1, LDC, 1) 1465#else 1466 movsd %xmm0, 0 * SIZE(CO1) 1467 movhpd %xmm0, 1 * SIZE(CO1) 1468 movsd %xmm1, 0 * SIZE(CO1, LDC, 1) 1469 movhpd %xmm1, 1 * SIZE(CO1, LDC, 1) 1470#endif 1471 1472#ifndef LN 1473 addl $2 * SIZE, CO1 1474#endif 1475 1476#if defined(LT) || defined(RN) 1477 movl K, %eax 1478 subl KK, %eax 1479 leal (,%eax, SIZE), %eax 1480 leal (AA, %eax, 2), AA 1481 leal (BB, %eax, 2), BB 1482#endif 1483 1484#ifdef LN 1485 subl $2, KK 1486#endif 1487 1488#ifdef LT 1489 addl $2, KK 1490#endif 1491 1492#ifdef RT 1493 movl K, %eax 1494 sall $1 + BASE_SHIFT, %eax 1495 addl %eax, AORIG 1496#endif 1497 1498 decl %ebx # i -- 1499 jg .L41 1500 ALIGN_4 1501 1502.L59: 1503#ifdef LN 1504 movl K, %eax 1505 leal (, %eax, SIZE), %eax 1506 leal (B, %eax, 2), B 1507#endif 1508 1509#if defined(LT) || defined(RN) 1510 movl BB, B 1511#endif 1512 1513#ifdef RN 1514 addl $2, KK 1515#endif 1516 1517#ifdef RT 1518 subl $2, KK 1519#endif 1520 ALIGN_4 1521 1522.L60: 1523 testl $1, N 1524 je .L999 1525 1526#if defined(LT) || defined(RN) 1527 movl A, AA 1528#else 1529 movl A, %eax 1530 movl %eax, AORIG 1531#endif 1532 1533#ifdef RT 1534 movl K, %eax 1535 sall $BASE_SHIFT, %eax 1536 subl %eax, B 1537#endif 1538 1539#ifdef RT 1540 subl LDC, C 1541#endif 1542 movl C, CO1 1543#ifndef RT 1544 addl LDC, C 1545#endif 1546 1547#ifdef LN 1548 movl OFFSET, %eax 1549 addl M, %eax 1550 movl %eax, KK 1551#endif 1552 1553#ifdef LT 1554 movl OFFSET, %eax 1555 movl %eax, KK 1556#endif 1557 1558 movl M, %ebx 1559 testl $1, %ebx # i = (m >> 2) 1560 jle .L80 1561 1562#ifdef LN 1563 movl K, %eax 1564 sall $BASE_SHIFT, %eax 1565 subl %eax, AORIG 1566#endif 1567 1568#if defined(LN) || defined(RT) 1569 movl KK, %eax 1570 movl AORIG, AA 1571 leal (AA, %eax, SIZE), AA 1572#endif 1573 1574 movl B, BB 1575 1576#if defined(LN) || defined(RT) 1577 movl KK, %eax 1578 sall $BASE_SHIFT, %eax 1579 addl %eax, BB 1580#endif 1581 1582 movsd 0 * SIZE(AA), %xmm0 1583 movhpd 1 * SIZE(AA), %xmm0 1584 pxor %xmm4, %xmm4 1585 movsd 8 * SIZE(AA), %xmm1 1586 movhpd 9 * SIZE(AA), %xmm1 1587 pxor %xmm5, %xmm5 1588 movsd 0 * SIZE(BB), %xmm2 1589 movhpd 1 * SIZE(BB), %xmm2 1590 pxor %xmm6, %xmm6 1591 movsd 8 * SIZE(BB), %xmm3 1592 movhpd 9 * SIZE(BB), %xmm3 1593 pxor %xmm7, %xmm7 1594 1595#if defined(LT) || defined(RN) 1596 movl KK, %eax 1597#else 1598 movl K, %eax 1599 subl KK, %eax 1600#endif 1601 sarl $4, %eax 1602 je .L85 1603 ALIGN_4 1604 1605.L82: 1606 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) 1607 mulpd %xmm0, %xmm2 1608 movapd 2 * SIZE(AA), %xmm0 1609 addpd %xmm2, %xmm4 1610 mulpd 2 * SIZE(BB), %xmm0 1611 movapd 16 * SIZE(BB), %xmm2 1612 addpd %xmm0, %xmm5 1613 movapd 4 * SIZE(AA), %xmm0 1614 mulpd 4 * SIZE(BB), %xmm0 1615 addpd %xmm0, %xmm6 1616 movapd 6 * SIZE(AA), %xmm0 1617 mulpd 6 * SIZE(BB), %xmm0 1618 addpd %xmm0, %xmm7 1619 movapd 16 * SIZE(AA), %xmm0 1620 mulpd %xmm1, %xmm3 1621 movapd 10 * SIZE(AA), %xmm1 1622 addpd %xmm3, %xmm4 1623 mulpd 10 * SIZE(BB), %xmm1 1624 movapd 24 * SIZE(BB), %xmm3 1625 addpd %xmm1, %xmm5 1626 movapd 12 * SIZE(AA), %xmm1 1627 mulpd 12 * SIZE(BB), %xmm1 1628 addpd %xmm1, %xmm6 1629 movapd 14 * SIZE(AA), %xmm1 1630 mulpd 14 * SIZE(BB), %xmm1 1631 addpd %xmm1, %xmm7 1632 movapd 24 * SIZE(AA), %xmm1 1633 1634 addl $16 * SIZE, AA 1635 addl $16 * SIZE, BB 1636 decl %eax 1637 jne .L82 1638 ALIGN_4 1639 1640.L85: 1641#if defined(LT) || defined(RN) 1642 movl KK, %eax 1643#else 1644 movl K, %eax 1645 subl KK, %eax 1646#endif 1647 andl $15, %eax # if (k & 1) 1648 BRANCH 1649 je .L88 1650 1651.L86: 1652 mulsd %xmm0, %xmm2 1653 movsd 1 * SIZE(AA), %xmm0 1654 addsd %xmm2, %xmm4 1655 movsd 1 * SIZE(BB), %xmm2 1656 1657 addl $1 * SIZE, AA 1658 addl $1 * SIZE, BB 1659 decl %eax 1660 jg .L86 1661 ALIGN_4 1662 1663.L88: 1664 addpd %xmm5, %xmm4 1665 addpd %xmm7, %xmm6 1666 addpd %xmm6, %xmm4 1667 1668 haddpd %xmm4, %xmm4 1669 1670#if defined(LN) || defined(RT) 1671 movl KK, %eax 1672#ifdef LN 1673 subl $1, %eax 1674#else 1675 subl $1, %eax 1676#endif 1677 1678 movl AORIG, AA 1679 1680 leal (, %eax, SIZE), %eax 1681 addl %eax, AA 1682 leal (B, %eax, 1), BB 1683#endif 1684 1685#if defined(LN) || defined(LT) 1686 movsd 0 * SIZE(BB), %xmm0 1687 subsd %xmm4, %xmm0 1688#else 1689 movsd 0 * SIZE(AA), %xmm0 1690 subsd %xmm4, %xmm0 1691#endif 1692 1693#ifdef LN 1694 movsd 0 * SIZE(AA), %xmm4 1695 mulsd %xmm4, %xmm0 1696#endif 1697 1698#ifdef LT 1699 movsd 0 * SIZE(AA), %xmm4 1700 mulsd %xmm4, %xmm0 1701#endif 1702 1703#ifdef RN 1704 movsd 0 * SIZE(BB), %xmm4 1705 mulsd %xmm4, %xmm0 1706#endif 1707 1708#ifdef RT 1709 movsd 0 * SIZE(BB), %xmm4 1710 mulsd %xmm4, %xmm0 1711#endif 1712 1713#if defined(LN) || defined(LT) 1714 movsd %xmm0, 0 * SIZE(BB) 1715#else 1716 movsd %xmm0, 0 * SIZE(AA) 1717#endif 1718 1719#ifdef LN 1720 subl $1 * SIZE, CO1 1721#endif 1722 1723#if defined(LN) || defined(LT) 1724 movsd %xmm0, 0 * SIZE(CO1) 1725#else 1726 movsd %xmm0, 0 * SIZE(CO1) 1727#endif 1728 1729#ifndef LN 1730 addl $1 * SIZE, CO1 1731#endif 1732 1733#if defined(LT) || defined(RN) 1734 movl K, %eax 1735 subl KK, %eax 1736 leal (,%eax, SIZE), %eax 1737 addl %eax, AA 1738 addl %eax, BB 1739#endif 1740 1741#ifdef LN 1742 subl $1, KK 1743#endif 1744 1745#ifdef LT 1746 addl $1, KK 1747#endif 1748 1749#ifdef RT 1750 movl K, %eax 1751 sall $BASE_SHIFT, %eax 1752 addl %eax, AORIG 1753#endif 1754 ALIGN_4 1755 1756 1757.L80: 1758 movl M, %ebx 1759 sarl $1, %ebx # i = (m >> 2) 1760 jle .L89 1761 ALIGN_4 1762 1763.L71: 1764#ifdef LN 1765 movl K, %eax 1766 sall $1 + BASE_SHIFT, %eax 1767 subl %eax, AORIG 1768#endif 1769 1770#if defined(LN) || defined(RT) 1771 movl KK, %eax 1772 movl AORIG, AA 1773 leal (, %eax, SIZE), %eax 1774 leal (AA, %eax, 2), AA 1775#endif 1776 1777 movl B, BB 1778 1779#if defined(LN) || defined(RT) 1780 movl KK, %eax 1781 sall $BASE_SHIFT, %eax 1782 addl %eax, BB 1783#endif 1784 1785 movapd 0 * SIZE(AA), %xmm0 1786 pxor %xmm4, %xmm4 1787 movapd 8 * SIZE(AA), %xmm1 1788 pxor %xmm5, %xmm5 1789 movddup 0 * SIZE(BB), %xmm2 1790 pxor %xmm6, %xmm6 1791 movddup 4 * SIZE(BB), %xmm3 1792 pxor %xmm7, %xmm7 1793 1794#ifdef LN 1795 prefetchnta -2 * SIZE(CO1) 1796#else 1797 prefetchnta 2 * SIZE(CO1) 1798#endif 1799 1800#if defined(LT) || defined(RN) 1801 movl KK, %eax 1802#else 1803 movl K, %eax 1804 subl KK, %eax 1805#endif 1806 sarl $3, %eax 1807 je .L75 1808 ALIGN_4 1809 1810.L72: 1811 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) 1812 mulpd %xmm2, %xmm0 1813 movddup 1 * SIZE(BB), %xmm2 1814 addpd %xmm0, %xmm4 1815 movapd 16 * SIZE(AA), %xmm0 1816 mulpd 2 * SIZE(AA), %xmm2 1817 addpd %xmm2, %xmm5 1818 movddup 2 * SIZE(BB), %xmm2 1819 mulpd 4 * SIZE(AA), %xmm2 1820 addpd %xmm2, %xmm6 1821 movddup 3 * SIZE(BB), %xmm2 1822 mulpd 6 * SIZE(AA), %xmm2 1823 addpd %xmm2, %xmm7 1824 movddup 8 * SIZE(BB), %xmm2 1825 mulpd %xmm3, %xmm1 1826 movddup 5 * SIZE(BB), %xmm3 1827 addpd %xmm1, %xmm4 1828 movapd 24 * SIZE(AA), %xmm1 1829 mulpd 10 * SIZE(AA), %xmm3 1830 addpd %xmm3, %xmm5 1831 movddup 6 * SIZE(BB), %xmm3 1832 mulpd 12 * SIZE(AA), %xmm3 1833 addpd %xmm3, %xmm6 1834 movddup 7 * SIZE(BB), %xmm3 1835 mulpd 14 * SIZE(AA), %xmm3 1836 addpd %xmm3, %xmm7 1837 movddup 12 * SIZE(BB), %xmm3 1838 1839 addl $16 * SIZE, AA 1840 addl $ 8 * SIZE, BB 1841 decl %eax 1842 jne .L72 1843 ALIGN_4 1844 1845.L75: 1846#if defined(LT) || defined(RN) 1847 movl KK, %eax 1848#else 1849 movl K, %eax 1850 subl KK, %eax 1851#endif 1852 andl $7, %eax # if (k & 1) 1853 BRANCH 1854 je .L78 1855 ALIGN_3 1856 1857.L76: 1858 mulpd %xmm2, %xmm0 1859 movddup 1 * SIZE(BB), %xmm2 1860 addpd %xmm0, %xmm4 1861 movapd 2 * SIZE(AA), %xmm0 1862 1863 addl $2 * SIZE, AA 1864 addl $1 * SIZE, BB 1865 decl %eax 1866 jg .L76 1867 ALIGN_4 1868 1869.L78: 1870 addpd %xmm5, %xmm4 1871 addpd %xmm7, %xmm6 1872 addpd %xmm6, %xmm4 1873 1874#if defined(LN) || defined(RT) 1875 movl KK, %eax 1876#ifdef LN 1877 subl $2, %eax 1878#else 1879 subl $1, %eax 1880#endif 1881 1882 movl AORIG, AA 1883 1884 leal (, %eax, SIZE), %eax 1885 leal (AA, %eax, 2), AA 1886 leal (B, %eax, 1), BB 1887#endif 1888 1889#if defined(LN) || defined(LT) 1890 movapd 0 * SIZE(BB), %xmm1 1891 1892 subpd %xmm4, %xmm1 1893 1894 movapd %xmm1, %xmm0 1895 unpckhpd %xmm1, %xmm1 1896#else 1897 movapd 0 * SIZE(AA), %xmm0 1898 1899 subpd %xmm4, %xmm0 1900#endif 1901 1902#ifdef LN 1903 movsd 3 * SIZE(AA), %xmm4 1904 mulsd %xmm4, %xmm1 1905 1906 movsd 2 * SIZE(AA), %xmm4 1907 mulsd %xmm1, %xmm4 1908 subsd %xmm4, %xmm0 1909 1910 movsd 0 * SIZE(AA), %xmm4 1911 mulsd %xmm4, %xmm0 1912 1913#endif 1914 1915#ifdef LT 1916 movsd 0 * SIZE(AA), %xmm4 1917 mulsd %xmm4, %xmm0 1918 1919 movsd 1 * SIZE(AA), %xmm4 1920 mulsd %xmm0, %xmm4 1921 subsd %xmm4, %xmm1 1922 1923 movsd 3 * SIZE(AA), %xmm4 1924 mulsd %xmm4, %xmm1 1925#endif 1926 1927#ifdef RN 1928 movddup 0 * SIZE(BB), %xmm4 1929 mulpd %xmm4, %xmm0 1930#endif 1931 1932#ifdef RT 1933 movddup 0 * SIZE(BB), %xmm4 1934 mulpd %xmm4, %xmm0 1935#endif 1936 1937#if defined(LN) || defined(LT) 1938 movsd %xmm0, 0 * SIZE(BB) 1939 movsd %xmm1, 1 * SIZE(BB) 1940#else 1941 movapd %xmm0, 0 * SIZE(AA) 1942#endif 1943 1944#ifdef LN 1945 subl $2 * SIZE, CO1 1946#endif 1947 1948#if defined(LN) || defined(LT) 1949 movsd %xmm0, 0 * SIZE(CO1) 1950 movsd %xmm1, 1 * SIZE(CO1) 1951#else 1952 movsd %xmm0, 0 * SIZE(CO1) 1953 movhpd %xmm0, 1 * SIZE(CO1) 1954#endif 1955 1956#ifndef LN 1957 addl $2 * SIZE, CO1 1958#endif 1959 1960#if defined(LT) || defined(RN) 1961 movl K, %eax 1962 subl KK, %eax 1963 leal (,%eax, SIZE), %eax 1964 leal (AA, %eax, 2), AA 1965 addl %eax, BB 1966#endif 1967 1968#ifdef LN 1969 subl $2, KK 1970#endif 1971 1972#ifdef LT 1973 addl $2, KK 1974#endif 1975 1976#ifdef RT 1977 movl K, %eax 1978 sall $1 + BASE_SHIFT, %eax 1979 addl %eax, AORIG 1980#endif 1981 1982 decl %ebx # i -- 1983 jg .L71 1984 ALIGN_4 1985 1986.L89: 1987#ifdef LN 1988 movl K, %eax 1989 leal (B, %eax, SIZE), B 1990#endif 1991 1992#if defined(LT) || defined(RN) 1993 movl BB, B 1994#endif 1995 1996#ifdef RN 1997 addl $1, KK 1998#endif 1999 2000#ifdef RT 2001 subl $1, KK 2002#endif 2003 ALIGN_4 2004 2005.L999: 2006 popl %ebx 2007 popl %esi 2008 popl %edi 2009 popl %ebp 2010 2011 addl $ARGS, %esp 2012 ret 2013 2014 EPILOGUE 2015