1/*********************************************************************/ 2/* */ 3/* Optimized BLAS libraries */ 4/* By Kazushige Goto <kgoto@tacc.utexas.edu> */ 5/* */ 6/* Copyright (c) The University of Texas, 2009. All rights reserved. */ 7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */ 8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */ 9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */ 10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */ 11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */ 12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */ 13/* THE USE OF THE SOFTWARE OR DOCUMENTATION. */ 14/* Under no circumstances shall University be liable for incidental, */ 15/* special, indirect, direct or consequential damages or loss of */ 16/* profits, interruption of business, or related expenses which may */ 17/* arise from use of Software or Documentation, including but not */ 18/* limited to those resulting from defects in Software and/or */ 19/* Documentation, or loss or inaccuracy of data of any kind. */ 20/*********************************************************************/ 21 22#define ASSEMBLER 23#include "common.h" 24 25#ifdef ATOM 26#define PREFETCH prefetchnta 27#define PREFETCHW prefetcht0 28#define PREFETCHSIZE (8 * 6) 29#endif 30 31#define STACKSIZE 16 32 33#define M 4 + STACKSIZE(%esp) 34#define N 8 + STACKSIZE(%esp) 35#define ALPHA 16 + STACKSIZE(%esp) 36#define A 24 + STACKSIZE(%esp) 37#define STACK_LDA 28 + STACKSIZE(%esp) 38#define STACK_X 32 + STACKSIZE(%esp) 39#define STACK_INCX 36 + STACKSIZE(%esp) 40#define Y 40 + STACKSIZE(%esp) 41#define STACK_INCY 44 + STACKSIZE(%esp) 42#define BUFFER 48 + STACKSIZE(%esp) 43 44#define I %eax 45#define J %ebx 46 47#define INCX %ecx 48#define INCY J 49 50#define A1 %esi 51#define X %edx 52#define Y1 %edi 53#define LDA %ebp 54 55 PROLOGUE 56 57 pushl %ebp 58 pushl %edi 59 pushl %esi 60 pushl %ebx 61 62 PROFCODE 63 64 movl STACK_LDA, LDA 65 movl STACK_X, X 66 movl STACK_INCX, INCX 67 68 leal (,INCX, SIZE), INCX 69 leal (,LDA, SIZE), LDA 70 71 subl $-16 * SIZE, A 72 73 cmpl $0, N 74 jle .L999 75 cmpl $0, M 76 jle .L999 77 78 movl BUFFER, Y1 79 80 pxor %xmm7, %xmm7 81 82 movl M, %eax 83 addl $16, %eax 84 sarl $4, %eax 85 ALIGN_3 86 87.L01: 88 movapd %xmm7, 0 * SIZE(Y1) 89 movapd %xmm7, 2 * SIZE(Y1) 90 movapd %xmm7, 4 * SIZE(Y1) 91 movapd %xmm7, 6 * SIZE(Y1) 92 movapd %xmm7, 8 * SIZE(Y1) 93 movapd %xmm7, 10 * SIZE(Y1) 94 movapd %xmm7, 12 * SIZE(Y1) 95 movapd %xmm7, 14 * SIZE(Y1) 96 subl $-16 * SIZE, Y1 97 decl %eax 98 jg .L01 99 ALIGN_3 100 101.L10: 102 movl N, J 103 sarl $1, J 104 jle .L20 105 ALIGN_3 106 107.L11: 108 movl BUFFER, Y1 109 addl $16 * SIZE, Y1 110 111 movl A, A1 112 leal (A1, LDA, 2), %eax 113 movl %eax, A 114 115 movsd (X), %xmm6 116 addl INCX, X 117 movsd (X), %xmm7 118 addl INCX, X 119 120 movsd ALPHA, %xmm0 121 122 mulsd %xmm0, %xmm6 123 mulsd %xmm0, %xmm7 124 125 movsd -16 * SIZE(Y1), %xmm0 126 movsd -15 * SIZE(Y1), %xmm1 127 128 movl M, I 129 sarl $3, I 130 jle .L15 131 132 movsd -16 * SIZE(A1), %xmm2 133 movsd -15 * SIZE(A1), %xmm3 134 movsd -16 * SIZE(A1, LDA), %xmm4 135 movsd -15 * SIZE(A1, LDA), %xmm5 136 137 mulsd %xmm6, %xmm2 138 mulsd %xmm6, %xmm3 139 140 decl I 141 jle .L14 142 ALIGN_3 143 144.L13: 145#ifdef PREFETCH 146 PREFETCH (PREFETCHSIZE + 0) * SIZE(A1) 147#endif 148 149 mulsd %xmm7, %xmm4 150 addsd %xmm2, %xmm0 151 movsd -14 * SIZE(A1), %xmm2 152 mulsd %xmm7, %xmm5 153 addsd %xmm3, %xmm1 154 movsd -13 * SIZE(A1), %xmm3 155 156 addsd %xmm4, %xmm0 157 movsd -14 * SIZE(A1, LDA), %xmm4 158 mulsd %xmm6, %xmm2 159 addsd %xmm5, %xmm1 160 movsd -13 * SIZE(A1, LDA), %xmm5 161 mulsd %xmm6, %xmm3 162 163 movlpd %xmm0, -16 * SIZE(Y1) 164 movsd -14 * SIZE(Y1), %xmm0 165 movlpd %xmm1, -15 * SIZE(Y1) 166 movsd -13 * SIZE(Y1), %xmm1 167 168 mulsd %xmm7, %xmm4 169 addsd %xmm2, %xmm0 170 movsd -12 * SIZE(A1), %xmm2 171 mulsd %xmm7, %xmm5 172 addsd %xmm3, %xmm1 173 movsd -11 * SIZE(A1), %xmm3 174 175 addsd %xmm4, %xmm0 176 movsd -12 * SIZE(A1, LDA), %xmm4 177 mulsd %xmm6, %xmm2 178 addsd %xmm5, %xmm1 179 movsd -11 * SIZE(A1, LDA), %xmm5 180 mulsd %xmm6, %xmm3 181 182 movlpd %xmm0, -14 * SIZE(Y1) 183 movsd -12 * SIZE(Y1), %xmm0 184 movlpd %xmm1, -13 * SIZE(Y1) 185 movsd -11 * SIZE(Y1), %xmm1 186 187#ifdef PREFETCH 188 PREFETCH (PREFETCHSIZE + 0) * SIZE(A1, LDA) 189#endif 190 191 mulsd %xmm7, %xmm4 192 addsd %xmm2, %xmm0 193 movsd -10 * SIZE(A1), %xmm2 194 mulsd %xmm7, %xmm5 195 addsd %xmm3, %xmm1 196 movsd -9 * SIZE(A1), %xmm3 197 198 addsd %xmm4, %xmm0 199 movsd -10 * SIZE(A1, LDA), %xmm4 200 mulsd %xmm6, %xmm2 201 addsd %xmm5, %xmm1 202 movsd -9 * SIZE(A1, LDA), %xmm5 203 mulsd %xmm6, %xmm3 204 205 movlpd %xmm0, -12 * SIZE(Y1) 206 movsd -10 * SIZE(Y1), %xmm0 207 movlpd %xmm1, -11 * SIZE(Y1) 208 movsd -9 * SIZE(Y1), %xmm1 209 210 mulsd %xmm7, %xmm4 211 addsd %xmm2, %xmm0 212 movsd -8 * SIZE(A1), %xmm2 213 mulsd %xmm7, %xmm5 214 addsd %xmm3, %xmm1 215 movsd -7 * SIZE(A1), %xmm3 216 217 addsd %xmm4, %xmm0 218 movsd -8 * SIZE(A1, LDA), %xmm4 219 mulsd %xmm6, %xmm2 220 addsd %xmm5, %xmm1 221 movsd -7 * SIZE(A1, LDA), %xmm5 222 mulsd %xmm6, %xmm3 223 224 movlpd %xmm0, -10 * SIZE(Y1) 225 movsd -8 * SIZE(Y1), %xmm0 226 movlpd %xmm1, -9 * SIZE(Y1) 227 movsd -7 * SIZE(Y1), %xmm1 228 229 subl $-8 * SIZE, A1 230 subl $-8 * SIZE, Y1 231 232 subl $1, I 233 BRANCH 234 jg .L13 235 ALIGN_3 236 237.L14: 238 mulsd %xmm7, %xmm4 239 addsd %xmm2, %xmm0 240 movsd -14 * SIZE(A1), %xmm2 241 mulsd %xmm7, %xmm5 242 addsd %xmm3, %xmm1 243 movsd -13 * SIZE(A1), %xmm3 244 245 addsd %xmm4, %xmm0 246 movsd -14 * SIZE(A1, LDA), %xmm4 247 mulsd %xmm6, %xmm2 248 addsd %xmm5, %xmm1 249 movsd -13 * SIZE(A1, LDA), %xmm5 250 mulsd %xmm6, %xmm3 251 252 movlpd %xmm0, -16 * SIZE(Y1) 253 movsd -14 * SIZE(Y1), %xmm0 254 movlpd %xmm1, -15 * SIZE(Y1) 255 movsd -13 * SIZE(Y1), %xmm1 256 257 mulsd %xmm7, %xmm4 258 addsd %xmm2, %xmm0 259 movsd -12 * SIZE(A1), %xmm2 260 mulsd %xmm7, %xmm5 261 addsd %xmm3, %xmm1 262 movsd -11 * SIZE(A1), %xmm3 263 264 addsd %xmm4, %xmm0 265 movsd -12 * SIZE(A1, LDA), %xmm4 266 mulsd %xmm6, %xmm2 267 addsd %xmm5, %xmm1 268 movsd -11 * SIZE(A1, LDA), %xmm5 269 mulsd %xmm6, %xmm3 270 271 movlpd %xmm0, -14 * SIZE(Y1) 272 movsd -12 * SIZE(Y1), %xmm0 273 movlpd %xmm1, -13 * SIZE(Y1) 274 movsd -11 * SIZE(Y1), %xmm1 275 276 mulsd %xmm7, %xmm4 277 addsd %xmm2, %xmm0 278 movsd -10 * SIZE(A1), %xmm2 279 mulsd %xmm7, %xmm5 280 addsd %xmm3, %xmm1 281 movsd -9 * SIZE(A1), %xmm3 282 283 addsd %xmm4, %xmm0 284 movsd -10 * SIZE(A1, LDA), %xmm4 285 mulsd %xmm6, %xmm2 286 addsd %xmm5, %xmm1 287 movsd -9 * SIZE(A1, LDA), %xmm5 288 mulsd %xmm6, %xmm3 289 290 movlpd %xmm0, -12 * SIZE(Y1) 291 movsd -10 * SIZE(Y1), %xmm0 292 movlpd %xmm1, -11 * SIZE(Y1) 293 movsd -9 * SIZE(Y1), %xmm1 294 295 mulsd %xmm7, %xmm4 296 addsd %xmm2, %xmm0 297 mulsd %xmm7, %xmm5 298 addsd %xmm3, %xmm1 299 300 addsd %xmm4, %xmm0 301 addsd %xmm5, %xmm1 302 303 movlpd %xmm0, -10 * SIZE(Y1) 304 movsd -8 * SIZE(Y1), %xmm0 305 movlpd %xmm1, -9 * SIZE(Y1) 306 movsd -7 * SIZE(Y1), %xmm1 307 308 subl $-8 * SIZE, A1 309 subl $-8 * SIZE, Y1 310 ALIGN_3 311 312.L15: 313 testl $4, M 314 je .L16 315 316 movsd -16 * SIZE(A1), %xmm2 317 movsd -15 * SIZE(A1), %xmm3 318 movsd -16 * SIZE(A1, LDA), %xmm4 319 movsd -15 * SIZE(A1, LDA), %xmm5 320 321 mulsd %xmm6, %xmm2 322 mulsd %xmm6, %xmm3 323 324 mulsd %xmm7, %xmm4 325 addsd %xmm2, %xmm0 326 movsd -14 * SIZE(A1), %xmm2 327 mulsd %xmm7, %xmm5 328 addsd %xmm3, %xmm1 329 movsd -13 * SIZE(A1), %xmm3 330 331 addsd %xmm4, %xmm0 332 movsd -14 * SIZE(A1, LDA), %xmm4 333 mulsd %xmm6, %xmm2 334 addsd %xmm5, %xmm1 335 movsd -13 * SIZE(A1, LDA), %xmm5 336 mulsd %xmm6, %xmm3 337 338 movlpd %xmm0, -16 * SIZE(Y1) 339 movsd -14 * SIZE(Y1), %xmm0 340 movlpd %xmm1, -15 * SIZE(Y1) 341 movsd -13 * SIZE(Y1), %xmm1 342 343 mulsd %xmm7, %xmm4 344 addsd %xmm2, %xmm0 345 mulsd %xmm7, %xmm5 346 addsd %xmm3, %xmm1 347 348 addsd %xmm4, %xmm0 349 addsd %xmm5, %xmm1 350 351 movlpd %xmm0, -14 * SIZE(Y1) 352 movsd -12 * SIZE(Y1), %xmm0 353 movlpd %xmm1, -13 * SIZE(Y1) 354 movsd -11 * SIZE(Y1), %xmm1 355 356 addl $4 * SIZE, A1 357 addl $4 * SIZE, Y1 358 ALIGN_3 359 360.L16: 361 testl $2, M 362 je .L17 363 364 movsd -16 * SIZE(A1), %xmm2 365 movsd -15 * SIZE(A1), %xmm3 366 movsd -16 * SIZE(A1, LDA), %xmm4 367 movsd -15 * SIZE(A1, LDA), %xmm5 368 369 mulsd %xmm6, %xmm2 370 mulsd %xmm6, %xmm3 371 372 mulsd %xmm7, %xmm4 373 addsd %xmm2, %xmm0 374 mulsd %xmm7, %xmm5 375 addsd %xmm3, %xmm1 376 377 addsd %xmm4, %xmm0 378 addsd %xmm5, %xmm1 379 380 movlpd %xmm0, -16 * SIZE(Y1) 381 movsd -14 * SIZE(Y1), %xmm0 382 movlpd %xmm1, -15 * SIZE(Y1) 383 384 addl $2 * SIZE, A1 385 addl $2 * SIZE, Y1 386 ALIGN_3 387 388.L17: 389 testl $1, M 390 je .L19 391 392 movsd -16 * SIZE(A1), %xmm2 393 movsd -16 * SIZE(A1, LDA), %xmm3 394 395 movsd -16 * SIZE(Y1), %xmm0 396 397 mulsd %xmm6, %xmm2 398 addsd %xmm2, %xmm0 399 mulsd %xmm7, %xmm3 400 addsd %xmm3, %xmm0 401 402 movsd %xmm0, -16 * SIZE(Y1) 403 ALIGN_3 404 405.L19: 406 decl J 407 jg .L11 408 ALIGN_4 409 410.L20: 411 testl $1, N 412 jle .L990 413 414 movl BUFFER, Y1 415 addl $16 * SIZE, Y1 416 417 movl A, A1 418 leal (A1, LDA, 2), %eax 419 movl %eax, A 420 421 movsd (X), %xmm6 422 addl INCX, X 423 movsd (X), %xmm7 424 addl INCX, X 425 426 movsd ALPHA, %xmm0 427 428 mulsd %xmm0, %xmm6 429 mulsd %xmm0, %xmm7 430 431 movsd -16 * SIZE(Y1), %xmm0 432 movsd -15 * SIZE(Y1), %xmm1 433 movsd -14 * SIZE(Y1), %xmm4 434 movsd -13 * SIZE(Y1), %xmm5 435 436 movl M, I 437 sarl $3, I 438 jle .L25 439 440 movsd -16 * SIZE(A1), %xmm2 441 movsd -15 * SIZE(A1), %xmm3 442 443 mulsd %xmm6, %xmm2 444 mulsd %xmm6, %xmm3 445 446 decl I 447 jle .L24 448 ALIGN_3 449 450.L23: 451#ifdef PREFETCH 452 PREFETCH (PREFETCHSIZE + 0) * SIZE(A1) 453#endif 454 455 addsd %xmm2, %xmm0 456 movsd -14 * SIZE(A1), %xmm2 457 addsd %xmm3, %xmm1 458 movsd -13 * SIZE(A1), %xmm3 459 460 mulsd %xmm6, %xmm2 461 movlpd %xmm0, -16 * SIZE(Y1) 462 movsd -12 * SIZE(Y1), %xmm0 463 mulsd %xmm6, %xmm3 464 movlpd %xmm1, -15 * SIZE(Y1) 465 movsd -11 * SIZE(Y1), %xmm1 466 467 addsd %xmm2, %xmm4 468 movsd -12 * SIZE(A1), %xmm2 469 addsd %xmm3, %xmm5 470 movsd -11 * SIZE(A1), %xmm3 471 472 mulsd %xmm6, %xmm2 473 movlpd %xmm4, -14 * SIZE(Y1) 474 movsd -10 * SIZE(Y1), %xmm4 475 mulsd %xmm6, %xmm3 476 movlpd %xmm5, -13 * SIZE(Y1) 477 movsd -9 * SIZE(Y1), %xmm5 478 479 addsd %xmm2, %xmm0 480 movsd -10 * SIZE(A1), %xmm2 481 addsd %xmm3, %xmm1 482 movsd -9 * SIZE(A1), %xmm3 483 484 mulsd %xmm6, %xmm2 485 movlpd %xmm0, -12 * SIZE(Y1) 486 movsd -8 * SIZE(Y1), %xmm0 487 mulsd %xmm6, %xmm3 488 movlpd %xmm1, -11 * SIZE(Y1) 489 movsd -7 * SIZE(Y1), %xmm1 490 491 addsd %xmm2, %xmm4 492 movsd -8 * SIZE(A1), %xmm2 493 addsd %xmm3, %xmm5 494 movsd -7 * SIZE(A1), %xmm3 495 496 mulsd %xmm6, %xmm2 497 movlpd %xmm4, -10 * SIZE(Y1) 498 movsd -6 * SIZE(Y1), %xmm4 499 mulsd %xmm6, %xmm3 500 movlpd %xmm5, -9 * SIZE(Y1) 501 movsd -5 * SIZE(Y1), %xmm5 502 503 subl $-8 * SIZE, A1 504 subl $-8 * SIZE, Y1 505 506 subl $1, I 507 BRANCH 508 jg .L23 509 ALIGN_3 510 511.L24: 512 addsd %xmm2, %xmm0 513 movsd -14 * SIZE(A1), %xmm2 514 addsd %xmm3, %xmm1 515 movsd -13 * SIZE(A1), %xmm3 516 517 mulsd %xmm6, %xmm2 518 movlpd %xmm0, -16 * SIZE(Y1) 519 movsd -12 * SIZE(Y1), %xmm0 520 mulsd %xmm6, %xmm3 521 movlpd %xmm1, -15 * SIZE(Y1) 522 movsd -11 * SIZE(Y1), %xmm1 523 524 addsd %xmm2, %xmm4 525 movsd -12 * SIZE(A1), %xmm2 526 addsd %xmm3, %xmm5 527 movsd -11 * SIZE(A1), %xmm3 528 529 mulsd %xmm6, %xmm2 530 movlpd %xmm4, -14 * SIZE(Y1) 531 movsd -10 * SIZE(Y1), %xmm4 532 mulsd %xmm6, %xmm3 533 movlpd %xmm5, -13 * SIZE(Y1) 534 movsd -9 * SIZE(Y1), %xmm5 535 536 addsd %xmm2, %xmm0 537 movsd -10 * SIZE(A1), %xmm2 538 addsd %xmm3, %xmm1 539 movsd -9 * SIZE(A1), %xmm3 540 541 mulsd %xmm6, %xmm2 542 movlpd %xmm0, -12 * SIZE(Y1) 543 mulsd %xmm6, %xmm3 544 movlpd %xmm1, -11 * SIZE(Y1) 545 546 addsd %xmm2, %xmm4 547 movsd -8 * SIZE(Y1), %xmm0 548 addsd %xmm3, %xmm5 549 movsd -7 * SIZE(Y1), %xmm1 550 551 movlpd %xmm4, -10 * SIZE(Y1) 552 movsd -6 * SIZE(Y1), %xmm4 553 movlpd %xmm5, -9 * SIZE(Y1) 554 movsd -5 * SIZE(Y1), %xmm5 555 556 subl $-8 * SIZE, A1 557 subl $-8 * SIZE, Y1 558 ALIGN_3 559 560.L25: 561 testl $4, M 562 je .L26 563 564 movsd -16 * SIZE(A1), %xmm2 565 movsd -15 * SIZE(A1), %xmm3 566 mulsd %xmm6, %xmm2 567 mulsd %xmm6, %xmm3 568 569 addsd %xmm2, %xmm0 570 movsd -14 * SIZE(A1), %xmm2 571 addsd %xmm3, %xmm1 572 movsd -13 * SIZE(A1), %xmm3 573 574 mulsd %xmm6, %xmm2 575 movlpd %xmm0, -16 * SIZE(Y1) 576 movsd -12 * SIZE(Y1), %xmm0 577 mulsd %xmm6, %xmm3 578 movlpd %xmm1, -15 * SIZE(Y1) 579 movsd -11 * SIZE(Y1), %xmm1 580 581 addsd %xmm2, %xmm4 582 addsd %xmm3, %xmm5 583 584 movlpd %xmm4, -14 * SIZE(Y1) 585 movlpd %xmm5, -13 * SIZE(Y1) 586 587 addl $4 * SIZE, A1 588 addl $4 * SIZE, Y1 589 ALIGN_3 590 591.L26: 592 testl $2, M 593 je .L27 594 595 movsd -16 * SIZE(A1), %xmm2 596 movsd -15 * SIZE(A1), %xmm3 597 598 mulsd %xmm6, %xmm2 599 mulsd %xmm6, %xmm3 600 addsd %xmm2, %xmm0 601 addsd %xmm3, %xmm1 602 603 movlpd %xmm0, -16 * SIZE(Y1) 604 movsd -14 * SIZE(Y1), %xmm0 605 movlpd %xmm1, -15 * SIZE(Y1) 606 607 addl $2 * SIZE, A1 608 addl $2 * SIZE, Y1 609 ALIGN_3 610 611.L27: 612 testl $1, M 613 je .L990 614 615 movsd -16 * SIZE(A1), %xmm2 616 movsd -16 * SIZE(Y1), %xmm0 617 618 mulsd %xmm6, %xmm2 619 addsd %xmm2, %xmm0 620 621 movsd %xmm0, -16 * SIZE(Y1) 622 ALIGN_3 623 624.L990: 625 movl Y, Y1 626 movl BUFFER, X 627 movl Y1, A1 628 629 movl STACK_INCY, INCY 630 sall $BASE_SHIFT, INCY 631 632 movl M, %eax 633 sarl $3, %eax 634 jle .L994 635 ALIGN_3 636 637.L992: 638 movsd (Y1), %xmm0 639 addl INCY, Y1 640 movsd (Y1), %xmm1 641 addl INCY, Y1 642 movsd (Y1), %xmm2 643 addl INCY, Y1 644 movsd (Y1), %xmm3 645 addl INCY, Y1 646 movsd (Y1), %xmm4 647 addl INCY, Y1 648 movsd (Y1), %xmm5 649 addl INCY, Y1 650 movsd (Y1), %xmm6 651 addl INCY, Y1 652 movsd (Y1), %xmm7 653 addl INCY, Y1 654 655 addsd 0 * SIZE(X), %xmm0 656 addsd 1 * SIZE(X), %xmm1 657 addsd 2 * SIZE(X), %xmm2 658 addsd 3 * SIZE(X), %xmm3 659 addsd 4 * SIZE(X), %xmm4 660 addsd 5 * SIZE(X), %xmm5 661 addsd 6 * SIZE(X), %xmm6 662 addsd 7 * SIZE(X), %xmm7 663 664 movlpd %xmm0, (A1) 665 addl INCY, A1 666 movlpd %xmm1, (A1) 667 addl INCY, A1 668 movlpd %xmm2, (A1) 669 addl INCY, A1 670 movlpd %xmm3, (A1) 671 addl INCY, A1 672 movlpd %xmm4, (A1) 673 addl INCY, A1 674 movlpd %xmm5, (A1) 675 addl INCY, A1 676 movlpd %xmm6, (A1) 677 addl INCY, A1 678 movlpd %xmm7, (A1) 679 addl INCY, A1 680 681 addl $8 * SIZE, X 682 decl %eax 683 jg .L992 684 ALIGN_3 685 686.L994: 687 testl $7, M 688 jle .L999 689 690 testl $4, M 691 jle .L995 692 693 movsd (Y1), %xmm0 694 addl INCY, Y1 695 movsd (Y1), %xmm1 696 addl INCY, Y1 697 movsd (Y1), %xmm2 698 addl INCY, Y1 699 movsd (Y1), %xmm3 700 addl INCY, Y1 701 702 addsd 0 * SIZE(X), %xmm0 703 addsd 1 * SIZE(X), %xmm1 704 addsd 2 * SIZE(X), %xmm2 705 addsd 3 * SIZE(X), %xmm3 706 707 movlpd %xmm0, (A1) 708 addl INCY, A1 709 movlpd %xmm1, (A1) 710 addl INCY, A1 711 movlpd %xmm2, (A1) 712 addl INCY, A1 713 movlpd %xmm3, (A1) 714 addl INCY, A1 715 716 addl $4 * SIZE, X 717 ALIGN_3 718 719.L995: 720 testl $2, M 721 jle .L996 722 723 movsd (Y1), %xmm0 724 addl INCY, Y1 725 movsd (Y1), %xmm1 726 addl INCY, Y1 727 728 addsd 0 * SIZE(X), %xmm0 729 addsd 1 * SIZE(X), %xmm1 730 731 movlpd %xmm0, (A1) 732 addl INCY, A1 733 movlpd %xmm1, (A1) 734 addl INCY, A1 735 736 addl $2 * SIZE, X 737 ALIGN_3 738 739.L996: 740 testl $1, M 741 jle .L999 742 743 movsd (Y1), %xmm0 744 745 addsd 0 * SIZE(X), %xmm0 746 747 movlpd %xmm0, (A1) 748 ALIGN_3 749 750.L999: 751 popl %ebx 752 popl %esi 753 popl %edi 754 popl %ebp 755 ret 756 757 EPILOGUE 758