1/*********************************************************************/ 2/* */ 3/* Optimized BLAS libraries */ 4/* By Kazushige Goto <kgoto@tacc.utexas.edu> */ 5/* */ 6/* Copyright (c) The University of Texas, 2009. All rights reserved. */ 7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */ 8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */ 9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */ 10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */ 11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */ 12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */ 13/* THE USE OF THE SOFTWARE OR DOCUMENTATION. */ 14/* Under no circumstances shall University be liable for incidental, */ 15/* special, indirect, direct or consequential damages or loss of */ 16/* profits, interruption of business, or related expenses which may */ 17/* arise from use of Software or Documentation, including but not */ 18/* limited to those resulting from defects in Software and/or */ 19/* Documentation, or loss or inaccuracy of data of any kind. */ 20/*********************************************************************/ 21 22#define ASSEMBLER 23#include "common.h" 24 25#define STACK 16 26#define ARGS 0 27 28#define STACK_M 4 + STACK + ARGS(%esp) 29#define STACK_X 8 + STACK + ARGS(%esp) 30#define STACK_INCX 12 + STACK + ARGS(%esp) 31 32#define RET %eax 33#define M %ebx 34#define X %ecx 35#define INCX %edx 36#define I %esi 37#define MM %ebp 38#define XX %edi 39#define TEMP %ebx 40 41#ifdef USE_MIN 42#define maxpd minpd 43#define maxsd minsd 44#endif 45 46#include "l1param.h" 47 48 PROLOGUE 49 50 pushl %ebp 51 pushl %edi 52 pushl %esi 53 pushl %ebx 54 55 PROFCODE 56 57 movl STACK_M, M 58 movl STACK_X, X 59 movl STACK_INCX, INCX 60 61#ifdef F_INTERFACE 62 movl (M), M 63 movl (INCX), INCX 64#endif 65 66 pxor %xmm0, %xmm0 67 pxor %xmm7, %xmm7 68 xor RET, RET 69 testl M, M 70 jle .L999 71 testl INCX, INCX 72 jle .L999 73 74 sall $ZBASE_SHIFT, INCX 75 movl M, MM 76 movl X, XX 77 78 cmpeqpd %xmm7, %xmm7 79 psrlq $1, %xmm7 80 81 movsd 0 * SIZE(XX), %xmm0 82 movsd 1 * SIZE(XX), %xmm1 83 addl INCX, XX 84 decl MM 85 andpd %xmm7, %xmm0 86 andpd %xmm7, %xmm1 87 addpd %xmm1, %xmm0 88 unpcklpd %xmm0, %xmm0 89 cmpl $2 * SIZE, INCX 90 jne .L60 91 92 movl MM, I 93 sarl $3, I 94 jle .L25 95 ALIGN_4 96 97.L21: 98#ifdef PREFETCH 99 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) 100#endif 101 102 movsd 0 * SIZE(XX), %xmm1 103 movsd 1 * SIZE(XX), %xmm2 104 movhpd 2 * SIZE(XX), %xmm1 105 movhpd 3 * SIZE(XX), %xmm2 106 107 andpd %xmm7, %xmm1 108 andpd %xmm7, %xmm2 109 addpd %xmm2, %xmm1 110 maxpd %xmm1, %xmm0 111 112 movsd 4 * SIZE(XX), %xmm3 113 movsd 5 * SIZE(XX), %xmm4 114 movhpd 6 * SIZE(XX), %xmm3 115 movhpd 7 * SIZE(XX), %xmm4 116 117 andpd %xmm7, %xmm3 118 andpd %xmm7, %xmm4 119 addpd %xmm4, %xmm3 120 maxpd %xmm3, %xmm0 121 122#if defined(PREFETCH) && !defined(FETCH128) 123 PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(XX) 124#endif 125 126 movsd 8 * SIZE(XX), %xmm1 127 movsd 9 * SIZE(XX), %xmm2 128 movhpd 10 * SIZE(XX), %xmm1 129 movhpd 11 * SIZE(XX), %xmm2 130 131 andpd %xmm7, %xmm1 132 andpd %xmm7, %xmm2 133 addpd %xmm2, %xmm1 134 maxpd %xmm1, %xmm0 135 136 movsd 12 * SIZE(XX), %xmm3 137 movsd 13 * SIZE(XX), %xmm4 138 movhpd 14 * SIZE(XX), %xmm3 139 movhpd 15 * SIZE(XX), %xmm4 140 141 andpd %xmm7, %xmm3 142 andpd %xmm7, %xmm4 143 addpd %xmm4, %xmm3 144 maxpd %xmm3, %xmm0 145 146 addl $16 * SIZE, XX 147 decl I 148 jg .L21 149 ALIGN_4 150 151.L25: 152 andl $7, MM 153 jle .L30 154 155 testl $4, MM 156 je .L26 157 158 movsd 0 * SIZE(XX), %xmm1 159 movsd 1 * SIZE(XX), %xmm2 160 movhpd 2 * SIZE(XX), %xmm1 161 movhpd 3 * SIZE(XX), %xmm2 162 163 andpd %xmm7, %xmm1 164 andpd %xmm7, %xmm2 165 addpd %xmm2, %xmm1 166 maxpd %xmm1, %xmm0 167 168 movsd 4 * SIZE(XX), %xmm3 169 movsd 5 * SIZE(XX), %xmm4 170 movhpd 6 * SIZE(XX), %xmm3 171 movhpd 7 * SIZE(XX), %xmm4 172 173 andpd %xmm7, %xmm3 174 andpd %xmm7, %xmm4 175 addpd %xmm4, %xmm3 176 maxpd %xmm3, %xmm0 177 addl $8 * SIZE, XX 178 ALIGN_3 179 180.L26: 181 testl $2, MM 182 je .L27 183 184 movsd 0 * SIZE(XX), %xmm1 185 movsd 1 * SIZE(XX), %xmm2 186 movhpd 2 * SIZE(XX), %xmm1 187 movhpd 3 * SIZE(XX), %xmm2 188 189 andpd %xmm7, %xmm1 190 andpd %xmm7, %xmm2 191 addpd %xmm2, %xmm1 192 maxpd %xmm1, %xmm0 193 194 addl $4 * SIZE, XX 195 ALIGN_3 196 197.L27: 198 testl $1, MM 199 je .L30 200 201 movsd 0 * SIZE(XX), %xmm1 202 movsd 1 * SIZE(XX), %xmm2 203 andpd %xmm7, %xmm1 204 andpd %xmm7, %xmm2 205 addpd %xmm2, %xmm1 206 maxsd %xmm1, %xmm0 207 ALIGN_4 208 209.L30: 210 movl X, XX 211 movl M, MM 212 213 movapd %xmm0, %xmm1 214 unpckhpd %xmm0, %xmm0 215 maxsd %xmm1, %xmm0 216 unpcklpd %xmm0, %xmm0 217 218 movl MM, I 219 sarl $2, I 220 jle .L35 221 ALIGN_4 222 223.L31: 224#ifdef PREFETCH 225 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) 226#endif 227 228 movsd 0 * SIZE(XX), %xmm1 229 movsd 1 * SIZE(XX), %xmm2 230 movhpd 2 * SIZE(XX), %xmm1 231 movhpd 3 * SIZE(XX), %xmm2 232 movsd 4 * SIZE(XX), %xmm3 233 movsd 5 * SIZE(XX), %xmm4 234 movhpd 6 * SIZE(XX), %xmm3 235 movhpd 7 * SIZE(XX), %xmm4 236 237 andpd %xmm7, %xmm1 238 andpd %xmm7, %xmm2 239 andpd %xmm7, %xmm3 240 andpd %xmm7, %xmm4 241 242 addpd %xmm2, %xmm1 243 addpd %xmm4, %xmm3 244 245 cmpeqpd %xmm0, %xmm1 246 cmpeqpd %xmm0, %xmm3 247 248 orpd %xmm3, %xmm1 249 movmskpd %xmm1, TEMP 250 testl $3, TEMP 251 jne .L33 252 253 addl $8 * SIZE, XX 254 addl $4, RET 255 decl I 256 jg .L31 257 jmp .L35 258 ALIGN_4 259 260.L33: 261 movsd 0 * SIZE(XX), %xmm1 262 movsd 1 * SIZE(XX), %xmm2 263 movsd 2 * SIZE(XX), %xmm3 264 movsd 3 * SIZE(XX), %xmm4 265 266 andpd %xmm7, %xmm1 267 andpd %xmm7, %xmm2 268 andpd %xmm7, %xmm3 269 andpd %xmm7, %xmm4 270 271 addpd %xmm2, %xmm1 272 addpd %xmm4, %xmm3 273 274 incl RET 275 comisd %xmm0, %xmm1 276 je .L999 277 incl RET 278 comisd %xmm0, %xmm3 279 je .L999 280 281 movsd 4 * SIZE(XX), %xmm1 282 movsd 5 * SIZE(XX), %xmm2 283 movsd 6 * SIZE(XX), %xmm3 284 movsd 7 * SIZE(XX), %xmm4 285 addl $8 * SIZE, XX 286 287 andpd %xmm7, %xmm1 288 andpd %xmm7, %xmm2 289 andpd %xmm7, %xmm3 290 andpd %xmm7, %xmm4 291 292 addpd %xmm2, %xmm1 293 addpd %xmm4, %xmm3 294 295 incl RET 296 comisd %xmm0, %xmm1 297 je .L999 298 incl RET 299 comisd %xmm0, %xmm3 300 je .L999 301 ALIGN_3 302 303.L35: 304 testl $2, MM 305 je .L36 306 307 movsd 0 * SIZE(XX), %xmm1 308 movsd 1 * SIZE(XX), %xmm2 309 movsd 2 * SIZE(XX), %xmm3 310 movsd 3 * SIZE(XX), %xmm4 311 addl $4 * SIZE, XX 312 313 andpd %xmm7, %xmm1 314 andpd %xmm7, %xmm2 315 andpd %xmm7, %xmm3 316 andpd %xmm7, %xmm4 317 318 addpd %xmm2, %xmm1 319 addpd %xmm4, %xmm3 320 321 incl RET 322 comisd %xmm0, %xmm1 323 je .L999 324 incl RET 325 comisd %xmm0, %xmm3 326 je .L999 327 ALIGN_3 328 329.L36: 330 incl RET 331 jmp .L999 332 ALIGN_3 333 334.L60: 335 movl MM, I 336 sarl $3, I 337 jle .L65 338 ALIGN_4 339 340.L61: 341#ifdef PREFETCH 342 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) 343#endif 344 345 movsd 0 * SIZE(XX), %xmm1 346 movsd 1 * SIZE(XX), %xmm2 347 addl INCX, XX 348 movhpd 0 * SIZE(XX), %xmm1 349 movhpd 1 * SIZE(XX), %xmm2 350 addl INCX, XX 351 352 andpd %xmm7, %xmm1 353 andpd %xmm7, %xmm2 354 addpd %xmm2, %xmm1 355 maxpd %xmm1, %xmm0 356 357 movsd 0 * SIZE(XX), %xmm3 358 movsd 1 * SIZE(XX), %xmm4 359 addl INCX, XX 360 movhpd 0 * SIZE(XX), %xmm3 361 movhpd 1 * SIZE(XX), %xmm4 362 addl INCX, XX 363 364 andpd %xmm7, %xmm3 365 andpd %xmm7, %xmm4 366 addpd %xmm4, %xmm3 367 maxpd %xmm3, %xmm0 368 369#if defined(PREFETCH) && !defined(FETCH128) 370 PREFETCH (PREFETCHSIZE + 64) - PREOFFSET(X) 371#endif 372 373 movsd 0 * SIZE(XX), %xmm1 374 movsd 1 * SIZE(XX), %xmm2 375 addl INCX, XX 376 movhpd 0 * SIZE(XX), %xmm1 377 movhpd 1 * SIZE(XX), %xmm2 378 addl INCX, XX 379 380 andpd %xmm7, %xmm1 381 andpd %xmm7, %xmm2 382 addpd %xmm2, %xmm1 383 maxpd %xmm1, %xmm0 384 385 movsd 0 * SIZE(XX), %xmm3 386 movsd 1 * SIZE(XX), %xmm4 387 addl INCX, XX 388 movhpd 0 * SIZE(XX), %xmm3 389 movhpd 1 * SIZE(XX), %xmm4 390 addl INCX, XX 391 392 andpd %xmm7, %xmm3 393 andpd %xmm7, %xmm4 394 addpd %xmm4, %xmm3 395 maxpd %xmm3, %xmm0 396 397 decl I 398 jg .L61 399 ALIGN_4 400 401.L65: 402 andl $7, MM 403 jle .L70 404 405 testl $4, MM 406 je .L66 407 408 movsd 0 * SIZE(XX), %xmm1 409 movsd 1 * SIZE(XX), %xmm2 410 addl INCX, XX 411 movhpd 0 * SIZE(XX), %xmm1 412 movhpd 1 * SIZE(XX), %xmm2 413 addl INCX, XX 414 415 andpd %xmm7, %xmm1 416 andpd %xmm7, %xmm2 417 addpd %xmm2, %xmm1 418 maxpd %xmm1, %xmm0 419 420 movsd 0 * SIZE(XX), %xmm3 421 movsd 1 * SIZE(XX), %xmm4 422 addl INCX, XX 423 movhpd 0 * SIZE(XX), %xmm3 424 movhpd 1 * SIZE(XX), %xmm4 425 addl INCX, XX 426 427 andpd %xmm7, %xmm3 428 andpd %xmm7, %xmm4 429 addpd %xmm4, %xmm3 430 maxpd %xmm3, %xmm0 431 ALIGN_3 432 433.L66: 434 testl $2, MM 435 je .L67 436 437 movsd 0 * SIZE(XX), %xmm1 438 movsd 1 * SIZE(XX), %xmm2 439 addl INCX, XX 440 movhpd 0 * SIZE(XX), %xmm1 441 movhpd 1 * SIZE(XX), %xmm2 442 addl INCX, XX 443 444 andpd %xmm7, %xmm1 445 andpd %xmm7, %xmm2 446 addpd %xmm2, %xmm1 447 maxpd %xmm1, %xmm0 448 ALIGN_3 449 450.L67: 451 testl $1, MM 452 je .L70 453 454 movsd 0 * SIZE(XX), %xmm1 455 movsd 1 * SIZE(XX), %xmm2 456 andpd %xmm7, %xmm1 457 andpd %xmm7, %xmm2 458 addpd %xmm2, %xmm1 459 maxsd %xmm1, %xmm0 460 ALIGN_3 461 462.L70: 463 movl X, XX 464 movl M, MM 465 466 movapd %xmm0, %xmm1 467 unpckhpd %xmm0, %xmm0 468 maxsd %xmm1, %xmm0 469 unpcklpd %xmm0, %xmm0 470 471 movl MM, I 472 sarl $2, I 473 jle .L75 474 ALIGN_4 475 476.L71: 477#ifdef PREFETCH 478 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(X) 479#endif 480 481 movsd 0 * SIZE(XX), %xmm1 482 movsd 1 * SIZE(XX), %xmm2 483 addl INCX, XX 484 movhpd 0 * SIZE(XX), %xmm1 485 movhpd 1 * SIZE(XX), %xmm2 486 addl INCX, XX 487 movsd 0 * SIZE(XX), %xmm3 488 movsd 1 * SIZE(XX), %xmm4 489 addl INCX, XX 490 movhpd 0 * SIZE(XX), %xmm3 491 movhpd 1 * SIZE(XX), %xmm4 492 addl INCX, XX 493 494 andpd %xmm7, %xmm1 495 andpd %xmm7, %xmm2 496 andpd %xmm7, %xmm3 497 andpd %xmm7, %xmm4 498 499 addpd %xmm2, %xmm1 500 addpd %xmm4, %xmm3 501 502 cmpeqpd %xmm0, %xmm1 503 cmpeqpd %xmm0, %xmm3 504 505 orpd %xmm3, %xmm1 506 movmskpd %xmm1, TEMP 507 testl $3, TEMP 508 jne .L73 509 510 addl $4, RET 511 decl I 512 jg .L71 513 jmp .L75 514 ALIGN_4 515 516.L73: 517 leal (, INCX, 4), TEMP 518 subl TEMP, XX 519 520 movsd 0 * SIZE(XX), %xmm1 521 movsd 1 * SIZE(XX), %xmm2 522 addl INCX, XX 523 movsd 0 * SIZE(XX), %xmm3 524 movsd 1 * SIZE(XX), %xmm4 525 addl INCX, XX 526 527 andpd %xmm7, %xmm1 528 andpd %xmm7, %xmm2 529 andpd %xmm7, %xmm3 530 andpd %xmm7, %xmm4 531 532 addpd %xmm2, %xmm1 533 addpd %xmm4, %xmm3 534 535 incl RET 536 comisd %xmm0, %xmm1 537 je .L999 538 incl RET 539 comisd %xmm0, %xmm3 540 je .L999 541 542 movsd 0 * SIZE(XX), %xmm1 543 movsd 1 * SIZE(XX), %xmm2 544 addl INCX, XX 545 movsd 0 * SIZE(XX), %xmm3 546 movsd 1 * SIZE(XX), %xmm4 547 addl INCX, XX 548 549 andpd %xmm7, %xmm1 550 andpd %xmm7, %xmm2 551 andpd %xmm7, %xmm3 552 andpd %xmm7, %xmm4 553 554 addpd %xmm2, %xmm1 555 addpd %xmm4, %xmm3 556 557 incl RET 558 comisd %xmm0, %xmm1 559 je .L999 560 incl RET 561 comisd %xmm0, %xmm3 562 je .L999 563 ALIGN_3 564 565.L75: 566 testl $2, MM 567 je .L76 568 569 movsd 0 * SIZE(XX), %xmm1 570 movsd 1 * SIZE(XX), %xmm2 571 addl INCX, XX 572 movsd 0 * SIZE(XX), %xmm3 573 movsd 1 * SIZE(XX), %xmm4 574 addl INCX, XX 575 576 andpd %xmm7, %xmm1 577 andpd %xmm7, %xmm2 578 andpd %xmm7, %xmm3 579 andpd %xmm7, %xmm4 580 581 addpd %xmm2, %xmm1 582 addpd %xmm4, %xmm3 583 incl RET 584 comisd %xmm0, %xmm1 585 je .L999 586 incl RET 587 comisd %xmm0, %xmm3 588 je .L999 589 ALIGN_3 590 591.L76: 592 incl RET 593 ALIGN_4 594 595.L999: 596 popl %ebx 597 popl %esi 598 popl %edi 599 popl %ebp 600 ret 601 602 EPILOGUE 603