1/*********************************************************************/ 2/* */ 3/* Optimized BLAS libraries */ 4/* By Kazushige Goto <kgoto@tacc.utexas.edu> */ 5/* */ 6/* Copyright (c) The University of Texas, 2009. All rights reserved. */ 7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */ 8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */ 9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */ 10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */ 11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */ 12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */ 13/* THE USE OF THE SOFTWARE OR DOCUMENTATION. */ 14/* Under no circumstances shall University be liable for incidental, */ 15/* special, indirect, direct or consequential damages or loss of */ 16/* profits, interruption of business, or related expenses which may */ 17/* arise from use of Software or Documentation, including but not */ 18/* limited to those resulting from defects in Software and/or */ 19/* Documentation, or loss or inaccuracy of data of any kind. */ 20/*********************************************************************/ 21 22#define ASSEMBLER 23#include "common.h" 24 25#define STACK 16 26#define ARGS 0 27 28#define STACK_M 4 + STACK + ARGS(%esp) 29#define STACK_X 8 + STACK + ARGS(%esp) 30#define STACK_INCX 12 + STACK + ARGS(%esp) 31 32#define RET %eax 33#define M %ebx 34#define X %ecx 35#define INCX %edx 36#define I %esi 37#define MM %ebp 38#define XX %edi 39#define TEMP %ebx 40 41#ifdef USE_MIN 42#define maxps minps 43#define maxss minss 44#endif 45 46#ifndef HAVE_SSE2 47#define pxor xorps 48#define movsd movlps 49#endif 50 51#include "l1param.h" 52 53 PROLOGUE 54 55 pushl %ebp 56 pushl %edi 57 pushl %esi 58 pushl %ebx 59 60 PROFCODE 61 62 movl STACK_M, M 63 movl STACK_X, X 64 movl STACK_INCX, INCX 65 66#ifdef F_INTERFACE 67 movl (M), M 68 movl (INCX), INCX 69#endif 70 71 pxor %xmm0, %xmm0 72 pxor %xmm7, %xmm7 73 xor RET, RET 74 testl M, M 75 jle .L999 76 testl INCX, INCX 77 jle .L999 78 79 sall $ZBASE_SHIFT, INCX 80 movl M, MM 81 movl X, XX 82 83#ifdef USE_ABS 84#ifndef HAVE_SSE2 85 subl $8, %esp 86 movl $0x7fffffff, (%esp) 87 movss (%esp), %xmm7 88 shufps $0, %xmm7, %xmm7 89 addl $8, %esp 90#else 91 cmpeqps %xmm7, %xmm7 92 psrld $1, %xmm7 93#endif 94#endif 95 96 movss 0 * SIZE(XX), %xmm0 97 movss 1 * SIZE(XX), %xmm1 98 addl INCX, XX 99 decl MM 100 andps %xmm7, %xmm0 101 andps %xmm7, %xmm1 102 addps %xmm1, %xmm0 103 shufps $0, %xmm0, %xmm0 104 cmpl $2 * SIZE, INCX 105 jne .L70 106 107.L30: 108 movl MM, I 109 sarl $3, I 110 jle .L35 111 ALIGN_4 112 113.L31: 114#ifdef PREFETCH 115 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) 116#endif 117 118 movsd 0 * SIZE(XX), %xmm1 119 movhps 2 * SIZE(XX), %xmm1 120 movsd 4 * SIZE(XX), %xmm2 121 movhps 6 * SIZE(XX), %xmm2 122 123 movaps %xmm1, %xmm3 124 125 shufps $0x88, %xmm2, %xmm1 126 shufps $0xdd, %xmm2, %xmm3 127 128 andps %xmm7, %xmm1 129 andps %xmm7, %xmm3 130 addps %xmm3, %xmm1 131 maxps %xmm1, %xmm0 132 133 movsd 8 * SIZE(XX), %xmm1 134 movhps 10 * SIZE(XX), %xmm1 135 movsd 12 * SIZE(XX), %xmm2 136 movhps 14 * SIZE(XX), %xmm2 137 138 movaps %xmm1, %xmm3 139 140 shufps $0x88, %xmm2, %xmm1 141 shufps $0xdd, %xmm2, %xmm3 142 143 andps %xmm7, %xmm1 144 andps %xmm7, %xmm3 145 addps %xmm3, %xmm1 146 maxps %xmm1, %xmm0 147 148 addl $16 * SIZE, XX 149 decl I 150 jg .L31 151 ALIGN_4 152 153.L35: 154 andl $7, MM 155 jle .L40 156 157 testl $4, MM 158 je .L36 159 160 movsd 0 * SIZE(XX), %xmm1 161 movhps 2 * SIZE(XX), %xmm1 162 movsd 4 * SIZE(XX), %xmm2 163 movhps 6 * SIZE(XX), %xmm2 164 165 movaps %xmm1, %xmm3 166 167 shufps $0x88, %xmm2, %xmm1 168 shufps $0xdd, %xmm2, %xmm3 169 170 andps %xmm7, %xmm1 171 andps %xmm7, %xmm3 172 addps %xmm3, %xmm1 173 maxps %xmm1, %xmm0 174 175 addl $8 * SIZE, XX 176 ALIGN_3 177 178.L36: 179 testl $2, MM 180 je .L37 181 182 movss 0 * SIZE(XX), %xmm1 183 movss 1 * SIZE(XX), %xmm2 184 movss 2 * SIZE(XX), %xmm3 185 movss 3 * SIZE(XX), %xmm4 186 andps %xmm7, %xmm1 187 andps %xmm7, %xmm2 188 andps %xmm7, %xmm3 189 andps %xmm7, %xmm4 190 addps %xmm2, %xmm1 191 addps %xmm4, %xmm3 192 maxss %xmm1, %xmm0 193 maxss %xmm3, %xmm0 194 addl $4 * SIZE, XX 195 ALIGN_3 196 197.L37: 198 testl $1, MM 199 je .L40 200 201 movss 0 * SIZE(XX), %xmm1 202 movss 1 * SIZE(XX), %xmm2 203 andps %xmm7, %xmm1 204 andps %xmm7, %xmm2 205 addps %xmm2, %xmm1 206 maxss %xmm1, %xmm0 207 ALIGN_4 208 209.L40: 210 movl X, XX 211 movl M, MM 212 213 movaps %xmm0, %xmm1 214 movhlps %xmm0, %xmm0 215 maxps %xmm1, %xmm0 216 movaps %xmm0, %xmm1 217 shufps $1, %xmm0, %xmm0 218 maxss %xmm1, %xmm0 219 shufps $0, %xmm0, %xmm0 220 221 movl MM, I 222 sarl $2, I 223 jle .L45 224 ALIGN_4 225 226.L41: 227#ifdef PREFETCH 228 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) 229#endif 230 231 movsd 0 * SIZE(XX), %xmm1 232 movhps 2 * SIZE(XX), %xmm1 233 movsd 4 * SIZE(XX), %xmm2 234 movhps 6 * SIZE(XX), %xmm2 235 236 movaps %xmm1, %xmm3 237 238 shufps $0x88, %xmm2, %xmm1 239 shufps $0xdd, %xmm2, %xmm3 240 241 andps %xmm7, %xmm1 242 andps %xmm7, %xmm3 243 addps %xmm3, %xmm1 244 245 cmpeqps %xmm0, %xmm1 246 movmskps %xmm1, TEMP 247 testl $15, TEMP 248 jne .L43 249 250 addl $8 * SIZE, XX 251 addl $4, RET 252 decl I 253 jg .L41 254 jmp .L45 255 ALIGN_4 256 257.L43: 258 movss 0 * SIZE(XX), %xmm1 259 movss 1 * SIZE(XX), %xmm2 260 movss 2 * SIZE(XX), %xmm3 261 movss 3 * SIZE(XX), %xmm4 262 263 andps %xmm7, %xmm1 264 andps %xmm7, %xmm2 265 andps %xmm7, %xmm3 266 andps %xmm7, %xmm4 267 268 addps %xmm2, %xmm1 269 addps %xmm4, %xmm3 270 271 incl RET 272 comiss %xmm0, %xmm1 273 je .L999 274 incl RET 275 comiss %xmm0, %xmm3 276 je .L999 277 278 movss 4 * SIZE(XX), %xmm1 279 movss 5 * SIZE(XX), %xmm2 280 movss 6 * SIZE(XX), %xmm3 281 movss 7 * SIZE(XX), %xmm4 282 283 andps %xmm7, %xmm1 284 andps %xmm7, %xmm2 285 andps %xmm7, %xmm3 286 andps %xmm7, %xmm4 287 288 addps %xmm2, %xmm1 289 addps %xmm4, %xmm3 290 291 addl $8 * SIZE, XX 292 293 incl RET 294 comiss %xmm0, %xmm1 295 je .L999 296 incl RET 297 comiss %xmm0, %xmm3 298 je .L999 299 ALIGN_3 300 301.L45: 302 testl $2, MM 303 je .L47 304 305 movss 0 * SIZE(XX), %xmm1 306 movss 1 * SIZE(XX), %xmm2 307 movss 2 * SIZE(XX), %xmm3 308 movss 3 * SIZE(XX), %xmm4 309 addl $4 * SIZE, XX 310 311 andps %xmm7, %xmm1 312 andps %xmm7, %xmm2 313 andps %xmm7, %xmm3 314 andps %xmm7, %xmm4 315 addps %xmm2, %xmm1 316 addps %xmm4, %xmm3 317 318 incl RET 319 comiss %xmm0, %xmm1 320 je .L999 321 incl RET 322 comiss %xmm0, %xmm3 323 je .L999 324 ALIGN_3 325 326.L47: 327 incl RET 328 jmp .L999 329 ALIGN_3 330 331.L70: 332 movl MM, I 333 sarl $3, I 334 jle .L75 335 ALIGN_4 336 337.L71: 338#ifdef PREFETCH 339 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) 340#endif 341 342 movsd 0 * SIZE(XX), %xmm1 343 addl INCX, XX 344 movhps 0 * SIZE(XX), %xmm1 345 addl INCX, XX 346 movsd 0 * SIZE(XX), %xmm2 347 addl INCX, XX 348 movhps 0 * SIZE(XX), %xmm2 349 addl INCX, XX 350 351 movaps %xmm1, %xmm3 352 353 shufps $0x88, %xmm2, %xmm1 354 shufps $0xdd, %xmm2, %xmm3 355 356 andps %xmm7, %xmm1 357 andps %xmm7, %xmm3 358 addps %xmm3, %xmm1 359 maxps %xmm1, %xmm0 360 361 movsd 0 * SIZE(XX), %xmm1 362 addl INCX, XX 363 movhps 0 * SIZE(XX), %xmm1 364 addl INCX, XX 365 movsd 0 * SIZE(XX), %xmm2 366 addl INCX, XX 367 movhps 0 * SIZE(XX), %xmm2 368 addl INCX, XX 369 370 movaps %xmm1, %xmm3 371 372 shufps $0x88, %xmm2, %xmm1 373 shufps $0xdd, %xmm2, %xmm3 374 375 andps %xmm7, %xmm1 376 andps %xmm7, %xmm3 377 addps %xmm3, %xmm1 378 maxps %xmm1, %xmm0 379 decl I 380 jg .L71 381 ALIGN_4 382 383.L75: 384 andl $7, MM 385 jle .L80 386 387 testl $4, MM 388 je .L76 389 390 movsd 0 * SIZE(XX), %xmm1 391 addl INCX, XX 392 movhps 0 * SIZE(XX), %xmm1 393 addl INCX, XX 394 movsd 0 * SIZE(XX), %xmm2 395 addl INCX, XX 396 movhps 0 * SIZE(XX), %xmm2 397 addl INCX, XX 398 399 movaps %xmm1, %xmm3 400 401 shufps $0x88, %xmm2, %xmm1 402 shufps $0xdd, %xmm2, %xmm3 403 404 andps %xmm7, %xmm1 405 andps %xmm7, %xmm3 406 addps %xmm3, %xmm1 407 maxps %xmm1, %xmm0 408 ALIGN_3 409 410.L76: 411 testl $2, MM 412 je .L77 413 414 movss 0 * SIZE(XX), %xmm1 415 movss 1 * SIZE(XX), %xmm2 416 addl INCX, XX 417 movss 0 * SIZE(XX), %xmm3 418 movss 1 * SIZE(XX), %xmm4 419 addl INCX, XX 420 andps %xmm7, %xmm1 421 andps %xmm7, %xmm2 422 andps %xmm7, %xmm3 423 andps %xmm7, %xmm4 424 addps %xmm2, %xmm1 425 addps %xmm4, %xmm3 426 maxss %xmm1, %xmm0 427 maxss %xmm3, %xmm0 428 ALIGN_3 429 430.L77: 431 testl $1, MM 432 je .L80 433 434 movss 0 * SIZE(XX), %xmm1 435 movss 1 * SIZE(XX), %xmm2 436 andps %xmm7, %xmm1 437 andps %xmm7, %xmm2 438 addps %xmm2, %xmm1 439 maxss %xmm1, %xmm0 440 ALIGN_4 441 442.L80: 443 movl X, XX 444 movl M, MM 445 446 movaps %xmm0, %xmm1 447 movhlps %xmm0, %xmm0 448 maxps %xmm1, %xmm0 449 movaps %xmm0, %xmm1 450 shufps $1, %xmm0, %xmm0 451 maxss %xmm1, %xmm0 452 shufps $0, %xmm0, %xmm0 453 454 movl MM, I 455 sarl $2, I 456 jle .L85 457 ALIGN_4 458 459.L81: 460#ifdef PREFETCH 461 PREFETCH (PREFETCHSIZE + 0) - PREOFFSET(XX) 462#endif 463 464 movsd 0 * SIZE(XX), %xmm1 465 addl INCX, XX 466 movhps 0 * SIZE(XX), %xmm1 467 addl INCX, XX 468 movsd 0 * SIZE(XX), %xmm2 469 addl INCX, XX 470 movhps 0 * SIZE(XX), %xmm2 471 addl INCX, XX 472 473 movaps %xmm1, %xmm3 474 475 shufps $0x88, %xmm2, %xmm1 476 shufps $0xdd, %xmm2, %xmm3 477 478 andps %xmm7, %xmm1 479 andps %xmm7, %xmm3 480 addps %xmm3, %xmm1 481 482 cmpeqps %xmm0, %xmm1 483 movmskps %xmm1, TEMP 484 testl $15, TEMP 485 jne .L83 486 487 addl $4, RET 488 decl I 489 jg .L81 490 jmp .L85 491 ALIGN_4 492 493.L83: 494 leal (, INCX, 4), TEMP 495 subl TEMP, XX 496 497 movss 0 * SIZE(XX), %xmm1 498 movss 1 * SIZE(XX), %xmm2 499 addl INCX, XX 500 movss 0 * SIZE(XX), %xmm3 501 movss 1 * SIZE(XX), %xmm4 502 addl INCX, XX 503 504 andps %xmm7, %xmm1 505 andps %xmm7, %xmm2 506 andps %xmm7, %xmm3 507 andps %xmm7, %xmm4 508 509 addps %xmm2, %xmm1 510 addps %xmm4, %xmm3 511 512 incl RET 513 comiss %xmm0, %xmm1 514 je .L999 515 incl RET 516 comiss %xmm0, %xmm3 517 je .L999 518 519 movss 0 * SIZE(XX), %xmm1 520 movss 1 * SIZE(XX), %xmm2 521 addl INCX, XX 522 movss 0 * SIZE(XX), %xmm3 523 movss 1 * SIZE(XX), %xmm4 524 addl INCX, XX 525 526 andps %xmm7, %xmm1 527 andps %xmm7, %xmm2 528 andps %xmm7, %xmm3 529 andps %xmm7, %xmm4 530 531 addps %xmm2, %xmm1 532 addps %xmm4, %xmm3 533 534 incl RET 535 comiss %xmm0, %xmm1 536 je .L999 537 incl RET 538 comiss %xmm0, %xmm3 539 je .L999 540 ALIGN_3 541 542.L85: 543 testl $2, MM 544 je .L87 545 546 movss 0 * SIZE(XX), %xmm1 547 movss 1 * SIZE(XX), %xmm2 548 addl INCX, XX 549 movss 0 * SIZE(XX), %xmm3 550 movss 1 * SIZE(XX), %xmm4 551 addl INCX, XX 552 553 andps %xmm7, %xmm1 554 andps %xmm7, %xmm2 555 andps %xmm7, %xmm3 556 andps %xmm7, %xmm4 557 addps %xmm2, %xmm1 558 addps %xmm4, %xmm3 559 560 incl RET 561 comiss %xmm0, %xmm1 562 je .L999 563 incl RET 564 comiss %xmm0, %xmm3 565 je .L999 566 ALIGN_3 567 568.L87: 569 incl RET 570 ALIGN_4 571 572.L999: 573 popl %ebx 574 popl %esi 575 popl %edi 576 popl %ebp 577 ret 578 579 EPILOGUE 580