1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define STACK 16 43#define ARGS 16 44 45#define M 4 + STACK + ARGS(%esp) 46#define N 8 + STACK + ARGS(%esp) 47#define K 12 + STACK + ARGS(%esp) 48#define ALPHA 16 + STACK + ARGS(%esp) 49#define A 24 + STACK + ARGS(%esp) 50#define ARG_B 28 + STACK + ARGS(%esp) 51#define C 32 + STACK + ARGS(%esp) 52#define ARG_LDC 36 + STACK + ARGS(%esp) 53#define OFFSET 40 + STACK + ARGS(%esp) 54 55#define J 0 + STACK(%esp) 56#define BX 4 + STACK(%esp) 57#define KK 8 + STACK(%esp) 58#define KKK 12 + STACK(%esp) 59 60#ifdef PENTIUM4 61#define PREFETCH_R (8 * 4) 62#define PREFETCH prefetcht1 63#define PREFETCHSIZE 84 64#endif 65 66#ifdef PENTIUMM 67#define PREFETCH_R (8 * 4) 68#define PREFETCH prefetcht1 69#define PREFETCHSIZE 84 70#endif 71 72#define AA %edx 73#define BB %ecx 74#define LDC %ebp 75#define B %edi 76 77#define KERNEL1(address) \ 78 mulpd %xmm0, %xmm2; \ 79 PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 1 * SIZE(AA); \ 80 addpd %xmm2, %xmm4; \ 81 movddup 1 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ 82 mulpd %xmm0, %xmm2; \ 83 addpd %xmm2, %xmm5; \ 84 movddup 2 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ 85 mulpd %xmm0, %xmm2; \ 86 addpd %xmm2, %xmm6; \ 87 movddup 3 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ 88 mulpd %xmm0, %xmm2; \ 89 movapd 2 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ 90 addpd %xmm2, %xmm7; \ 91 movddup 4 * SIZE + (address) * 2 * SIZE(BB), %xmm2 92 93#define KERNEL2(address) \ 94 mulpd %xmm0, %xmm2; \ 95 addpd %xmm2, %xmm4; \ 96 movddup 5 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ 97 mulpd %xmm0, %xmm2; \ 98 addpd %xmm2, %xmm5; \ 99 movddup 6 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ 100 mulpd %xmm0, %xmm2; \ 101 addpd %xmm2, %xmm6; \ 102 movddup 7 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ 103 mulpd %xmm0, %xmm2; \ 104 movapd 4 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ 105 addpd %xmm2, %xmm7; \ 106 movddup 16 * SIZE + (address) * 2 * SIZE(BB), %xmm2 107 108#define KERNEL3(address) \ 109 mulpd %xmm0, %xmm3; \ 110 addpd %xmm3, %xmm4; \ 111 movddup 9 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ 112 mulpd %xmm0, %xmm3; \ 113 addpd %xmm3, %xmm5; \ 114 movddup 10 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ 115 mulpd %xmm0, %xmm3; \ 116 addpd %xmm3, %xmm6; \ 117 movddup 11 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ 118 mulpd %xmm0, %xmm3; \ 119 movapd 6 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ 120 addpd %xmm3, %xmm7; \ 121 movddup 12 * SIZE + (address) * 2 * SIZE(BB), %xmm3 122 123#define KERNEL4(address) \ 124 mulpd %xmm0, %xmm3; \ 125 addpd %xmm3, %xmm4; \ 126 movddup 13 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ 127 mulpd %xmm0, %xmm3; \ 128 addpd %xmm3, %xmm5; \ 129 movddup 14 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ 130 mulpd %xmm0, %xmm3; \ 131 addpd %xmm3, %xmm6; \ 132 movddup 15 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ 133 mulpd %xmm0, %xmm3; \ 134 movapd 16 * SIZE + (address) * 1 * SIZE(AA), %xmm0; \ 135 addpd %xmm3, %xmm7; \ 136 movddup 24 * SIZE + (address) * 2 * SIZE(BB), %xmm3 137 138#define KERNEL5(address) \ 139 mulpd %xmm1, %xmm2; \ 140 addpd %xmm2, %xmm4; \ 141 movddup 17 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ 142 mulpd %xmm1, %xmm2; \ 143 addpd %xmm2, %xmm5; \ 144 movddup 18 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ 145 mulpd %xmm1, %xmm2; \ 146 addpd %xmm2, %xmm6; \ 147 movddup 19 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ 148 mulpd %xmm1, %xmm2; \ 149 movapd 10 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ 150 addpd %xmm2, %xmm7 151 152#define KERNEL6(address) \ 153 movddup 20 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ 154 mulpd %xmm1, %xmm2; \ 155 addpd %xmm2, %xmm4; \ 156 movddup 21 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ 157 mulpd %xmm1, %xmm2; \ 158 addpd %xmm2, %xmm5; \ 159 movddup 22 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ 160 mulpd %xmm1, %xmm2; \ 161 addpd %xmm2, %xmm6; \ 162 movddup 23 * SIZE + (address) * 2 * SIZE(BB), %xmm2; \ 163 mulpd %xmm1, %xmm2; \ 164 movapd 12 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ 165 addpd %xmm2, %xmm7; \ 166 movddup 32 * SIZE + (address) * 2 * SIZE(BB), %xmm2 167 168#define KERNEL7(address) \ 169 mulpd %xmm1, %xmm3; \ 170 addpd %xmm3, %xmm4; \ 171 movddup 25 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ 172 mulpd %xmm1, %xmm3; \ 173 addpd %xmm3, %xmm5; \ 174 movddup 26 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ 175 mulpd %xmm1, %xmm3; \ 176 addpd %xmm3, %xmm6; \ 177 movddup 27 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ 178 mulpd %xmm1, %xmm3; \ 179 movapd 14 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ 180 addpd %xmm3, %xmm7; \ 181 movddup 28 * SIZE + (address) * 2 * SIZE(BB), %xmm3 182 183#define KERNEL8(address) \ 184 mulpd %xmm1, %xmm3; \ 185 addpd %xmm3, %xmm4; \ 186 movddup 29 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ 187 mulpd %xmm1, %xmm3; \ 188 addpd %xmm3, %xmm5; \ 189 movddup 30 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ 190 mulpd %xmm1, %xmm3; \ 191 addpd %xmm3, %xmm6; \ 192 movddup 31 * SIZE + (address) * 2 * SIZE(BB), %xmm3; \ 193 mulpd %xmm1, %xmm3; \ 194 movapd 24 * SIZE + (address) * 1 * SIZE(AA), %xmm1; \ 195 addpd %xmm3, %xmm7; \ 196 movddup 40 * SIZE + (address) * 2 * SIZE(BB), %xmm3 197 198 PROLOGUE 199 200 subl $ARGS, %esp 201 202 pushl %ebp 203 pushl %edi 204 pushl %esi 205 pushl %ebx 206 207 PROFCODE 208 209 movl ARG_B, B 210 movl ARG_LDC, LDC 211 212#ifdef TRMMKERNEL 213 movl OFFSET, %eax 214#ifndef LEFT 215 negl %eax 216#endif 217 movl %eax, KK 218#endif 219 220 leal (, LDC, SIZE), LDC 221 222 movl N, %eax 223 sarl $2, %eax 224 movl %eax, J 225 jle .L30 226 ALIGN_2 227 228.L10: 229#if defined(TRMMKERNEL) && defined(LEFT) 230 movl OFFSET, %eax 231 movl %eax, KK 232#endif 233 234 movl K, %eax 235 sall $BASE_SHIFT + 2, %eax 236 leal (B, %eax), %eax 237 movl %eax, BX 238 239 movl C, %esi # coffset = c 240 movl A, AA # aoffset = a 241 242 movl M, %ebx 243 sarl $1, %ebx # i = (m >> 2) 244 jle .L20 245 ALIGN_4 246 247.L11: 248#if !defined(TRMMKERNEL) || \ 249 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 250 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 251 252 movl B, BB 253#else 254 movl KK, %eax 255 leal (, %eax, SIZE), %eax 256 leal (AA, %eax, 2), AA 257 leal (B, %eax, 4), BB 258#endif 259 260 movl BX, %eax 261 prefetcht2 0 * SIZE(%eax) 262 subl $-4 * SIZE, BX 263 264 movapd 0 * SIZE(AA), %xmm0 265 pxor %xmm4, %xmm4 266 movapd 8 * SIZE(AA), %xmm1 267 pxor %xmm5, %xmm5 268 movddup 0 * SIZE(BB), %xmm2 269 pxor %xmm6, %xmm6 270 movddup 8 * SIZE(BB), %xmm3 271 pxor %xmm7, %xmm7 272 273 leal (LDC, LDC, 2), %eax 274 275#ifdef PENTIUM4 276 prefetchnta 3 * SIZE(%esi) 277 prefetchnta 3 * SIZE(%esi, LDC, 1) 278 prefetchnta 3 * SIZE(%esi, LDC, 2) 279 prefetchnta 3 * SIZE(%esi, %eax, 1) 280#endif 281 282#ifndef TRMMKERNEL 283 movl K, %eax 284#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 285 movl K, %eax 286 subl KK, %eax 287 movl %eax, KKK 288#else 289 movl KK, %eax 290#ifdef LEFT 291 addl $2, %eax 292#else 293 addl $4, %eax 294#endif 295 movl %eax, KKK 296#endif 297 298#ifdef CORE_PRESCOTT 299 andl $-8, %eax 300 sall $4, %eax 301 je .L15 302 303.L1X: 304 KERNEL1(16 * 0) 305 KERNEL2(16 * 0) 306 KERNEL3(16 * 0) 307 KERNEL4(16 * 0) 308 KERNEL5(16 * 0) 309 KERNEL6(16 * 0) 310 KERNEL7(16 * 0) 311 KERNEL8(16 * 0) 312 cmpl $128 * 1, %eax 313 jle .L12 314 KERNEL1(16 * 1) 315 KERNEL2(16 * 1) 316 KERNEL3(16 * 1) 317 KERNEL4(16 * 1) 318 KERNEL5(16 * 1) 319 KERNEL6(16 * 1) 320 KERNEL7(16 * 1) 321 KERNEL8(16 * 1) 322 cmpl $128 * 2, %eax 323 jle .L12 324 KERNEL1(16 * 2) 325 KERNEL2(16 * 2) 326 KERNEL3(16 * 2) 327 KERNEL4(16 * 2) 328 KERNEL5(16 * 2) 329 KERNEL6(16 * 2) 330 KERNEL7(16 * 2) 331 KERNEL8(16 * 2) 332 cmpl $128 * 3, %eax 333 jle .L12 334 KERNEL1(16 * 3) 335 KERNEL2(16 * 3) 336 KERNEL3(16 * 3) 337 KERNEL4(16 * 3) 338 KERNEL5(16 * 3) 339 KERNEL6(16 * 3) 340 KERNEL7(16 * 3) 341 KERNEL8(16 * 3) 342 cmpl $128 * 4, %eax 343 jle .L12 344 KERNEL1(16 * 4) 345 KERNEL2(16 * 4) 346 KERNEL3(16 * 4) 347 KERNEL4(16 * 4) 348 KERNEL5(16 * 4) 349 KERNEL6(16 * 4) 350 KERNEL7(16 * 4) 351 KERNEL8(16 * 4) 352 cmpl $128 * 5, %eax 353 jle .L12 354 KERNEL1(16 * 5) 355 KERNEL2(16 * 5) 356 KERNEL3(16 * 5) 357 KERNEL4(16 * 5) 358 KERNEL5(16 * 5) 359 KERNEL6(16 * 5) 360 KERNEL7(16 * 5) 361 KERNEL8(16 * 5) 362 cmpl $128 * 6, %eax 363 jle .L12 364 KERNEL1(16 * 6) 365 KERNEL2(16 * 6) 366 KERNEL3(16 * 6) 367 KERNEL4(16 * 6) 368 KERNEL5(16 * 6) 369 KERNEL6(16 * 6) 370 KERNEL7(16 * 6) 371 KERNEL8(16 * 6) 372 cmpl $128 * 7, %eax 373 jle .L12 374 KERNEL1(16 * 7) 375 KERNEL2(16 * 7) 376 KERNEL3(16 * 7) 377 KERNEL4(16 * 7) 378 KERNEL5(16 * 7) 379 KERNEL6(16 * 7) 380 KERNEL7(16 * 7) 381 KERNEL8(16 * 7) 382#if 1 383 cmpl $128 * 8, %eax 384 jle .L12 385 KERNEL1(16 * 8) 386 KERNEL2(16 * 8) 387 KERNEL3(16 * 8) 388 KERNEL4(16 * 8) 389 KERNEL5(16 * 8) 390 KERNEL6(16 * 8) 391 KERNEL7(16 * 8) 392 KERNEL8(16 * 8) 393 cmpl $128 * 9, %eax 394 jle .L12 395 KERNEL1(16 * 9) 396 KERNEL2(16 * 9) 397 KERNEL3(16 * 9) 398 KERNEL4(16 * 9) 399 KERNEL5(16 * 9) 400 KERNEL6(16 * 9) 401 KERNEL7(16 * 9) 402 KERNEL8(16 * 9) 403 cmpl $128 * 10, %eax 404 jle .L12 405 KERNEL1(16 * 10) 406 KERNEL2(16 * 10) 407 KERNEL3(16 * 10) 408 KERNEL4(16 * 10) 409 KERNEL5(16 * 10) 410 KERNEL6(16 * 10) 411 KERNEL7(16 * 10) 412 KERNEL8(16 * 10) 413 cmpl $128 * 11, %eax 414 jle .L12 415 KERNEL1(16 * 11) 416 KERNEL2(16 * 11) 417 KERNEL3(16 * 11) 418 KERNEL4(16 * 11) 419 KERNEL5(16 * 11) 420 KERNEL6(16 * 11) 421 KERNEL7(16 * 11) 422 KERNEL8(16 * 11) 423 cmpl $128 * 12, %eax 424 jle .L12 425 KERNEL1(16 * 12) 426 KERNEL2(16 * 12) 427 KERNEL3(16 * 12) 428 KERNEL4(16 * 12) 429 KERNEL5(16 * 12) 430 KERNEL6(16 * 12) 431 KERNEL7(16 * 12) 432 KERNEL8(16 * 12) 433 cmpl $128 * 13, %eax 434 jle .L12 435 KERNEL1(16 * 13) 436 KERNEL2(16 * 13) 437 KERNEL3(16 * 13) 438 KERNEL4(16 * 13) 439 KERNEL5(16 * 13) 440 KERNEL6(16 * 13) 441 KERNEL7(16 * 13) 442 KERNEL8(16 * 13) 443 cmpl $128 * 14, %eax 444 jle .L12 445 KERNEL1(16 * 14) 446 KERNEL2(16 * 14) 447 KERNEL3(16 * 14) 448 KERNEL4(16 * 14) 449 KERNEL5(16 * 14) 450 KERNEL6(16 * 14) 451 KERNEL7(16 * 14) 452 KERNEL8(16 * 14) 453 cmpl $128 * 15, %eax 454 jle .L12 455 KERNEL1(16 * 15) 456 KERNEL2(16 * 15) 457 KERNEL3(16 * 15) 458 KERNEL4(16 * 15) 459 KERNEL5(16 * 15) 460 KERNEL6(16 * 15) 461 KERNEL7(16 * 15) 462 KERNEL8(16 * 15) 463#else 464 addl $32 * 4 * SIZE, AA 465 addl $32 * 8 * SIZE, BB 466 subl $128 * 8, %eax 467 jg .L1X 468#endif 469 470.L12: 471 leal (AA, %eax, 1), AA # * 16 472 leal (BB, %eax, 2), BB # * 64 473 474#else 475 476 sarl $3, %eax 477 je .L15 478 ALIGN_4 479 480.L12: 481 mulpd %xmm0, %xmm2 482 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) 483 addpd %xmm2, %xmm4 484 movddup 1 * SIZE(BB), %xmm2 485 mulpd %xmm0, %xmm2 486 addpd %xmm2, %xmm5 487 movddup 2 * SIZE(BB), %xmm2 488 mulpd %xmm0, %xmm2 489 addpd %xmm2, %xmm6 490 movddup 3 * SIZE(BB), %xmm2 491 mulpd %xmm0, %xmm2 492 movapd 2 * SIZE(AA), %xmm0 493 addpd %xmm2, %xmm7 494 movddup 4 * SIZE(BB), %xmm2 495 mulpd %xmm0, %xmm2 496 addpd %xmm2, %xmm4 497 movddup 5 * SIZE(BB), %xmm2 498 mulpd %xmm0, %xmm2 499 addpd %xmm2, %xmm5 500 movddup 6 * SIZE(BB), %xmm2 501 mulpd %xmm0, %xmm2 502 addpd %xmm2, %xmm6 503 movddup 7 * SIZE(BB), %xmm2 504 mulpd %xmm0, %xmm2 505 movapd 4 * SIZE(AA), %xmm0 506 addpd %xmm2, %xmm7 507 movddup 16 * SIZE(BB), %xmm2 508 mulpd %xmm0, %xmm3 509 addpd %xmm3, %xmm4 510 movddup 9 * SIZE(BB), %xmm3 511 mulpd %xmm0, %xmm3 512 addpd %xmm3, %xmm5 513 movddup 10 * SIZE(BB), %xmm3 514 mulpd %xmm0, %xmm3 515 addpd %xmm3, %xmm6 516 movddup 11 * SIZE(BB), %xmm3 517 mulpd %xmm0, %xmm3 518 movapd 6 * SIZE(AA), %xmm0 519 addpd %xmm3, %xmm7 520 movddup 12 * SIZE(BB), %xmm3 521 mulpd %xmm0, %xmm3 522 addpd %xmm3, %xmm4 523 movddup 13 * SIZE(BB), %xmm3 524 mulpd %xmm0, %xmm3 525 addpd %xmm3, %xmm5 526 movddup 14 * SIZE(BB), %xmm3 527 mulpd %xmm0, %xmm3 528 addpd %xmm3, %xmm6 529 movddup 15 * SIZE(BB), %xmm3 530 mulpd %xmm0, %xmm3 531 movapd 16 * SIZE(AA), %xmm0 532 addpd %xmm3, %xmm7 533 movddup 24 * SIZE(BB), %xmm3 534 mulpd %xmm1, %xmm2 535 addpd %xmm2, %xmm4 536 movddup 17 * SIZE(BB), %xmm2 537 mulpd %xmm1, %xmm2 538 addpd %xmm2, %xmm5 539 movddup 18 * SIZE(BB), %xmm2 540 mulpd %xmm1, %xmm2 541 addpd %xmm2, %xmm6 542 movddup 19 * SIZE(BB), %xmm2 543 mulpd %xmm1, %xmm2 544 movapd 10 * SIZE(AA), %xmm1 545 addpd %xmm2, %xmm7 546 movddup 20 * SIZE(BB), %xmm2 547 mulpd %xmm1, %xmm2 548 addpd %xmm2, %xmm4 549 movddup 21 * SIZE(BB), %xmm2 550 mulpd %xmm1, %xmm2 551 addpd %xmm2, %xmm5 552 movddup 22 * SIZE(BB), %xmm2 553 mulpd %xmm1, %xmm2 554 addpd %xmm2, %xmm6 555 movddup 23 * SIZE(BB), %xmm2 556 mulpd %xmm1, %xmm2 557 movapd 12 * SIZE(AA), %xmm1 558 addpd %xmm2, %xmm7 559 movddup 32 * SIZE(BB), %xmm2 560 mulpd %xmm1, %xmm3 561 addpd %xmm3, %xmm4 562 movddup 25 * SIZE(BB), %xmm3 563 mulpd %xmm1, %xmm3 564 addpd %xmm3, %xmm5 565 movddup 26 * SIZE(BB), %xmm3 566 mulpd %xmm1, %xmm3 567 addpd %xmm3, %xmm6 568 movddup 27 * SIZE(BB), %xmm3 569 mulpd %xmm1, %xmm3 570 movapd 14 * SIZE(AA), %xmm1 571 addpd %xmm3, %xmm7 572 movddup 28 * SIZE(BB), %xmm3 573 mulpd %xmm1, %xmm3 574 addpd %xmm3, %xmm4 575 movddup 29 * SIZE(BB), %xmm3 576 mulpd %xmm1, %xmm3 577 addpd %xmm3, %xmm5 578 movddup 30 * SIZE(BB), %xmm3 579 mulpd %xmm1, %xmm3 580 addpd %xmm3, %xmm6 581 movddup 31 * SIZE(BB), %xmm3 582 mulpd %xmm1, %xmm3 583 movapd 24 * SIZE(AA), %xmm1 584 addpd %xmm3, %xmm7 585 movddup 40 * SIZE(BB), %xmm3 586 587 addl $32 * SIZE, BB 588 addl $16 * SIZE, AA 589 decl %eax 590 jne .L12 591 ALIGN_4 592#endif 593 594.L15: 595#ifndef TRMMKERNEL 596 movl K, %eax 597#else 598 movl KKK, %eax 599#endif 600 movddup ALPHA, %xmm3 601 andl $7, %eax # if (k & 1) 602 BRANCH 603 je .L18 604 ALIGN_3 605 606.L16: 607 mulpd %xmm0, %xmm2 608 addpd %xmm2, %xmm4 609 movddup 1 * SIZE(BB), %xmm2 610 mulpd %xmm0, %xmm2 611 addpd %xmm2, %xmm5 612 movddup 2 * SIZE(BB), %xmm2 613 mulpd %xmm0, %xmm2 614 addpd %xmm2, %xmm6 615 movddup 3 * SIZE(BB), %xmm2 616 mulpd %xmm0, %xmm2 617 movapd 2 * SIZE(AA), %xmm0 618 addpd %xmm2, %xmm7 619 movddup 4 * SIZE(BB), %xmm2 620 621 addl $2 * SIZE, AA 622 addl $4 * SIZE, BB 623 decl %eax 624 jg .L16 625 ALIGN_4 626 627.L18: 628 SHUFPD_2 %xmm0, %xmm0 629 SHUFPD_2 %xmm1, %xmm1 630 SHUFPD_2 %xmm2, %xmm2 631 SHUFPD_2 %xmm3, %xmm3 632 633 mulpd %xmm3, %xmm4 634 mulpd %xmm3, %xmm5 635 mulpd %xmm3, %xmm6 636 mulpd %xmm3, %xmm7 637 638 movl %esi, %eax 639 orl LDC, %eax 640 testl $15, %eax 641 NOBRANCH 642 jne .L18x 643 644 leal (LDC, LDC, 2), %eax 645 646#ifndef TRMMKERNEL 647 movapd 0 * SIZE(%esi), %xmm0 648 movapd 0 * SIZE(%esi, LDC, 1), %xmm1 649 movapd 0 * SIZE(%esi, LDC, 2), %xmm2 650 movapd 0 * SIZE(%esi, %eax, 1), %xmm3 651 652 addpd %xmm0, %xmm4 653 addpd %xmm1, %xmm5 654 addpd %xmm2, %xmm6 655 addpd %xmm3, %xmm7 656#endif 657 658 movapd %xmm4, 0 * SIZE(%esi) 659 movapd %xmm5, 0 * SIZE(%esi, LDC, 1) 660 movapd %xmm6, 0 * SIZE(%esi, LDC, 2) 661 movapd %xmm7, 0 * SIZE(%esi, %eax, 1) 662 663#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 664 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 665 movl K, %eax 666 subl KKK, %eax 667 leal (,%eax, SIZE), %eax 668 leal (AA, %eax, 2), AA 669 leal (BB, %eax, 4), BB 670#endif 671 672#if defined(TRMMKERNEL) && defined(LEFT) 673 addl $2, KK 674#endif 675 676 addl $2 * SIZE, %esi # coffset += 2 677 decl %ebx # i -- 678 jg .L11 679 jmp .L20 680 ALIGN_4 681 682.L18x: 683 leal (LDC, LDC, 2), %eax 684 685#ifndef TRMMKERNEL 686 movsd 0 * SIZE(%esi), %xmm0 687 movhpd 1 * SIZE(%esi), %xmm0 688 movsd 0 * SIZE(%esi, LDC, 1), %xmm1 689 movhpd 1 * SIZE(%esi, LDC, 1), %xmm1 690 movsd 0 * SIZE(%esi, LDC, 2), %xmm2 691 movhpd 1 * SIZE(%esi, LDC, 2), %xmm2 692 movsd 0 * SIZE(%esi, %eax, 1), %xmm3 693 movhpd 1 * SIZE(%esi, %eax, 1), %xmm3 694 695 addpd %xmm0, %xmm4 696 addpd %xmm1, %xmm5 697 addpd %xmm2, %xmm6 698 addpd %xmm3, %xmm7 699#endif 700 701 movsd %xmm4, 0 * SIZE(%esi) 702 movhpd %xmm4, 1 * SIZE(%esi) 703 movsd %xmm5, 0 * SIZE(%esi, LDC, 1) 704 movhpd %xmm5, 1 * SIZE(%esi, LDC, 1) 705 movsd %xmm6, 0 * SIZE(%esi, LDC, 2) 706 movhpd %xmm6, 1 * SIZE(%esi, LDC, 2) 707 movsd %xmm7, 0 * SIZE(%esi, %eax, 1) 708 movhpd %xmm7, 1 * SIZE(%esi, %eax, 1) 709 710#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 711 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 712 movl K, %eax 713 subl KKK, %eax 714 leal (,%eax, SIZE), %eax 715 leal (AA, %eax, 2), AA 716 leal (BB, %eax, 4), BB 717#endif 718 719#if defined(TRMMKERNEL) && defined(LEFT) 720 addl $2, KK 721#endif 722 723 addl $2 * SIZE, %esi # coffset += 2 724 decl %ebx # i -- 725 jg .L11 726 ALIGN_3 727 728.L20: 729 movl M, %ebx 730 testl $1, %ebx # i = (m >> 2) 731 jle .L29 732 733 734#if !defined(TRMMKERNEL) || \ 735 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 736 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 737 738 movl B, BB 739#else 740 movl KK, %eax 741 leal (, %eax, SIZE), %eax 742 leal (AA, %eax, 1), AA 743 leal (B, %eax, 4), BB 744#endif 745 746 movddup 0 * SIZE(AA), %xmm0 747 pxor %xmm4, %xmm4 748 movddup 8 * SIZE(AA), %xmm1 749 pxor %xmm5, %xmm5 750 movapd 0 * SIZE(BB), %xmm2 751 pxor %xmm6, %xmm6 752 movapd 8 * SIZE(BB), %xmm3 753 pxor %xmm7, %xmm7 754 755#ifndef TRMMKERNEL 756 movl K, %eax 757#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 758 movl K, %eax 759 subl KK, %eax 760 movl %eax, KKK 761#else 762 movl KK, %eax 763#ifdef LEFT 764 addl $1, %eax 765#else 766 addl $4, %eax 767#endif 768 movl %eax, KKK 769#endif 770 sarl $4, %eax 771 je .L25 772 ALIGN_4 773 774.L22: 775 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) 776 mulpd %xmm0, %xmm2 777 mulpd 2 * SIZE(BB), %xmm0 778 addpd %xmm2, %xmm4 779 movapd 4 * SIZE(BB), %xmm2 780 addpd %xmm0, %xmm5 781 movddup 1 * SIZE(AA), %xmm0 782 mulpd %xmm0, %xmm2 783 mulpd 6 * SIZE(BB), %xmm0 784 addpd %xmm2, %xmm6 785 movapd 16 * SIZE(BB), %xmm2 786 addpd %xmm0, %xmm7 787 movddup 2 * SIZE(AA), %xmm0 788 mulpd %xmm0, %xmm3 789 mulpd 10 * SIZE(BB), %xmm0 790 addpd %xmm3, %xmm4 791 movapd 12 * SIZE(BB), %xmm3 792 addpd %xmm0, %xmm5 793 movddup 3 * SIZE(AA), %xmm0 794 mulpd %xmm0, %xmm3 795 mulpd 14 * SIZE(BB), %xmm0 796 addpd %xmm3, %xmm6 797 movapd 24 * SIZE(BB), %xmm3 798 addpd %xmm0, %xmm7 799 movddup 4 * SIZE(AA), %xmm0 800 mulpd %xmm0, %xmm2 801 mulpd 18 * SIZE(BB), %xmm0 802 addpd %xmm2, %xmm4 803 movapd 20 * SIZE(BB), %xmm2 804 addpd %xmm0, %xmm5 805 movddup 5 * SIZE(AA), %xmm0 806 mulpd %xmm0, %xmm2 807 mulpd 22 * SIZE(BB), %xmm0 808 addpd %xmm2, %xmm6 809 movapd 32 * SIZE(BB), %xmm2 810 addpd %xmm0, %xmm7 811 movddup 6 * SIZE(AA), %xmm0 812 mulpd %xmm0, %xmm3 813 mulpd 26 * SIZE(BB), %xmm0 814 addpd %xmm3, %xmm4 815 movapd 28 * SIZE(BB), %xmm3 816 addpd %xmm0, %xmm5 817 movddup 7 * SIZE(AA), %xmm0 818 mulpd %xmm0, %xmm3 819 mulpd 30 * SIZE(BB), %xmm0 820 addpd %xmm3, %xmm6 821 movapd 40 * SIZE(BB), %xmm3 822 addpd %xmm0, %xmm7 823 movddup 16 * SIZE(AA), %xmm0 824 mulpd %xmm1, %xmm2 825 mulpd 34 * SIZE(BB), %xmm1 826 addpd %xmm2, %xmm4 827 movapd 36 * SIZE(BB), %xmm2 828 addpd %xmm1, %xmm5 829 movddup 9 * SIZE(AA), %xmm1 830 mulpd %xmm1, %xmm2 831 mulpd 38 * SIZE(BB), %xmm1 832 addpd %xmm2, %xmm6 833 movapd 48 * SIZE(BB), %xmm2 834 addpd %xmm1, %xmm7 835 movddup 10 * SIZE(AA), %xmm1 836 mulpd %xmm1, %xmm3 837 mulpd 42 * SIZE(BB), %xmm1 838 addpd %xmm3, %xmm4 839 movapd 44 * SIZE(BB), %xmm3 840 addpd %xmm1, %xmm5 841 movddup 11 * SIZE(AA), %xmm1 842 mulpd %xmm1, %xmm3 843 mulpd 46 * SIZE(BB), %xmm1 844 addpd %xmm3, %xmm6 845 movapd 56 * SIZE(BB), %xmm3 846 addpd %xmm1, %xmm7 847 movddup 12 * SIZE(AA), %xmm1 848 mulpd %xmm1, %xmm2 849 mulpd 50 * SIZE(BB), %xmm1 850 addpd %xmm2, %xmm4 851 movapd 52 * SIZE(BB), %xmm2 852 addpd %xmm1, %xmm5 853 movddup 13 * SIZE(AA), %xmm1 854 mulpd %xmm1, %xmm2 855 mulpd 54 * SIZE(BB), %xmm1 856 addpd %xmm2, %xmm6 857 movapd 64 * SIZE(BB), %xmm2 858 addpd %xmm1, %xmm7 859 movddup 14 * SIZE(AA), %xmm1 860 mulpd %xmm1, %xmm3 861 mulpd 58 * SIZE(BB), %xmm1 862 addpd %xmm3, %xmm4 863 movapd 60 * SIZE(BB), %xmm3 864 addpd %xmm1, %xmm5 865 movddup 15 * SIZE(AA), %xmm1 866 mulpd %xmm1, %xmm3 867 mulpd 62 * SIZE(BB), %xmm1 868 addpd %xmm3, %xmm6 869 movapd 72 * SIZE(BB), %xmm3 870 addpd %xmm1, %xmm7 871 movddup 24 * SIZE(AA), %xmm1 872 873 addl $16 * SIZE, AA 874 addl $64 * SIZE, BB 875 decl %eax 876 jne .L22 877 ALIGN_4 878 879.L25: 880#ifndef TRMMKERNEL 881 movl K, %eax 882#else 883 movl KKK, %eax 884#endif 885 movddup ALPHA, %xmm3 886 andl $15, %eax # if (k & 1) 887 BRANCH 888 je .L28 889 890.L26: 891 mulpd %xmm0, %xmm2 892 mulpd 2 * SIZE(BB), %xmm0 893 addpd %xmm2, %xmm4 894 movapd 4 * SIZE(BB), %xmm2 895 addpd %xmm0, %xmm5 896 movddup 1 * SIZE(AA), %xmm0 897 898 addl $1 * SIZE, AA 899 addl $4 * SIZE, BB 900 901 decl %eax 902 jg .L26 903 ALIGN_4 904 905.L28: 906 leal (%esi, LDC, 1), %eax 907 908 addpd %xmm6, %xmm4 909 addpd %xmm7, %xmm5 910 911 mulpd %xmm3, %xmm4 912 mulpd %xmm3, %xmm5 913 914#ifndef TRMMKERNEL 915 916#ifdef PENTIUM4 917 SHUFPD_2 %xmm0, %xmm0 918 SHUFPD_2 %xmm1, %xmm1 919#endif 920 921 movsd 0 * SIZE(%esi), %xmm0 922 movhpd 0 * SIZE(%eax), %xmm0 923 movsd 0 * SIZE(%esi, LDC, 2), %xmm1 924 movhpd 0 * SIZE(%eax, LDC, 2), %xmm1 925 926 addpd %xmm0, %xmm4 927 addpd %xmm1, %xmm5 928#endif 929 930 movsd %xmm4, 0 * SIZE(%esi) 931 movhpd %xmm4, 0 * SIZE(%eax) 932 movsd %xmm5, 0 * SIZE(%esi, LDC, 2) 933 movhpd %xmm5, 0 * SIZE(%eax, LDC, 2) 934 935#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 936 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 937 movl K, %eax 938 subl KKK, %eax 939 leal (,%eax, SIZE), %eax 940 leal (AA, %eax, 1), AA 941 leal (BB, %eax, 4), BB 942#endif 943 944#if defined(TRMMKERNEL) && defined(LEFT) 945 addl $1, KK 946#endif 947 ALIGN_4 948 949.L29: 950#if defined(TRMMKERNEL) && !defined(LEFT) 951 addl $4, KK 952#endif 953 954 leal (, LDC, 4), %eax 955 movl BB, B 956 addl %eax, C # c += 4 * ldc 957 decl J # j -- 958 jg .L10 959 ALIGN_4 960 961.L30: 962 testl $2, N 963 je .L60 964 965 movl C, %esi # coffset = c 966 movl A, AA # aoffset = a 967 968#if defined(TRMMKERNEL) && defined(LEFT) 969 movl OFFSET, %eax 970 movl %eax, KK 971#endif 972 973 movl M, %ebx 974 sarl $1, %ebx # i = (m >> 2) 975 jle .L50 976 ALIGN_4 977 978.L41: 979#if !defined(TRMMKERNEL) || \ 980 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 981 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 982 983 movl B, BB 984#else 985 movl KK, %eax 986 leal (, %eax, SIZE), %eax 987 leal (AA, %eax, 2), AA 988 leal (B, %eax, 2), BB 989#endif 990 991 movapd 0 * SIZE(AA), %xmm0 992 pxor %xmm4, %xmm4 993 movapd 8 * SIZE(AA), %xmm1 994 pxor %xmm5, %xmm5 995 movddup 0 * SIZE(BB), %xmm2 996 pxor %xmm6, %xmm6 997 movddup 8 * SIZE(BB), %xmm3 998 pxor %xmm7, %xmm7 999 1000#ifdef HAVE_3DNOW 1001 prefetchw 2 * SIZE(%esi) 1002 prefetchw 2 * SIZE(%esi, LDC) 1003#endif 1004 1005#ifdef PENTIUM4 1006 prefetchnta 3 * SIZE(%esi) 1007 prefetchnta 3 * SIZE(%esi, LDC) 1008#endif 1009 1010#ifndef TRMMKERNEL 1011 movl K, %eax 1012#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1013 movl K, %eax 1014 subl KK, %eax 1015 movl %eax, KKK 1016#else 1017 movl KK, %eax 1018#ifdef LEFT 1019 addl $2, %eax 1020#else 1021 addl $2, %eax 1022#endif 1023 movl %eax, KKK 1024#endif 1025 sarl $3, %eax 1026 je .L45 1027 ALIGN_4 1028 1029.L42: 1030 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) 1031 mulpd %xmm0, %xmm2 1032 addpd %xmm2, %xmm4 1033 movddup 1 * SIZE(BB), %xmm2 1034 mulpd %xmm0, %xmm2 1035 movapd 2 * SIZE(AA), %xmm0 1036 addpd %xmm2, %xmm5 1037 movddup 2 * SIZE(BB), %xmm2 1038 mulpd %xmm0, %xmm2 1039 addpd %xmm2, %xmm6 1040 movddup 3 * SIZE(BB), %xmm2 1041 mulpd %xmm0, %xmm2 1042 movapd 4 * SIZE(AA), %xmm0 1043 addpd %xmm2, %xmm7 1044 movddup 4 * SIZE(BB), %xmm2 1045 mulpd %xmm0, %xmm2 1046 addpd %xmm2, %xmm4 1047 movddup 5 * SIZE(BB), %xmm2 1048 mulpd %xmm0, %xmm2 1049 movapd 6 * SIZE(AA), %xmm0 1050 addpd %xmm2, %xmm5 1051 movddup 6 * SIZE(BB), %xmm2 1052 mulpd %xmm0, %xmm2 1053 addpd %xmm2, %xmm6 1054 movddup 7 * SIZE(BB), %xmm2 1055 mulpd %xmm0, %xmm2 1056 movapd 16 * SIZE(AA), %xmm0 1057 addpd %xmm2, %xmm7 1058 movddup 16 * SIZE(BB), %xmm2 1059 mulpd %xmm1, %xmm3 1060 addpd %xmm3, %xmm4 1061 movddup 9 * SIZE(BB), %xmm3 1062 mulpd %xmm1, %xmm3 1063 movapd 10 * SIZE(AA), %xmm1 1064 addpd %xmm3, %xmm5 1065 movddup 10 * SIZE(BB), %xmm3 1066 mulpd %xmm1, %xmm3 1067 addpd %xmm3, %xmm6 1068 movddup 11 * SIZE(BB), %xmm3 1069 mulpd %xmm1, %xmm3 1070 movapd 12 * SIZE(AA), %xmm1 1071 addpd %xmm3, %xmm7 1072 movddup 12 * SIZE(BB), %xmm3 1073 mulpd %xmm1, %xmm3 1074 addpd %xmm3, %xmm4 1075 movddup 13 * SIZE(BB), %xmm3 1076 mulpd %xmm1, %xmm3 1077 movapd 14 * SIZE(AA), %xmm1 1078 addpd %xmm3, %xmm5 1079 movddup 14 * SIZE(BB), %xmm3 1080 mulpd %xmm1, %xmm3 1081 addpd %xmm3, %xmm6 1082 movddup 15 * SIZE(BB), %xmm3 1083 mulpd %xmm1, %xmm3 1084 movapd 24 * SIZE(AA), %xmm1 1085 addpd %xmm3, %xmm7 1086 movddup 24 * SIZE(BB), %xmm3 1087 1088 addl $16 * SIZE, AA 1089 addl $16 * SIZE, BB 1090 decl %eax 1091 jne .L42 1092 ALIGN_4 1093 1094.L45: 1095#ifndef TRMMKERNEL 1096 movl K, %eax 1097#else 1098 movl KKK, %eax 1099#endif 1100 movddup ALPHA, %xmm3 1101 andl $7, %eax # if (k & 1) 1102 BRANCH 1103 je .L48 1104 ALIGN_3 1105 1106.L46: 1107 mulpd %xmm0, %xmm2 1108 addpd %xmm2, %xmm4 1109 movddup 1 * SIZE(BB), %xmm2 1110 mulpd %xmm0, %xmm2 1111 movapd 2 * SIZE(AA), %xmm0 1112 addpd %xmm2, %xmm5 1113 movddup 2 * SIZE(BB), %xmm2 1114 1115 addl $2 * SIZE, AA 1116 addl $2 * SIZE, BB 1117 decl %eax 1118 jg .L46 1119 ALIGN_4 1120 1121.L48: 1122 addpd %xmm6, %xmm4 1123 addpd %xmm7, %xmm5 1124 1125 mulpd %xmm3, %xmm4 1126 mulpd %xmm3, %xmm5 1127 1128#ifndef TRMMKERNEL 1129#ifdef PENTIUM4 1130 SHUFPD_2 %xmm0, %xmm0 1131 SHUFPD_2 %xmm1, %xmm1 1132#endif 1133 1134 movsd 0 * SIZE(%esi), %xmm0 1135 movhpd 1 * SIZE(%esi), %xmm0 1136 movsd 0 * SIZE(%esi, LDC, 1), %xmm1 1137 movhpd 1 * SIZE(%esi, LDC, 1), %xmm1 1138 1139 addpd %xmm0, %xmm4 1140 addpd %xmm1, %xmm5 1141#endif 1142 1143 movsd %xmm4, 0 * SIZE(%esi) 1144 movhpd %xmm4, 1 * SIZE(%esi) 1145 movsd %xmm5, 0 * SIZE(%esi, LDC, 1) 1146 movhpd %xmm5, 1 * SIZE(%esi, LDC, 1) 1147 1148 1149#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1150 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1151 movl K, %eax 1152 subl KKK, %eax 1153 leal (,%eax, SIZE), %eax 1154 leal (AA, %eax, 2), AA 1155 leal (BB, %eax, 2), BB 1156#endif 1157 1158#if defined(TRMMKERNEL) && defined(LEFT) 1159 addl $2, KK 1160#endif 1161 1162 addl $2 * SIZE, %esi # coffset += 2 1163 decl %ebx # i -- 1164 jg .L41 1165 ALIGN_4 1166 1167.L50: 1168 movl M, %ebx 1169 testl $1, %ebx # i = (m >> 2) 1170 jle .L59 1171 1172#if !defined(TRMMKERNEL) || \ 1173 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1174 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1175 1176 movl B, BB 1177#else 1178 movl KK, %eax 1179 leal (, %eax, SIZE), %eax 1180 leal (AA, %eax, 1), AA 1181 leal (B, %eax, 2), BB 1182#endif 1183 1184 movddup 0 * SIZE(AA), %xmm0 1185 pxor %xmm4, %xmm4 1186 movddup 8 * SIZE(AA), %xmm1 1187 pxor %xmm5, %xmm5 1188 movapd 0 * SIZE(BB), %xmm2 1189 pxor %xmm6, %xmm6 1190 movapd 8 * SIZE(BB), %xmm3 1191 pxor %xmm7, %xmm7 1192 1193#ifndef TRMMKERNEL 1194 movl K, %eax 1195#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1196 movl K, %eax 1197 subl KK, %eax 1198 movl %eax, KKK 1199#else 1200 movl KK, %eax 1201#ifdef LEFT 1202 addl $1, %eax 1203#else 1204 addl $2, %eax 1205#endif 1206 movl %eax, KKK 1207#endif 1208 sarl $4, %eax 1209 je .L55 1210 ALIGN_4 1211 1212.L52: 1213 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) 1214 mulpd %xmm0, %xmm2 1215 movddup 1 * SIZE(AA), %xmm0 1216 addpd %xmm2, %xmm4 1217 mulpd 2 * SIZE(BB), %xmm0 1218 movapd 16 * SIZE(BB), %xmm2 1219 addpd %xmm0, %xmm5 1220 movddup 2 * SIZE(AA), %xmm0 1221 mulpd 4 * SIZE(BB), %xmm0 1222 addpd %xmm0, %xmm6 1223 movddup 3 * SIZE(AA), %xmm0 1224 mulpd 6 * SIZE(BB), %xmm0 1225 addpd %xmm0, %xmm7 1226 movddup 4 * SIZE(AA), %xmm0 1227 mulpd %xmm0, %xmm3 1228 movddup 5 * SIZE(AA), %xmm0 1229 addpd %xmm3, %xmm4 1230 mulpd 10 * SIZE(BB), %xmm0 1231 movapd 24 * SIZE(BB), %xmm3 1232 addpd %xmm0, %xmm5 1233 movddup 6 * SIZE(AA), %xmm0 1234 mulpd 12 * SIZE(BB), %xmm0 1235 addpd %xmm0, %xmm6 1236 movddup 7 * SIZE(AA), %xmm0 1237 mulpd 14 * SIZE(BB), %xmm0 1238 addpd %xmm0, %xmm7 1239 movddup 16 * SIZE(AA), %xmm0 1240 mulpd %xmm1, %xmm2 1241 movddup 9 * SIZE(AA), %xmm1 1242 addpd %xmm2, %xmm4 1243 mulpd 18 * SIZE(BB), %xmm1 1244 movapd 32 * SIZE(BB), %xmm2 1245 addpd %xmm1, %xmm5 1246 movddup 10 * SIZE(AA), %xmm1 1247 mulpd 20 * SIZE(BB), %xmm1 1248 addpd %xmm1, %xmm6 1249 movddup 11 * SIZE(AA), %xmm1 1250 mulpd 22 * SIZE(BB), %xmm1 1251 addpd %xmm1, %xmm7 1252 movddup 12 * SIZE(AA), %xmm1 1253 mulpd %xmm1, %xmm3 1254 movddup 13 * SIZE(AA), %xmm1 1255 addpd %xmm3, %xmm4 1256 mulpd 26 * SIZE(BB), %xmm1 1257 movapd 40 * SIZE(BB), %xmm3 1258 addpd %xmm1, %xmm5 1259 movddup 14 * SIZE(AA), %xmm1 1260 mulpd 28 * SIZE(BB), %xmm1 1261 addpd %xmm1, %xmm6 1262 movddup 15 * SIZE(AA), %xmm1 1263 mulpd 30 * SIZE(BB), %xmm1 1264 addpd %xmm1, %xmm7 1265 movddup 24 * SIZE(AA), %xmm1 1266 1267 addl $16 * SIZE, AA 1268 addl $32 * SIZE, BB 1269 decl %eax 1270 jne .L52 1271 ALIGN_4 1272 1273.L55: 1274#ifndef TRMMKERNEL 1275 movl K, %eax 1276#else 1277 movl KKK, %eax 1278#endif 1279 movddup ALPHA, %xmm3 1280 andl $15, %eax # if (k & 1) 1281 BRANCH 1282 je .L58 1283 1284.L56: 1285 mulpd %xmm0, %xmm2 1286 movddup 1 * SIZE(AA), %xmm0 1287 addpd %xmm2, %xmm4 1288 movapd 2 * SIZE(BB), %xmm2 1289 1290 addl $1 * SIZE, AA 1291 addl $2 * SIZE, BB 1292 decl %eax 1293 jg .L56 1294 ALIGN_4 1295 1296.L58: 1297 addpd %xmm5, %xmm4 1298 addpd %xmm7, %xmm6 1299 addpd %xmm6, %xmm4 1300 1301 mulpd %xmm3, %xmm4 1302 1303#ifndef TRMMKERNEL 1304#ifdef PENTIUM4 1305 SHUFPD_2 %xmm0, %xmm0 1306#endif 1307 1308 movsd 0 * SIZE(%esi), %xmm0 1309 movhpd 0 * SIZE(%esi, LDC, 1), %xmm0 1310 1311 addpd %xmm0, %xmm4 1312#endif 1313 1314 movsd %xmm4, 0 * SIZE(%esi) 1315 movhpd %xmm4, 0 * SIZE(%esi, LDC, 1) 1316 1317#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1318 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1319 movl K, %eax 1320 subl KKK, %eax 1321 leal (,%eax, SIZE), %eax 1322 leal (AA, %eax, 1), AA 1323 leal (BB, %eax, 2), BB 1324#endif 1325 1326#if defined(TRMMKERNEL) && defined(LEFT) 1327 addl $1, KK 1328#endif 1329 ALIGN_4 1330 1331.L59: 1332#if defined(TRMMKERNEL) && !defined(LEFT) 1333 addl $2, KK 1334#endif 1335 1336 leal (, LDC, 2), %eax 1337 movl BB, B 1338 addl %eax, C # c += 4 * ldc 1339 ALIGN_4 1340 1341.L60: 1342 testl $1, N 1343 je .L999 1344 1345 movl C, %esi # coffset = c 1346 movl A, AA # aoffset = a 1347 1348#if defined(TRMMKERNEL) && defined(LEFT) 1349 movl OFFSET, %eax 1350 movl %eax, KK 1351#endif 1352 1353 movl M, %ebx 1354 sarl $1, %ebx # i = (m >> 2) 1355 jle .L80 1356 ALIGN_4 1357 1358.L71: 1359#if !defined(TRMMKERNEL) || \ 1360 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1361 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1362 1363 movl B, BB 1364#else 1365 movl KK, %eax 1366 leal (, %eax, SIZE), %eax 1367 leal (AA, %eax, 2), AA 1368 leal (B, %eax, 1), BB 1369#endif 1370 1371 movapd 0 * SIZE(AA), %xmm0 1372 pxor %xmm4, %xmm4 1373 movapd 8 * SIZE(AA), %xmm1 1374 pxor %xmm5, %xmm5 1375 movddup 0 * SIZE(BB), %xmm2 1376 pxor %xmm6, %xmm6 1377 movddup 4 * SIZE(BB), %xmm3 1378 pxor %xmm7, %xmm7 1379 1380#ifdef PENTIUM4 1381 prefetchnta 3 * SIZE(%esi) 1382#endif 1383 1384#ifndef TRMMKERNEL 1385 movl K, %eax 1386#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1387 movl K, %eax 1388 subl KK, %eax 1389 movl %eax, KKK 1390#else 1391 movl KK, %eax 1392#ifdef LEFT 1393 addl $2, %eax 1394#else 1395 addl $1, %eax 1396#endif 1397 movl %eax, KKK 1398#endif 1399 sarl $3, %eax 1400 je .L75 1401 ALIGN_4 1402 1403.L72: 1404 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) 1405 mulpd %xmm2, %xmm0 1406 movddup 1 * SIZE(BB), %xmm2 1407 addpd %xmm0, %xmm4 1408 movapd 16 * SIZE(AA), %xmm0 1409 mulpd 2 * SIZE(AA), %xmm2 1410 addpd %xmm2, %xmm5 1411 movddup 2 * SIZE(BB), %xmm2 1412 mulpd 4 * SIZE(AA), %xmm2 1413 addpd %xmm2, %xmm6 1414 movddup 3 * SIZE(BB), %xmm2 1415 mulpd 6 * SIZE(AA), %xmm2 1416 addpd %xmm2, %xmm7 1417 movddup 8 * SIZE(BB), %xmm2 1418 mulpd %xmm3, %xmm1 1419 movddup 5 * SIZE(BB), %xmm3 1420 addpd %xmm1, %xmm4 1421 movapd 24 * SIZE(AA), %xmm1 1422 mulpd 10 * SIZE(AA), %xmm3 1423 addpd %xmm3, %xmm5 1424 movddup 6 * SIZE(BB), %xmm3 1425 mulpd 12 * SIZE(AA), %xmm3 1426 addpd %xmm3, %xmm6 1427 movddup 7 * SIZE(BB), %xmm3 1428 mulpd 14 * SIZE(AA), %xmm3 1429 addpd %xmm3, %xmm7 1430 movddup 12 * SIZE(BB), %xmm3 1431 1432 addl $16 * SIZE, AA 1433 addl $ 8 * SIZE, BB 1434 decl %eax 1435 jne .L72 1436 ALIGN_4 1437 1438.L75: 1439#ifndef TRMMKERNEL 1440 movl K, %eax 1441#else 1442 movl KKK, %eax 1443#endif 1444 movddup ALPHA, %xmm3 1445 andl $7, %eax # if (k & 1) 1446 BRANCH 1447 je .L78 1448 ALIGN_3 1449 1450.L76: 1451 mulpd %xmm2, %xmm0 1452 movddup 1 * SIZE(BB), %xmm2 1453 addpd %xmm0, %xmm4 1454 movapd 2 * SIZE(AA), %xmm0 1455 1456 addl $2 * SIZE, AA 1457 addl $1 * SIZE, BB 1458 decl %eax 1459 jg .L76 1460 ALIGN_4 1461 1462.L78: 1463 addpd %xmm5, %xmm4 1464 addpd %xmm7, %xmm6 1465 addpd %xmm6, %xmm4 1466 1467 mulpd %xmm3, %xmm4 1468 1469#ifndef TRMMKERNEL 1470#ifdef PENTIUM4 1471 SHUFPD_2 %xmm0, %xmm0 1472#endif 1473 1474 movsd 0 * SIZE(%esi), %xmm0 1475 movhpd 1 * SIZE(%esi), %xmm0 1476 1477 addpd %xmm0, %xmm4 1478#endif 1479 1480 movsd %xmm4, 0 * SIZE(%esi) 1481 movhpd %xmm4, 1 * SIZE(%esi) 1482 1483#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1484 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1485 movl K, %eax 1486 subl KKK, %eax 1487 leal (,%eax, SIZE), %eax 1488 leal (AA, %eax, 2), AA 1489 leal (BB, %eax, 1), BB 1490#endif 1491 1492#if defined(TRMMKERNEL) && defined(LEFT) 1493 addl $2, KK 1494#endif 1495 1496 addl $2 * SIZE, %esi # coffset += 2 1497 decl %ebx # i -- 1498 jg .L71 1499 ALIGN_4 1500 1501.L80: 1502 movl M, %ebx 1503 testl $1, %ebx # i = (m >> 2) 1504 jle .L999 1505 1506#if !defined(TRMMKERNEL) || \ 1507 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1508 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1509 1510 movl B, BB 1511#else 1512 movl KK, %eax 1513 leal (, %eax, SIZE), %eax 1514 leal (AA, %eax, 1), AA 1515 leal (B, %eax, 1), BB 1516#endif 1517 1518 movapd 0 * SIZE(AA), %xmm0 1519 pxor %xmm4, %xmm4 1520 movapd 8 * SIZE(AA), %xmm1 1521 pxor %xmm5, %xmm5 1522 movapd 0 * SIZE(BB), %xmm2 1523 pxor %xmm6, %xmm6 1524 movapd 8 * SIZE(BB), %xmm3 1525 pxor %xmm7, %xmm7 1526 1527#ifndef TRMMKERNEL 1528 movl K, %eax 1529#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1530 movl K, %eax 1531 subl KK, %eax 1532 movl %eax, KKK 1533#else 1534 movl KK, %eax 1535#ifdef LEFT 1536 addl $1, %eax 1537#else 1538 addl $1, %eax 1539#endif 1540 movl %eax, KKK 1541#endif 1542 sarl $4, %eax 1543 je .L85 1544 ALIGN_4 1545 1546.L82: 1547 PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) 1548 mulpd %xmm0, %xmm2 1549 movapd 2 * SIZE(AA), %xmm0 1550 addpd %xmm2, %xmm4 1551 mulpd 2 * SIZE(BB), %xmm0 1552 movapd 16 * SIZE(BB), %xmm2 1553 addpd %xmm0, %xmm5 1554 movapd 4 * SIZE(AA), %xmm0 1555 mulpd 4 * SIZE(BB), %xmm0 1556 addpd %xmm0, %xmm6 1557 movapd 6 * SIZE(AA), %xmm0 1558 mulpd 6 * SIZE(BB), %xmm0 1559 addpd %xmm0, %xmm7 1560 movapd 16 * SIZE(AA), %xmm0 1561 mulpd %xmm1, %xmm3 1562 movapd 10 * SIZE(AA), %xmm1 1563 addpd %xmm3, %xmm4 1564 mulpd 10 * SIZE(BB), %xmm1 1565 movapd 24 * SIZE(BB), %xmm3 1566 addpd %xmm1, %xmm5 1567 movapd 12 * SIZE(AA), %xmm1 1568 mulpd 12 * SIZE(BB), %xmm1 1569 addpd %xmm1, %xmm6 1570 movapd 14 * SIZE(AA), %xmm1 1571 mulpd 14 * SIZE(BB), %xmm1 1572 addpd %xmm1, %xmm7 1573 movapd 24 * SIZE(AA), %xmm1 1574 1575 addl $16 * SIZE, AA 1576 addl $16 * SIZE, BB 1577 decl %eax 1578 jne .L82 1579 ALIGN_4 1580 1581.L85: 1582#ifndef TRMMKERNEL 1583 movl K, %eax 1584#else 1585 movl KKK, %eax 1586#endif 1587 movddup ALPHA, %xmm3 1588 andl $15, %eax # if (k & 1) 1589 BRANCH 1590 je .L88 1591 1592.L86: 1593 mulsd %xmm0, %xmm2 1594 movsd 1 * SIZE(AA), %xmm0 1595 addsd %xmm2, %xmm4 1596 movsd 1 * SIZE(BB), %xmm2 1597 1598 addl $1 * SIZE, AA 1599 addl $1 * SIZE, BB 1600 decl %eax 1601 jg .L86 1602 ALIGN_4 1603 1604.L88: 1605 addpd %xmm5, %xmm4 1606 addpd %xmm7, %xmm6 1607 addpd %xmm6, %xmm4 1608 1609 haddpd %xmm4, %xmm4 1610 1611 mulsd %xmm3, %xmm4 1612 1613#ifndef TRMMKERNEL 1614#ifdef PENTIUM4 1615 SHUFPD_2 %xmm0, %xmm0 1616#endif 1617 1618 movsd 0 * SIZE(%esi), %xmm0 1619 1620 addsd %xmm0, %xmm4 1621#endif 1622 1623 movsd %xmm4, 0 * SIZE(%esi) 1624 ALIGN_4 1625 1626.L999: 1627 popl %ebx 1628 popl %esi 1629 popl %edi 1630 popl %ebp 1631 1632 addl $ARGS, %esp 1633 ret 1634 1635 EPILOGUE 1636