1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define STACK 16 43#define ARGS 0 44 45#define OLD_M 4 + STACK + ARGS(%esi) 46#define OLD_N 8 + STACK + ARGS(%esi) 47#define OLD_K 12 + STACK + ARGS(%esi) 48#define OLD_ALPHA_R 16 + STACK + ARGS(%esi) 49#define OLD_ALPHA_I 20 + STACK + ARGS(%esi) 50#define OLD_A 24 + STACK + ARGS(%esi) 51#define OLD_B 28 + STACK + ARGS(%esi) 52#define OLD_C 32 + STACK + ARGS(%esi) 53#define OLD_LDC 36 + STACK + ARGS(%esi) 54#define OLD_OFFSET 40 + STACK + ARGS(%esi) 55 56#define GAMMA_R 0(%esp) 57#define GAMMA_I 8(%esp) 58#define ALPHA 16(%esp) 59#define K 24(%esp) 60#define N 28(%esp) 61#define M 32(%esp) 62#define A 36(%esp) 63#define C 40(%esp) 64#define J 44(%esp) 65#define OLD_STACK 48(%esp) 66#define OFFSET 52(%esp) 67#define KK 56(%esp) 68#define KKK 60(%esp) 69#define BUFFER 128(%esp) 70 71#define AA %edx 72#define BB %ecx 73 74#define PREFETCHSIZE (16 * 2 + 6) 75 76#define AOFFSET -32 77#define BOFFSET 128 78 79/* 80 81 A hint of scheduling is received from following URL 82 83https://sourceforge.net/mailarchive/forum.php?forum_id=426&max_rows=25&style=flat&viewmonth=200309&viewday=11 84 85*/ 86 87 PROLOGUE 88 89 pushl %ebp 90 pushl %edi 91 pushl %esi 92 pushl %ebx 93 94 PROFCODE 95 96 movl %esp, %esi # save old stack 97 subl $128 + LOCAL_BUFFER_SIZE, %esp 98 movl OLD_M, %ebx 99 andl $-1024, %esp # align stack 100 101 STACK_TOUCHING 102 103 movl OLD_N, %eax 104 movl OLD_K, %ecx 105 movl OLD_A, %edx 106 107 movl %ebx, M 108 movl %eax, N 109 movl %ecx, K 110 subl $AOFFSET * SIZE, %edx 111 movl %edx, A 112 movl %esi, OLD_STACK 113 114 testl %ebx, %ebx 115 jle .L999 116 117 movl OLD_B, %edi 118 movl OLD_C, %ebx 119 120 EMMS 121 122 movd OLD_ALPHA_R, %mm0 123 movd OLD_ALPHA_I, %mm1 124 125 movd %mm0, 0 + ALPHA 126 movd %mm1, 4 + ALPHA 127 128#if defined(NN) || defined(NT) || defined(TN) || defined(TT) 129 movl $0x3f800000, 0 + GAMMA_R 130 movl $0x3f800000, 4 + GAMMA_R 131 movl $0xbf800000, 0 + GAMMA_I 132 movl $0x3f800000, 4 + GAMMA_I 133#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) 134 movl $0x3f800000, 0 + GAMMA_R 135 movl $0x3f800000, 4 + GAMMA_R 136 movl $0x3f800000, 0 + GAMMA_I 137 movl $0xbf800000, 4 + GAMMA_I 138#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) 139 movl $0x3f800000, 0 + GAMMA_R 140 movl $0xbF800000, 4 + GAMMA_R 141 movl $0x3f800000, 0 + GAMMA_I 142 movl $0x3F800000, 4 + GAMMA_I 143#else 144 movl $0x3f800000, 0 + GAMMA_R 145 movl $0xbf800000, 4 + GAMMA_R 146 movl $0xbf800000, 0 + GAMMA_I 147 movl $0xbf800000, 4 + GAMMA_I 148#endif 149 movl %ebx, C 150 movl OLD_LDC, %ebp 151 leal (, %ebp, SIZE * 2), %ebp 152 153#ifdef TRMMKERNEL 154 movl OLD_OFFSET, %eax 155 movl %eax, OFFSET 156#ifndef LEFT 157 negl %eax 158 movl %eax, KK 159#endif 160#endif 161 162 movl N, %eax 163 sarl $1, %eax 164 movl %eax, J # j = n 165 jle .L20 166 ALIGN_4 167 168.L01: 169/* Copying to Sub Buffer */ 170 leal BUFFER, BB 171 172#if defined(TRMMKERNEL) && defined(LEFT) 173 movl OFFSET, %eax 174 movl %eax, KK 175#endif 176 177 movl K, %eax 178 sarl $2, %eax 179 jle .L03 180 ALIGN_4 181 182.L02: 183 movd 0 * SIZE(%edi), %mm0 184 movd 1 * SIZE(%edi), %mm1 185 movd 2 * SIZE(%edi), %mm2 186 movd 3 * SIZE(%edi), %mm3 187 movd 4 * SIZE(%edi), %mm4 188 movd 5 * SIZE(%edi), %mm5 189 movd 6 * SIZE(%edi), %mm6 190 movd 7 * SIZE(%edi), %mm7 191 192 prefetchnta 72 * SIZE(%edi) 193 194 punpckldq %mm0, %mm0 195 punpckldq %mm1, %mm1 196 punpckldq %mm2, %mm2 197 punpckldq %mm3, %mm3 198 punpckldq %mm4, %mm4 199 punpckldq %mm5, %mm5 200 punpckldq %mm6, %mm6 201 punpckldq %mm7, %mm7 202 203 movq %mm0, 0 * SIZE(BB) 204 movq %mm1, 2 * SIZE(BB) 205 movq %mm2, 4 * SIZE(BB) 206 movq %mm3, 6 * SIZE(BB) 207 movq %mm4, 8 * SIZE(BB) 208 movq %mm5, 10 * SIZE(BB) 209 movq %mm6, 12 * SIZE(BB) 210 movq %mm7, 14 * SIZE(BB) 211 212 movd 8 * SIZE(%edi), %mm0 213 movd 9 * SIZE(%edi), %mm1 214 movd 10 * SIZE(%edi), %mm2 215 movd 11 * SIZE(%edi), %mm3 216 movd 12 * SIZE(%edi), %mm4 217 movd 13 * SIZE(%edi), %mm5 218 movd 14 * SIZE(%edi), %mm6 219 movd 15 * SIZE(%edi), %mm7 220 221 punpckldq %mm0, %mm0 222 punpckldq %mm1, %mm1 223 punpckldq %mm2, %mm2 224 punpckldq %mm3, %mm3 225 punpckldq %mm4, %mm4 226 punpckldq %mm5, %mm5 227 punpckldq %mm6, %mm6 228 punpckldq %mm7, %mm7 229 230 movq %mm0, 16 * SIZE(BB) 231 movq %mm1, 18 * SIZE(BB) 232 movq %mm2, 20 * SIZE(BB) 233 movq %mm3, 22 * SIZE(BB) 234 movq %mm4, 24 * SIZE(BB) 235 movq %mm5, 26 * SIZE(BB) 236 movq %mm6, 28 * SIZE(BB) 237 movq %mm7, 30 * SIZE(BB) 238 239 addl $16 * SIZE, %edi 240 addl $32 * SIZE, BB 241 decl %eax 242 jne .L02 243 ALIGN_4 244 245.L03: 246 movl K, %eax 247 andl $3, %eax 248 BRANCH 249 jle .L10 250 ALIGN_4 251 252.L04: 253 movd 0 * SIZE(%edi), %mm0 254 movd 1 * SIZE(%edi), %mm1 255 movd 2 * SIZE(%edi), %mm2 256 movd 3 * SIZE(%edi), %mm3 257 258 punpckldq %mm0, %mm0 259 punpckldq %mm1, %mm1 260 punpckldq %mm2, %mm2 261 punpckldq %mm3, %mm3 262 263 movq %mm0, 0 * SIZE(BB) 264 movq %mm1, 2 * SIZE(BB) 265 movq %mm2, 4 * SIZE(BB) 266 movq %mm3, 6 * SIZE(BB) 267 268 addl $4 * SIZE, %edi 269 addl $8 * SIZE, BB 270 decl %eax 271 jne .L04 272 ALIGN_4 273 274.L10: 275 movl C, %esi # coffset = c 276 movl A, AA # aoffset = a 277 movl M, %ebx 278 ALIGN_4 279 280.L11: 281 leal - BOFFSET * SIZE + BUFFER, BB 282 283#if !defined(TRMMKERNEL) || \ 284 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 285 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 286#else 287 movl KK, %eax 288 leal (, %eax, SIZE), %eax 289 leal (AA, %eax, 2), AA 290 leal (BB, %eax, 8), BB 291#endif 292 293 movq ( 0 + AOFFSET) * SIZE(AA), %mm0 294 pxor %mm4, %mm4 295 movq ( 16 + AOFFSET) * SIZE(AA), %mm1 296 pxor %mm5, %mm5 297 PADDING movq ( 0 + BOFFSET) * SIZE(BB), %mm2 298 pxor %mm6, %mm6 299 PADDING movq ( 16 + BOFFSET) * SIZE(BB), %mm3 300 pxor %mm7, %mm7 301 302 prefetchw 2 * SIZE(%esi) 303 prefetchw 2 * SIZE(%esi, %ebp) 304 305#ifndef TRMMKERNEL 306 movl K, %eax 307#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 308 movl K, %eax 309 subl KK, %eax 310 movl %eax, KKK 311#else 312 movl KK, %eax 313#ifdef LEFT 314 addl $1, %eax 315#else 316 addl $2, %eax 317#endif 318 movl %eax, KKK 319#endif 320 sarl $4, %eax 321 je .L15 322 ALIGN_4 323 324.L12: 325 pfmul %mm0, %mm2 326 pfadd %mm2, %mm4 327 PADDING movq ( 2 + BOFFSET) * SIZE(BB), %mm2 328 329 pfmul %mm0, %mm2 330 pfadd %mm2, %mm5 331 PADDING movq ( 4 + BOFFSET) * SIZE(BB), %mm2 332 pfmul %mm0, %mm2 333 pfadd %mm2, %mm6 334 PADDING prefetch (PREFETCHSIZE + 0) * SIZE(AA) 335 336 PADDING movq ( 8 + BOFFSET) * SIZE(BB), %mm2 337 pfmul ( 6 + BOFFSET) * SIZE(BB), %mm0 338 pfadd %mm0, %mm7 339 movq ( 2 + AOFFSET) * SIZE(AA), %mm0 340 341 pfmul %mm0, %mm2 342 pfadd %mm2, %mm4 343 PADDING movq ( 10 + BOFFSET) * SIZE(BB), %mm2 344 pfmul %mm0, %mm2 345 pfadd %mm2, %mm5 346 PADDING movq ( 12 + BOFFSET) * SIZE(BB), %mm2 347 pfmul %mm0, %mm2 348 pfadd %mm2, %mm6 349 PADDING movq ( 32 + BOFFSET) * SIZE(BB), %mm2 350 pfmul ( 14 + BOFFSET) * SIZE(BB), %mm0 351 pfadd %mm0, %mm7 352 movq ( 4 + AOFFSET) * SIZE(AA), %mm0 353 354 pfmul %mm0, %mm3 355 pfadd %mm3, %mm4 356 PADDING movq ( 18 + BOFFSET) * SIZE(BB), %mm3 357 pfmul %mm0, %mm3 358 pfadd %mm3, %mm5 359 PADDING movq ( 20 + BOFFSET) * SIZE(BB), %mm3 360 pfmul %mm0, %mm3 361 pfadd %mm3, %mm6 362 PADDING movq ( 24 + BOFFSET) * SIZE(BB), %mm3 363 pfmul ( 22 + BOFFSET) * SIZE(BB), %mm0 364 pfadd %mm0, %mm7 365 movq ( 6 + AOFFSET) * SIZE(AA), %mm0 366 367 pfmul %mm0, %mm3 368 pfadd %mm3, %mm4 369 PADDING movq ( 26 + BOFFSET) * SIZE(BB), %mm3 370 pfmul %mm0, %mm3 371 pfadd %mm3, %mm5 372 PADDING movq ( 28 + BOFFSET) * SIZE(BB), %mm3 373 pfmul %mm0, %mm3 374 pfadd %mm3, %mm6 375 PADDING movq ( 48 + BOFFSET) * SIZE(BB), %mm3 376 pfmul ( 30 + BOFFSET) * SIZE(BB), %mm0 377 pfadd %mm0, %mm7 378 movq ( 8 + AOFFSET) * SIZE(AA), %mm0 379 380 pfmul %mm0, %mm2 381 pfadd %mm2, %mm4 382 PADDING movq ( 34 + BOFFSET) * SIZE(BB), %mm2 383 pfmul %mm0, %mm2 384 pfadd %mm2, %mm5 385 PADDING movq ( 36 + BOFFSET) * SIZE(BB), %mm2 386 pfmul %mm0, %mm2 387 pfadd %mm2, %mm6 388 PADDING movq ( 40 + BOFFSET) * SIZE(BB), %mm2 389 pfmul ( 38 + BOFFSET) * SIZE(BB), %mm0 390 pfadd %mm0, %mm7 391 movq ( 10 + AOFFSET) * SIZE(AA), %mm0 392 393 pfmul %mm0, %mm2 394 pfadd %mm2, %mm4 395 PADDING movq ( 42 + BOFFSET) * SIZE(BB), %mm2 396 pfmul %mm0, %mm2 397 pfadd %mm2, %mm5 398 PADDING movq ( 44 + BOFFSET) * SIZE(BB), %mm2 399 pfmul %mm0, %mm2 400 pfadd %mm2, %mm6 401 PADDING movq ( 64 + BOFFSET) * SIZE(BB), %mm2 402 pfmul ( 46 + BOFFSET) * SIZE(BB), %mm0 403 pfadd %mm0, %mm7 404 movq ( 12 + AOFFSET) * SIZE(AA), %mm0 405 406 pfmul %mm0, %mm3 407 pfadd %mm3, %mm4 408 PADDING movq ( 50 + BOFFSET) * SIZE(BB), %mm3 409 pfmul %mm0, %mm3 410 pfadd %mm3, %mm5 411 PADDING movq ( 52 + BOFFSET) * SIZE(BB), %mm3 412 pfmul %mm0, %mm3 413 pfadd %mm3, %mm6 414 PADDING movq ( 56 + BOFFSET) * SIZE(BB), %mm3 415 pfmul ( 54 + BOFFSET) * SIZE(BB), %mm0 416 pfadd %mm0, %mm7 417 movq ( 14 + AOFFSET) * SIZE(AA), %mm0 418 419 pfmul %mm0, %mm3 420 pfadd %mm3, %mm4 421 PADDING movq ( 58 + BOFFSET) * SIZE(BB), %mm3 422 pfmul %mm0, %mm3 423 pfadd %mm3, %mm5 424 PADDING movq ( 60 + BOFFSET) * SIZE(BB), %mm3 425 pfmul %mm0, %mm3 426 pfadd %mm3, %mm6 427 PADDING movq ( 80 + BOFFSET) * SIZE(BB), %mm3 428 pfmul ( 62 + BOFFSET) * SIZE(BB), %mm0 429 pfadd %mm0, %mm7 430 movq ( 32 + AOFFSET) * SIZE(AA), %mm0 431 432 pfmul %mm1, %mm2 433 pfadd %mm2, %mm4 434 PADDING movq ( 66 + BOFFSET) * SIZE(BB), %mm2 435 pfmul %mm1, %mm2 436 pfadd %mm2, %mm5 437 PADDING movq ( 68 + BOFFSET) * SIZE(BB), %mm2 438 pfmul %mm1, %mm2 439 pfadd %mm2, %mm6 440 PADDING movq ( 72 + BOFFSET) * SIZE(BB), %mm2 441 pfmul ( 70 + BOFFSET) * SIZE(BB), %mm1 442 pfadd %mm1, %mm7 443 movq ( 18 + AOFFSET) * SIZE(AA), %mm1 444 445 pfmul %mm1, %mm2 446 pfadd %mm2, %mm4 447 PADDING movq ( 74 + BOFFSET) * SIZE(BB), %mm2 448 pfmul %mm1, %mm2 449 pfadd %mm2, %mm5 450 PADDING movq ( 76 + BOFFSET) * SIZE(BB), %mm2 451 pfmul %mm1, %mm2 452 pfadd %mm2, %mm6 453 PADDING movq ( 96 + BOFFSET) * SIZE(BB), %mm2 454 pfmul ( 78 + BOFFSET) * SIZE(BB), %mm1 455 pfadd %mm1, %mm7 456 movq ( 20 + AOFFSET) * SIZE(AA), %mm1 457 458 pfmul %mm1, %mm3 459 pfadd %mm3, %mm4 460 PADDING movq ( 82 + BOFFSET) * SIZE(BB), %mm3 461 pfmul %mm1, %mm3 462 pfadd %mm3, %mm5 463 PADDING movq ( 84 + BOFFSET) * SIZE(BB), %mm3 464 pfmul %mm1, %mm3 465 pfadd %mm3, %mm6 466 PADDING movq ( 88 + BOFFSET) * SIZE(BB), %mm3 467 pfmul ( 86 + BOFFSET) * SIZE(BB), %mm1 468 pfadd %mm1, %mm7 469 movq ( 22 + AOFFSET) * SIZE(AA), %mm1 470 471 pfmul %mm1, %mm3 472 pfadd %mm3, %mm4 473 PADDING movq ( 90 + BOFFSET) * SIZE(BB), %mm3 474 pfmul %mm1, %mm3 475 pfadd %mm3, %mm5 476 PADDING movq ( 92 + BOFFSET) * SIZE(BB), %mm3 477 pfmul %mm1, %mm3 478 pfadd %mm3, %mm6 479 PADDING movq (112 + BOFFSET) * SIZE(BB), %mm3 480 pfmul ( 94 + BOFFSET) * SIZE(BB), %mm1 481 pfadd %mm1, %mm7 482 movq ( 24 + AOFFSET) * SIZE(AA), %mm1 483 484 pfmul %mm1, %mm2 485 pfadd %mm2, %mm4 486 PADDING movq ( 98 + BOFFSET) * SIZE(BB), %mm2 487 pfmul %mm1, %mm2 488 pfadd %mm2, %mm5 489 PADDING movq (100 + BOFFSET) * SIZE(BB), %mm2 490 pfmul %mm1, %mm2 491 pfadd %mm2, %mm6 492 PADDING movq (104 + BOFFSET) * SIZE(BB), %mm2 493 pfmul (102 + BOFFSET) * SIZE(BB), %mm1 494 pfadd %mm1, %mm7 495 movq ( 26 + AOFFSET) * SIZE(AA), %mm1 496 497 pfmul %mm1, %mm2 498 pfadd %mm2, %mm4 499 PADDING movq (106 + BOFFSET) * SIZE(BB), %mm2 500 pfmul %mm1, %mm2 501 pfadd %mm2, %mm5 502 PADDING movq (108 + BOFFSET) * SIZE(BB), %mm2 503 pfmul %mm1, %mm2 504 pfadd %mm2, %mm6 505 PADDING movq (128 + BOFFSET) * SIZE(BB), %mm2 506 pfmul (110 + BOFFSET) * SIZE(BB), %mm1 507 pfadd %mm1, %mm7 508 movq ( 28 + AOFFSET) * SIZE(AA), %mm1 509 510 pfmul %mm1, %mm3 511 pfadd %mm3, %mm4 512 PADDING movq (114 + BOFFSET) * SIZE(BB), %mm3 513 pfmul %mm1, %mm3 514 pfadd %mm3, %mm5 515 PADDING movq (116 + BOFFSET) * SIZE(BB), %mm3 516 pfmul %mm1, %mm3 517 pfadd %mm3, %mm6 518 PADDING movq (120 + BOFFSET) * SIZE(BB), %mm3 519 pfmul (118 + BOFFSET) * SIZE(BB), %mm1 520 pfadd %mm1, %mm7 521 movq ( 30 + AOFFSET) * SIZE(AA), %mm1 522 523 pfmul %mm1, %mm3 524 pfadd %mm3, %mm4 525 PADDING movq (122 + BOFFSET) * SIZE(BB), %mm3 526 pfmul %mm1, %mm3 527 pfadd %mm3, %mm5 528 PADDING movq (124 + BOFFSET) * SIZE(BB), %mm3 529 pfmul %mm1, %mm3 530 pfadd %mm3, %mm6 531 PADDING movq (144 + BOFFSET) * SIZE(BB), %mm3 532 pfmul (126 + BOFFSET) * SIZE(BB), %mm1 533 pfadd %mm1, %mm7 534 movq ( 48 + AOFFSET) * SIZE(AA), %mm1 535 536 subl $-32 * SIZE, AA 537 addl $128 * SIZE, BB 538 decl %eax 539 jne .L12 540 ALIGN_3 541 542.L15: 543#ifndef TRMMKERNEL 544 movl K, %eax 545#else 546 movl KKK, %eax 547#endif 548 andl $15, %eax # if (k & 1) 549 BRANCH 550 je .L18 551 ALIGN_3 552 553.L16: 554 pfmul %mm0, %mm2 555 pfadd %mm2, %mm4 556 PADDING movq ( 2 + BOFFSET) * SIZE(BB), %mm2 557 pfmul %mm0, %mm2 558 pfadd %mm2, %mm5 559 PADDING movq ( 4 + BOFFSET) * SIZE(BB), %mm2 560 pfmul %mm0, %mm2 561 pfadd %mm2, %mm6 562 PADDING movq ( 8 + BOFFSET) * SIZE(BB), %mm2 563 pfmul ( 6 + BOFFSET) * SIZE(BB), %mm0 564 pfadd %mm0, %mm7 565 movq ( 2 + AOFFSET) * SIZE(AA), %mm0 566 567 addl $2 * SIZE, AA 568 addl $8 * SIZE, BB 569 decl %eax 570 jg .L16 571 ALIGN_4 572 573.L18: 574 movq GAMMA_R, %mm0 575 movq GAMMA_I, %mm1 576 movq ALPHA, %mm2 577 578 pswapd %mm5, %mm5 579 pswapd %mm7, %mm7 580 581 pfmul %mm0, %mm4 582 pfmul %mm1, %mm5 583 pfmul %mm0, %mm6 584 pfmul %mm1, %mm7 585 586 pfadd %mm5, %mm4 587 pfadd %mm7, %mm6 588 589 pswapd %mm4, %mm5 590 pswapd %mm6, %mm7 591 pfmul %mm2, %mm4 592 pfmul %mm2, %mm6 593 pfmul %mm2, %mm5 594 pfmul %mm2, %mm7 595 596 pfpnacc %mm5, %mm4 597 pfpnacc %mm7, %mm6 598 599#ifndef TRMMKERNEL 600 pfadd (%esi), %mm4 601 pfadd (%esi, %ebp), %mm6 602#endif 603 movq %mm4, (%esi) 604 movq %mm6, (%esi, %ebp) 605 606#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 607 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 608 movl K, %eax 609 subl KKK, %eax 610 leal (,%eax, SIZE), %eax 611 leal (AA, %eax, 2), AA 612 leal (BB, %eax, 8), BB 613#endif 614 615#if defined(TRMMKERNEL) && defined(LEFT) 616 addl $1, KK 617#endif 618 619 addl $2 * SIZE, %esi 620 decl %ebx 621 jg .L11 622 ALIGN_4 623 624.L19: 625#if defined(TRMMKERNEL) && !defined(LEFT) 626 addl $2, KK 627#endif 628 629 leal (, %ebp, 2), %eax 630 addl %eax, C # c += ldc 631 decl J # j -- 632 jg .L01 633 ALIGN_4 634 635.L20: 636 movl N, %eax 637 andl $1, %eax 638 jle .L999 639 ALIGN_4 640 641.L21: 642/* Copying to Sub Buffer */ 643 movl K, %eax 644 leal BUFFER, BB 645 sarl $2, %eax 646 jle .L25 647 ALIGN_4 648 649.L22: 650 movd 0 * SIZE(%edi), %mm0 651 movd 1 * SIZE(%edi), %mm1 652 movd 2 * SIZE(%edi), %mm2 653 movd 3 * SIZE(%edi), %mm3 654 movd 4 * SIZE(%edi), %mm4 655 movd 5 * SIZE(%edi), %mm5 656 movd 6 * SIZE(%edi), %mm6 657 movd 7 * SIZE(%edi), %mm7 658 659 prefetchnta 72 * SIZE(%edi) 660 661 punpckldq %mm0, %mm0 662 punpckldq %mm1, %mm1 663 punpckldq %mm2, %mm2 664 punpckldq %mm3, %mm3 665 punpckldq %mm4, %mm4 666 punpckldq %mm5, %mm5 667 punpckldq %mm6, %mm6 668 punpckldq %mm7, %mm7 669 670 movq %mm0, 0 * SIZE(BB) 671 movq %mm1, 2 * SIZE(BB) 672 movq %mm2, 4 * SIZE(BB) 673 movq %mm3, 6 * SIZE(BB) 674 movq %mm4, 8 * SIZE(BB) 675 movq %mm5, 10 * SIZE(BB) 676 movq %mm6, 12 * SIZE(BB) 677 movq %mm7, 14 * SIZE(BB) 678 679 addl $ 8 * SIZE, %edi 680 addl $16 * SIZE, BB 681 decl %eax 682 jne .L22 683 ALIGN_4 684 685.L25: 686 movl K, %eax 687 andl $3, %eax 688 BRANCH 689 jle .L30 690 ALIGN_4 691 692.L26: 693 movd 0 * SIZE(%edi), %mm0 694 movd 1 * SIZE(%edi), %mm1 695 696 movd %mm0, 0 * SIZE(BB) 697 movd %mm0, 1 * SIZE(BB) 698 movd %mm1, 2 * SIZE(BB) 699 movd %mm1, 3 * SIZE(BB) 700 701 addl $2 * SIZE, %edi 702 addl $4 * SIZE, BB 703 decl %eax 704 jne .L26 705 ALIGN_4 706 707.L30: 708#if defined(TRMMKERNEL) && defined(LEFT) 709 movl OFFSET, %eax 710 movl %eax, KK 711#endif 712 713 movl C, %esi # coffset = c 714 movl A, AA # aoffset = a 715 movl M, %ebx 716 ALIGN_3 717 718.L31: 719 leal - BOFFSET * SIZE + BUFFER, BB 720 721#if !defined(TRMMKERNEL) || \ 722 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 723 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 724#else 725 movl KK, %eax 726 leal (, %eax, SIZE), %eax 727 leal (AA, %eax, 2), AA 728 leal (BB, %eax, 4), BB 729#endif 730 731 movq ( 0 + AOFFSET) * SIZE(AA), %mm0 732 pxor %mm4, %mm4 733 movq ( 16 + AOFFSET) * SIZE(AA), %mm1 734 pxor %mm5, %mm5 735 PADDING movq ( 0 + BOFFSET) * SIZE(BB), %mm2 736 pxor %mm6, %mm6 737 PADDING movq ( 16 + BOFFSET) * SIZE(BB), %mm3 738 pxor %mm7, %mm7 739 740 prefetchw 2 * SIZE(%esi) 741 742#ifndef TRMMKERNEL 743 movl K, %eax 744#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 745 movl K, %eax 746 subl KK, %eax 747 movl %eax, KKK 748#else 749 movl KK, %eax 750#ifdef LEFT 751 addl $1, %eax 752#else 753 addl $1, %eax 754#endif 755 movl %eax, KKK 756#endif 757 sarl $4, %eax 758 je .L35 759 ALIGN_4 760 761.L32: 762 pfmul %mm0, %mm2 763 pfadd %mm2, %mm4 764 PADDING movq ( 4 + BOFFSET) * SIZE(BB), %mm2 765 pfmul ( 2 + BOFFSET) * SIZE(BB), %mm0 766 pfadd %mm0, %mm5 767 movq ( 2 + AOFFSET) * SIZE(AA), %mm0 768 769 PADDING prefetch (PREFETCHSIZE + 0) * SIZE(AA) 770 771 pfmul %mm0, %mm2 772 pfadd %mm2, %mm6 773 PADDING movq ( 8 + BOFFSET) * SIZE(BB), %mm2 774 pfmul ( 6 + BOFFSET) * SIZE(BB), %mm0 775 pfadd %mm0, %mm7 776 movq ( 4 + AOFFSET) * SIZE(AA), %mm0 777 778 pfmul %mm0, %mm2 779 pfadd %mm2, %mm4 780 PADDING movq ( 12 + BOFFSET) * SIZE(BB), %mm2 781 pfmul ( 10 + BOFFSET) * SIZE(BB), %mm0 782 pfadd %mm0, %mm5 783 movq ( 6 + AOFFSET) * SIZE(AA), %mm0 784 785 pfmul %mm0, %mm2 786 pfadd %mm2, %mm6 787 PADDING movq ( 32 + BOFFSET) * SIZE(BB), %mm2 788 pfmul ( 14 + BOFFSET) * SIZE(BB), %mm0 789 pfadd %mm0, %mm7 790 movq ( 8 + AOFFSET) * SIZE(AA), %mm0 791 792 pfmul %mm0, %mm3 793 pfadd %mm3, %mm4 794 PADDING movq ( 20 + BOFFSET) * SIZE(BB), %mm3 795 pfmul ( 18 + BOFFSET) * SIZE(BB), %mm0 796 pfadd %mm0, %mm5 797 movq ( 10 + AOFFSET) * SIZE(AA), %mm0 798 799 pfmul %mm0, %mm3 800 pfadd %mm3, %mm6 801 PADDING movq ( 24 + BOFFSET) * SIZE(BB), %mm3 802 pfmul ( 22 + BOFFSET) * SIZE(BB), %mm0 803 pfadd %mm0, %mm7 804 movq ( 12 + AOFFSET) * SIZE(AA), %mm0 805 806 pfmul %mm0, %mm3 807 pfadd %mm3, %mm4 808 PADDING movq ( 28 + BOFFSET) * SIZE(BB), %mm3 809 pfmul ( 26 + BOFFSET) * SIZE(BB), %mm0 810 pfadd %mm0, %mm5 811 movq ( 14 + AOFFSET) * SIZE(AA), %mm0 812 813 pfmul %mm0, %mm3 814 pfadd %mm3, %mm6 815 PADDING movq ( 48 + BOFFSET) * SIZE(BB), %mm3 816 pfmul ( 30 + BOFFSET) * SIZE(BB), %mm0 817 pfadd %mm0, %mm7 818 movq ( 32 + AOFFSET) * SIZE(AA), %mm0 819 820 pfmul %mm1, %mm2 821 pfadd %mm2, %mm4 822 PADDING movq ( 36 + BOFFSET) * SIZE(BB), %mm2 823 pfmul ( 34 + BOFFSET) * SIZE(BB), %mm1 824 pfadd %mm1, %mm5 825 movq ( 18 + AOFFSET) * SIZE(AA), %mm1 826 827 pfmul %mm1, %mm2 828 pfadd %mm2, %mm6 829 PADDING movq ( 40 + BOFFSET) * SIZE(BB), %mm2 830 pfmul ( 38 + BOFFSET) * SIZE(BB), %mm1 831 pfadd %mm1, %mm7 832 movq ( 20 + AOFFSET) * SIZE(AA), %mm1 833 834 pfmul %mm1, %mm2 835 pfadd %mm2, %mm4 836 PADDING movq ( 44 + BOFFSET) * SIZE(BB), %mm2 837 pfmul ( 42 + BOFFSET) * SIZE(BB), %mm1 838 pfadd %mm1, %mm5 839 movq ( 22 + AOFFSET) * SIZE(AA), %mm1 840 841 pfmul %mm1, %mm2 842 pfadd %mm2, %mm6 843 PADDING movq ( 64 + BOFFSET) * SIZE(BB), %mm2 844 pfmul ( 46 + BOFFSET) * SIZE(BB), %mm1 845 pfadd %mm1, %mm7 846 movq ( 24 + AOFFSET) * SIZE(AA), %mm1 847 848 pfmul %mm1, %mm3 849 pfadd %mm3, %mm4 850 PADDING movq ( 52 + BOFFSET) * SIZE(BB), %mm3 851 pfmul ( 50 + BOFFSET) * SIZE(BB), %mm1 852 pfadd %mm1, %mm5 853 movq ( 26 + AOFFSET) * SIZE(AA), %mm1 854 855 pfmul %mm1, %mm3 856 pfadd %mm3, %mm6 857 PADDING movq ( 56 + BOFFSET) * SIZE(BB), %mm3 858 pfmul ( 54 + BOFFSET) * SIZE(BB), %mm1 859 pfadd %mm1, %mm7 860 movq ( 28 + AOFFSET) * SIZE(AA), %mm1 861 862 pfmul %mm1, %mm3 863 pfadd %mm3, %mm4 864 PADDING movq ( 60 + BOFFSET) * SIZE(BB), %mm3 865 pfmul ( 58 + BOFFSET) * SIZE(BB), %mm1 866 pfadd %mm1, %mm5 867 movq ( 30 + AOFFSET) * SIZE(AA), %mm1 868 869 pfmul %mm1, %mm3 870 pfadd %mm3, %mm6 871 PADDING movq ( 80 + BOFFSET) * SIZE(BB), %mm3 872 pfmul ( 62 + BOFFSET) * SIZE(BB), %mm1 873 pfadd %mm1, %mm7 874 movq ( 48 + AOFFSET) * SIZE(AA), %mm1 875 876 subl $-32 * SIZE, AA 877 addl $ 64 * SIZE, BB 878 decl %eax 879 jne .L32 880 ALIGN_3 881 882.L35: 883#ifndef TRMMKERNEL 884 movl K, %eax 885#else 886 movl KKK, %eax 887#endif 888 andl $15, %eax # if (k & 1) 889 BRANCH 890 je .L38 891 ALIGN_3 892 893.L36: 894 pfmul %mm0, %mm2 895 pfadd %mm2, %mm4 896 PADDING movq ( 4 + BOFFSET) * SIZE(BB), %mm2 897 898 pfmul ( 2 + BOFFSET) * SIZE(BB), %mm0 899 pfadd %mm0, %mm5 900 movq ( 2 + AOFFSET) * SIZE(AA), %mm0 901 902 addl $2 * SIZE, AA 903 addl $4 * SIZE, BB 904 decl %eax 905 jg .L36 906 ALIGN_4 907 908.L38: 909 pfadd %mm6, %mm4 910 pfadd %mm7, %mm5 911 912 movq ALPHA, %mm2 913 pswapd %mm5, %mm5 914 915 pfmul GAMMA_R, %mm4 916 pfmul GAMMA_I, %mm5 917 918 pfadd %mm5, %mm4 919 920 pswapd %mm4, %mm5 921 pfmul %mm2, %mm4 922 pfmul %mm2, %mm5 923 pfpnacc %mm5, %mm4 924 925#ifndef TRMMKERNEL 926 pfadd 0 * SIZE(%esi), %mm4 927#endif 928 movq %mm4, 0 * SIZE(%esi) 929 930#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 931 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 932 movl K, %eax 933 subl KKK, %eax 934 leal (,%eax, SIZE), %eax 935 leal (AA, %eax, 2), AA 936 leal (BB, %eax, 4), BB 937#endif 938 939#if defined(TRMMKERNEL) && defined(LEFT) 940 addl $1, KK 941#endif 942 943 addl $2 * SIZE, %esi # coffset += 4 944 decl %ebx # i -- 945 jg .L31 946 ALIGN_4 947 948.L999: 949 EMMS 950 951 movl OLD_STACK, %esp 952 popl %ebx 953 popl %esi 954 popl %edi 955 popl %ebp 956 ret 957 958 EPILOGUE 959