1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39#define ASSEMBLER 40#include "common.h" 41 42#define OLD_M %rdi 43#define OLD_N %rsi 44 45#define M %r13 46#define N %r14 47#define K %rdx 48 49#define A %rcx 50#define B %r8 51#define C %r9 52#define LDC %r10 53#define I %r11 54#define AO %rdi 55#define BO %rsi 56#define CO1 %r15 57#define CO2 %rbp 58#define BB %r12 59 60#ifndef WINDOWS_ABI 61 62#define STACKSIZE 64 63 64#define OLD_LDC 8 + STACKSIZE(%rsp) 65#define OLD_OFFSET 16 + STACKSIZE(%rsp) 66 67#else 68 69#define STACKSIZE 256 70 71#define OLD_A 40 + STACKSIZE(%rsp) 72#define OLD_B 48 + STACKSIZE(%rsp) 73#define OLD_C 56 + STACKSIZE(%rsp) 74#define OLD_LDC 64 + STACKSIZE(%rsp) 75#define OLD_OFFSET 72 + STACKSIZE(%rsp) 76 77#endif 78 79#define ALPHA 0(%rsp) 80#define J 16(%rsp) 81#define OFFSET 24(%rsp) 82#define KK 32(%rsp) 83#define KKK 40(%rsp) 84#define BUFFER 256(%rsp) 85 86#ifdef OPTERON 87#define movsd movlps 88#endif 89 90#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) 91#define PREFETCH prefetch 92#define PREFETCHW prefetchw 93#define PREFETCHSIZE (16 * 9 + 8) 94#endif 95 96#if defined(GENERIC) || defined(NANO) 97#define PREFETCH prefetcht0 98#define PREFETCHW prefetcht0 99#define PREFETCHSIZE (16 * 5 + 8) 100#endif 101 102#define RPREFETCHSIZE (8 * 7 + 4) 103#define WPREFETCHSIZE (8 * 8 + 4) 104 105#ifndef GENERIC 106#define KERNEL1(xx) \ 107 mulps %xmm0, %xmm1 ;\ 108 addps %xmm1, %xmm8 ;\ 109 movaps -32 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ 110 mulps %xmm0, %xmm3 ;\ 111 addps %xmm3, %xmm9 ;\ 112 movaps -28 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ 113 mulps %xmm0, %xmm5 ;\ 114 PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\ 115 mulps -20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\ 116 addps %xmm5, %xmm10 ;\ 117 movaps -24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ 118 addps %xmm0, %xmm11 ;\ 119 movaps -16 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0 120 121#define KERNEL2(xx) \ 122 mulps %xmm2, %xmm1 ;\ 123 addps %xmm1, %xmm12 ;\ 124 movaps 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ 125 mulps %xmm2, %xmm3 ;\ 126 addps %xmm3, %xmm13 ;\ 127 movaps -12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ 128 mulps %xmm2, %xmm5 ;\ 129 mulps -20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\ 130 addps %xmm5, %xmm14 ;\ 131 movaps -8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ 132 addps %xmm2, %xmm15 ;\ 133 movaps -12 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2 134 135#define KERNEL3(xx) \ 136 mulps %xmm4, %xmm7 ;\ 137 addps %xmm7, %xmm8 ;\ 138 movaps -16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ 139 mulps %xmm4, %xmm3 ;\ 140 addps %xmm3, %xmm9 ;\ 141 movaps -12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ 142 mulps %xmm4, %xmm5 ;\ 143 mulps -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\ 144 addps %xmm5, %xmm10 ;\ 145 movaps -8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ 146 addps %xmm4, %xmm11 ;\ 147 movaps -8 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4 148 149#define KERNEL4(xx) \ 150 mulps %xmm6, %xmm7 ;\ 151 addps %xmm7, %xmm12 ;\ 152 movaps 16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ 153 mulps %xmm6, %xmm3 ;\ 154 addps %xmm3, %xmm13 ;\ 155 movaps 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ 156 mulps %xmm6, %xmm5 ;\ 157 mulps -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\ 158 addps %xmm5, %xmm14 ;\ 159 movaps 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ 160 PREFETCH (PREFETCHSIZE + 16) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\ 161 addps %xmm6, %xmm15 ;\ 162 movaps -4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6 163 164#define KERNEL5(xx) \ 165 mulps %xmm0, %xmm1 ;\ 166 addps %xmm1, %xmm8 ;\ 167 movaps 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ 168 mulps %xmm0, %xmm3 ;\ 169 addps %xmm3, %xmm9 ;\ 170 movaps 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ 171 mulps %xmm0, %xmm5 ;\ 172 mulps 12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\ 173 addps %xmm5, %xmm10 ;\ 174 movaps 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ 175 addps %xmm0, %xmm11 ;\ 176 movaps 0 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0 177 178#define KERNEL6(xx) \ 179 mulps %xmm2, %xmm1 ;\ 180 addps %xmm1, %xmm12 ;\ 181 movaps 32 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ 182 mulps %xmm2, %xmm3 ;\ 183 addps %xmm3, %xmm13 ;\ 184 movaps 20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ 185 mulps %xmm2, %xmm5 ;\ 186 mulps 12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\ 187 addps %xmm5, %xmm14 ;\ 188 movaps 24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ 189 addps %xmm2, %xmm15 ;\ 190 movaps 4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2 191 192#define KERNEL7(xx) \ 193 mulps %xmm4, %xmm7 ;\ 194 addps %xmm7, %xmm8 ;\ 195 movaps 16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ 196 mulps %xmm4, %xmm3 ;\ 197 addps %xmm3, %xmm9 ;\ 198 movaps 20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ 199 mulps %xmm4, %xmm5 ;\ 200 mulps 28 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\ 201 addps %xmm5, %xmm10 ;\ 202 movaps 24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ 203 addps %xmm4, %xmm11 ;\ 204 movaps 8 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4 205 206#define KERNEL8(xx) \ 207 mulps %xmm6, %xmm7 ;\ 208 addps %xmm7, %xmm12 ;\ 209 movaps 48 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ 210 mulps %xmm6, %xmm3 ;\ 211 addps %xmm3, %xmm13 ;\ 212 movaps 36 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ 213 mulps %xmm6, %xmm5 ;\ 214 mulps 28 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\ 215 addps %xmm5, %xmm14 ;\ 216 movaps 40 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ 217 addps %xmm6, %xmm15 ;\ 218 movaps 12 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6 219 220#else 221#define KERNEL1(xx) \ 222 mulps %xmm0, %xmm1 ;\ 223 addps %xmm1, %xmm8 ;\ 224 movaps -32 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ 225 mulps %xmm0, %xmm3 ;\ 226 addps %xmm3, %xmm9 ;\ 227 movaps -28 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ 228 mulps %xmm0, %xmm5 ;\ 229 PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\ 230 mulps -20 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\ 231 addps %xmm5, %xmm10 ;\ 232 movaps -24 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ 233 addps %xmm0, %xmm11 ;\ 234 movaps -16 * SIZE + 1 * (xx) * SIZE(AO), %xmm0 235 236#define KERNEL2(xx) \ 237 mulps %xmm2, %xmm1 ;\ 238 addps %xmm1, %xmm12 ;\ 239 movaps 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ 240 mulps %xmm2, %xmm3 ;\ 241 addps %xmm3, %xmm13 ;\ 242 movaps -12 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ 243 mulps %xmm2, %xmm5 ;\ 244 mulps -20 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\ 245 addps %xmm5, %xmm14 ;\ 246 movaps -8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ 247 addps %xmm2, %xmm15 ;\ 248 movaps -12 * SIZE + 1 * (xx) * SIZE(AO), %xmm2 ;\ 249 250#define KERNEL3(xx) \ 251 mulps %xmm4, %xmm7 ;\ 252 addps %xmm7, %xmm8 ;\ 253 movaps -16 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ 254 mulps %xmm4, %xmm3 ;\ 255 addps %xmm3, %xmm9 ;\ 256 movaps -12 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ 257 mulps %xmm4, %xmm5 ;\ 258 mulps -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\ 259 addps %xmm5, %xmm10 ;\ 260 movaps -8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ 261 addps %xmm4, %xmm11 ;\ 262 movaps -8 * SIZE + 1 * (xx) * SIZE(AO), %xmm4 263 264#define KERNEL4(xx) \ 265 mulps %xmm6, %xmm7 ;\ 266 addps %xmm7, %xmm12 ;\ 267 movaps 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ 268 mulps %xmm6, %xmm3 ;\ 269 addps %xmm3, %xmm13 ;\ 270 movaps 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ 271 mulps %xmm6, %xmm5 ;\ 272 mulps -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\ 273 addps %xmm5, %xmm14 ;\ 274 movaps 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ 275 addps %xmm6, %xmm15 ;\ 276 movaps -4 * SIZE + 1 * (xx) * SIZE(AO), %xmm6 277 278#define KERNEL5(xx) \ 279 mulps %xmm0, %xmm1 ;\ 280 PREFETCH (PREFETCHSIZE + 16) * SIZE + 1 * (xx) * SIZE(AO) ;\ 281 addps %xmm1, %xmm8 ;\ 282 movaps 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ 283 mulps %xmm0, %xmm3 ;\ 284 addps %xmm3, %xmm9 ;\ 285 movaps 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ 286 mulps %xmm0, %xmm5 ;\ 287 mulps 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\ 288 addps %xmm5, %xmm10 ;\ 289 movaps 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ 290 addps %xmm0, %xmm11 ;\ 291 movaps 0 * SIZE + 1 * (xx) * SIZE(AO), %xmm0 292 293#define KERNEL6(xx) \ 294 mulps %xmm2, %xmm1 ;\ 295 addps %xmm1, %xmm12 ;\ 296 movaps 32 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ 297 mulps %xmm2, %xmm3 ;\ 298 addps %xmm3, %xmm13 ;\ 299 movaps 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ 300 mulps %xmm2, %xmm5 ;\ 301 mulps 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\ 302 addps %xmm5, %xmm14 ;\ 303 movaps 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ 304 addps %xmm2, %xmm15 ;\ 305 movaps 4 * SIZE + 1 * (xx) * SIZE(AO), %xmm2 306 307#define KERNEL7(xx) \ 308 mulps %xmm4, %xmm7 ;\ 309 addps %xmm7, %xmm8 ;\ 310 movaps 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ 311 mulps %xmm4, %xmm3 ;\ 312 addps %xmm3, %xmm9 ;\ 313 movaps 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ 314 mulps %xmm4, %xmm5 ;\ 315 mulps 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\ 316 addps %xmm5, %xmm10 ;\ 317 movaps 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ 318 addps %xmm4, %xmm11 ;\ 319 movaps 8 * SIZE + 1 * (xx) * SIZE(AO), %xmm4 320 321#define KERNEL8(xx) \ 322 mulps %xmm6, %xmm7 ;\ 323 addps %xmm7, %xmm12 ;\ 324 movaps 48 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ 325 mulps %xmm6, %xmm3 ;\ 326 addps %xmm3, %xmm13 ;\ 327 movaps 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ 328 mulps %xmm6, %xmm5 ;\ 329 mulps 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\ 330 addps %xmm5, %xmm14 ;\ 331 movaps 40 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ 332 addps %xmm6, %xmm15 ;\ 333 movaps 12 * SIZE + 1 * (xx) * SIZE(AO), %xmm6 334 335#endif 336 337 PROLOGUE 338 PROFCODE 339 340 subq $STACKSIZE, %rsp 341 342 movq %rbx, 0(%rsp) 343 movq %rbp, 8(%rsp) 344 movq %r12, 16(%rsp) 345 movq %r13, 24(%rsp) 346 movq %r14, 32(%rsp) 347 movq %r15, 40(%rsp) 348 349#ifdef WINDOWS_ABI 350 movq %rdi, 48(%rsp) 351 movq %rsi, 56(%rsp) 352 movups %xmm6, 64(%rsp) 353 movups %xmm7, 80(%rsp) 354 movups %xmm8, 96(%rsp) 355 movups %xmm9, 112(%rsp) 356 movups %xmm10, 128(%rsp) 357 movups %xmm11, 144(%rsp) 358 movups %xmm12, 160(%rsp) 359 movups %xmm13, 176(%rsp) 360 movups %xmm14, 192(%rsp) 361 movups %xmm15, 208(%rsp) 362 363 movq ARG1, OLD_M 364 movq ARG2, OLD_N 365 movq ARG3, K 366 movq OLD_A, A 367 movq OLD_B, B 368 movq OLD_C, C 369 movq OLD_LDC, LDC 370#ifdef TRMMKERNEL 371 movsd OLD_OFFSET, %xmm4 372#endif 373 movaps %xmm3, %xmm0 374 375#else 376 movq OLD_LDC, LDC 377#ifdef TRMMKERNEL 378 movsd OLD_OFFSET, %xmm4 379#endif 380 381#endif 382 383 EMMS 384 385 movq %rsp, %rbx # save old stack 386 subq $256 + LOCAL_BUFFER_SIZE, %rsp 387 andq $-4096, %rsp # align stack 388 389 STACK_TOUCHING 390 391 movq OLD_M, M 392 movq OLD_N, N 393 394 shufps $0, %xmm0, %xmm0 395 movaps %xmm0, ALPHA 396 397#ifdef TRMMKERNEL 398 movsd %xmm4, OFFSET 399 movsd %xmm4, KK 400#ifndef LEFT 401 negq KK 402#endif 403#endif 404 405 subq $-32 * SIZE, A 406 407 leaq (, LDC, SIZE), LDC 408 409 movq N, J 410 sarq $2, J # j = (n >> 2) 411 jle .L50 412 413.L01: 414#if defined(TRMMKERNEL) && defined(LEFT) 415 movq OFFSET, %rax 416 movq %rax, KK 417#endif 418 419/* Copying to Sub Buffer */ 420 leaq BUFFER, BO 421 422 movd 0 * SIZE(B), %mm0 423 424 movq K, %rax 425 sarq $2, %rax 426 jle .L03 427 428 addq %rax, %rax 429 ALIGN_4 430 431.L02: 432 PREFETCH (RPREFETCHSIZE + 0) * SIZE(B) 433 434 movd 1 * SIZE(B), %mm1 435 movd 2 * SIZE(B), %mm2 436 movd 3 * SIZE(B), %mm3 437 movd 4 * SIZE(B), %mm4 438 movd 5 * SIZE(B), %mm5 439 movd 6 * SIZE(B), %mm6 440 movd 7 * SIZE(B), %mm7 441 442 PREFETCHW (WPREFETCHSIZE + 0) * SIZE(BO) 443 444 punpckldq %mm0, %mm0 445 movq %mm0, 0 * SIZE(BO) 446 movq %mm0, 2 * SIZE(BO) 447 punpckldq %mm1, %mm1 448 movd 8 * SIZE(B), %mm0 449 movq %mm1, 4 * SIZE(BO) 450 movq %mm1, 6 * SIZE(BO) 451 punpckldq %mm2, %mm2 452 movq %mm2, 8 * SIZE(BO) 453 movq %mm2, 10 * SIZE(BO) 454 punpckldq %mm3, %mm3 455 movq %mm3, 12 * SIZE(BO) 456 movq %mm3, 14 * SIZE(BO) 457 458 PREFETCHW (WPREFETCHSIZE + 16) * SIZE(BO) 459 460 punpckldq %mm4, %mm4 461 movq %mm4, 16 * SIZE(BO) 462 movq %mm4, 18 * SIZE(BO) 463 punpckldq %mm5, %mm5 464 movq %mm5, 20 * SIZE(BO) 465 movq %mm5, 22 * SIZE(BO) 466 punpckldq %mm6, %mm6 467 movq %mm6, 24 * SIZE(BO) 468 movq %mm6, 26 * SIZE(BO) 469 punpckldq %mm7, %mm7 470 movq %mm7, 28 * SIZE(BO) 471 movq %mm7, 30 * SIZE(BO) 472 473 474 addq $ 8 * SIZE, B 475 addq $32 * SIZE, BO 476 477 decq %rax 478 jne .L02 479 ALIGN_4 480 481.L03: 482 movq K, %rax 483 andq $3, %rax 484 BRANCH 485 jle .L10 486 ALIGN_4 487 488.L04: 489 movd 0 * SIZE(B), %mm0 490 movd 1 * SIZE(B), %mm1 491 movd 2 * SIZE(B), %mm2 492 movd 3 * SIZE(B), %mm3 493 494 punpckldq %mm0, %mm0 495 punpckldq %mm1, %mm1 496 punpckldq %mm2, %mm2 497 punpckldq %mm3, %mm3 498 499 movq %mm0, 0 * SIZE(BO) 500 movq %mm0, 2 * SIZE(BO) 501 movq %mm1, 4 * SIZE(BO) 502 movq %mm1, 6 * SIZE(BO) 503 movq %mm2, 8 * SIZE(BO) 504 movq %mm2, 10 * SIZE(BO) 505 movq %mm3, 12 * SIZE(BO) 506 movq %mm3, 14 * SIZE(BO) 507 508 addq $ 4 * SIZE, B 509 addq $16 * SIZE, BO 510 decq %rax 511 jne .L04 512 ALIGN_4 513 514.L10: 515 movq C, CO1 # coffset1 = c 516 leaq (C, LDC, 1), CO2 # coffset2 = c + ldc 517 movq A, AO # aoffset = a 518 519 leaq (RPREFETCHSIZE + 0) * SIZE(B), BB 520 521 movq M, I 522 sarq $3, I # i = (m >> 3) 523 jle .L20 524 ALIGN_4 525 526.L11: 527#if !defined(TRMMKERNEL) || \ 528 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 529 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 530 531 leaq 32 * SIZE + BUFFER, BO 532#else 533 leaq 32 * SIZE + BUFFER, BO 534 movq KK, %rax 535 leaq (, %rax, 8), %rax 536 leaq (AO, %rax, 4), AO 537 leaq (BO, %rax, 8), BO 538#endif 539 540 movaps -32 * SIZE(AO), %xmm0 541 movaps -32 * SIZE(BO), %xmm1 542 xorps %xmm8, %xmm8 543 movaps -28 * SIZE(AO), %xmm2 544 movaps -28 * SIZE(BO), %xmm3 545 xorps %xmm9, %xmm9 546 movaps -24 * SIZE(AO), %xmm4 547 movaps -24 * SIZE(BO), %xmm5 548 xorps %xmm10, %xmm10 549 movaps -20 * SIZE(AO), %xmm6 550 movaps -16 * SIZE(BO), %xmm7 551 xorps %xmm11, %xmm11 552 553 PREFETCHW 7 * SIZE(CO1) 554 xorps %xmm12, %xmm12 555 PREFETCHW 15 * SIZE(CO2) 556 xorps %xmm13, %xmm13 557 PREFETCHW 7 * SIZE(CO1, LDC, 2) 558 xorps %xmm14, %xmm14 559 PREFETCHW 15 * SIZE(CO2, LDC, 2) 560 xorps %xmm15, %xmm15 561 PREFETCH -32 * SIZE(BB) 562 563#ifndef TRMMKERNEL 564 movq K, %rax 565#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 566 movq K, %rax 567 subq KK, %rax 568 movq %rax, KKK 569#else 570 movq KK, %rax 571#ifdef LEFT 572 addq $8, %rax 573#else 574 addq $4, %rax 575#endif 576 movq %rax, KKK 577#endif 578#ifndef GENERIC 579 andq $-8, %rax 580 581 leaq (, %rax, 8), %rax 582 leaq (AO, %rax, 4), AO 583 leaq (BO, %rax, 8), BO 584 negq %rax 585 NOBRANCH 586 je .L15 587 ALIGN_3 588 589.L12: 590 KERNEL1(16 * 0) 591 KERNEL2(16 * 0) 592 KERNEL3(16 * 0) 593 KERNEL4(16 * 0) 594 KERNEL5(16 * 0) 595 KERNEL6(16 * 0) 596 KERNEL7(16 * 0) 597 KERNEL8(16 * 0) 598 599 KERNEL1(16 * 2) 600 KERNEL2(16 * 2) 601 KERNEL3(16 * 2) 602 KERNEL4(16 * 2) 603 KERNEL5(16 * 2) 604 KERNEL6(16 * 2) 605 KERNEL7(16 * 2) 606 KERNEL8(16 * 2) 607 608 addq $16 * SIZE, %rax 609 NOBRANCH 610 je .L15 611 KERNEL1(16 * 0) 612 KERNEL2(16 * 0) 613 KERNEL3(16 * 0) 614 KERNEL4(16 * 0) 615 KERNEL5(16 * 0) 616 KERNEL6(16 * 0) 617 KERNEL7(16 * 0) 618 KERNEL8(16 * 0) 619 620 KERNEL1(16 * 2) 621 KERNEL2(16 * 2) 622 KERNEL3(16 * 2) 623 KERNEL4(16 * 2) 624 KERNEL5(16 * 2) 625 KERNEL6(16 * 2) 626 KERNEL7(16 * 2) 627 KERNEL8(16 * 2) 628 629 addq $16 * SIZE, %rax 630 NOBRANCH 631 je .L15 632 KERNEL1(16 * 0) 633 KERNEL2(16 * 0) 634 KERNEL3(16 * 0) 635 KERNEL4(16 * 0) 636 KERNEL5(16 * 0) 637 KERNEL6(16 * 0) 638 KERNEL7(16 * 0) 639 KERNEL8(16 * 0) 640 641 KERNEL1(16 * 2) 642 KERNEL2(16 * 2) 643 KERNEL3(16 * 2) 644 KERNEL4(16 * 2) 645 KERNEL5(16 * 2) 646 KERNEL6(16 * 2) 647 KERNEL7(16 * 2) 648 KERNEL8(16 * 2) 649 650 addq $16 * SIZE, %rax 651 NOBRANCH 652 je .L15 653 KERNEL1(16 * 0) 654 KERNEL2(16 * 0) 655 KERNEL3(16 * 0) 656 KERNEL4(16 * 0) 657 KERNEL5(16 * 0) 658 KERNEL6(16 * 0) 659 KERNEL7(16 * 0) 660 KERNEL8(16 * 0) 661 662 KERNEL1(16 * 2) 663 KERNEL2(16 * 2) 664 KERNEL3(16 * 2) 665 KERNEL4(16 * 2) 666 KERNEL5(16 * 2) 667 KERNEL6(16 * 2) 668 KERNEL7(16 * 2) 669 KERNEL8(16 * 2) 670 671 addq $16 * SIZE, %rax 672 NOBRANCH 673 je .L15 674 KERNEL1(16 * 0) 675 KERNEL2(16 * 0) 676 KERNEL3(16 * 0) 677 KERNEL4(16 * 0) 678 KERNEL5(16 * 0) 679 KERNEL6(16 * 0) 680 KERNEL7(16 * 0) 681 KERNEL8(16 * 0) 682 683 KERNEL1(16 * 2) 684 KERNEL2(16 * 2) 685 KERNEL3(16 * 2) 686 KERNEL4(16 * 2) 687 KERNEL5(16 * 2) 688 KERNEL6(16 * 2) 689 KERNEL7(16 * 2) 690 KERNEL8(16 * 2) 691 692 addq $16 * SIZE, %rax 693 NOBRANCH 694 je .L15 695 KERNEL1(16 * 0) 696 KERNEL2(16 * 0) 697 KERNEL3(16 * 0) 698 KERNEL4(16 * 0) 699 KERNEL5(16 * 0) 700 KERNEL6(16 * 0) 701 KERNEL7(16 * 0) 702 KERNEL8(16 * 0) 703 704 KERNEL1(16 * 2) 705 KERNEL2(16 * 2) 706 KERNEL3(16 * 2) 707 KERNEL4(16 * 2) 708 KERNEL5(16 * 2) 709 KERNEL6(16 * 2) 710 KERNEL7(16 * 2) 711 KERNEL8(16 * 2) 712 713 addq $16 * SIZE, %rax 714 NOBRANCH 715 je .L15 716 KERNEL1(16 * 0) 717 KERNEL2(16 * 0) 718 KERNEL3(16 * 0) 719 KERNEL4(16 * 0) 720 KERNEL5(16 * 0) 721 KERNEL6(16 * 0) 722 KERNEL7(16 * 0) 723 KERNEL8(16 * 0) 724 725 KERNEL1(16 * 2) 726 KERNEL2(16 * 2) 727 KERNEL3(16 * 2) 728 KERNEL4(16 * 2) 729 KERNEL5(16 * 2) 730 KERNEL6(16 * 2) 731 KERNEL7(16 * 2) 732 KERNEL8(16 * 2) 733 734 addq $16 * SIZE, %rax 735 NOBRANCH 736 je .L15 737 KERNEL1(16 * 0) 738 KERNEL2(16 * 0) 739 KERNEL3(16 * 0) 740 KERNEL4(16 * 0) 741 KERNEL5(16 * 0) 742 KERNEL6(16 * 0) 743 KERNEL7(16 * 0) 744 KERNEL8(16 * 0) 745 746 KERNEL1(16 * 2) 747 KERNEL2(16 * 2) 748 KERNEL3(16 * 2) 749 KERNEL4(16 * 2) 750 KERNEL5(16 * 2) 751 KERNEL6(16 * 2) 752 KERNEL7(16 * 2) 753 KERNEL8(16 * 2) 754 755 addq $16 * SIZE, %rax 756 BRANCH 757 jl .L12 758 ALIGN_3 759 760.L15: 761 PREFETCH -16 * SIZE(BB) 762 subq $-16 * SIZE, BB 763 764#ifndef TRMMKERNEL 765 movq K, %rax 766#else 767 movq KKK, %rax 768#endif 769 testq $4, %rax 770 je .L16 771 xorq %rax, %rax 772 ALIGN_3 773 774 KERNEL1(16 * 0) 775 KERNEL2(16 * 0) 776 KERNEL3(16 * 0) 777 KERNEL4(16 * 0) 778 KERNEL5(16 * 0) 779 KERNEL6(16 * 0) 780 KERNEL7(16 * 0) 781 KERNEL8(16 * 0) 782 783 addq $64 * SIZE, BO 784 addq $32 * SIZE, AO 785 ALIGN_3 786#else 787 sarq $2, %rax 788 NOBRANCH 789 jle .L16 790 ALIGN_3 791 792.L12: 793 KERNEL1(16 * 0) 794 KERNEL2(16 * 0) 795 KERNEL3(16 * 0) 796 KERNEL4(16 * 0) 797 KERNEL5(16 * 0) 798 KERNEL6(16 * 0) 799 KERNEL7(16 * 0) 800 KERNEL8(16 * 0) 801 802 addq $ 64 * SIZE, BO 803 subq $-32 * SIZE, AO 804 decq %rax 805 BRANCH 806 jg .L12 807#endif 808 809.L16: 810 movaps ALPHA, %xmm7 811 812#ifndef TRMMKERNEL 813 movq K, %rax 814#else 815 movq KKK, %rax 816#endif 817 andq $3, %rax # if (k & 1) 818 je .L18 819 820 leaq (, %rax, 8), %rax 821 leaq (AO, %rax, 4), AO 822 leaq (BO, %rax, 8), BO 823 negq %rax 824 ALIGN_4 825 826.L17: 827 mulps %xmm0, %xmm1 828 addps %xmm1, %xmm8 829 movaps -28 * SIZE(BO, %rax, 8), %xmm1 830 mulps %xmm0, %xmm1 831 addps %xmm1, %xmm9 832 movaps -24 * SIZE(BO, %rax, 8), %xmm1 833 mulps %xmm0, %xmm1 834 mulps -20 * SIZE(BO, %rax, 8), %xmm0 835 addps %xmm1, %xmm10 836 movaps -32 * SIZE(BO, %rax, 8), %xmm1 837 addps %xmm0, %xmm11 838 movaps -24 * SIZE(AO, %rax, 4), %xmm0 839 mulps %xmm2, %xmm1 840 addps %xmm1, %xmm12 841 movaps -28 * SIZE(BO, %rax, 8), %xmm1 842 mulps %xmm2, %xmm1 843 addps %xmm1, %xmm13 844 movaps -24 * SIZE(BO, %rax, 8), %xmm1 845 mulps %xmm2, %xmm1 846 mulps -20 * SIZE(BO, %rax, 8), %xmm2 847 addps %xmm1, %xmm14 848 movaps -16 * SIZE(BO, %rax, 8), %xmm1 849 addps %xmm2, %xmm15 850 movaps -20 * SIZE(AO, %rax, 4), %xmm2 851 852 addq $SIZE * 2, %rax 853 jl .L17 854 ALIGN_4 855 856.L18: 857#ifndef TRMMKERNEL 858 movsd 0 * SIZE(CO1), %xmm0 859 movhps 2 * SIZE(CO1), %xmm0 860 movsd 4 * SIZE(CO1), %xmm1 861 movhps 6 * SIZE(CO1), %xmm1 862 863 movsd 0 * SIZE(CO2), %xmm2 864 movhps 2 * SIZE(CO2), %xmm2 865 movsd 4 * SIZE(CO2), %xmm3 866 movhps 6 * SIZE(CO2), %xmm3 867#endif 868 869 mulps %xmm7, %xmm8 870 mulps %xmm7, %xmm9 871 mulps %xmm7, %xmm10 872 mulps %xmm7, %xmm11 873 874 mulps %xmm7, %xmm12 875 mulps %xmm7, %xmm13 876 mulps %xmm7, %xmm14 877 mulps %xmm7, %xmm15 878 879#ifndef TRMMKERNEL 880 movsd 0 * SIZE(CO1, LDC, 2), %xmm4 881 movhps 2 * SIZE(CO1, LDC, 2), %xmm4 882 movsd 4 * SIZE(CO1, LDC, 2), %xmm5 883 movhps 6 * SIZE(CO1, LDC, 2), %xmm5 884 885 movsd 0 * SIZE(CO2, LDC, 2), %xmm6 886 movhps 2 * SIZE(CO2, LDC, 2), %xmm6 887 movsd 4 * SIZE(CO2, LDC, 2), %xmm7 888 movhps 6 * SIZE(CO2, LDC, 2), %xmm7 889 890 addps %xmm0, %xmm8 891 addps %xmm1, %xmm12 892 addps %xmm2, %xmm9 893 addps %xmm3, %xmm13 894#endif 895 896 movlps %xmm8, 0 * SIZE(CO1) 897 movhps %xmm8, 2 * SIZE(CO1) 898 movlps %xmm12, 4 * SIZE(CO1) 899 movhps %xmm12, 6 * SIZE(CO1) 900 901 movlps %xmm9, 0 * SIZE(CO2) 902 movhps %xmm9, 2 * SIZE(CO2) 903 movlps %xmm13, 4 * SIZE(CO2) 904 movhps %xmm13, 6 * SIZE(CO2) 905 906#ifndef TRMMKERNEL 907 addps %xmm4, %xmm10 908 addps %xmm5, %xmm14 909 addps %xmm6, %xmm11 910 addps %xmm7, %xmm15 911#endif 912 913 movlps %xmm10, 0 * SIZE(CO1, LDC, 2) 914 movhps %xmm10, 2 * SIZE(CO1, LDC, 2) 915 movlps %xmm14, 4 * SIZE(CO1, LDC, 2) 916 movhps %xmm14, 6 * SIZE(CO1, LDC, 2) 917 918 movlps %xmm11, 0 * SIZE(CO2, LDC, 2) 919 movhps %xmm11, 2 * SIZE(CO2, LDC, 2) 920 movlps %xmm15, 4 * SIZE(CO2, LDC, 2) 921 movhps %xmm15, 6 * SIZE(CO2, LDC, 2) 922 923#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 924 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 925 movq K, %rax 926 subq KKK, %rax 927 leaq (,%rax, 8), %rax 928 leaq (AO, %rax, 4), AO 929 leaq (BO, %rax, 8), BO 930#endif 931 932#if defined(TRMMKERNEL) && defined(LEFT) 933 addq $8, KK 934#endif 935 936 addq $8 * SIZE, CO1 # coffset += 4 937 addq $8 * SIZE, CO2 # coffset += 4 938 decq I # i -- 939 jg .L11 940 ALIGN_4 941 942.L20: 943 testq $4, M 944 je .L30 945 946#if !defined(TRMMKERNEL) || \ 947 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 948 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 949 950 leaq BUFFER, BO 951#else 952 leaq BUFFER, BO 953 movq KK, %rax 954 leaq (, %rax, 8), %rax 955 leaq (AO, %rax, 2), AO 956 leaq (BO, %rax, 8), BO 957#endif 958 959 movaps -32 * SIZE(AO), %xmm8 960 movaps -16 * SIZE(AO), %xmm10 961 962 movaps 0 * SIZE(BO), %xmm9 963 movaps 16 * SIZE(BO), %xmm11 964 movaps 32 * SIZE(BO), %xmm13 965 movaps 48 * SIZE(BO), %xmm15 966 967 xorps %xmm0, %xmm0 968 xorps %xmm1, %xmm1 969 xorps %xmm2, %xmm2 970 xorps %xmm3, %xmm3 971 972#ifndef TRMMKERNEL 973 movq K, %rax 974#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 975 movq K, %rax 976 subq KK, %rax 977 movq %rax, KKK 978#else 979 movq KK, %rax 980#ifdef LEFT 981 addq $4, %rax 982#else 983 addq $4, %rax 984#endif 985 movq %rax, KKK 986#endif 987 sarq $3, %rax 988 je .L25 989 ALIGN_4 990 991.L22: 992 mulps %xmm8, %xmm9 993 addps %xmm9, %xmm0 994#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) 995 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 996#endif 997 movaps 4 * SIZE(BO), %xmm9 998 mulps %xmm8, %xmm9 999 addps %xmm9, %xmm1 1000 movaps 8 * SIZE(BO), %xmm9 1001 mulps %xmm8, %xmm9 1002 mulps 12 * SIZE(BO), %xmm8 1003 addps %xmm9, %xmm2 1004 movaps 64 * SIZE(BO), %xmm9 1005 addps %xmm8, %xmm3 1006 movaps -28 * SIZE(AO), %xmm8 1007 1008 mulps %xmm8, %xmm11 1009 addps %xmm11, %xmm0 1010 movaps 20 * SIZE(BO), %xmm11 1011 mulps %xmm8, %xmm11 1012 addps %xmm11, %xmm1 1013 movaps 24 * SIZE(BO), %xmm11 1014 mulps %xmm8, %xmm11 1015 mulps 28 * SIZE(BO), %xmm8 1016 addps %xmm11, %xmm2 1017 movaps 80 * SIZE(BO), %xmm11 1018 addps %xmm8, %xmm3 1019 movaps -24 * SIZE(AO), %xmm8 1020 1021 mulps %xmm8, %xmm13 1022 addps %xmm13, %xmm0 1023 movaps 36 * SIZE(BO), %xmm13 1024 mulps %xmm8, %xmm13 1025 addps %xmm13, %xmm1 1026 movaps 40 * SIZE(BO), %xmm13 1027 mulps %xmm8, %xmm13 1028 mulps 44 * SIZE(BO), %xmm8 1029 addps %xmm13, %xmm2 1030 movaps 96 * SIZE(BO), %xmm13 1031 addps %xmm8, %xmm3 1032 movaps -20 * SIZE(AO), %xmm8 1033 1034 mulps %xmm8, %xmm15 1035 addps %xmm15, %xmm0 1036 movaps 52 * SIZE(BO), %xmm15 1037 mulps %xmm8, %xmm15 1038 addps %xmm15, %xmm1 1039 movaps 56 * SIZE(BO), %xmm15 1040 mulps %xmm8, %xmm15 1041 mulps 60 * SIZE(BO), %xmm8 1042 addps %xmm15, %xmm2 1043 movaps 112 * SIZE(BO), %xmm15 1044 addps %xmm8, %xmm3 1045 movaps 0 * SIZE(AO), %xmm8 1046 1047#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) 1048 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) 1049#endif 1050 mulps %xmm10, %xmm9 1051 addps %xmm9, %xmm0 1052 movaps 68 * SIZE(BO), %xmm9 1053 mulps %xmm10, %xmm9 1054 addps %xmm9, %xmm1 1055 movaps 72 * SIZE(BO), %xmm9 1056 mulps %xmm10, %xmm9 1057 mulps 76 * SIZE(BO), %xmm10 1058 addps %xmm9, %xmm2 1059 movaps 128 * SIZE(BO), %xmm9 1060 addps %xmm10, %xmm3 1061 movaps -12 * SIZE(AO), %xmm10 1062 1063 mulps %xmm10, %xmm11 1064 addps %xmm11, %xmm0 1065 movaps 84 * SIZE(BO), %xmm11 1066 mulps %xmm10, %xmm11 1067 addps %xmm11, %xmm1 1068 movaps 88 * SIZE(BO), %xmm11 1069 mulps %xmm10, %xmm11 1070 mulps 92 * SIZE(BO), %xmm10 1071 addps %xmm11, %xmm2 1072 movaps 144 * SIZE(BO), %xmm11 1073 addps %xmm10, %xmm3 1074 movaps -8 * SIZE(AO), %xmm10 1075 1076 mulps %xmm10, %xmm13 1077 addps %xmm13, %xmm0 1078 movaps 100 * SIZE(BO), %xmm13 1079 mulps %xmm10, %xmm13 1080 addps %xmm13, %xmm1 1081 movaps 104 * SIZE(BO), %xmm13 1082 mulps %xmm10, %xmm13 1083 mulps 108 * SIZE(BO), %xmm10 1084 addps %xmm13, %xmm2 1085 movaps 160 * SIZE(BO), %xmm13 1086 addps %xmm10, %xmm3 1087 movaps -4 * SIZE(AO), %xmm10 1088 1089 mulps %xmm10, %xmm15 1090 addps %xmm15, %xmm0 1091 movaps 116 * SIZE(BO), %xmm15 1092 mulps %xmm10, %xmm15 1093 addps %xmm15, %xmm1 1094 movaps 120 * SIZE(BO), %xmm15 1095 mulps %xmm10, %xmm15 1096 mulps 124 * SIZE(BO), %xmm10 1097 addps %xmm15, %xmm2 1098 movaps 176 * SIZE(BO), %xmm15 1099 addps %xmm10, %xmm3 1100 movaps 16 * SIZE(AO), %xmm10 1101 1102 addq $ 32 * SIZE, AO 1103 addq $128 * SIZE, BO 1104 decq %rax 1105 jne .L22 1106 ALIGN_4 1107 1108.L25: 1109#ifndef TRMMKERNEL 1110 movq K, %rax 1111#else 1112 movq KKK, %rax 1113#endif 1114 movaps ALPHA, %xmm15 1115 andq $7, %rax # if (k & 1) 1116 BRANCH 1117 je .L28 1118 ALIGN_4 1119 1120.L26: 1121 mulps %xmm8, %xmm9 1122 addps %xmm9, %xmm0 1123 movaps 4 * SIZE(BO), %xmm9 1124 mulps %xmm8, %xmm9 1125 addps %xmm9, %xmm1 1126 movaps 8 * SIZE(BO), %xmm9 1127 mulps %xmm8, %xmm9 1128 mulps 12 * SIZE(BO), %xmm8 1129 addps %xmm9, %xmm2 1130 movaps 16 * SIZE(BO), %xmm9 1131 addps %xmm8, %xmm3 1132 movaps -28 * SIZE(AO), %xmm8 1133 1134 addq $ 4 * SIZE, AO # aoffset += 4 1135 addq $16 * SIZE, BO # boffset1 += 8 1136 decq %rax 1137 jg .L26 1138 ALIGN_4 1139 1140.L28: 1141 mulps %xmm15, %xmm0 1142 mulps %xmm15, %xmm1 1143 mulps %xmm15, %xmm2 1144 mulps %xmm15, %xmm3 1145 1146#ifndef TRMMKERNEL 1147 movsd 0 * SIZE(CO1), %xmm8 1148 movhps 2 * SIZE(CO1), %xmm8 1149 movsd 0 * SIZE(CO2), %xmm10 1150 movhps 2 * SIZE(CO2), %xmm10 1151 1152 movsd 0 * SIZE(CO1, LDC, 2), %xmm12 1153 movhps 2 * SIZE(CO1, LDC, 2), %xmm12 1154 movsd 0 * SIZE(CO2, LDC, 2), %xmm14 1155 movhps 2 * SIZE(CO2, LDC, 2), %xmm14 1156 1157 addps %xmm8, %xmm0 1158 addps %xmm10, %xmm1 1159 addps %xmm12, %xmm2 1160 addps %xmm14, %xmm3 1161#endif 1162 1163 movlps %xmm0, 0 * SIZE(CO1) 1164 movhps %xmm0, 2 * SIZE(CO1) 1165 movlps %xmm1, 0 * SIZE(CO2) 1166 movhps %xmm1, 2 * SIZE(CO2) 1167 1168 movlps %xmm2, 0 * SIZE(CO1, LDC, 2) 1169 movhps %xmm2, 2 * SIZE(CO1, LDC, 2) 1170 movlps %xmm3, 0 * SIZE(CO2, LDC, 2) 1171 movhps %xmm3, 2 * SIZE(CO2, LDC, 2) 1172 1173#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1174 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1175 movq K, %rax 1176 subq KKK, %rax 1177 leaq (,%rax, 8), %rax 1178 leaq (AO, %rax, 2), AO 1179 leaq (BO, %rax, 8), BO 1180#endif 1181 1182#if defined(TRMMKERNEL) && defined(LEFT) 1183 addq $4, KK 1184#endif 1185 1186 addq $4 * SIZE, CO1 # coffset += 4 1187 addq $4 * SIZE, CO2 # coffset += 4 1188 ALIGN_4 1189 1190.L30: 1191 testq $2, M 1192 je .L40 1193 1194#if !defined(TRMMKERNEL) || \ 1195 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1196 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1197 1198 leaq BUFFER, BO 1199#else 1200 leaq BUFFER, BO 1201 movq KK, %rax 1202 leaq (, %rax, 8), %rax 1203 leaq (AO, %rax, 1), AO 1204 leaq (BO, %rax, 8), BO 1205#endif 1206 1207 movaps -32 * SIZE(AO), %xmm8 1208 movaps -24 * SIZE(AO), %xmm10 1209 1210 movaps 0 * SIZE(BO), %xmm9 1211 movaps 16 * SIZE(BO), %xmm11 1212 movaps 32 * SIZE(BO), %xmm13 1213 movaps 48 * SIZE(BO), %xmm15 1214 1215 xorps %xmm0, %xmm0 1216 xorps %xmm1, %xmm1 1217 xorps %xmm2, %xmm2 1218 xorps %xmm3, %xmm3 1219 1220#ifndef TRMMKERNEL 1221 movq K, %rax 1222#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1223 movq K, %rax 1224 subq KK, %rax 1225 movq %rax, KKK 1226#else 1227 movq KK, %rax 1228#ifdef LEFT 1229 addq $2, %rax 1230#else 1231 addq $4, %rax 1232#endif 1233 movq %rax, KKK 1234#endif 1235 sarq $3, %rax 1236 je .L35 1237 ALIGN_4 1238 1239.L32: 1240 mulps %xmm8, %xmm9 1241 addps %xmm9, %xmm0 1242#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) 1243 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 1244#endif 1245 movaps 4 * SIZE(BO), %xmm9 1246 mulps %xmm8, %xmm9 1247 addps %xmm9, %xmm1 1248 movaps 8 * SIZE(BO), %xmm9 1249 mulps %xmm8, %xmm9 1250 addps %xmm9, %xmm2 1251 movaps 12 * SIZE(BO), %xmm9 1252 mulps %xmm8, %xmm9 1253 movsd -30 * SIZE(AO), %xmm8 1254 addps %xmm9, %xmm3 1255 movaps 64 * SIZE(BO), %xmm9 1256 1257 mulps %xmm8, %xmm11 1258 addps %xmm11, %xmm0 1259 movaps 20 * SIZE(BO), %xmm11 1260 mulps %xmm8, %xmm11 1261 addps %xmm11, %xmm1 1262 movaps 24 * SIZE(BO), %xmm11 1263 mulps %xmm8, %xmm11 1264 addps %xmm11, %xmm2 1265 movaps 28 * SIZE(BO), %xmm11 1266 mulps %xmm8, %xmm11 1267 movsd -28 * SIZE(AO), %xmm8 1268 addps %xmm11, %xmm3 1269 movaps 80 * SIZE(BO), %xmm11 1270 1271 mulps %xmm8, %xmm13 1272 addps %xmm13, %xmm0 1273 movaps 36 * SIZE(BO), %xmm13 1274 mulps %xmm8, %xmm13 1275 addps %xmm13, %xmm1 1276 movaps 40 * SIZE(BO), %xmm13 1277 mulps %xmm8, %xmm13 1278 addps %xmm13, %xmm2 1279 movaps 44 * SIZE(BO), %xmm13 1280 mulps %xmm8, %xmm13 1281 movsd -26 * SIZE(AO), %xmm8 1282 addps %xmm13, %xmm3 1283 movaps 96 * SIZE(BO), %xmm13 1284 1285 mulps %xmm8, %xmm15 1286 addps %xmm15, %xmm0 1287 movaps 52 * SIZE(BO), %xmm15 1288 mulps %xmm8, %xmm15 1289 addps %xmm15, %xmm1 1290 movaps 56 * SIZE(BO), %xmm15 1291 mulps %xmm8, %xmm15 1292 addps %xmm15, %xmm2 1293 movaps 60 * SIZE(BO), %xmm15 1294 mulps %xmm8, %xmm15 1295 movsd -16 * SIZE(AO), %xmm8 1296 addps %xmm15, %xmm3 1297 movaps 112 * SIZE(BO), %xmm15 1298 1299 mulps %xmm10, %xmm9 1300 addps %xmm9, %xmm0 1301 movaps 68 * SIZE(BO), %xmm9 1302 mulps %xmm10, %xmm9 1303 addps %xmm9, %xmm1 1304 movaps 72 * SIZE(BO), %xmm9 1305 mulps %xmm10, %xmm9 1306 addps %xmm9, %xmm2 1307 movaps 76 * SIZE(BO), %xmm9 1308 mulps %xmm10, %xmm9 1309 movsd -22 * SIZE(AO), %xmm10 1310 addps %xmm9, %xmm3 1311 movaps 128 * SIZE(BO), %xmm9 1312 1313 mulps %xmm10, %xmm11 1314 addps %xmm11, %xmm0 1315 movaps 84 * SIZE(BO), %xmm11 1316 mulps %xmm10, %xmm11 1317 addps %xmm11, %xmm1 1318 movaps 88 * SIZE(BO), %xmm11 1319 mulps %xmm10, %xmm11 1320 addps %xmm11, %xmm2 1321 movaps 92 * SIZE(BO), %xmm11 1322 mulps %xmm10, %xmm11 1323 movsd -20 * SIZE(AO), %xmm10 1324 addps %xmm11, %xmm3 1325 movaps 144 * SIZE(BO), %xmm11 1326 1327 mulps %xmm10, %xmm13 1328 addps %xmm13, %xmm0 1329 movaps 100 * SIZE(BO), %xmm13 1330 mulps %xmm10, %xmm13 1331 addps %xmm13, %xmm1 1332 movaps 104 * SIZE(BO), %xmm13 1333 mulps %xmm10, %xmm13 1334 addps %xmm13, %xmm2 1335 movaps 108 * SIZE(BO), %xmm13 1336 mulps %xmm10, %xmm13 1337 movsd -18 * SIZE(AO), %xmm10 1338 addps %xmm13, %xmm3 1339 movaps 160 * SIZE(BO), %xmm13 1340 1341 mulps %xmm10, %xmm15 1342 addps %xmm15, %xmm0 1343 movaps 116 * SIZE(BO), %xmm15 1344 mulps %xmm10, %xmm15 1345 addps %xmm15, %xmm1 1346 movaps 120 * SIZE(BO), %xmm15 1347 mulps %xmm10, %xmm15 1348 addps %xmm15, %xmm2 1349 movaps 124 * SIZE(BO), %xmm15 1350 mulps %xmm10, %xmm15 1351 movsd -8 * SIZE(AO), %xmm10 1352 addps %xmm15, %xmm3 1353 movaps 176 * SIZE(BO), %xmm15 1354 1355 addq $ 16 * SIZE, AO 1356 addq $128 * SIZE, BO 1357 decq %rax 1358 jne .L32 1359 ALIGN_4 1360 1361.L35: 1362#ifndef TRMMKERNEL 1363 movq K, %rax 1364#else 1365 movq KKK, %rax 1366#endif 1367 movaps ALPHA, %xmm15 1368 andq $7, %rax # if (k & 1) 1369 BRANCH 1370 je .L38 1371 ALIGN_4 1372 1373.L36: 1374 mulps %xmm8, %xmm9 1375 addps %xmm9, %xmm0 1376 movaps 4 * SIZE(BO), %xmm9 1377 mulps %xmm8, %xmm9 1378 addps %xmm9, %xmm1 1379 movaps 8 * SIZE(BO), %xmm9 1380 mulps %xmm8, %xmm9 1381 addps %xmm9, %xmm2 1382 movaps 12 * SIZE(BO), %xmm9 1383 mulps %xmm8, %xmm9 1384 movsd -30 * SIZE(AO), %xmm8 1385 addps %xmm9, %xmm3 1386 movaps 16 * SIZE(BO), %xmm9 1387 1388 addq $ 2 * SIZE, AO # aoffset += 4 1389 addq $16 * SIZE, BO # boffset1 += 8 1390 decq %rax 1391 jg .L36 1392 ALIGN_4 1393 1394.L38: 1395 mulps %xmm15, %xmm0 1396 mulps %xmm15, %xmm1 1397 mulps %xmm15, %xmm2 1398 mulps %xmm15, %xmm3 1399 1400#ifndef TRMMKERNEL 1401#ifdef movsd 1402 xorps %xmm8, %xmm8 1403#endif 1404 movsd 0 * SIZE(CO1), %xmm8 1405#ifdef movsd 1406 xorps %xmm10, %xmm10 1407#endif 1408 movsd 0 * SIZE(CO2), %xmm10 1409#ifdef movsd 1410 xorps %xmm12, %xmm12 1411#endif 1412 movsd 0 * SIZE(CO1, LDC, 2), %xmm12 1413#ifdef movsd 1414 xorps %xmm14, %xmm14 1415#endif 1416 movsd 0 * SIZE(CO2, LDC, 2), %xmm14 1417 1418 addps %xmm8, %xmm0 1419 addps %xmm10, %xmm1 1420 addps %xmm12, %xmm2 1421 addps %xmm14, %xmm3 1422#endif 1423 1424 movlps %xmm0, 0 * SIZE(CO1) 1425 movlps %xmm1, 0 * SIZE(CO2) 1426 movlps %xmm2, 0 * SIZE(CO1, LDC, 2) 1427 movlps %xmm3, 0 * SIZE(CO2, LDC, 2) 1428 1429#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1430 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1431 movq K, %rax 1432 subq KKK, %rax 1433 leaq (,%rax, 8), %rax 1434 leaq (AO, %rax, 1), AO 1435 leaq (BO, %rax, 8), BO 1436#endif 1437 1438#if defined(TRMMKERNEL) && defined(LEFT) 1439 addq $2, KK 1440#endif 1441 1442 addq $2 * SIZE, CO1 # coffset += 4 1443 addq $2 * SIZE, CO2 # coffset += 4 1444 ALIGN_4 1445 1446.L40: 1447 testq $1, M 1448 je .L49 1449 1450#if !defined(TRMMKERNEL) || \ 1451 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1452 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1453 1454 leaq BUFFER, BO 1455#else 1456 leaq BUFFER, BO 1457 movq KK, %rax 1458 leaq (, %rax, 4), %rax 1459 leaq (AO, %rax, 1), AO 1460 leaq (BO, %rax, 8), BO 1461 leaq (BO, %rax, 8), BO 1462#endif 1463 1464 movss -32 * SIZE(AO), %xmm8 1465 movss -28 * SIZE(AO), %xmm10 1466 1467 movss 0 * SIZE(BO), %xmm9 1468 movss 16 * SIZE(BO), %xmm11 1469 movss 32 * SIZE(BO), %xmm13 1470 movss 48 * SIZE(BO), %xmm15 1471 1472 xorps %xmm0, %xmm0 1473 xorps %xmm1, %xmm1 1474 xorps %xmm2, %xmm2 1475 xorps %xmm3, %xmm3 1476 1477#ifndef TRMMKERNEL 1478 movq K, %rax 1479#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1480 movq K, %rax 1481 subq KK, %rax 1482 movq %rax, KKK 1483#else 1484 movq KK, %rax 1485#ifdef LEFT 1486 addq $1, %rax 1487#else 1488 addq $4, %rax 1489#endif 1490 movq %rax, KKK 1491#endif 1492 sarq $3, %rax 1493 je .L45 1494 ALIGN_4 1495 1496.L42: 1497 mulss %xmm8, %xmm9 1498 addss %xmm9, %xmm0 1499#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) 1500 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 1501#endif 1502 movss 4 * SIZE(BO), %xmm9 1503 mulss %xmm8, %xmm9 1504 addss %xmm9, %xmm1 1505 movss 8 * SIZE(BO), %xmm9 1506 mulss %xmm8, %xmm9 1507 addss %xmm9, %xmm2 1508 movss 12 * SIZE(BO), %xmm9 1509 mulss %xmm8, %xmm9 1510 movss -31 * SIZE(AO), %xmm8 1511 addss %xmm9, %xmm3 1512 movss 64 * SIZE(BO), %xmm9 1513 1514 mulss %xmm8, %xmm11 1515 addss %xmm11, %xmm0 1516 movss 20 * SIZE(BO), %xmm11 1517 mulss %xmm8, %xmm11 1518 addss %xmm11, %xmm1 1519 movss 24 * SIZE(BO), %xmm11 1520 mulss %xmm8, %xmm11 1521 addss %xmm11, %xmm2 1522 movss 28 * SIZE(BO), %xmm11 1523 mulss %xmm8, %xmm11 1524 movss -30 * SIZE(AO), %xmm8 1525 addss %xmm11, %xmm3 1526 movss 80 * SIZE(BO), %xmm11 1527 1528 mulss %xmm8, %xmm13 1529 addss %xmm13, %xmm0 1530 movss 36 * SIZE(BO), %xmm13 1531 mulss %xmm8, %xmm13 1532 addss %xmm13, %xmm1 1533 movss 40 * SIZE(BO), %xmm13 1534 mulss %xmm8, %xmm13 1535 addss %xmm13, %xmm2 1536 movss 44 * SIZE(BO), %xmm13 1537 mulss %xmm8, %xmm13 1538 movss -29 * SIZE(AO), %xmm8 1539 addss %xmm13, %xmm3 1540 movss 96 * SIZE(BO), %xmm13 1541 1542 mulss %xmm8, %xmm15 1543 addss %xmm15, %xmm0 1544 movss 52 * SIZE(BO), %xmm15 1545 mulss %xmm8, %xmm15 1546 addss %xmm15, %xmm1 1547 movss 56 * SIZE(BO), %xmm15 1548 mulss %xmm8, %xmm15 1549 addss %xmm15, %xmm2 1550 movss 60 * SIZE(BO), %xmm15 1551 mulss %xmm8, %xmm15 1552 movss -24 * SIZE(AO), %xmm8 1553 addss %xmm15, %xmm3 1554 movss 112 * SIZE(BO), %xmm15 1555 1556 mulss %xmm10, %xmm9 1557 addss %xmm9, %xmm0 1558 movss 68 * SIZE(BO), %xmm9 1559 mulss %xmm10, %xmm9 1560 addss %xmm9, %xmm1 1561 movss 72 * SIZE(BO), %xmm9 1562 mulss %xmm10, %xmm9 1563 addss %xmm9, %xmm2 1564 movss 76 * SIZE(BO), %xmm9 1565 mulss %xmm10, %xmm9 1566 movss -27 * SIZE(AO), %xmm10 1567 addss %xmm9, %xmm3 1568 movss 128 * SIZE(BO), %xmm9 1569 1570 mulss %xmm10, %xmm11 1571 addss %xmm11, %xmm0 1572 movss 84 * SIZE(BO), %xmm11 1573 mulss %xmm10, %xmm11 1574 addss %xmm11, %xmm1 1575 movss 88 * SIZE(BO), %xmm11 1576 mulss %xmm10, %xmm11 1577 addss %xmm11, %xmm2 1578 movss 92 * SIZE(BO), %xmm11 1579 mulss %xmm10, %xmm11 1580 movss -26 * SIZE(AO), %xmm10 1581 addss %xmm11, %xmm3 1582 movss 144 * SIZE(BO), %xmm11 1583 1584 mulss %xmm10, %xmm13 1585 addss %xmm13, %xmm0 1586 movss 100 * SIZE(BO), %xmm13 1587 mulss %xmm10, %xmm13 1588 addss %xmm13, %xmm1 1589 movss 104 * SIZE(BO), %xmm13 1590 mulss %xmm10, %xmm13 1591 addss %xmm13, %xmm2 1592 movss 108 * SIZE(BO), %xmm13 1593 mulss %xmm10, %xmm13 1594 movss -25 * SIZE(AO), %xmm10 1595 addss %xmm13, %xmm3 1596 movss 160 * SIZE(BO), %xmm13 1597 1598 mulss %xmm10, %xmm15 1599 addss %xmm15, %xmm0 1600 movss 116 * SIZE(BO), %xmm15 1601 mulss %xmm10, %xmm15 1602 addss %xmm15, %xmm1 1603 movss 120 * SIZE(BO), %xmm15 1604 mulss %xmm10, %xmm15 1605 addss %xmm15, %xmm2 1606 movss 124 * SIZE(BO), %xmm15 1607 mulss %xmm10, %xmm15 1608 movss -20 * SIZE(AO), %xmm10 1609 addss %xmm15, %xmm3 1610 movss 176 * SIZE(BO), %xmm15 1611 1612 addq $ 8 * SIZE, AO 1613 addq $128 * SIZE, BO 1614 decq %rax 1615 jne .L42 1616 ALIGN_4 1617 1618.L45: 1619#ifndef TRMMKERNEL 1620 movq K, %rax 1621#else 1622 movq KKK, %rax 1623#endif 1624 movaps ALPHA, %xmm15 1625 andq $7, %rax # if (k & 1) 1626 BRANCH 1627 je .L48 1628 ALIGN_4 1629 1630.L46: 1631 mulps %xmm8, %xmm9 1632 addps %xmm9, %xmm0 1633 movss 4 * SIZE(BO), %xmm9 1634 mulps %xmm8, %xmm9 1635 addps %xmm9, %xmm1 1636 movss 8 * SIZE(BO), %xmm9 1637 mulps %xmm8, %xmm9 1638 addps %xmm9, %xmm2 1639 movss 12 * SIZE(BO), %xmm9 1640 mulps %xmm8, %xmm9 1641 movss -31 * SIZE(AO), %xmm8 1642 addps %xmm9, %xmm3 1643 movss 16 * SIZE(BO), %xmm9 1644 1645 addq $ 1 * SIZE, AO # aoffset += 4 1646 addq $16 * SIZE, BO # boffset1 += 8 1647 decq %rax 1648 jg .L46 1649 ALIGN_4 1650 1651.L48: 1652 mulss %xmm15, %xmm0 1653 mulss %xmm15, %xmm1 1654 mulss %xmm15, %xmm2 1655 mulss %xmm15, %xmm3 1656 1657#ifndef TRMMKERNEL 1658 movss 0 * SIZE(CO1), %xmm8 1659 movss 0 * SIZE(CO2), %xmm10 1660 movss 0 * SIZE(CO1, LDC, 2), %xmm12 1661 movss 0 * SIZE(CO2, LDC, 2), %xmm14 1662 1663 addss %xmm8, %xmm0 1664 addss %xmm10, %xmm1 1665 addss %xmm12, %xmm2 1666 addss %xmm14, %xmm3 1667#endif 1668 1669 movss %xmm0, 0 * SIZE(CO1) 1670 movss %xmm1, 0 * SIZE(CO2) 1671 movss %xmm2, 0 * SIZE(CO1, LDC, 2) 1672 movss %xmm3, 0 * SIZE(CO2, LDC, 2) 1673 1674#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1675 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1676 movq K, %rax 1677 subq KKK, %rax 1678 leaq (,%rax, 4), %rax 1679 leaq (AO, %rax, 1), AO 1680 leaq (BO, %rax, 8), BO 1681 leaq (BO, %rax, 8), BO 1682#endif 1683 1684#if defined(TRMMKERNEL) && defined(LEFT) 1685 addq $1, KK 1686#endif 1687 ALIGN_4 1688 1689.L49: 1690#if defined(TRMMKERNEL) && !defined(LEFT) 1691 addl $4, KK 1692#endif 1693 leaq (C, LDC, 4), C # c += 4 * ldc 1694 decq J # j -- 1695 jg .L01 1696 1697.L50: 1698 testq $2, N 1699 je .L100 1700 1701.L51: 1702#if defined(TRMMKERNEL) && defined(LEFT) 1703 movq OFFSET, %rax 1704 movq %rax, KK 1705#endif 1706 1707/* Copying to Sub Buffer */ 1708 leaq BUFFER, BO 1709 1710 movq K, %rax 1711 sarq $2, %rax 1712 jle .L53 1713 ALIGN_4 1714 1715.L52: 1716#if defined(PENTIUM4) || defined(GENERIC) 1717 movss 0 * SIZE(B), %xmm0 1718 movss 1 * SIZE(B), %xmm1 1719 movss 2 * SIZE(B), %xmm2 1720 movss 3 * SIZE(B), %xmm3 1721 movss 4 * SIZE(B), %xmm4 1722 movss 5 * SIZE(B), %xmm5 1723 movss 6 * SIZE(B), %xmm6 1724 movss 7 * SIZE(B), %xmm7 1725 1726 PREFETCH 32 * SIZE(B) 1727 1728 shufps $0, %xmm0, %xmm0 1729 shufps $0, %xmm1, %xmm1 1730 shufps $0, %xmm2, %xmm2 1731 shufps $0, %xmm3, %xmm3 1732 shufps $0, %xmm4, %xmm4 1733 shufps $0, %xmm5, %xmm5 1734 shufps $0, %xmm6, %xmm6 1735 shufps $0, %xmm7, %xmm7 1736 1737 movaps %xmm0, 0 * SIZE(BO) 1738 movaps %xmm1, 4 * SIZE(BO) 1739 movaps %xmm2, 8 * SIZE(BO) 1740 movaps %xmm3, 12 * SIZE(BO) 1741 movaps %xmm4, 16 * SIZE(BO) 1742 movaps %xmm5, 20 * SIZE(BO) 1743 movaps %xmm6, 24 * SIZE(BO) 1744 movaps %xmm7, 28 * SIZE(BO) 1745 1746 addq $ 8 * SIZE, B 1747 addq $32 * SIZE, BO 1748#endif 1749 1750#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) 1751 PREFETCH 32 * SIZE(B) 1752 1753 movd 0 * SIZE(B), %mm0 1754 movd 1 * SIZE(B), %mm1 1755 movd 2 * SIZE(B), %mm2 1756 movd 3 * SIZE(B), %mm3 1757 movd 4 * SIZE(B), %mm4 1758 movd 5 * SIZE(B), %mm5 1759 movd 6 * SIZE(B), %mm6 1760 movd 7 * SIZE(B), %mm7 1761 1762 punpckldq %mm0, %mm0 1763 punpckldq %mm1, %mm1 1764 punpckldq %mm2, %mm2 1765 punpckldq %mm3, %mm3 1766 punpckldq %mm4, %mm4 1767 punpckldq %mm5, %mm5 1768 punpckldq %mm6, %mm6 1769 punpckldq %mm7, %mm7 1770 1771 movq %mm0, 0 * SIZE(BO) 1772 movq %mm0, 2 * SIZE(BO) 1773 movq %mm1, 4 * SIZE(BO) 1774 movq %mm1, 6 * SIZE(BO) 1775 movq %mm2, 8 * SIZE(BO) 1776 movq %mm2, 10 * SIZE(BO) 1777 movq %mm3, 12 * SIZE(BO) 1778 movq %mm3, 14 * SIZE(BO) 1779 movq %mm4, 16 * SIZE(BO) 1780 movq %mm4, 18 * SIZE(BO) 1781 movq %mm5, 20 * SIZE(BO) 1782 movq %mm5, 22 * SIZE(BO) 1783 movq %mm6, 24 * SIZE(BO) 1784 movq %mm6, 26 * SIZE(BO) 1785 movq %mm7, 28 * SIZE(BO) 1786 movq %mm7, 30 * SIZE(BO) 1787 1788 addq $ 8 * SIZE, B 1789 addq $32 * SIZE, BO 1790#endif 1791 1792 decq %rax 1793 jne .L52 1794 ALIGN_4 1795 1796.L53: 1797 movq K, %rax 1798 andq $3, %rax 1799 BRANCH 1800 jle .L60 1801 ALIGN_4 1802 1803.L54: 1804#if defined(PENTIUM4) || defined(GENERIC) 1805 movss 0 * SIZE(B), %xmm0 1806 movss 1 * SIZE(B), %xmm1 1807 1808 shufps $0, %xmm0, %xmm0 1809 shufps $0, %xmm1, %xmm1 1810 1811 movaps %xmm0, 0 * SIZE(BO) 1812 movaps %xmm1, 4 * SIZE(BO) 1813#endif 1814 1815#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) 1816 movd 0 * SIZE(B), %mm0 1817 movd 1 * SIZE(B), %mm1 1818 1819 punpckldq %mm0, %mm0 1820 punpckldq %mm1, %mm1 1821 1822 movq %mm0, 0 * SIZE(BO) 1823 movq %mm0, 2 * SIZE(BO) 1824 movq %mm1, 4 * SIZE(BO) 1825 movq %mm1, 6 * SIZE(BO) 1826#endif 1827 1828 addq $ 2 * SIZE, B 1829 addq $ 8 * SIZE, BO 1830 decq %rax 1831 jne .L54 1832 ALIGN_4 1833 1834.L60: 1835 movq C, CO1 # coffset1 = c 1836 leaq (C, LDC, 1), CO2 # coffset2 = c + ldc 1837 movq A, AO # aoffset = a 1838 1839 movq M, I 1840 sarq $3, I # i = (m >> 3) 1841 jle .L70 1842 ALIGN_4 1843 1844.L61: 1845#if !defined(TRMMKERNEL) || \ 1846 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1847 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1848 1849 leaq BUFFER, BO 1850#else 1851 leaq BUFFER, BO 1852 movq KK, %rax 1853 leaq (, %rax, 8), %rax 1854 leaq (AO, %rax, 4), AO 1855 leaq (BO, %rax, 4), BO 1856#endif 1857 1858 movaps -32 * SIZE(AO), %xmm8 1859 movaps -16 * SIZE(AO), %xmm10 1860 movaps 0 * SIZE(AO), %xmm12 1861 movaps 16 * SIZE(AO), %xmm14 1862 1863 movaps 0 * SIZE(BO), %xmm9 1864 movaps 16 * SIZE(BO), %xmm11 1865 movaps 32 * SIZE(BO), %xmm13 1866 movaps 48 * SIZE(BO), %xmm15 1867 1868 xorps %xmm0, %xmm0 1869 xorps %xmm1, %xmm1 1870 1871 PREFETCHW 7 * SIZE(CO1) 1872 xorps %xmm4, %xmm4 1873 PREFETCHW 7 * SIZE(CO2) 1874 xorps %xmm5, %xmm5 1875 1876#ifndef TRMMKERNEL 1877 movq K, %rax 1878#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1879 movq K, %rax 1880 subq KK, %rax 1881 movq %rax, KKK 1882#else 1883 movq KK, %rax 1884#ifdef LEFT 1885 addq $8, %rax 1886#else 1887 addq $2, %rax 1888#endif 1889 movq %rax, KKK 1890#endif 1891 sarq $3, %rax 1892 je .L65 1893 ALIGN_4 1894 1895.L62: 1896 mulps %xmm8, %xmm9 1897#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) 1898 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 1899#endif 1900 mulps 4 * SIZE(BO), %xmm8 1901 addps %xmm9, %xmm0 1902 movaps 0 * SIZE(BO), %xmm9 1903 addps %xmm8, %xmm1 1904 movaps -28 * SIZE(AO), %xmm8 1905 mulps %xmm8, %xmm9 1906 mulps 4 * SIZE(BO), %xmm8 1907 addps %xmm9, %xmm4 1908 movaps 8 * SIZE(BO), %xmm9 1909 addps %xmm8, %xmm5 1910 movaps -24 * SIZE(AO), %xmm8 1911 1912 mulps %xmm8, %xmm9 1913 mulps 12 * SIZE(BO), %xmm8 1914 addps %xmm9, %xmm0 1915 movaps 8 * SIZE(BO), %xmm9 1916 addps %xmm8, %xmm1 1917 movaps -20 * SIZE(AO), %xmm8 1918 mulps %xmm8, %xmm9 1919 mulps 12 * SIZE(BO), %xmm8 1920 addps %xmm9, %xmm4 1921 movaps 64 * SIZE(BO), %xmm9 1922 addps %xmm8, %xmm5 1923 movaps 32 * SIZE(AO), %xmm8 1924 1925#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) 1926 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) 1927#endif 1928 mulps %xmm10, %xmm11 1929 mulps 20 * SIZE(BO), %xmm10 1930 addps %xmm11, %xmm0 1931 movaps 16 * SIZE(BO), %xmm11 1932 addps %xmm10, %xmm1 1933 movaps -12 * SIZE(AO), %xmm10 1934 mulps %xmm10, %xmm11 1935 mulps 20 * SIZE(BO), %xmm10 1936 addps %xmm11, %xmm4 1937 movaps 24 * SIZE(BO), %xmm11 1938 addps %xmm10, %xmm5 1939 movaps -8 * SIZE(AO), %xmm10 1940 1941 mulps %xmm10, %xmm11 1942 mulps 28 * SIZE(BO), %xmm10 1943 addps %xmm11, %xmm0 1944 movaps 24 * SIZE(BO), %xmm11 1945 addps %xmm10, %xmm1 1946 movaps -4 * SIZE(AO), %xmm10 1947 mulps %xmm10, %xmm11 1948 mulps 28 * SIZE(BO), %xmm10 1949 addps %xmm11, %xmm4 1950 movaps 80 * SIZE(BO), %xmm11 1951 addps %xmm10, %xmm5 1952 movaps 48 * SIZE(AO), %xmm10 1953 1954#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) 1955 PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) 1956#endif 1957 mulps %xmm12, %xmm13 1958 mulps 36 * SIZE(BO), %xmm12 1959 addps %xmm13, %xmm0 1960 movaps 32 * SIZE(BO), %xmm13 1961 addps %xmm12, %xmm1 1962 movaps 4 * SIZE(AO), %xmm12 1963 mulps %xmm12, %xmm13 1964 mulps 36 * SIZE(BO), %xmm12 1965 addps %xmm13, %xmm4 1966 movaps 40 * SIZE(BO), %xmm13 1967 addps %xmm12, %xmm5 1968 movaps 8 * SIZE(AO), %xmm12 1969 1970 mulps %xmm12, %xmm13 1971 mulps 44 * SIZE(BO), %xmm12 1972 addps %xmm13, %xmm0 1973 movaps 40 * SIZE(BO), %xmm13 1974 addps %xmm12, %xmm1 1975 movaps 12 * SIZE(AO), %xmm12 1976 mulps %xmm12, %xmm13 1977 mulps 44 * SIZE(BO), %xmm12 1978 addps %xmm13, %xmm4 1979 movaps 96 * SIZE(BO), %xmm13 1980 addps %xmm12, %xmm5 1981 movaps 64 * SIZE(AO), %xmm12 1982 1983#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) 1984 PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) 1985#endif 1986 mulps %xmm14, %xmm15 1987 mulps 52 * SIZE(BO), %xmm14 1988 addps %xmm15, %xmm0 1989 movaps 48 * SIZE(BO), %xmm15 1990 addps %xmm14, %xmm1 1991 movaps 20 * SIZE(AO), %xmm14 1992 mulps %xmm14, %xmm15 1993 mulps 52 * SIZE(BO), %xmm14 1994 addps %xmm15, %xmm4 1995 movaps 56 * SIZE(BO), %xmm15 1996 addps %xmm14, %xmm5 1997 movaps 24 * SIZE(AO), %xmm14 1998 1999 mulps %xmm14, %xmm15 2000 mulps 60 * SIZE(BO), %xmm14 2001 addps %xmm15, %xmm0 2002 movaps 56 * SIZE(BO), %xmm15 2003 addps %xmm14, %xmm1 2004 movaps 28 * SIZE(AO), %xmm14 2005 mulps %xmm14, %xmm15 2006 mulps 60 * SIZE(BO), %xmm14 2007 addps %xmm15, %xmm4 2008 movaps 112 * SIZE(BO), %xmm15 2009 addps %xmm14, %xmm5 2010 movaps 80 * SIZE(AO), %xmm14 2011 2012 addq $64 * SIZE, AO 2013 addq $64 * SIZE, BO 2014 decq %rax 2015 jne .L62 2016 ALIGN_4 2017 2018.L65: 2019#ifndef TRMMKERNEL 2020 movq K, %rax 2021#else 2022 movq KKK, %rax 2023#endif 2024 movaps ALPHA, %xmm15 2025 andq $7, %rax # if (k & 1) 2026 BRANCH 2027 je .L68 2028 ALIGN_4 2029 2030.L66: 2031 mulps %xmm8, %xmm9 2032 mulps 4 * SIZE(BO), %xmm8 2033 addps %xmm9, %xmm0 2034 movaps 0 * SIZE(BO), %xmm9 2035 addps %xmm8, %xmm1 2036 movaps -28 * SIZE(AO), %xmm8 2037 mulps %xmm8, %xmm9 2038 mulps 4 * SIZE(BO), %xmm8 2039 addps %xmm9, %xmm4 2040 movaps 8 * SIZE(BO), %xmm9 2041 addps %xmm8, %xmm5 2042 movaps -24 * SIZE(AO), %xmm8 2043 2044 addq $8 * SIZE, AO # aoffset += 4 2045 addq $8 * SIZE, BO # boffset1 += 8 2046 decq %rax 2047 jg .L66 2048 ALIGN_4 2049 2050.L68: 2051#ifndef TRMMKERNEL 2052 movsd 0 * SIZE(CO1), %xmm8 2053 movhps 2 * SIZE(CO1), %xmm8 2054 movsd 4 * SIZE(CO1), %xmm9 2055 movhps 6 * SIZE(CO1), %xmm9 2056 2057 movsd 0 * SIZE(CO2), %xmm10 2058 movhps 2 * SIZE(CO2), %xmm10 2059 movsd 4 * SIZE(CO2), %xmm11 2060 movhps 6 * SIZE(CO2), %xmm11 2061#endif 2062 2063 mulps %xmm15, %xmm0 2064 mulps %xmm15, %xmm4 2065 mulps %xmm15, %xmm1 2066 mulps %xmm15, %xmm5 2067 2068#ifndef TRMMKERNEL 2069 addps %xmm8, %xmm0 2070 addps %xmm9, %xmm4 2071 addps %xmm10, %xmm1 2072 addps %xmm11, %xmm5 2073#endif 2074 2075 movlps %xmm0, 0 * SIZE(CO1) 2076 movhps %xmm0, 2 * SIZE(CO1) 2077 movlps %xmm4, 4 * SIZE(CO1) 2078 movhps %xmm4, 6 * SIZE(CO1) 2079 2080 movlps %xmm1, 0 * SIZE(CO2) 2081 movhps %xmm1, 2 * SIZE(CO2) 2082 movlps %xmm5, 4 * SIZE(CO2) 2083 movhps %xmm5, 6 * SIZE(CO2) 2084 2085#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2086 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2087 movq K, %rax 2088 subq KKK, %rax 2089 leaq (,%rax, 8), %rax 2090 leaq (AO, %rax, 4), AO 2091 leaq (BO, %rax, 4), BO 2092#endif 2093 2094#if defined(TRMMKERNEL) && defined(LEFT) 2095 addq $8, KK 2096#endif 2097 2098 addq $8 * SIZE, CO1 # coffset += 4 2099 addq $8 * SIZE, CO2 # coffset += 4 2100 decq I # i -- 2101 jg .L61 2102 ALIGN_4 2103 2104.L70: 2105 testq $4, M 2106 je .L80 2107 2108 2109#if !defined(TRMMKERNEL) || \ 2110 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2111 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2112 2113 leaq BUFFER, BO 2114#else 2115 leaq BUFFER, BO 2116 movq KK, %rax 2117 leaq (, %rax, 8), %rax 2118 leaq (AO, %rax, 2), AO 2119 leaq (BO, %rax, 4), BO 2120#endif 2121 2122 movaps -32 * SIZE(AO), %xmm8 2123 movaps -16 * SIZE(AO), %xmm10 2124 2125 movaps 0 * SIZE(BO), %xmm9 2126 movaps 16 * SIZE(BO), %xmm11 2127 movaps 32 * SIZE(BO), %xmm13 2128 movaps 48 * SIZE(BO), %xmm15 2129 2130 xorps %xmm0, %xmm0 2131 xorps %xmm1, %xmm1 2132 xorps %xmm2, %xmm2 2133 xorps %xmm3, %xmm3 2134 2135#ifndef TRMMKERNEL 2136 movq K, %rax 2137#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 2138 movq K, %rax 2139 subq KK, %rax 2140 movq %rax, KKK 2141#else 2142 movq KK, %rax 2143#ifdef LEFT 2144 addq $4, %rax 2145#else 2146 addq $2, %rax 2147#endif 2148 movq %rax, KKK 2149#endif 2150 sarq $3, %rax 2151 je .L75 2152 ALIGN_4 2153 2154.L72: 2155 mulps %xmm8, %xmm9 2156#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) 2157 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 2158#endif 2159 2160 mulps 4 * SIZE(BO), %xmm8 2161 addps %xmm9, %xmm0 2162 movaps 8 * SIZE(BO), %xmm9 2163 addps %xmm8, %xmm1 2164 movaps -28 * SIZE(AO), %xmm8 2165 2166 mulps %xmm8, %xmm9 2167 mulps 12 * SIZE(BO), %xmm8 2168 addps %xmm9, %xmm2 2169 movaps 64 * SIZE(BO), %xmm9 2170 addps %xmm8, %xmm3 2171 movaps -24 * SIZE(AO), %xmm8 2172 2173 mulps %xmm8, %xmm11 2174 mulps 20 * SIZE(BO), %xmm8 2175 addps %xmm11, %xmm0 2176 movaps 24 * SIZE(BO), %xmm11 2177 addps %xmm8, %xmm1 2178 movaps -20 * SIZE(AO), %xmm8 2179 2180 mulps %xmm8, %xmm11 2181 mulps 28 * SIZE(BO), %xmm8 2182 addps %xmm11, %xmm2 2183 movaps 80 * SIZE(BO), %xmm11 2184 addps %xmm8, %xmm3 2185 movaps 0 * SIZE(AO), %xmm8 2186 2187 mulps %xmm10, %xmm13 2188 mulps 36 * SIZE(BO), %xmm10 2189 addps %xmm13, %xmm0 2190 movaps 40 * SIZE(BO), %xmm13 2191 addps %xmm10, %xmm1 2192 movaps -12 * SIZE(AO), %xmm10 2193 2194 mulps %xmm10, %xmm13 2195 mulps 44 * SIZE(BO), %xmm10 2196 addps %xmm13, %xmm2 2197 movaps 96 * SIZE(BO), %xmm13 2198 addps %xmm10, %xmm3 2199 movaps -8 * SIZE(AO), %xmm10 2200 2201 mulps %xmm10, %xmm15 2202 mulps 52 * SIZE(BO), %xmm10 2203 addps %xmm15, %xmm0 2204 movaps 56 * SIZE(BO), %xmm15 2205 addps %xmm10, %xmm1 2206 movaps -4 * SIZE(AO), %xmm10 2207 2208 mulps %xmm10, %xmm15 2209 mulps 60 * SIZE(BO), %xmm10 2210 addps %xmm15, %xmm2 2211 movaps 112 * SIZE(BO), %xmm15 2212 addps %xmm10, %xmm3 2213 movaps 16 * SIZE(AO), %xmm10 2214 2215 addq $32 * SIZE, AO 2216 addq $64 * SIZE, BO 2217 decq %rax 2218 jne .L72 2219 ALIGN_4 2220 2221.L75: 2222#ifndef TRMMKERNEL 2223 movq K, %rax 2224#else 2225 movq KKK, %rax 2226#endif 2227 movaps ALPHA, %xmm15 2228 andq $7, %rax # if (k & 1) 2229 BRANCH 2230 je .L78 2231 ALIGN_4 2232 2233.L76: 2234 mulps %xmm8, %xmm9 2235 mulps 4 * SIZE(BO), %xmm8 2236 addps %xmm9, %xmm0 2237 movaps 8 * SIZE(BO), %xmm9 2238 addps %xmm8, %xmm1 2239 movaps -28 * SIZE(AO), %xmm8 2240 2241 addq $4 * SIZE, AO # aoffset += 4 2242 addq $8 * SIZE, BO # boffset1 += 8 2243 decq %rax 2244 jg .L76 2245 ALIGN_4 2246 2247.L78: 2248#ifndef TRMMKERNEL 2249 movsd 0 * SIZE(CO1), %xmm8 2250 movhps 2 * SIZE(CO1), %xmm8 2251 movsd 0 * SIZE(CO2), %xmm10 2252 movhps 2 * SIZE(CO2), %xmm10 2253#endif 2254 2255 addps %xmm2, %xmm0 2256 addps %xmm3, %xmm1 2257 2258 mulps %xmm15, %xmm0 2259 mulps %xmm15, %xmm1 2260 2261#ifndef TRMMKERNEL 2262 addps %xmm8, %xmm0 2263 addps %xmm10, %xmm1 2264#endif 2265 2266 movlps %xmm0, 0 * SIZE(CO1) 2267 movhps %xmm0, 2 * SIZE(CO1) 2268 movlps %xmm1, 0 * SIZE(CO2) 2269 movhps %xmm1, 2 * SIZE(CO2) 2270 2271#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2272 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2273 movq K, %rax 2274 subq KKK, %rax 2275 leaq (,%rax, 8), %rax 2276 leaq (AO, %rax, 2), AO 2277 leaq (BO, %rax, 4), BO 2278#endif 2279 2280#if defined(TRMMKERNEL) && defined(LEFT) 2281 addq $4, KK 2282#endif 2283 2284 addq $4 * SIZE, CO1 # coffset += 4 2285 addq $4 * SIZE, CO2 # coffset += 4 2286 ALIGN_4 2287 2288.L80: 2289 testq $2, M 2290 je .L90 2291 2292#if !defined(TRMMKERNEL) || \ 2293 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2294 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2295 2296 leaq BUFFER, BO 2297#else 2298 leaq BUFFER, BO 2299 movq KK, %rax 2300 leaq (, %rax, 8), %rax 2301 leaq (AO, %rax, 1), AO 2302 leaq (BO, %rax, 4), BO 2303#endif 2304 2305 movaps -32 * SIZE(AO), %xmm8 2306 movaps -24 * SIZE(AO), %xmm10 2307 2308 movaps 0 * SIZE(BO), %xmm9 2309 movaps 16 * SIZE(BO), %xmm11 2310 movaps 32 * SIZE(BO), %xmm13 2311 movaps 48 * SIZE(BO), %xmm15 2312 2313 xorps %xmm0, %xmm0 2314 xorps %xmm1, %xmm1 2315 xorps %xmm2, %xmm2 2316 xorps %xmm3, %xmm3 2317 2318#ifndef TRMMKERNEL 2319 movq K, %rax 2320#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 2321 movq K, %rax 2322 subq KK, %rax 2323 movq %rax, KKK 2324#else 2325 movq KK, %rax 2326#ifdef LEFT 2327 addq $2, %rax 2328#else 2329 addq $2, %rax 2330#endif 2331 movq %rax, KKK 2332#endif 2333 sarq $3, %rax 2334 je .L85 2335 ALIGN_4 2336 2337.L82: 2338 mulps %xmm8, %xmm9 2339 addps %xmm9, %xmm0 2340#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) 2341 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 2342#endif 2343 movaps 4 * SIZE(BO), %xmm9 2344 mulps %xmm8, %xmm9 2345 movsd -30 * SIZE(AO), %xmm8 2346 addps %xmm9, %xmm1 2347 movaps 8 * SIZE(BO), %xmm9 2348 2349 mulps %xmm8, %xmm9 2350 addps %xmm9, %xmm2 2351 movaps 12 * SIZE(BO), %xmm9 2352 mulps %xmm8, %xmm9 2353 movsd -28 * SIZE(AO), %xmm8 2354 addps %xmm9, %xmm3 2355 movaps 64 * SIZE(BO), %xmm9 2356 2357 mulps %xmm8, %xmm11 2358 addps %xmm11, %xmm0 2359 movaps 20 * SIZE(BO), %xmm11 2360 mulps %xmm8, %xmm11 2361 movsd -26 * SIZE(AO), %xmm8 2362 addps %xmm11, %xmm1 2363 movaps 24 * SIZE(BO), %xmm11 2364 2365 mulps %xmm8, %xmm11 2366 addps %xmm11, %xmm2 2367 movaps 28 * SIZE(BO), %xmm11 2368 mulps %xmm8, %xmm11 2369 movsd -16 * SIZE(AO), %xmm8 2370 addps %xmm11, %xmm3 2371 movaps 80 * SIZE(BO), %xmm11 2372 2373 mulps %xmm10, %xmm13 2374 addps %xmm13, %xmm0 2375 movaps 36 * SIZE(BO), %xmm13 2376 mulps %xmm10, %xmm13 2377 movsd -22 * SIZE(AO), %xmm10 2378 addps %xmm13, %xmm1 2379 movaps 40 * SIZE(BO), %xmm13 2380 2381 mulps %xmm10, %xmm13 2382 addps %xmm13, %xmm2 2383 movaps 44 * SIZE(BO), %xmm13 2384 mulps %xmm10, %xmm13 2385 movsd -20 * SIZE(AO), %xmm10 2386 addps %xmm13, %xmm3 2387 movaps 96 * SIZE(BO), %xmm13 2388 2389 mulps %xmm10, %xmm15 2390 addps %xmm15, %xmm0 2391 movaps 52 * SIZE(BO), %xmm15 2392 mulps %xmm10, %xmm15 2393 movsd -18 * SIZE(AO), %xmm10 2394 addps %xmm15, %xmm1 2395 movaps 56 * SIZE(BO), %xmm15 2396 2397 mulps %xmm10, %xmm15 2398 addps %xmm15, %xmm2 2399 movaps 60 * SIZE(BO), %xmm15 2400 mulps %xmm10, %xmm15 2401 movsd -8 * SIZE(AO), %xmm10 2402 addps %xmm15, %xmm3 2403 movaps 112 * SIZE(BO), %xmm15 2404 2405 addq $16 * SIZE, AO 2406 addq $64 * SIZE, BO 2407 decq %rax 2408 jne .L82 2409 ALIGN_4 2410 2411.L85: 2412#ifndef TRMMKERNEL 2413 movq K, %rax 2414#else 2415 movq KKK, %rax 2416#endif 2417 movaps ALPHA, %xmm15 2418 andq $7, %rax # if (k & 1) 2419 BRANCH 2420 je .L88 2421 ALIGN_4 2422 2423.L86: 2424 mulps %xmm8, %xmm9 2425 addps %xmm9, %xmm0 2426 movaps 4 * SIZE(BO), %xmm9 2427 mulps %xmm8, %xmm9 2428 movsd -30 * SIZE(AO), %xmm8 2429 addps %xmm9, %xmm1 2430 movaps 8 * SIZE(BO), %xmm9 2431 2432 addq $2 * SIZE, AO # aoffset += 4 2433 addq $8 * SIZE, BO # boffset1 += 8 2434 decq %rax 2435 jg .L86 2436 ALIGN_4 2437 2438.L88: 2439#ifndef TRMMKERNEL 2440#ifdef movsd 2441 xorps %xmm8, %xmm8 2442#endif 2443 movsd 0 * SIZE(CO1), %xmm8 2444#ifdef movsd 2445 xorps %xmm10, %xmm10 2446#endif 2447 movsd 0 * SIZE(CO2), %xmm10 2448#endif 2449 2450 addps %xmm2, %xmm0 2451 addps %xmm3, %xmm1 2452 2453 mulps %xmm15, %xmm0 2454 mulps %xmm15, %xmm1 2455 2456#ifndef TRMMKERNEL 2457 addps %xmm8, %xmm0 2458 addps %xmm10, %xmm1 2459#endif 2460 2461 movlps %xmm0, 0 * SIZE(CO1) 2462 movlps %xmm1, 0 * SIZE(CO2) 2463 2464#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2465 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2466 movq K, %rax 2467 subq KKK, %rax 2468 leaq (,%rax, 8), %rax 2469 leaq (AO, %rax, 1), AO 2470 leaq (BO, %rax, 4), BO 2471#endif 2472 2473#if defined(TRMMKERNEL) && defined(LEFT) 2474 addq $2, KK 2475#endif 2476 2477 addq $2 * SIZE, CO1 # coffset += 4 2478 addq $2 * SIZE, CO2 # coffset += 4 2479 ALIGN_4 2480 2481.L90: 2482 testq $1, M 2483 je .L99 2484 2485#if !defined(TRMMKERNEL) || \ 2486 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2487 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2488 2489 leaq BUFFER, BO 2490#else 2491 leaq BUFFER, BO 2492 movq KK, %rax 2493 leaq (, %rax, 4), %rax 2494 leaq (AO, %rax, 1), AO 2495 leaq (BO, %rax, 8), BO 2496#endif 2497 2498 movss -32 * SIZE(AO), %xmm8 2499 movss -28 * SIZE(AO), %xmm10 2500 2501 movss 0 * SIZE(BO), %xmm9 2502 movss 16 * SIZE(BO), %xmm11 2503 movss 32 * SIZE(BO), %xmm13 2504 movss 48 * SIZE(BO), %xmm15 2505 2506 xorps %xmm0, %xmm0 2507 xorps %xmm1, %xmm1 2508 xorps %xmm2, %xmm2 2509 xorps %xmm3, %xmm3 2510 2511#ifndef TRMMKERNEL 2512 movq K, %rax 2513#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 2514 movq K, %rax 2515 subq KK, %rax 2516 movq %rax, KKK 2517#else 2518 movq KK, %rax 2519#ifdef LEFT 2520 addq $1, %rax 2521#else 2522 addq $2, %rax 2523#endif 2524 movq %rax, KKK 2525#endif 2526 sarq $3, %rax 2527 je .L95 2528 ALIGN_4 2529 2530.L92: 2531 mulps %xmm8, %xmm9 2532 addps %xmm9, %xmm0 2533#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) 2534 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 2535#endif 2536 movss 4 * SIZE(BO), %xmm9 2537 mulps %xmm8, %xmm9 2538 movss -31 * SIZE(AO), %xmm8 2539 addps %xmm9, %xmm1 2540 movss 8 * SIZE(BO), %xmm9 2541 2542 mulps %xmm8, %xmm9 2543 addps %xmm9, %xmm2 2544 movss 12 * SIZE(BO), %xmm9 2545 mulps %xmm8, %xmm9 2546 movss -30 * SIZE(AO), %xmm8 2547 addps %xmm9, %xmm3 2548 movss 64 * SIZE(BO), %xmm9 2549 2550 mulps %xmm8, %xmm11 2551 addps %xmm11, %xmm0 2552 movss 20 * SIZE(BO), %xmm11 2553 mulps %xmm8, %xmm11 2554 movss -29 * SIZE(AO), %xmm8 2555 addps %xmm11, %xmm1 2556 movss 24 * SIZE(BO), %xmm11 2557 2558 mulps %xmm8, %xmm11 2559 addps %xmm11, %xmm2 2560 movss 28 * SIZE(BO), %xmm11 2561 mulps %xmm8, %xmm11 2562 movss -24 * SIZE(AO), %xmm8 2563 addps %xmm11, %xmm3 2564 movss 80 * SIZE(BO), %xmm11 2565 2566 mulps %xmm10, %xmm13 2567 addps %xmm13, %xmm0 2568 movss 36 * SIZE(BO), %xmm13 2569 mulps %xmm10, %xmm13 2570 movss -27 * SIZE(AO), %xmm10 2571 addps %xmm13, %xmm1 2572 movss 40 * SIZE(BO), %xmm13 2573 2574 mulps %xmm10, %xmm13 2575 addps %xmm13, %xmm2 2576 movss 44 * SIZE(BO), %xmm13 2577 mulps %xmm10, %xmm13 2578 movss -26 * SIZE(AO), %xmm10 2579 addps %xmm13, %xmm3 2580 movss 96 * SIZE(BO), %xmm13 2581 2582 mulps %xmm10, %xmm15 2583 addps %xmm15, %xmm0 2584 movss 52 * SIZE(BO), %xmm15 2585 mulps %xmm10, %xmm15 2586 movss -25 * SIZE(AO), %xmm10 2587 addps %xmm15, %xmm1 2588 movss 56 * SIZE(BO), %xmm15 2589 2590 mulps %xmm10, %xmm15 2591 addps %xmm15, %xmm2 2592 movss 60 * SIZE(BO), %xmm15 2593 mulps %xmm10, %xmm15 2594 movss -20 * SIZE(AO), %xmm10 2595 addps %xmm15, %xmm3 2596 movss 112 * SIZE(BO), %xmm15 2597 2598 addq $ 8 * SIZE, AO 2599 addq $64 * SIZE, BO 2600 decq %rax 2601 jne .L92 2602 ALIGN_4 2603 2604.L95: 2605#ifndef TRMMKERNEL 2606 movq K, %rax 2607#else 2608 movq KKK, %rax 2609#endif 2610 movaps ALPHA, %xmm15 2611 andq $7, %rax # if (k & 1) 2612 BRANCH 2613 je .L98 2614 ALIGN_4 2615 2616.L96: 2617 mulps %xmm8, %xmm9 2618 addps %xmm9, %xmm0 2619 movss 4 * SIZE(BO), %xmm9 2620 mulps %xmm8, %xmm9 2621 movss -31 * SIZE(AO), %xmm8 2622 addps %xmm9, %xmm1 2623 movss 8 * SIZE(BO), %xmm9 2624 2625 addq $1 * SIZE, AO # aoffset += 4 2626 addq $8 * SIZE, BO # boffset1 += 8 2627 decq %rax 2628 jg .L96 2629 ALIGN_4 2630 2631.L98: 2632#ifndef TRMMKERNEL 2633 movss 0 * SIZE(CO1), %xmm8 2634 movss 0 * SIZE(CO2), %xmm10 2635#endif 2636 2637 addss %xmm2, %xmm0 2638 addss %xmm3, %xmm1 2639 mulss %xmm15, %xmm0 2640 mulss %xmm15, %xmm1 2641 2642#ifndef TRMMKERNEL 2643 addss %xmm8, %xmm0 2644 addss %xmm10, %xmm1 2645#endif 2646 2647 movss %xmm0, 0 * SIZE(CO1) 2648 movss %xmm1, 0 * SIZE(CO2) 2649 2650#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2651 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2652 movq K, %rax 2653 subq KKK, %rax 2654 leaq (,%rax, 4), %rax 2655 leaq (AO, %rax, 1), AO 2656 leaq (BO, %rax, 8), BO 2657#endif 2658 2659#if defined(TRMMKERNEL) && defined(LEFT) 2660 addq $1, KK 2661#endif 2662 ALIGN_4 2663 2664.L99: 2665#if defined(TRMMKERNEL) && !defined(LEFT) 2666 addl $2, KK 2667#endif 2668 leaq (C, LDC, 2), C # c += 4 * ldc 2669 ALIGN_4 2670 2671 2672.L100: 2673 testq $1, N 2674 je .L999 2675 2676.L101: 2677#if defined(TRMMKERNEL) && defined(LEFT) 2678 movq OFFSET, %rax 2679 movq %rax, KK 2680#endif 2681 2682/* Copying to Sub Buffer */ 2683 leaq BUFFER, BO 2684 2685 movq K, %rax 2686 sarq $3, %rax 2687 jle .L103 2688 ALIGN_4 2689 2690 2691.L102: 2692#if defined(PENTIUM4) || defined(GENERIC) 2693 movss 0 * SIZE(B), %xmm0 2694 movss 1 * SIZE(B), %xmm1 2695 movss 2 * SIZE(B), %xmm2 2696 movss 3 * SIZE(B), %xmm3 2697 movss 4 * SIZE(B), %xmm4 2698 movss 5 * SIZE(B), %xmm5 2699 movss 6 * SIZE(B), %xmm6 2700 movss 7 * SIZE(B), %xmm7 2701 2702 PREFETCH 32 * SIZE(B) 2703 2704 shufps $0, %xmm0, %xmm0 2705 shufps $0, %xmm1, %xmm1 2706 shufps $0, %xmm2, %xmm2 2707 shufps $0, %xmm3, %xmm3 2708 shufps $0, %xmm4, %xmm4 2709 shufps $0, %xmm5, %xmm5 2710 shufps $0, %xmm6, %xmm6 2711 shufps $0, %xmm7, %xmm7 2712 2713 movaps %xmm0, 0 * SIZE(BO) 2714 movaps %xmm1, 4 * SIZE(BO) 2715 movaps %xmm2, 8 * SIZE(BO) 2716 movaps %xmm3, 12 * SIZE(BO) 2717 movaps %xmm4, 16 * SIZE(BO) 2718 movaps %xmm5, 20 * SIZE(BO) 2719 movaps %xmm6, 24 * SIZE(BO) 2720 movaps %xmm7, 28 * SIZE(BO) 2721 2722 addq $ 8 * SIZE, B 2723 addq $32 * SIZE, BO 2724#endif 2725 2726#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) 2727 PREFETCH 32 * SIZE(B) 2728 2729 movd 0 * SIZE(B), %mm0 2730 movd 1 * SIZE(B), %mm1 2731 movd 2 * SIZE(B), %mm2 2732 movd 3 * SIZE(B), %mm3 2733 movd 4 * SIZE(B), %mm4 2734 movd 5 * SIZE(B), %mm5 2735 movd 6 * SIZE(B), %mm6 2736 movd 7 * SIZE(B), %mm7 2737 2738 punpckldq %mm0, %mm0 2739 punpckldq %mm1, %mm1 2740 punpckldq %mm2, %mm2 2741 punpckldq %mm3, %mm3 2742 punpckldq %mm4, %mm4 2743 punpckldq %mm5, %mm5 2744 punpckldq %mm6, %mm6 2745 punpckldq %mm7, %mm7 2746 2747 movq %mm0, 0 * SIZE(BO) 2748 movq %mm0, 2 * SIZE(BO) 2749 movq %mm1, 4 * SIZE(BO) 2750 movq %mm1, 6 * SIZE(BO) 2751 movq %mm2, 8 * SIZE(BO) 2752 movq %mm2, 10 * SIZE(BO) 2753 movq %mm3, 12 * SIZE(BO) 2754 movq %mm3, 14 * SIZE(BO) 2755 movq %mm4, 16 * SIZE(BO) 2756 movq %mm4, 18 * SIZE(BO) 2757 movq %mm5, 20 * SIZE(BO) 2758 movq %mm5, 22 * SIZE(BO) 2759 movq %mm6, 24 * SIZE(BO) 2760 movq %mm6, 26 * SIZE(BO) 2761 movq %mm7, 28 * SIZE(BO) 2762 movq %mm7, 30 * SIZE(BO) 2763 2764 addq $ 8 * SIZE, B 2765 addq $32 * SIZE, BO 2766#endif 2767 2768 decq %rax 2769 jne .L102 2770 ALIGN_4 2771 2772.L103: 2773 movq K, %rax 2774 andq $7, %rax 2775 BRANCH 2776 jle .L110 2777 ALIGN_4 2778 2779.L104: 2780#if defined(PENTIUM4) || defined(GENERIC) 2781 movss 0 * SIZE(B), %xmm0 2782 shufps $0, %xmm0, %xmm0 2783 movaps %xmm0, 0 * SIZE(BO) 2784#endif 2785 2786#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) 2787 movd 0 * SIZE(B), %mm0 2788 punpckldq %mm0, %mm0 2789 movq %mm0, 0 * SIZE(BO) 2790 movq %mm0, 2 * SIZE(BO) 2791#endif 2792 2793 addq $ 1 * SIZE, B 2794 addq $ 4 * SIZE, BO 2795 decq %rax 2796 jne .L104 2797 ALIGN_4 2798 2799.L110: 2800 movq C, CO1 # coffset1 = c 2801 movq A, AO # aoffset = a 2802 2803 movq M, I 2804 sarq $3, I # i = (m >> 3) 2805 jle .L120 2806 ALIGN_4 2807 2808.L111: 2809#if !defined(TRMMKERNEL) || \ 2810 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2811 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2812 2813 leaq BUFFER, BO 2814#else 2815 leaq BUFFER, BO 2816 movq KK, %rax 2817 leaq (, %rax, 8), %rax 2818 leaq (AO, %rax, 4), AO 2819 leaq (BO, %rax, 2), BO 2820#endif 2821 2822 movaps -32 * SIZE(AO), %xmm8 2823 movaps -16 * SIZE(AO), %xmm10 2824 movaps 0 * SIZE(AO), %xmm12 2825 movaps 16 * SIZE(AO), %xmm14 2826 2827 movaps 0 * SIZE(BO), %xmm9 2828 movaps 16 * SIZE(BO), %xmm11 2829 movaps 32 * SIZE(BO), %xmm13 2830 movaps 48 * SIZE(BO), %xmm15 2831 2832 xorps %xmm0, %xmm0 2833 xorps %xmm1, %xmm1 2834 2835 PREFETCHW 7 * SIZE(CO1) 2836 xorps %xmm4, %xmm4 2837 xorps %xmm5, %xmm5 2838 2839#ifndef TRMMKERNEL 2840 movq K, %rax 2841#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 2842 movq K, %rax 2843 subq KK, %rax 2844 movq %rax, KKK 2845#else 2846 movq KK, %rax 2847#ifdef LEFT 2848 addq $8, %rax 2849#else 2850 addq $1, %rax 2851#endif 2852 movq %rax, KKK 2853#endif 2854 sarq $3, %rax 2855 je .L115 2856 ALIGN_4 2857 2858.L112: 2859 mulps %xmm9, %xmm8 2860#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) 2861 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 2862#endif 2863 2864 mulps -28 * SIZE(AO), %xmm9 2865 addps %xmm8, %xmm0 2866 movaps -24 * SIZE(AO), %xmm8 2867 addps %xmm9, %xmm4 2868 movaps 4 * SIZE(BO), %xmm9 2869 2870 mulps %xmm9, %xmm8 2871 mulps -20 * SIZE(AO), %xmm9 2872 addps %xmm8, %xmm0 2873 movaps 32 * SIZE(AO), %xmm8 2874 addps %xmm9, %xmm4 2875 movaps 8 * SIZE(BO), %xmm9 2876 2877#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) 2878 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) 2879#endif 2880 mulps %xmm9, %xmm10 2881 mulps -12 * SIZE(AO), %xmm9 2882 addps %xmm10, %xmm0 2883 movaps -8 * SIZE(AO), %xmm10 2884 addps %xmm9, %xmm4 2885 movaps 12 * SIZE(BO), %xmm9 2886 2887 mulps %xmm9, %xmm10 2888 mulps -4 * SIZE(AO), %xmm9 2889 addps %xmm10, %xmm0 2890 movaps 48 * SIZE(AO), %xmm10 2891 addps %xmm9, %xmm4 2892 movaps 32 * SIZE(BO), %xmm9 2893 2894#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) 2895 PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) 2896#endif 2897 mulps %xmm11, %xmm12 2898 mulps 4 * SIZE(AO), %xmm11 2899 addps %xmm12, %xmm0 2900 movaps 8 * SIZE(AO), %xmm12 2901 addps %xmm11, %xmm4 2902 movaps 20 * SIZE(BO), %xmm11 2903 2904 mulps %xmm11, %xmm12 2905 mulps 12 * SIZE(AO), %xmm11 2906 addps %xmm12, %xmm0 2907 movaps 64 * SIZE(AO), %xmm12 2908 addps %xmm11, %xmm4 2909 movaps 24 * SIZE(BO), %xmm11 2910 2911#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) 2912 PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) 2913#endif 2914 mulps %xmm11, %xmm14 2915 mulps 20 * SIZE(AO), %xmm11 2916 addps %xmm14, %xmm0 2917 movaps 24 * SIZE(AO), %xmm14 2918 addps %xmm11, %xmm4 2919 movaps 28 * SIZE(BO), %xmm11 2920 2921 mulps %xmm11, %xmm14 2922 mulps 28 * SIZE(AO), %xmm11 2923 addps %xmm14, %xmm0 2924 movaps 80 * SIZE(AO), %xmm14 2925 addps %xmm11, %xmm4 2926 movaps 48 * SIZE(BO), %xmm11 2927 2928 addq $64 * SIZE, AO 2929 addq $32 * SIZE, BO 2930 decq %rax 2931 jne .L112 2932 ALIGN_4 2933 2934.L115: 2935#ifndef TRMMKERNEL 2936 movq K, %rax 2937#else 2938 movq KKK, %rax 2939#endif 2940 movaps ALPHA, %xmm15 2941 andq $7, %rax # if (k & 1) 2942 BRANCH 2943 je .L118 2944 ALIGN_4 2945 2946.L116: 2947 mulps %xmm9, %xmm8 2948 mulps -28 * SIZE(AO), %xmm9 2949 addps %xmm8, %xmm0 2950 movaps -24 * SIZE(AO), %xmm8 2951 addps %xmm9, %xmm4 2952 movaps 4 * SIZE(BO), %xmm9 2953 2954 addq $8 * SIZE, AO # aoffset += 4 2955 addq $4 * SIZE, BO # boffset1 += 8 2956 decq %rax 2957 jg .L116 2958 ALIGN_4 2959 2960.L118: 2961#ifndef TRMMKERNEL 2962 movsd 0 * SIZE(CO1), %xmm8 2963 movhps 2 * SIZE(CO1), %xmm8 2964 movsd 4 * SIZE(CO1), %xmm9 2965 movhps 6 * SIZE(CO1), %xmm9 2966#endif 2967 2968 mulps %xmm15, %xmm0 2969 mulps %xmm15, %xmm4 2970#ifndef TRMMKERNEL 2971 addps %xmm8, %xmm0 2972 addps %xmm9, %xmm4 2973#endif 2974 2975 movlps %xmm0, 0 * SIZE(CO1) 2976 movhps %xmm0, 2 * SIZE(CO1) 2977 movlps %xmm4, 4 * SIZE(CO1) 2978 movhps %xmm4, 6 * SIZE(CO1) 2979 2980#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2981 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2982 movq K, %rax 2983 subq KKK, %rax 2984 leaq (,%rax, 8), %rax 2985 leaq (AO, %rax, 4), AO 2986 leaq (BO, %rax, 2), BO 2987#endif 2988 2989#if defined(TRMMKERNEL) && defined(LEFT) 2990 addq $8, KK 2991#endif 2992 2993 addq $8 * SIZE, CO1 # coffset += 4 2994 decq I # i -- 2995 jg .L111 2996 ALIGN_4 2997 2998.L120: 2999 testq $4, M 3000 je .L130 3001 3002#if !defined(TRMMKERNEL) || \ 3003 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 3004 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 3005 3006 leaq BUFFER, BO 3007#else 3008 leaq BUFFER, BO 3009 movq KK, %rax 3010 leaq (, %rax, 8), %rax 3011 leaq (AO, %rax, 2), AO 3012 leaq (BO, %rax, 2), BO 3013#endif 3014 3015 movaps -32 * SIZE(AO), %xmm8 3016 movaps -16 * SIZE(AO), %xmm10 3017 3018 movaps 0 * SIZE(BO), %xmm9 3019 movaps 16 * SIZE(BO), %xmm11 3020 3021 xorps %xmm0, %xmm0 3022 xorps %xmm1, %xmm1 3023 xorps %xmm2, %xmm2 3024 xorps %xmm3, %xmm3 3025 3026#ifndef TRMMKERNEL 3027 movq K, %rax 3028#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 3029 movq K, %rax 3030 subq KK, %rax 3031 movq %rax, KKK 3032#else 3033 movq KK, %rax 3034#ifdef LEFT 3035 addq $4, %rax 3036#else 3037 addq $1, %rax 3038#endif 3039 movq %rax, KKK 3040#endif 3041 sarq $3, %rax 3042 je .L125 3043 ALIGN_4 3044 3045.L122: 3046 mulps %xmm8, %xmm9 3047#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) 3048 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 3049#endif 3050 movaps -28 * SIZE(AO), %xmm8 3051 mulps 4 * SIZE(BO), %xmm8 3052 addps %xmm9, %xmm0 3053 movaps 32 * SIZE(BO), %xmm9 3054 addps %xmm8, %xmm1 3055 movaps -24 * SIZE(AO), %xmm8 3056 mulps 8 * SIZE(BO), %xmm8 3057 addps %xmm8, %xmm2 3058 movaps -20 * SIZE(AO), %xmm8 3059 mulps 12 * SIZE(BO), %xmm8 3060 addps %xmm8, %xmm3 3061 movaps 0 * SIZE(AO), %xmm8 3062 3063#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) 3064 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) 3065#endif 3066 mulps %xmm10, %xmm11 3067 movaps -12 * SIZE(AO), %xmm10 3068 mulps 20 * SIZE(BO), %xmm10 3069 addps %xmm11, %xmm0 3070 movaps 48 * SIZE(BO), %xmm11 3071 addps %xmm10, %xmm1 3072 movaps -8 * SIZE(AO), %xmm10 3073 mulps 24 * SIZE(BO), %xmm10 3074 addps %xmm10, %xmm2 3075 movaps -4 * SIZE(AO), %xmm10 3076 mulps 28 * SIZE(BO), %xmm10 3077 addps %xmm10, %xmm3 3078 movaps 16 * SIZE(AO), %xmm10 3079 3080 addq $32 * SIZE, AO 3081 addq $32 * SIZE, BO 3082 decq %rax 3083 jne .L122 3084 ALIGN_4 3085 3086.L125: 3087#ifndef TRMMKERNEL 3088 movq K, %rax 3089#else 3090 movq KKK, %rax 3091#endif 3092 movaps ALPHA, %xmm15 3093 andq $7, %rax # if (k & 1) 3094 BRANCH 3095 je .L128 3096 ALIGN_4 3097 3098.L126: 3099 mulps %xmm8, %xmm9 3100 movaps -28 * SIZE(AO), %xmm8 3101 addps %xmm9, %xmm0 3102 movaps 4 * SIZE(BO), %xmm9 3103 3104 addq $4 * SIZE, AO # aoffset += 4 3105 addq $4 * SIZE, BO # boffset1 += 8 3106 decq %rax 3107 jg .L126 3108 ALIGN_4 3109 3110.L128: 3111#ifndef TRMMKERNEL 3112 movsd 0 * SIZE(CO1), %xmm8 3113 movhps 2 * SIZE(CO1), %xmm8 3114#endif 3115 3116 addps %xmm1, %xmm0 3117 addps %xmm3, %xmm2 3118 addps %xmm2, %xmm0 3119 3120 mulps %xmm15, %xmm0 3121#ifndef TRMMKERNEL 3122 addps %xmm8, %xmm0 3123#endif 3124 3125 movlps %xmm0, 0 * SIZE(CO1) 3126 movhps %xmm0, 2 * SIZE(CO1) 3127 3128#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 3129 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 3130 movq K, %rax 3131 subq KKK, %rax 3132 leaq (,%rax, 8), %rax 3133 leaq (AO, %rax, 2), AO 3134 leaq (BO, %rax, 2), BO 3135#endif 3136 3137#if defined(TRMMKERNEL) && defined(LEFT) 3138 addq $4, KK 3139#endif 3140 3141 addq $4 * SIZE, CO1 # coffset += 4 3142 ALIGN_4 3143 3144.L130: 3145 testq $2, M 3146 je .L140 3147 3148#if !defined(TRMMKERNEL) || \ 3149 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 3150 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 3151 3152 leaq BUFFER, BO 3153#else 3154 leaq BUFFER, BO 3155 movq KK, %rax 3156 leaq (, %rax, 8), %rax 3157 leaq (AO, %rax, 1), AO 3158 leaq (BO, %rax, 2), BO 3159#endif 3160 3161 movaps -32 * SIZE(AO), %xmm8 3162 movaps -24 * SIZE(AO), %xmm10 3163 3164 movaps 0 * SIZE(BO), %xmm9 3165 movaps 16 * SIZE(BO), %xmm11 3166 3167 xorps %xmm0, %xmm0 3168 xorps %xmm1, %xmm1 3169 xorps %xmm2, %xmm2 3170 xorps %xmm3, %xmm3 3171 3172#ifndef TRMMKERNEL 3173 movq K, %rax 3174#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 3175 movq K, %rax 3176 subq KK, %rax 3177 movq %rax, KKK 3178#else 3179 movq KK, %rax 3180#ifdef LEFT 3181 addq $2, %rax 3182#else 3183 addq $1, %rax 3184#endif 3185 movq %rax, KKK 3186#endif 3187 sarq $3, %rax 3188 je .L135 3189 ALIGN_4 3190 3191.L132: 3192 mulps %xmm8, %xmm9 3193#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) 3194 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 3195#endif 3196 movsd -30 * SIZE(AO), %xmm8 3197 addps %xmm9, %xmm0 3198 movaps 4 * SIZE(BO), %xmm9 3199 mulps %xmm8, %xmm9 3200 movsd -28 * SIZE(AO), %xmm8 3201 addps %xmm9, %xmm1 3202 movaps 8 * SIZE(BO), %xmm9 3203 3204 mulps %xmm8, %xmm9 3205 movsd -26 * SIZE(AO), %xmm8 3206 addps %xmm9, %xmm0 3207 movaps 12 * SIZE(BO), %xmm9 3208 3209 mulps %xmm8, %xmm9 3210 movsd -16 * SIZE(AO), %xmm8 3211 addps %xmm9, %xmm1 3212 movaps 32 * SIZE(BO), %xmm9 3213 3214 mulps %xmm10, %xmm11 3215 movsd -22 * SIZE(AO), %xmm10 3216 addps %xmm11, %xmm0 3217 movaps 20 * SIZE(BO), %xmm11 3218 3219 mulps %xmm10, %xmm11 3220 movsd -20 * SIZE(AO), %xmm10 3221 addps %xmm11, %xmm1 3222 movaps 24 * SIZE(BO), %xmm11 3223 3224 mulps %xmm10, %xmm11 3225 movsd -18 * SIZE(AO), %xmm10 3226 addps %xmm11, %xmm0 3227 movaps 28 * SIZE(BO), %xmm11 3228 3229 mulps %xmm10, %xmm11 3230 movsd -8 * SIZE(AO), %xmm10 3231 addps %xmm11, %xmm1 3232 movaps 48 * SIZE(BO), %xmm11 3233 3234 addq $16 * SIZE, AO 3235 addq $32 * SIZE, BO 3236 decq %rax 3237 jne .L132 3238 ALIGN_4 3239 3240.L135: 3241#ifndef TRMMKERNEL 3242 movq K, %rax 3243#else 3244 movq KKK, %rax 3245#endif 3246 movaps ALPHA, %xmm15 3247 andq $7, %rax # if (k & 1) 3248 BRANCH 3249 je .L138 3250 ALIGN_4 3251 3252.L136: 3253 mulps %xmm8, %xmm9 3254 movsd -30 * SIZE(AO), %xmm8 3255 addps %xmm9, %xmm0 3256 movaps 4 * SIZE(BO), %xmm9 3257 3258 addq $2 * SIZE, AO # aoffset += 4 3259 addq $4 * SIZE, BO # boffset1 += 8 3260 decq %rax 3261 jg .L136 3262 ALIGN_4 3263 3264.L138: 3265 addps %xmm1, %xmm0 3266 mulps %xmm15, %xmm0 3267 3268#ifndef TRMMKERNEL 3269#ifdef movsd 3270 xorps %xmm8, %xmm8 3271#endif 3272 movsd 0 * SIZE(CO1), %xmm8 3273 addps %xmm8, %xmm0 3274#endif 3275 3276 movlps %xmm0, 0 * SIZE(CO1) 3277 3278#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 3279 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 3280 movq K, %rax 3281 subq KKK, %rax 3282 leaq (,%rax, 8), %rax 3283 leaq (AO, %rax, 1), AO 3284 leaq (BO, %rax, 2), BO 3285#endif 3286 3287#if defined(TRMMKERNEL) && defined(LEFT) 3288 addq $2, KK 3289#endif 3290 3291 addq $2 * SIZE, CO1 # coffset += 4 3292 ALIGN_4 3293 3294.L140: 3295 testq $1, M 3296 je .L999 3297 3298#if !defined(TRMMKERNEL) || \ 3299 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 3300 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 3301 3302 leaq BUFFER, BO 3303#else 3304 leaq BUFFER, BO 3305 movq KK, %rax 3306 leaq (, %rax, 4), %rax 3307 leaq (AO, %rax, 1), AO 3308 leaq (BO, %rax, 4), BO 3309#endif 3310 3311 movss -32 * SIZE(AO), %xmm8 3312 movss -28 * SIZE(AO), %xmm10 3313 3314 movss 0 * SIZE(BO), %xmm9 3315 movss 16 * SIZE(BO), %xmm11 3316 3317 xorps %xmm0, %xmm0 3318 xorps %xmm1, %xmm1 3319 xorps %xmm2, %xmm2 3320 xorps %xmm3, %xmm3 3321 3322#ifndef TRMMKERNEL 3323 movq K, %rax 3324#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 3325 movq K, %rax 3326 subq KK, %rax 3327 movq %rax, KKK 3328#else 3329 movq KK, %rax 3330#ifdef LEFT 3331 addq $1, %rax 3332#else 3333 addq $1, %rax 3334#endif 3335 movq %rax, KKK 3336#endif 3337 sarq $3, %rax 3338 je .L145 3339 ALIGN_4 3340 3341.L142: 3342 mulss %xmm8, %xmm9 3343#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) 3344 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 3345#endif 3346 movss -31 * SIZE(AO), %xmm8 3347 mulss 4 * SIZE(BO), %xmm8 3348 addss %xmm9, %xmm0 3349 movss 32 * SIZE(BO), %xmm9 3350 addss %xmm8, %xmm1 3351 movss -30 * SIZE(AO), %xmm8 3352 mulss 8 * SIZE(BO), %xmm8 3353 addss %xmm8, %xmm2 3354 movss -29 * SIZE(AO), %xmm8 3355 mulss 12 * SIZE(BO), %xmm8 3356 addss %xmm8, %xmm3 3357 movss -24 * SIZE(AO), %xmm8 3358 mulss %xmm10, %xmm11 3359 movss -27 * SIZE(AO), %xmm10 3360 mulss 20 * SIZE(BO), %xmm10 3361 addss %xmm11, %xmm0 3362 movss 48 * SIZE(BO), %xmm11 3363 addss %xmm10, %xmm1 3364 movss -26 * SIZE(AO), %xmm10 3365 mulss 24 * SIZE(BO), %xmm10 3366 addss %xmm10, %xmm2 3367 movss -25 * SIZE(AO), %xmm10 3368 mulss 28 * SIZE(BO), %xmm10 3369 addss %xmm10, %xmm3 3370 movss -20 * SIZE(AO), %xmm10 3371 3372 addq $ 8 * SIZE, AO 3373 addq $32 * SIZE, BO 3374 decq %rax 3375 jne .L142 3376 ALIGN_4 3377 3378.L145: 3379#ifndef TRMMKERNEL 3380 movq K, %rax 3381#else 3382 movq KKK, %rax 3383#endif 3384 movss ALPHA, %xmm15 3385 andq $7, %rax # if (k & 1) 3386 BRANCH 3387 je .L148 3388 ALIGN_4 3389 3390.L146: 3391 mulss %xmm8, %xmm9 3392 movss -31 * SIZE(AO), %xmm8 3393 addss %xmm9, %xmm0 3394 movss 4 * SIZE(BO), %xmm9 3395 3396 addq $1 * SIZE, AO 3397 addq $4 * SIZE, BO 3398 decq %rax 3399 jg .L146 3400 ALIGN_4 3401 3402.L148: 3403 addss %xmm1, %xmm0 3404 addss %xmm3, %xmm2 3405 addss %xmm2, %xmm0 3406 3407 mulss %xmm15, %xmm0 3408 3409#ifndef TRMMKERNEL 3410 movss 0 * SIZE(CO1), %xmm8 3411 addss %xmm8, %xmm0 3412#endif 3413 movss %xmm0, 0 * SIZE(CO1) 3414 ALIGN_4 3415 3416.L999: 3417 movq %rbx, %rsp 3418 3419 EMMS 3420 3421 movq 0(%rsp), %rbx 3422 movq 8(%rsp), %rbp 3423 movq 16(%rsp), %r12 3424 movq 24(%rsp), %r13 3425 movq 32(%rsp), %r14 3426 movq 40(%rsp), %r15 3427 3428#ifdef WINDOWS_ABI 3429 movq 48(%rsp), %rdi 3430 movq 56(%rsp), %rsi 3431 movups 64(%rsp), %xmm6 3432 movups 80(%rsp), %xmm7 3433 movups 96(%rsp), %xmm8 3434 movups 112(%rsp), %xmm9 3435 movups 128(%rsp), %xmm10 3436 movups 144(%rsp), %xmm11 3437 movups 160(%rsp), %xmm12 3438 movups 176(%rsp), %xmm13 3439 movups 192(%rsp), %xmm14 3440 movups 208(%rsp), %xmm15 3441#endif 3442 3443 addq $STACKSIZE, %rsp 3444 ret 3445 3446 EPILOGUE 3447