1/*********************************************************************/ 2/* */ 3/* Optimized BLAS libraries */ 4/* By Kazushige Goto <kgoto@tacc.utexas.edu> */ 5/* */ 6/* Copyright (c) The University of Texas, 2009. All rights reserved. */ 7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */ 8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */ 9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */ 10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */ 11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */ 12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */ 13/* THE USE OF THE SOFTWARE OR DOCUMENTATION. */ 14/* Under no circumstances shall University be liable for incidental, */ 15/* special, indirect, direct or consequential damages or loss of */ 16/* profits, interruption of business, or related expenses which may */ 17/* arise from use of Software or Documentation, including but not */ 18/* limited to those resulting from defects in Software and/or */ 19/* Documentation, or loss or inaccuracy of data of any kind. */ 20/*********************************************************************/ 21 22#define ASSEMBLER 23#include "common.h" 24 25#define OLD_M %rdi 26#define OLD_N %rsi 27 28#define M %r13 29#define N %r14 30#define K %rdx 31 32#define A %rcx 33#define B %r8 34#define C %r9 35#define LDC %r10 36#define I %r11 37#define AO %rdi 38#define BO %rsi 39#define CO1 %r15 40#define CO2 %rbp 41#define BB %r12 42 43#ifndef WINDOWS_ABI 44 45#define STACKSIZE 64 46 47#define OLD_LDC 8 + STACKSIZE(%rsp) 48#define OLD_OFFSET 16 + STACKSIZE(%rsp) 49 50#else 51 52#define STACKSIZE 256 53 54#define OLD_A 40 + STACKSIZE(%rsp) 55#define OLD_B 48 + STACKSIZE(%rsp) 56#define OLD_C 56 + STACKSIZE(%rsp) 57#define OLD_LDC 64 + STACKSIZE(%rsp) 58#define OLD_OFFSET 72 + STACKSIZE(%rsp) 59 60#endif 61 62#define ALPHA 0(%rsp) 63#define J 16(%rsp) 64#define OFFSET 24(%rsp) 65#define KK 32(%rsp) 66#define KKK 40(%rsp) 67#define BUFFER 256(%rsp) 68 69#ifdef OPTERON 70#define movsd movlps 71#endif 72 73#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) 74#define PREFETCH prefetch 75#define PREFETCHW prefetchw 76#define PREFETCHSIZE (16 * 9 + 8) 77#endif 78 79#if defined(GENERIC) || defined(NANO) 80#define PREFETCH prefetcht0 81#define PREFETCHW prefetcht0 82#define PREFETCHSIZE (16 * 5 + 8) 83#endif 84 85#define RPREFETCHSIZE (8 * 7 + 4) 86#define WPREFETCHSIZE (8 * 8 + 4) 87 88#ifndef GENERIC 89#define KERNEL1(xx) \ 90 mulps %xmm0, %xmm1 ;\ 91 addps %xmm1, %xmm8 ;\ 92 movaps -32 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ 93 mulps %xmm0, %xmm3 ;\ 94 addps %xmm3, %xmm9 ;\ 95 movaps -28 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ 96 mulps %xmm0, %xmm5 ;\ 97 PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\ 98 mulps -20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\ 99 addps %xmm5, %xmm10 ;\ 100 movaps -24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ 101 addps %xmm0, %xmm11 ;\ 102 movaps -16 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0 103 104#define KERNEL2(xx) \ 105 mulps %xmm2, %xmm1 ;\ 106 addps %xmm1, %xmm12 ;\ 107 movaps 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ 108 mulps %xmm2, %xmm3 ;\ 109 addps %xmm3, %xmm13 ;\ 110 movaps -12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ 111 mulps %xmm2, %xmm5 ;\ 112 mulps -20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\ 113 addps %xmm5, %xmm14 ;\ 114 movaps -8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ 115 addps %xmm2, %xmm15 ;\ 116 movaps -12 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2 117 118#define KERNEL3(xx) \ 119 mulps %xmm4, %xmm7 ;\ 120 addps %xmm7, %xmm8 ;\ 121 movaps -16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ 122 mulps %xmm4, %xmm3 ;\ 123 addps %xmm3, %xmm9 ;\ 124 movaps -12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ 125 mulps %xmm4, %xmm5 ;\ 126 mulps -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\ 127 addps %xmm5, %xmm10 ;\ 128 movaps -8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ 129 addps %xmm4, %xmm11 ;\ 130 movaps -8 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4 131 132#define KERNEL4(xx) \ 133 mulps %xmm6, %xmm7 ;\ 134 addps %xmm7, %xmm12 ;\ 135 movaps 16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ 136 mulps %xmm6, %xmm3 ;\ 137 addps %xmm3, %xmm13 ;\ 138 movaps 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ 139 mulps %xmm6, %xmm5 ;\ 140 mulps -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\ 141 addps %xmm5, %xmm14 ;\ 142 movaps 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ 143 PREFETCH (PREFETCHSIZE + 16) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\ 144 addps %xmm6, %xmm15 ;\ 145 movaps -4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6 146 147#define KERNEL5(xx) \ 148 mulps %xmm0, %xmm1 ;\ 149 addps %xmm1, %xmm8 ;\ 150 movaps 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ 151 mulps %xmm0, %xmm3 ;\ 152 addps %xmm3, %xmm9 ;\ 153 movaps 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ 154 mulps %xmm0, %xmm5 ;\ 155 mulps 12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\ 156 addps %xmm5, %xmm10 ;\ 157 movaps 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ 158 addps %xmm0, %xmm11 ;\ 159 movaps 0 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0 160 161#define KERNEL6(xx) \ 162 mulps %xmm2, %xmm1 ;\ 163 addps %xmm1, %xmm12 ;\ 164 movaps 32 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ 165 mulps %xmm2, %xmm3 ;\ 166 addps %xmm3, %xmm13 ;\ 167 movaps 20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ 168 mulps %xmm2, %xmm5 ;\ 169 mulps 12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\ 170 addps %xmm5, %xmm14 ;\ 171 movaps 24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ 172 addps %xmm2, %xmm15 ;\ 173 movaps 4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2 174 175#define KERNEL7(xx) \ 176 mulps %xmm4, %xmm7 ;\ 177 addps %xmm7, %xmm8 ;\ 178 movaps 16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ 179 mulps %xmm4, %xmm3 ;\ 180 addps %xmm3, %xmm9 ;\ 181 movaps 20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ 182 mulps %xmm4, %xmm5 ;\ 183 mulps 28 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\ 184 addps %xmm5, %xmm10 ;\ 185 movaps 24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ 186 addps %xmm4, %xmm11 ;\ 187 movaps 8 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4 188 189#define KERNEL8(xx) \ 190 mulps %xmm6, %xmm7 ;\ 191 addps %xmm7, %xmm12 ;\ 192 movaps 48 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ 193 mulps %xmm6, %xmm3 ;\ 194 addps %xmm3, %xmm13 ;\ 195 movaps 36 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ 196 mulps %xmm6, %xmm5 ;\ 197 mulps 28 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\ 198 addps %xmm5, %xmm14 ;\ 199 movaps 40 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ 200 addps %xmm6, %xmm15 ;\ 201 movaps 12 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6 202 203#else 204#define KERNEL1(xx) \ 205 mulps %xmm0, %xmm1 ;\ 206 addps %xmm1, %xmm8 ;\ 207 movaps -32 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ 208 mulps %xmm0, %xmm3 ;\ 209 addps %xmm3, %xmm9 ;\ 210 movaps -28 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ 211 mulps %xmm0, %xmm5 ;\ 212 PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\ 213 mulps -20 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\ 214 addps %xmm5, %xmm10 ;\ 215 movaps -24 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ 216 addps %xmm0, %xmm11 ;\ 217 movaps -16 * SIZE + 1 * (xx) * SIZE(AO), %xmm0 218 219#define KERNEL2(xx) \ 220 mulps %xmm2, %xmm1 ;\ 221 addps %xmm1, %xmm12 ;\ 222 movaps 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ 223 mulps %xmm2, %xmm3 ;\ 224 addps %xmm3, %xmm13 ;\ 225 movaps -12 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ 226 mulps %xmm2, %xmm5 ;\ 227 mulps -20 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\ 228 addps %xmm5, %xmm14 ;\ 229 movaps -8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ 230 addps %xmm2, %xmm15 ;\ 231 movaps -12 * SIZE + 1 * (xx) * SIZE(AO), %xmm2 ;\ 232 233#define KERNEL3(xx) \ 234 mulps %xmm4, %xmm7 ;\ 235 addps %xmm7, %xmm8 ;\ 236 movaps -16 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ 237 mulps %xmm4, %xmm3 ;\ 238 addps %xmm3, %xmm9 ;\ 239 movaps -12 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ 240 mulps %xmm4, %xmm5 ;\ 241 mulps -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\ 242 addps %xmm5, %xmm10 ;\ 243 movaps -8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ 244 addps %xmm4, %xmm11 ;\ 245 movaps -8 * SIZE + 1 * (xx) * SIZE(AO), %xmm4 246 247#define KERNEL4(xx) \ 248 mulps %xmm6, %xmm7 ;\ 249 addps %xmm7, %xmm12 ;\ 250 movaps 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ 251 mulps %xmm6, %xmm3 ;\ 252 addps %xmm3, %xmm13 ;\ 253 movaps 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ 254 mulps %xmm6, %xmm5 ;\ 255 mulps -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\ 256 addps %xmm5, %xmm14 ;\ 257 movaps 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ 258 addps %xmm6, %xmm15 ;\ 259 movaps -4 * SIZE + 1 * (xx) * SIZE(AO), %xmm6 260 261#define KERNEL5(xx) \ 262 mulps %xmm0, %xmm1 ;\ 263 PREFETCH (PREFETCHSIZE + 16) * SIZE + 1 * (xx) * SIZE(AO) ;\ 264 addps %xmm1, %xmm8 ;\ 265 movaps 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ 266 mulps %xmm0, %xmm3 ;\ 267 addps %xmm3, %xmm9 ;\ 268 movaps 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ 269 mulps %xmm0, %xmm5 ;\ 270 mulps 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\ 271 addps %xmm5, %xmm10 ;\ 272 movaps 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ 273 addps %xmm0, %xmm11 ;\ 274 movaps 0 * SIZE + 1 * (xx) * SIZE(AO), %xmm0 275 276#define KERNEL6(xx) \ 277 mulps %xmm2, %xmm1 ;\ 278 addps %xmm1, %xmm12 ;\ 279 movaps 32 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ 280 mulps %xmm2, %xmm3 ;\ 281 addps %xmm3, %xmm13 ;\ 282 movaps 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ 283 mulps %xmm2, %xmm5 ;\ 284 mulps 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\ 285 addps %xmm5, %xmm14 ;\ 286 movaps 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ 287 addps %xmm2, %xmm15 ;\ 288 movaps 4 * SIZE + 1 * (xx) * SIZE(AO), %xmm2 289 290#define KERNEL7(xx) \ 291 mulps %xmm4, %xmm7 ;\ 292 addps %xmm7, %xmm8 ;\ 293 movaps 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ 294 mulps %xmm4, %xmm3 ;\ 295 addps %xmm3, %xmm9 ;\ 296 movaps 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ 297 mulps %xmm4, %xmm5 ;\ 298 mulps 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\ 299 addps %xmm5, %xmm10 ;\ 300 movaps 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ 301 addps %xmm4, %xmm11 ;\ 302 movaps 8 * SIZE + 1 * (xx) * SIZE(AO), %xmm4 303 304#define KERNEL8(xx) \ 305 mulps %xmm6, %xmm7 ;\ 306 addps %xmm7, %xmm12 ;\ 307 movaps 48 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ 308 mulps %xmm6, %xmm3 ;\ 309 addps %xmm3, %xmm13 ;\ 310 movaps 36 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ 311 mulps %xmm6, %xmm5 ;\ 312 mulps 28 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\ 313 addps %xmm5, %xmm14 ;\ 314 movaps 40 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ 315 addps %xmm6, %xmm15 ;\ 316 movaps 12 * SIZE + 1 * (xx) * SIZE(AO), %xmm6 317 318#endif 319 320 PROLOGUE 321 PROFCODE 322 323 subq $STACKSIZE, %rsp 324 325 movq %rbx, 0(%rsp) 326 movq %rbp, 8(%rsp) 327 movq %r12, 16(%rsp) 328 movq %r13, 24(%rsp) 329 movq %r14, 32(%rsp) 330 movq %r15, 40(%rsp) 331 332#ifdef WINDOWS_ABI 333 movq %rdi, 48(%rsp) 334 movq %rsi, 56(%rsp) 335 movups %xmm6, 64(%rsp) 336 movups %xmm7, 80(%rsp) 337 movups %xmm8, 96(%rsp) 338 movups %xmm9, 112(%rsp) 339 movups %xmm10, 128(%rsp) 340 movups %xmm11, 144(%rsp) 341 movups %xmm12, 160(%rsp) 342 movups %xmm13, 176(%rsp) 343 movups %xmm14, 192(%rsp) 344 movups %xmm15, 208(%rsp) 345 346 movq ARG1, OLD_M 347 movq ARG2, OLD_N 348 movq ARG3, K 349 movq OLD_A, A 350 movq OLD_B, B 351 movq OLD_C, C 352 movq OLD_LDC, LDC 353#ifdef TRMMKERNEL 354 movsd OLD_OFFSET, %xmm4 355#endif 356 movaps %xmm3, %xmm0 357 358#else 359 movq OLD_LDC, LDC 360#ifdef TRMMKERNEL 361 movsd OLD_OFFSET, %xmm4 362#endif 363 364#endif 365 366 EMMS 367 368 movq %rsp, %rbx # save old stack 369 subq $128 + LOCAL_BUFFER_SIZE, %rsp 370 andq $-4096, %rsp # align stack 371 372 STACK_TOUCHING 373 374 movq OLD_M, M 375 movq OLD_N, N 376 377 shufps $0, %xmm0, %xmm0 378 movaps %xmm0, ALPHA 379 380#ifdef TRMMKERNEL 381 movsd %xmm4, OFFSET 382 movsd %xmm4, KK 383#ifndef LEFT 384 negq KK 385#endif 386#endif 387 388 subq $-32 * SIZE, A 389 390 leaq (, LDC, SIZE), LDC 391 392 movq N, J 393 sarq $2, J # j = (n >> 2) 394 jle .L50 395 396.L01: 397#if defined(TRMMKERNEL) && defined(LEFT) 398 movq OFFSET, %rax 399 movq %rax, KK 400#endif 401 402/* Copying to Sub Buffer */ 403 leaq BUFFER, BO 404 405 movd 0 * SIZE(B), %mm0 406 407 movq K, %rax 408 sarq $2, %rax 409 jle .L03 410 411 addq %rax, %rax 412 ALIGN_4 413 414.L02: 415 PREFETCH (RPREFETCHSIZE + 0) * SIZE(B) 416 417 movd 1 * SIZE(B), %mm1 418 movd 2 * SIZE(B), %mm2 419 movd 3 * SIZE(B), %mm3 420 movd 4 * SIZE(B), %mm4 421 movd 5 * SIZE(B), %mm5 422 movd 6 * SIZE(B), %mm6 423 movd 7 * SIZE(B), %mm7 424 425 PREFETCHW (WPREFETCHSIZE + 0) * SIZE(BO) 426 427 punpckldq %mm0, %mm0 428 movq %mm0, 0 * SIZE(BO) 429 movq %mm0, 2 * SIZE(BO) 430 punpckldq %mm1, %mm1 431 movd 8 * SIZE(B), %mm0 432 movq %mm1, 4 * SIZE(BO) 433 movq %mm1, 6 * SIZE(BO) 434 punpckldq %mm2, %mm2 435 movq %mm2, 8 * SIZE(BO) 436 movq %mm2, 10 * SIZE(BO) 437 punpckldq %mm3, %mm3 438 movq %mm3, 12 * SIZE(BO) 439 movq %mm3, 14 * SIZE(BO) 440 441 PREFETCHW (WPREFETCHSIZE + 16) * SIZE(BO) 442 443 punpckldq %mm4, %mm4 444 movq %mm4, 16 * SIZE(BO) 445 movq %mm4, 18 * SIZE(BO) 446 punpckldq %mm5, %mm5 447 movq %mm5, 20 * SIZE(BO) 448 movq %mm5, 22 * SIZE(BO) 449 punpckldq %mm6, %mm6 450 movq %mm6, 24 * SIZE(BO) 451 movq %mm6, 26 * SIZE(BO) 452 punpckldq %mm7, %mm7 453 movq %mm7, 28 * SIZE(BO) 454 movq %mm7, 30 * SIZE(BO) 455 456 457 addq $ 8 * SIZE, B 458 addq $32 * SIZE, BO 459 460 decq %rax 461 jne .L02 462 ALIGN_4 463 464.L03: 465 movq K, %rax 466 andq $3, %rax 467 BRANCH 468 jle .L10 469 ALIGN_4 470 471.L04: 472 movd 0 * SIZE(B), %mm0 473 movd 1 * SIZE(B), %mm1 474 movd 2 * SIZE(B), %mm2 475 movd 3 * SIZE(B), %mm3 476 477 punpckldq %mm0, %mm0 478 punpckldq %mm1, %mm1 479 punpckldq %mm2, %mm2 480 punpckldq %mm3, %mm3 481 482 movq %mm0, 0 * SIZE(BO) 483 movq %mm0, 2 * SIZE(BO) 484 movq %mm1, 4 * SIZE(BO) 485 movq %mm1, 6 * SIZE(BO) 486 movq %mm2, 8 * SIZE(BO) 487 movq %mm2, 10 * SIZE(BO) 488 movq %mm3, 12 * SIZE(BO) 489 movq %mm3, 14 * SIZE(BO) 490 491 addq $ 4 * SIZE, B 492 addq $16 * SIZE, BO 493 decq %rax 494 jne .L04 495 ALIGN_4 496 497.L10: 498 movq C, CO1 # coffset1 = c 499 leaq (C, LDC, 1), CO2 # coffset2 = c + ldc 500 movq A, AO # aoffset = a 501 502 leaq (RPREFETCHSIZE + 0) * SIZE(B), BB 503 504 movq M, I 505 sarq $3, I # i = (m >> 3) 506 jle .L20 507 ALIGN_4 508 509.L11: 510#if !defined(TRMMKERNEL) || \ 511 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 512 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 513 514 leaq 32 * SIZE + BUFFER, BO 515#else 516 leaq 32 * SIZE + BUFFER, BO 517 movq KK, %rax 518 leaq (, %rax, 8), %rax 519 leaq (AO, %rax, 4), AO 520 leaq (BO, %rax, 8), BO 521#endif 522 523 movaps -32 * SIZE(AO), %xmm0 524 movaps -32 * SIZE(BO), %xmm1 525 xorps %xmm8, %xmm8 526 movaps -28 * SIZE(AO), %xmm2 527 movaps -28 * SIZE(BO), %xmm3 528 xorps %xmm9, %xmm9 529 movaps -24 * SIZE(AO), %xmm4 530 movaps -24 * SIZE(BO), %xmm5 531 xorps %xmm10, %xmm10 532 movaps -20 * SIZE(AO), %xmm6 533 movaps -16 * SIZE(BO), %xmm7 534 xorps %xmm11, %xmm11 535 536 PREFETCHW 7 * SIZE(CO1) 537 xorps %xmm12, %xmm12 538 PREFETCHW 15 * SIZE(CO2) 539 xorps %xmm13, %xmm13 540 PREFETCHW 7 * SIZE(CO1, LDC, 2) 541 xorps %xmm14, %xmm14 542 PREFETCHW 15 * SIZE(CO2, LDC, 2) 543 xorps %xmm15, %xmm15 544 PREFETCH -32 * SIZE(BB) 545 546#ifndef TRMMKERNEL 547 movq K, %rax 548#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 549 movq K, %rax 550 subq KK, %rax 551 movq %rax, KKK 552#else 553 movq KK, %rax 554#ifdef LEFT 555 addq $8, %rax 556#else 557 addq $4, %rax 558#endif 559 movq %rax, KKK 560#endif 561#ifndef GENERIC 562 andq $-8, %rax 563 564 leaq (, %rax, 8), %rax 565 leaq (AO, %rax, 4), AO 566 leaq (BO, %rax, 8), BO 567 negq %rax 568 NOBRANCH 569 je .L15 570 ALIGN_3 571 572.L12: 573 KERNEL1(16 * 0) 574 KERNEL2(16 * 0) 575 KERNEL3(16 * 0) 576 KERNEL4(16 * 0) 577 KERNEL5(16 * 0) 578 KERNEL6(16 * 0) 579 KERNEL7(16 * 0) 580 KERNEL8(16 * 0) 581 582 KERNEL1(16 * 2) 583 KERNEL2(16 * 2) 584 KERNEL3(16 * 2) 585 KERNEL4(16 * 2) 586 KERNEL5(16 * 2) 587 KERNEL6(16 * 2) 588 KERNEL7(16 * 2) 589 KERNEL8(16 * 2) 590 591 addq $16 * SIZE, %rax 592 NOBRANCH 593 je .L15 594 KERNEL1(16 * 0) 595 KERNEL2(16 * 0) 596 KERNEL3(16 * 0) 597 KERNEL4(16 * 0) 598 KERNEL5(16 * 0) 599 KERNEL6(16 * 0) 600 KERNEL7(16 * 0) 601 KERNEL8(16 * 0) 602 603 KERNEL1(16 * 2) 604 KERNEL2(16 * 2) 605 KERNEL3(16 * 2) 606 KERNEL4(16 * 2) 607 KERNEL5(16 * 2) 608 KERNEL6(16 * 2) 609 KERNEL7(16 * 2) 610 KERNEL8(16 * 2) 611 612 addq $16 * SIZE, %rax 613 NOBRANCH 614 je .L15 615 KERNEL1(16 * 0) 616 KERNEL2(16 * 0) 617 KERNEL3(16 * 0) 618 KERNEL4(16 * 0) 619 KERNEL5(16 * 0) 620 KERNEL6(16 * 0) 621 KERNEL7(16 * 0) 622 KERNEL8(16 * 0) 623 624 KERNEL1(16 * 2) 625 KERNEL2(16 * 2) 626 KERNEL3(16 * 2) 627 KERNEL4(16 * 2) 628 KERNEL5(16 * 2) 629 KERNEL6(16 * 2) 630 KERNEL7(16 * 2) 631 KERNEL8(16 * 2) 632 633 addq $16 * SIZE, %rax 634 NOBRANCH 635 je .L15 636 KERNEL1(16 * 0) 637 KERNEL2(16 * 0) 638 KERNEL3(16 * 0) 639 KERNEL4(16 * 0) 640 KERNEL5(16 * 0) 641 KERNEL6(16 * 0) 642 KERNEL7(16 * 0) 643 KERNEL8(16 * 0) 644 645 KERNEL1(16 * 2) 646 KERNEL2(16 * 2) 647 KERNEL3(16 * 2) 648 KERNEL4(16 * 2) 649 KERNEL5(16 * 2) 650 KERNEL6(16 * 2) 651 KERNEL7(16 * 2) 652 KERNEL8(16 * 2) 653 654 addq $16 * SIZE, %rax 655 NOBRANCH 656 je .L15 657 KERNEL1(16 * 0) 658 KERNEL2(16 * 0) 659 KERNEL3(16 * 0) 660 KERNEL4(16 * 0) 661 KERNEL5(16 * 0) 662 KERNEL6(16 * 0) 663 KERNEL7(16 * 0) 664 KERNEL8(16 * 0) 665 666 KERNEL1(16 * 2) 667 KERNEL2(16 * 2) 668 KERNEL3(16 * 2) 669 KERNEL4(16 * 2) 670 KERNEL5(16 * 2) 671 KERNEL6(16 * 2) 672 KERNEL7(16 * 2) 673 KERNEL8(16 * 2) 674 675 addq $16 * SIZE, %rax 676 NOBRANCH 677 je .L15 678 KERNEL1(16 * 0) 679 KERNEL2(16 * 0) 680 KERNEL3(16 * 0) 681 KERNEL4(16 * 0) 682 KERNEL5(16 * 0) 683 KERNEL6(16 * 0) 684 KERNEL7(16 * 0) 685 KERNEL8(16 * 0) 686 687 KERNEL1(16 * 2) 688 KERNEL2(16 * 2) 689 KERNEL3(16 * 2) 690 KERNEL4(16 * 2) 691 KERNEL5(16 * 2) 692 KERNEL6(16 * 2) 693 KERNEL7(16 * 2) 694 KERNEL8(16 * 2) 695 696 addq $16 * SIZE, %rax 697 NOBRANCH 698 je .L15 699 KERNEL1(16 * 0) 700 KERNEL2(16 * 0) 701 KERNEL3(16 * 0) 702 KERNEL4(16 * 0) 703 KERNEL5(16 * 0) 704 KERNEL6(16 * 0) 705 KERNEL7(16 * 0) 706 KERNEL8(16 * 0) 707 708 KERNEL1(16 * 2) 709 KERNEL2(16 * 2) 710 KERNEL3(16 * 2) 711 KERNEL4(16 * 2) 712 KERNEL5(16 * 2) 713 KERNEL6(16 * 2) 714 KERNEL7(16 * 2) 715 KERNEL8(16 * 2) 716 717 addq $16 * SIZE, %rax 718 NOBRANCH 719 je .L15 720 KERNEL1(16 * 0) 721 KERNEL2(16 * 0) 722 KERNEL3(16 * 0) 723 KERNEL4(16 * 0) 724 KERNEL5(16 * 0) 725 KERNEL6(16 * 0) 726 KERNEL7(16 * 0) 727 KERNEL8(16 * 0) 728 729 KERNEL1(16 * 2) 730 KERNEL2(16 * 2) 731 KERNEL3(16 * 2) 732 KERNEL4(16 * 2) 733 KERNEL5(16 * 2) 734 KERNEL6(16 * 2) 735 KERNEL7(16 * 2) 736 KERNEL8(16 * 2) 737 738 addq $16 * SIZE, %rax 739 BRANCH 740 jl .L12 741 ALIGN_3 742 743.L15: 744 PREFETCH -16 * SIZE(BB) 745 subq $-16 * SIZE, BB 746 747#ifndef TRMMKERNEL 748 movq K, %rax 749#else 750 movq KKK, %rax 751#endif 752 testq $4, %rax 753 je .L16 754 xorq %rax, %rax 755 ALIGN_3 756 757 KERNEL1(16 * 0) 758 KERNEL2(16 * 0) 759 KERNEL3(16 * 0) 760 KERNEL4(16 * 0) 761 KERNEL5(16 * 0) 762 KERNEL6(16 * 0) 763 KERNEL7(16 * 0) 764 KERNEL8(16 * 0) 765 766 addq $64 * SIZE, BO 767 addq $32 * SIZE, AO 768 ALIGN_3 769#else 770 sarq $2, %rax 771 NOBRANCH 772 jle .L16 773 ALIGN_3 774 775.L12: 776 KERNEL1(16 * 0) 777 KERNEL2(16 * 0) 778 KERNEL3(16 * 0) 779 KERNEL4(16 * 0) 780 KERNEL5(16 * 0) 781 KERNEL6(16 * 0) 782 KERNEL7(16 * 0) 783 KERNEL8(16 * 0) 784 785 addq $ 64 * SIZE, BO 786 subq $-32 * SIZE, AO 787 decq %rax 788 BRANCH 789 jg .L12 790#endif 791 792.L16: 793 movaps ALPHA, %xmm7 794 795#ifndef TRMMKERNEL 796 movq K, %rax 797#else 798 movq KKK, %rax 799#endif 800 andq $3, %rax # if (k & 1) 801 je .L18 802 803 leaq (, %rax, 8), %rax 804 leaq (AO, %rax, 4), AO 805 leaq (BO, %rax, 8), BO 806 negq %rax 807 ALIGN_4 808 809.L17: 810 mulps %xmm0, %xmm1 811 addps %xmm1, %xmm8 812 movaps -28 * SIZE(BO, %rax, 8), %xmm1 813 mulps %xmm0, %xmm1 814 addps %xmm1, %xmm9 815 movaps -24 * SIZE(BO, %rax, 8), %xmm1 816 mulps %xmm0, %xmm1 817 mulps -20 * SIZE(BO, %rax, 8), %xmm0 818 addps %xmm1, %xmm10 819 movaps -32 * SIZE(BO, %rax, 8), %xmm1 820 addps %xmm0, %xmm11 821 movaps -24 * SIZE(AO, %rax, 4), %xmm0 822 mulps %xmm2, %xmm1 823 addps %xmm1, %xmm12 824 movaps -28 * SIZE(BO, %rax, 8), %xmm1 825 mulps %xmm2, %xmm1 826 addps %xmm1, %xmm13 827 movaps -24 * SIZE(BO, %rax, 8), %xmm1 828 mulps %xmm2, %xmm1 829 mulps -20 * SIZE(BO, %rax, 8), %xmm2 830 addps %xmm1, %xmm14 831 movaps -16 * SIZE(BO, %rax, 8), %xmm1 832 addps %xmm2, %xmm15 833 movaps -20 * SIZE(AO, %rax, 4), %xmm2 834 835 addq $SIZE * 2, %rax 836 jl .L17 837 ALIGN_4 838 839.L18: 840#ifndef TRMMKERNEL 841 movsd 0 * SIZE(CO1), %xmm0 842 movhps 2 * SIZE(CO1), %xmm0 843 movsd 4 * SIZE(CO1), %xmm1 844 movhps 6 * SIZE(CO1), %xmm1 845 846 movsd 0 * SIZE(CO2), %xmm2 847 movhps 2 * SIZE(CO2), %xmm2 848 movsd 4 * SIZE(CO2), %xmm3 849 movhps 6 * SIZE(CO2), %xmm3 850#endif 851 852 mulps %xmm7, %xmm8 853 mulps %xmm7, %xmm9 854 mulps %xmm7, %xmm10 855 mulps %xmm7, %xmm11 856 857 mulps %xmm7, %xmm12 858 mulps %xmm7, %xmm13 859 mulps %xmm7, %xmm14 860 mulps %xmm7, %xmm15 861 862#ifndef TRMMKERNEL 863 movsd 0 * SIZE(CO1, LDC, 2), %xmm4 864 movhps 2 * SIZE(CO1, LDC, 2), %xmm4 865 movsd 4 * SIZE(CO1, LDC, 2), %xmm5 866 movhps 6 * SIZE(CO1, LDC, 2), %xmm5 867 868 movsd 0 * SIZE(CO2, LDC, 2), %xmm6 869 movhps 2 * SIZE(CO2, LDC, 2), %xmm6 870 movsd 4 * SIZE(CO2, LDC, 2), %xmm7 871 movhps 6 * SIZE(CO2, LDC, 2), %xmm7 872 873 addps %xmm0, %xmm8 874 addps %xmm1, %xmm12 875 addps %xmm2, %xmm9 876 addps %xmm3, %xmm13 877#endif 878 879 movlps %xmm8, 0 * SIZE(CO1) 880 movhps %xmm8, 2 * SIZE(CO1) 881 movlps %xmm12, 4 * SIZE(CO1) 882 movhps %xmm12, 6 * SIZE(CO1) 883 884 movlps %xmm9, 0 * SIZE(CO2) 885 movhps %xmm9, 2 * SIZE(CO2) 886 movlps %xmm13, 4 * SIZE(CO2) 887 movhps %xmm13, 6 * SIZE(CO2) 888 889#ifndef TRMMKERNEL 890 addps %xmm4, %xmm10 891 addps %xmm5, %xmm14 892 addps %xmm6, %xmm11 893 addps %xmm7, %xmm15 894#endif 895 896 movlps %xmm10, 0 * SIZE(CO1, LDC, 2) 897 movhps %xmm10, 2 * SIZE(CO1, LDC, 2) 898 movlps %xmm14, 4 * SIZE(CO1, LDC, 2) 899 movhps %xmm14, 6 * SIZE(CO1, LDC, 2) 900 901 movlps %xmm11, 0 * SIZE(CO2, LDC, 2) 902 movhps %xmm11, 2 * SIZE(CO2, LDC, 2) 903 movlps %xmm15, 4 * SIZE(CO2, LDC, 2) 904 movhps %xmm15, 6 * SIZE(CO2, LDC, 2) 905 906#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 907 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 908 movq K, %rax 909 subq KKK, %rax 910 leaq (,%rax, 8), %rax 911 leaq (AO, %rax, 4), AO 912 leaq (BO, %rax, 8), BO 913#endif 914 915#if defined(TRMMKERNEL) && defined(LEFT) 916 addq $8, KK 917#endif 918 919 addq $8 * SIZE, CO1 # coffset += 4 920 addq $8 * SIZE, CO2 # coffset += 4 921 decq I # i -- 922 jg .L11 923 ALIGN_4 924 925.L20: 926 testq $4, M 927 je .L30 928 929#if !defined(TRMMKERNEL) || \ 930 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 931 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 932 933 leaq BUFFER, BO 934#else 935 leaq BUFFER, BO 936 movq KK, %rax 937 leaq (, %rax, 8), %rax 938 leaq (AO, %rax, 2), AO 939 leaq (BO, %rax, 8), BO 940#endif 941 942 movaps -32 * SIZE(AO), %xmm8 943 movaps -16 * SIZE(AO), %xmm10 944 945 movaps 0 * SIZE(BO), %xmm9 946 movaps 16 * SIZE(BO), %xmm11 947 movaps 32 * SIZE(BO), %xmm13 948 movaps 48 * SIZE(BO), %xmm15 949 950 xorps %xmm0, %xmm0 951 xorps %xmm1, %xmm1 952 xorps %xmm2, %xmm2 953 xorps %xmm3, %xmm3 954 955#ifndef TRMMKERNEL 956 movq K, %rax 957#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 958 movq K, %rax 959 subq KK, %rax 960 movq %rax, KKK 961#else 962 movq KK, %rax 963#ifdef LEFT 964 addq $4, %rax 965#else 966 addq $4, %rax 967#endif 968 movq %rax, KKK 969#endif 970 sarq $3, %rax 971 je .L25 972 ALIGN_4 973 974.L22: 975 mulps %xmm8, %xmm9 976 addps %xmm9, %xmm0 977#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) 978 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 979#endif 980 movaps 4 * SIZE(BO), %xmm9 981 mulps %xmm8, %xmm9 982 addps %xmm9, %xmm1 983 movaps 8 * SIZE(BO), %xmm9 984 mulps %xmm8, %xmm9 985 mulps 12 * SIZE(BO), %xmm8 986 addps %xmm9, %xmm2 987 movaps 64 * SIZE(BO), %xmm9 988 addps %xmm8, %xmm3 989 movaps -28 * SIZE(AO), %xmm8 990 991 mulps %xmm8, %xmm11 992 addps %xmm11, %xmm0 993 movaps 20 * SIZE(BO), %xmm11 994 mulps %xmm8, %xmm11 995 addps %xmm11, %xmm1 996 movaps 24 * SIZE(BO), %xmm11 997 mulps %xmm8, %xmm11 998 mulps 28 * SIZE(BO), %xmm8 999 addps %xmm11, %xmm2 1000 movaps 80 * SIZE(BO), %xmm11 1001 addps %xmm8, %xmm3 1002 movaps -24 * SIZE(AO), %xmm8 1003 1004 mulps %xmm8, %xmm13 1005 addps %xmm13, %xmm0 1006 movaps 36 * SIZE(BO), %xmm13 1007 mulps %xmm8, %xmm13 1008 addps %xmm13, %xmm1 1009 movaps 40 * SIZE(BO), %xmm13 1010 mulps %xmm8, %xmm13 1011 mulps 44 * SIZE(BO), %xmm8 1012 addps %xmm13, %xmm2 1013 movaps 96 * SIZE(BO), %xmm13 1014 addps %xmm8, %xmm3 1015 movaps -20 * SIZE(AO), %xmm8 1016 1017 mulps %xmm8, %xmm15 1018 addps %xmm15, %xmm0 1019 movaps 52 * SIZE(BO), %xmm15 1020 mulps %xmm8, %xmm15 1021 addps %xmm15, %xmm1 1022 movaps 56 * SIZE(BO), %xmm15 1023 mulps %xmm8, %xmm15 1024 mulps 60 * SIZE(BO), %xmm8 1025 addps %xmm15, %xmm2 1026 movaps 112 * SIZE(BO), %xmm15 1027 addps %xmm8, %xmm3 1028 movaps 0 * SIZE(AO), %xmm8 1029 1030#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) 1031 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) 1032#endif 1033 mulps %xmm10, %xmm9 1034 addps %xmm9, %xmm0 1035 movaps 68 * SIZE(BO), %xmm9 1036 mulps %xmm10, %xmm9 1037 addps %xmm9, %xmm1 1038 movaps 72 * SIZE(BO), %xmm9 1039 mulps %xmm10, %xmm9 1040 mulps 76 * SIZE(BO), %xmm10 1041 addps %xmm9, %xmm2 1042 movaps 128 * SIZE(BO), %xmm9 1043 addps %xmm10, %xmm3 1044 movaps -12 * SIZE(AO), %xmm10 1045 1046 mulps %xmm10, %xmm11 1047 addps %xmm11, %xmm0 1048 movaps 84 * SIZE(BO), %xmm11 1049 mulps %xmm10, %xmm11 1050 addps %xmm11, %xmm1 1051 movaps 88 * SIZE(BO), %xmm11 1052 mulps %xmm10, %xmm11 1053 mulps 92 * SIZE(BO), %xmm10 1054 addps %xmm11, %xmm2 1055 movaps 144 * SIZE(BO), %xmm11 1056 addps %xmm10, %xmm3 1057 movaps -8 * SIZE(AO), %xmm10 1058 1059 mulps %xmm10, %xmm13 1060 addps %xmm13, %xmm0 1061 movaps 100 * SIZE(BO), %xmm13 1062 mulps %xmm10, %xmm13 1063 addps %xmm13, %xmm1 1064 movaps 104 * SIZE(BO), %xmm13 1065 mulps %xmm10, %xmm13 1066 mulps 108 * SIZE(BO), %xmm10 1067 addps %xmm13, %xmm2 1068 movaps 160 * SIZE(BO), %xmm13 1069 addps %xmm10, %xmm3 1070 movaps -4 * SIZE(AO), %xmm10 1071 1072 mulps %xmm10, %xmm15 1073 addps %xmm15, %xmm0 1074 movaps 116 * SIZE(BO), %xmm15 1075 mulps %xmm10, %xmm15 1076 addps %xmm15, %xmm1 1077 movaps 120 * SIZE(BO), %xmm15 1078 mulps %xmm10, %xmm15 1079 mulps 124 * SIZE(BO), %xmm10 1080 addps %xmm15, %xmm2 1081 movaps 176 * SIZE(BO), %xmm15 1082 addps %xmm10, %xmm3 1083 movaps 16 * SIZE(AO), %xmm10 1084 1085 addq $ 32 * SIZE, AO 1086 addq $128 * SIZE, BO 1087 decq %rax 1088 jne .L22 1089 ALIGN_4 1090 1091.L25: 1092#ifndef TRMMKERNEL 1093 movq K, %rax 1094#else 1095 movq KKK, %rax 1096#endif 1097 movaps ALPHA, %xmm15 1098 andq $7, %rax # if (k & 1) 1099 BRANCH 1100 je .L28 1101 ALIGN_4 1102 1103.L26: 1104 mulps %xmm8, %xmm9 1105 addps %xmm9, %xmm0 1106 movaps 4 * SIZE(BO), %xmm9 1107 mulps %xmm8, %xmm9 1108 addps %xmm9, %xmm1 1109 movaps 8 * SIZE(BO), %xmm9 1110 mulps %xmm8, %xmm9 1111 mulps 12 * SIZE(BO), %xmm8 1112 addps %xmm9, %xmm2 1113 movaps 16 * SIZE(BO), %xmm9 1114 addps %xmm8, %xmm3 1115 movaps -28 * SIZE(AO), %xmm8 1116 1117 addq $ 4 * SIZE, AO # aoffset += 4 1118 addq $16 * SIZE, BO # boffset1 += 8 1119 decq %rax 1120 jg .L26 1121 ALIGN_4 1122 1123.L28: 1124 mulps %xmm15, %xmm0 1125 mulps %xmm15, %xmm1 1126 mulps %xmm15, %xmm2 1127 mulps %xmm15, %xmm3 1128 1129#ifndef TRMMKERNEL 1130 movsd 0 * SIZE(CO1), %xmm8 1131 movhps 2 * SIZE(CO1), %xmm8 1132 movsd 0 * SIZE(CO2), %xmm10 1133 movhps 2 * SIZE(CO2), %xmm10 1134 1135 movsd 0 * SIZE(CO1, LDC, 2), %xmm12 1136 movhps 2 * SIZE(CO1, LDC, 2), %xmm12 1137 movsd 0 * SIZE(CO2, LDC, 2), %xmm14 1138 movhps 2 * SIZE(CO2, LDC, 2), %xmm14 1139 1140 addps %xmm8, %xmm0 1141 addps %xmm10, %xmm1 1142 addps %xmm12, %xmm2 1143 addps %xmm14, %xmm3 1144#endif 1145 1146 movlps %xmm0, 0 * SIZE(CO1) 1147 movhps %xmm0, 2 * SIZE(CO1) 1148 movlps %xmm1, 0 * SIZE(CO2) 1149 movhps %xmm1, 2 * SIZE(CO2) 1150 1151 movlps %xmm2, 0 * SIZE(CO1, LDC, 2) 1152 movhps %xmm2, 2 * SIZE(CO1, LDC, 2) 1153 movlps %xmm3, 0 * SIZE(CO2, LDC, 2) 1154 movhps %xmm3, 2 * SIZE(CO2, LDC, 2) 1155 1156#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1157 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1158 movq K, %rax 1159 subq KKK, %rax 1160 leaq (,%rax, 8), %rax 1161 leaq (AO, %rax, 2), AO 1162 leaq (BO, %rax, 8), BO 1163#endif 1164 1165#if defined(TRMMKERNEL) && defined(LEFT) 1166 addq $4, KK 1167#endif 1168 1169 addq $4 * SIZE, CO1 # coffset += 4 1170 addq $4 * SIZE, CO2 # coffset += 4 1171 ALIGN_4 1172 1173.L30: 1174 testq $2, M 1175 je .L40 1176 1177#if !defined(TRMMKERNEL) || \ 1178 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1179 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1180 1181 leaq BUFFER, BO 1182#else 1183 leaq BUFFER, BO 1184 movq KK, %rax 1185 leaq (, %rax, 8), %rax 1186 leaq (AO, %rax, 1), AO 1187 leaq (BO, %rax, 8), BO 1188#endif 1189 1190 movaps -32 * SIZE(AO), %xmm8 1191 movaps -24 * SIZE(AO), %xmm10 1192 1193 movaps 0 * SIZE(BO), %xmm9 1194 movaps 16 * SIZE(BO), %xmm11 1195 movaps 32 * SIZE(BO), %xmm13 1196 movaps 48 * SIZE(BO), %xmm15 1197 1198 xorps %xmm0, %xmm0 1199 xorps %xmm1, %xmm1 1200 xorps %xmm2, %xmm2 1201 xorps %xmm3, %xmm3 1202 1203#ifndef TRMMKERNEL 1204 movq K, %rax 1205#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1206 movq K, %rax 1207 subq KK, %rax 1208 movq %rax, KKK 1209#else 1210 movq KK, %rax 1211#ifdef LEFT 1212 addq $2, %rax 1213#else 1214 addq $4, %rax 1215#endif 1216 movq %rax, KKK 1217#endif 1218 sarq $3, %rax 1219 je .L35 1220 ALIGN_4 1221 1222.L32: 1223 mulps %xmm8, %xmm9 1224 addps %xmm9, %xmm0 1225#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) 1226 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 1227#endif 1228 movaps 4 * SIZE(BO), %xmm9 1229 mulps %xmm8, %xmm9 1230 addps %xmm9, %xmm1 1231 movaps 8 * SIZE(BO), %xmm9 1232 mulps %xmm8, %xmm9 1233 addps %xmm9, %xmm2 1234 movaps 12 * SIZE(BO), %xmm9 1235 mulps %xmm8, %xmm9 1236 movsd -30 * SIZE(AO), %xmm8 1237 addps %xmm9, %xmm3 1238 movaps 64 * SIZE(BO), %xmm9 1239 1240 mulps %xmm8, %xmm11 1241 addps %xmm11, %xmm0 1242 movaps 20 * SIZE(BO), %xmm11 1243 mulps %xmm8, %xmm11 1244 addps %xmm11, %xmm1 1245 movaps 24 * SIZE(BO), %xmm11 1246 mulps %xmm8, %xmm11 1247 addps %xmm11, %xmm2 1248 movaps 28 * SIZE(BO), %xmm11 1249 mulps %xmm8, %xmm11 1250 movsd -28 * SIZE(AO), %xmm8 1251 addps %xmm11, %xmm3 1252 movaps 80 * SIZE(BO), %xmm11 1253 1254 mulps %xmm8, %xmm13 1255 addps %xmm13, %xmm0 1256 movaps 36 * SIZE(BO), %xmm13 1257 mulps %xmm8, %xmm13 1258 addps %xmm13, %xmm1 1259 movaps 40 * SIZE(BO), %xmm13 1260 mulps %xmm8, %xmm13 1261 addps %xmm13, %xmm2 1262 movaps 44 * SIZE(BO), %xmm13 1263 mulps %xmm8, %xmm13 1264 movsd -26 * SIZE(AO), %xmm8 1265 addps %xmm13, %xmm3 1266 movaps 96 * SIZE(BO), %xmm13 1267 1268 mulps %xmm8, %xmm15 1269 addps %xmm15, %xmm0 1270 movaps 52 * SIZE(BO), %xmm15 1271 mulps %xmm8, %xmm15 1272 addps %xmm15, %xmm1 1273 movaps 56 * SIZE(BO), %xmm15 1274 mulps %xmm8, %xmm15 1275 addps %xmm15, %xmm2 1276 movaps 60 * SIZE(BO), %xmm15 1277 mulps %xmm8, %xmm15 1278 movsd -16 * SIZE(AO), %xmm8 1279 addps %xmm15, %xmm3 1280 movaps 112 * SIZE(BO), %xmm15 1281 1282 mulps %xmm10, %xmm9 1283 addps %xmm9, %xmm0 1284 movaps 68 * SIZE(BO), %xmm9 1285 mulps %xmm10, %xmm9 1286 addps %xmm9, %xmm1 1287 movaps 72 * SIZE(BO), %xmm9 1288 mulps %xmm10, %xmm9 1289 addps %xmm9, %xmm2 1290 movaps 76 * SIZE(BO), %xmm9 1291 mulps %xmm10, %xmm9 1292 movsd -22 * SIZE(AO), %xmm10 1293 addps %xmm9, %xmm3 1294 movaps 128 * SIZE(BO), %xmm9 1295 1296 mulps %xmm10, %xmm11 1297 addps %xmm11, %xmm0 1298 movaps 84 * SIZE(BO), %xmm11 1299 mulps %xmm10, %xmm11 1300 addps %xmm11, %xmm1 1301 movaps 88 * SIZE(BO), %xmm11 1302 mulps %xmm10, %xmm11 1303 addps %xmm11, %xmm2 1304 movaps 92 * SIZE(BO), %xmm11 1305 mulps %xmm10, %xmm11 1306 movsd -20 * SIZE(AO), %xmm10 1307 addps %xmm11, %xmm3 1308 movaps 144 * SIZE(BO), %xmm11 1309 1310 mulps %xmm10, %xmm13 1311 addps %xmm13, %xmm0 1312 movaps 100 * SIZE(BO), %xmm13 1313 mulps %xmm10, %xmm13 1314 addps %xmm13, %xmm1 1315 movaps 104 * SIZE(BO), %xmm13 1316 mulps %xmm10, %xmm13 1317 addps %xmm13, %xmm2 1318 movaps 108 * SIZE(BO), %xmm13 1319 mulps %xmm10, %xmm13 1320 movsd -18 * SIZE(AO), %xmm10 1321 addps %xmm13, %xmm3 1322 movaps 160 * SIZE(BO), %xmm13 1323 1324 mulps %xmm10, %xmm15 1325 addps %xmm15, %xmm0 1326 movaps 116 * SIZE(BO), %xmm15 1327 mulps %xmm10, %xmm15 1328 addps %xmm15, %xmm1 1329 movaps 120 * SIZE(BO), %xmm15 1330 mulps %xmm10, %xmm15 1331 addps %xmm15, %xmm2 1332 movaps 124 * SIZE(BO), %xmm15 1333 mulps %xmm10, %xmm15 1334 movsd -8 * SIZE(AO), %xmm10 1335 addps %xmm15, %xmm3 1336 movaps 176 * SIZE(BO), %xmm15 1337 1338 addq $ 16 * SIZE, AO 1339 addq $128 * SIZE, BO 1340 decq %rax 1341 jne .L32 1342 ALIGN_4 1343 1344.L35: 1345#ifndef TRMMKERNEL 1346 movq K, %rax 1347#else 1348 movq KKK, %rax 1349#endif 1350 movaps ALPHA, %xmm15 1351 andq $7, %rax # if (k & 1) 1352 BRANCH 1353 je .L38 1354 ALIGN_4 1355 1356.L36: 1357 mulps %xmm8, %xmm9 1358 addps %xmm9, %xmm0 1359 movaps 4 * SIZE(BO), %xmm9 1360 mulps %xmm8, %xmm9 1361 addps %xmm9, %xmm1 1362 movaps 8 * SIZE(BO), %xmm9 1363 mulps %xmm8, %xmm9 1364 addps %xmm9, %xmm2 1365 movaps 12 * SIZE(BO), %xmm9 1366 mulps %xmm8, %xmm9 1367 movsd -30 * SIZE(AO), %xmm8 1368 addps %xmm9, %xmm3 1369 movaps 16 * SIZE(BO), %xmm9 1370 1371 addq $ 2 * SIZE, AO # aoffset += 4 1372 addq $16 * SIZE, BO # boffset1 += 8 1373 decq %rax 1374 jg .L36 1375 ALIGN_4 1376 1377.L38: 1378 mulps %xmm15, %xmm0 1379 mulps %xmm15, %xmm1 1380 mulps %xmm15, %xmm2 1381 mulps %xmm15, %xmm3 1382 1383#ifndef TRMMKERNEL 1384#ifdef movsd 1385 xorps %xmm8, %xmm8 1386#endif 1387 movsd 0 * SIZE(CO1), %xmm8 1388#ifdef movsd 1389 xorps %xmm10, %xmm10 1390#endif 1391 movsd 0 * SIZE(CO2), %xmm10 1392#ifdef movsd 1393 xorps %xmm12, %xmm12 1394#endif 1395 movsd 0 * SIZE(CO1, LDC, 2), %xmm12 1396#ifdef movsd 1397 xorps %xmm14, %xmm14 1398#endif 1399 movsd 0 * SIZE(CO2, LDC, 2), %xmm14 1400 1401 addps %xmm8, %xmm0 1402 addps %xmm10, %xmm1 1403 addps %xmm12, %xmm2 1404 addps %xmm14, %xmm3 1405#endif 1406 1407 movlps %xmm0, 0 * SIZE(CO1) 1408 movlps %xmm1, 0 * SIZE(CO2) 1409 movlps %xmm2, 0 * SIZE(CO1, LDC, 2) 1410 movlps %xmm3, 0 * SIZE(CO2, LDC, 2) 1411 1412#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1413 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1414 movq K, %rax 1415 subq KKK, %rax 1416 leaq (,%rax, 8), %rax 1417 leaq (AO, %rax, 1), AO 1418 leaq (BO, %rax, 8), BO 1419#endif 1420 1421#if defined(TRMMKERNEL) && defined(LEFT) 1422 addq $2, KK 1423#endif 1424 1425 addq $2 * SIZE, CO1 # coffset += 4 1426 addq $2 * SIZE, CO2 # coffset += 4 1427 ALIGN_4 1428 1429.L40: 1430 testq $1, M 1431 je .L49 1432 1433#if !defined(TRMMKERNEL) || \ 1434 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1435 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1436 1437 leaq BUFFER, BO 1438#else 1439 leaq BUFFER, BO 1440 movq KK, %rax 1441 leaq (, %rax, 4), %rax 1442 leaq (AO, %rax, 1), AO 1443 leaq (BO, %rax, 8), BO 1444 leaq (BO, %rax, 8), BO 1445#endif 1446 1447 movss -32 * SIZE(AO), %xmm8 1448 movss -28 * SIZE(AO), %xmm10 1449 1450 movss 0 * SIZE(BO), %xmm9 1451 movss 16 * SIZE(BO), %xmm11 1452 movss 32 * SIZE(BO), %xmm13 1453 movss 48 * SIZE(BO), %xmm15 1454 1455 xorps %xmm0, %xmm0 1456 xorps %xmm1, %xmm1 1457 xorps %xmm2, %xmm2 1458 xorps %xmm3, %xmm3 1459 1460#ifndef TRMMKERNEL 1461 movq K, %rax 1462#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1463 movq K, %rax 1464 subq KK, %rax 1465 movq %rax, KKK 1466#else 1467 movq KK, %rax 1468#ifdef LEFT 1469 addq $1, %rax 1470#else 1471 addq $4, %rax 1472#endif 1473 movq %rax, KKK 1474#endif 1475 sarq $3, %rax 1476 je .L45 1477 ALIGN_4 1478 1479.L42: 1480 mulss %xmm8, %xmm9 1481 addss %xmm9, %xmm0 1482#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) 1483 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 1484#endif 1485 movss 4 * SIZE(BO), %xmm9 1486 mulss %xmm8, %xmm9 1487 addss %xmm9, %xmm1 1488 movss 8 * SIZE(BO), %xmm9 1489 mulss %xmm8, %xmm9 1490 addss %xmm9, %xmm2 1491 movss 12 * SIZE(BO), %xmm9 1492 mulss %xmm8, %xmm9 1493 movss -31 * SIZE(AO), %xmm8 1494 addss %xmm9, %xmm3 1495 movss 64 * SIZE(BO), %xmm9 1496 1497 mulss %xmm8, %xmm11 1498 addss %xmm11, %xmm0 1499 movss 20 * SIZE(BO), %xmm11 1500 mulss %xmm8, %xmm11 1501 addss %xmm11, %xmm1 1502 movss 24 * SIZE(BO), %xmm11 1503 mulss %xmm8, %xmm11 1504 addss %xmm11, %xmm2 1505 movss 28 * SIZE(BO), %xmm11 1506 mulss %xmm8, %xmm11 1507 movss -30 * SIZE(AO), %xmm8 1508 addss %xmm11, %xmm3 1509 movss 80 * SIZE(BO), %xmm11 1510 1511 mulss %xmm8, %xmm13 1512 addss %xmm13, %xmm0 1513 movss 36 * SIZE(BO), %xmm13 1514 mulss %xmm8, %xmm13 1515 addss %xmm13, %xmm1 1516 movss 40 * SIZE(BO), %xmm13 1517 mulss %xmm8, %xmm13 1518 addss %xmm13, %xmm2 1519 movss 44 * SIZE(BO), %xmm13 1520 mulss %xmm8, %xmm13 1521 movss -29 * SIZE(AO), %xmm8 1522 addss %xmm13, %xmm3 1523 movss 96 * SIZE(BO), %xmm13 1524 1525 mulss %xmm8, %xmm15 1526 addss %xmm15, %xmm0 1527 movss 52 * SIZE(BO), %xmm15 1528 mulss %xmm8, %xmm15 1529 addss %xmm15, %xmm1 1530 movss 56 * SIZE(BO), %xmm15 1531 mulss %xmm8, %xmm15 1532 addss %xmm15, %xmm2 1533 movss 60 * SIZE(BO), %xmm15 1534 mulss %xmm8, %xmm15 1535 movss -24 * SIZE(AO), %xmm8 1536 addss %xmm15, %xmm3 1537 movss 112 * SIZE(BO), %xmm15 1538 1539 mulss %xmm10, %xmm9 1540 addss %xmm9, %xmm0 1541 movss 68 * SIZE(BO), %xmm9 1542 mulss %xmm10, %xmm9 1543 addss %xmm9, %xmm1 1544 movss 72 * SIZE(BO), %xmm9 1545 mulss %xmm10, %xmm9 1546 addss %xmm9, %xmm2 1547 movss 76 * SIZE(BO), %xmm9 1548 mulss %xmm10, %xmm9 1549 movss -27 * SIZE(AO), %xmm10 1550 addss %xmm9, %xmm3 1551 movss 128 * SIZE(BO), %xmm9 1552 1553 mulss %xmm10, %xmm11 1554 addss %xmm11, %xmm0 1555 movss 84 * SIZE(BO), %xmm11 1556 mulss %xmm10, %xmm11 1557 addss %xmm11, %xmm1 1558 movss 88 * SIZE(BO), %xmm11 1559 mulss %xmm10, %xmm11 1560 addss %xmm11, %xmm2 1561 movss 92 * SIZE(BO), %xmm11 1562 mulss %xmm10, %xmm11 1563 movss -26 * SIZE(AO), %xmm10 1564 addss %xmm11, %xmm3 1565 movss 144 * SIZE(BO), %xmm11 1566 1567 mulss %xmm10, %xmm13 1568 addss %xmm13, %xmm0 1569 movss 100 * SIZE(BO), %xmm13 1570 mulss %xmm10, %xmm13 1571 addss %xmm13, %xmm1 1572 movss 104 * SIZE(BO), %xmm13 1573 mulss %xmm10, %xmm13 1574 addss %xmm13, %xmm2 1575 movss 108 * SIZE(BO), %xmm13 1576 mulss %xmm10, %xmm13 1577 movss -25 * SIZE(AO), %xmm10 1578 addss %xmm13, %xmm3 1579 movss 160 * SIZE(BO), %xmm13 1580 1581 mulss %xmm10, %xmm15 1582 addss %xmm15, %xmm0 1583 movss 116 * SIZE(BO), %xmm15 1584 mulss %xmm10, %xmm15 1585 addss %xmm15, %xmm1 1586 movss 120 * SIZE(BO), %xmm15 1587 mulss %xmm10, %xmm15 1588 addss %xmm15, %xmm2 1589 movss 124 * SIZE(BO), %xmm15 1590 mulss %xmm10, %xmm15 1591 movss -20 * SIZE(AO), %xmm10 1592 addss %xmm15, %xmm3 1593 movss 176 * SIZE(BO), %xmm15 1594 1595 addq $ 8 * SIZE, AO 1596 addq $128 * SIZE, BO 1597 decq %rax 1598 jne .L42 1599 ALIGN_4 1600 1601.L45: 1602#ifndef TRMMKERNEL 1603 movq K, %rax 1604#else 1605 movq KKK, %rax 1606#endif 1607 movaps ALPHA, %xmm15 1608 andq $7, %rax # if (k & 1) 1609 BRANCH 1610 je .L48 1611 ALIGN_4 1612 1613.L46: 1614 mulps %xmm8, %xmm9 1615 addps %xmm9, %xmm0 1616 movss 4 * SIZE(BO), %xmm9 1617 mulps %xmm8, %xmm9 1618 addps %xmm9, %xmm1 1619 movss 8 * SIZE(BO), %xmm9 1620 mulps %xmm8, %xmm9 1621 addps %xmm9, %xmm2 1622 movss 12 * SIZE(BO), %xmm9 1623 mulps %xmm8, %xmm9 1624 movss -31 * SIZE(AO), %xmm8 1625 addps %xmm9, %xmm3 1626 movss 16 * SIZE(BO), %xmm9 1627 1628 addq $ 1 * SIZE, AO # aoffset += 4 1629 addq $16 * SIZE, BO # boffset1 += 8 1630 decq %rax 1631 jg .L46 1632 ALIGN_4 1633 1634.L48: 1635 mulss %xmm15, %xmm0 1636 mulss %xmm15, %xmm1 1637 mulss %xmm15, %xmm2 1638 mulss %xmm15, %xmm3 1639 1640#ifndef TRMMKERNEL 1641 movss 0 * SIZE(CO1), %xmm8 1642 movss 0 * SIZE(CO2), %xmm10 1643 movss 0 * SIZE(CO1, LDC, 2), %xmm12 1644 movss 0 * SIZE(CO2, LDC, 2), %xmm14 1645 1646 addss %xmm8, %xmm0 1647 addss %xmm10, %xmm1 1648 addss %xmm12, %xmm2 1649 addss %xmm14, %xmm3 1650#endif 1651 1652 movss %xmm0, 0 * SIZE(CO1) 1653 movss %xmm1, 0 * SIZE(CO2) 1654 movss %xmm2, 0 * SIZE(CO1, LDC, 2) 1655 movss %xmm3, 0 * SIZE(CO2, LDC, 2) 1656 1657#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1658 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1659 movq K, %rax 1660 subq KKK, %rax 1661 leaq (,%rax, 4), %rax 1662 leaq (AO, %rax, 1), AO 1663 leaq (BO, %rax, 8), BO 1664 leaq (BO, %rax, 8), BO 1665#endif 1666 1667#if defined(TRMMKERNEL) && defined(LEFT) 1668 addq $1, KK 1669#endif 1670 ALIGN_4 1671 1672.L49: 1673#if defined(TRMMKERNEL) && !defined(LEFT) 1674 addl $4, KK 1675#endif 1676 leaq (C, LDC, 4), C # c += 4 * ldc 1677 decq J # j -- 1678 jg .L01 1679 1680.L50: 1681 testq $2, N 1682 je .L100 1683 1684.L51: 1685#if defined(TRMMKERNEL) && defined(LEFT) 1686 movq OFFSET, %rax 1687 movq %rax, KK 1688#endif 1689 1690/* Copying to Sub Buffer */ 1691 leaq BUFFER, BO 1692 1693 movq K, %rax 1694 sarq $2, %rax 1695 jle .L53 1696 ALIGN_4 1697 1698.L52: 1699#if defined(PENTIUM4) || defined(GENERIC) 1700 movss 0 * SIZE(B), %xmm0 1701 movss 1 * SIZE(B), %xmm1 1702 movss 2 * SIZE(B), %xmm2 1703 movss 3 * SIZE(B), %xmm3 1704 movss 4 * SIZE(B), %xmm4 1705 movss 5 * SIZE(B), %xmm5 1706 movss 6 * SIZE(B), %xmm6 1707 movss 7 * SIZE(B), %xmm7 1708 1709 PREFETCH 32 * SIZE(B) 1710 1711 shufps $0, %xmm0, %xmm0 1712 shufps $0, %xmm1, %xmm1 1713 shufps $0, %xmm2, %xmm2 1714 shufps $0, %xmm3, %xmm3 1715 shufps $0, %xmm4, %xmm4 1716 shufps $0, %xmm5, %xmm5 1717 shufps $0, %xmm6, %xmm6 1718 shufps $0, %xmm7, %xmm7 1719 1720 movaps %xmm0, 0 * SIZE(BO) 1721 movaps %xmm1, 4 * SIZE(BO) 1722 movaps %xmm2, 8 * SIZE(BO) 1723 movaps %xmm3, 12 * SIZE(BO) 1724 movaps %xmm4, 16 * SIZE(BO) 1725 movaps %xmm5, 20 * SIZE(BO) 1726 movaps %xmm6, 24 * SIZE(BO) 1727 movaps %xmm7, 28 * SIZE(BO) 1728 1729 addq $ 8 * SIZE, B 1730 addq $32 * SIZE, BO 1731#endif 1732 1733#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) 1734 PREFETCH 32 * SIZE(B) 1735 1736 movd 0 * SIZE(B), %mm0 1737 movd 1 * SIZE(B), %mm1 1738 movd 2 * SIZE(B), %mm2 1739 movd 3 * SIZE(B), %mm3 1740 movd 4 * SIZE(B), %mm4 1741 movd 5 * SIZE(B), %mm5 1742 movd 6 * SIZE(B), %mm6 1743 movd 7 * SIZE(B), %mm7 1744 1745 punpckldq %mm0, %mm0 1746 punpckldq %mm1, %mm1 1747 punpckldq %mm2, %mm2 1748 punpckldq %mm3, %mm3 1749 punpckldq %mm4, %mm4 1750 punpckldq %mm5, %mm5 1751 punpckldq %mm6, %mm6 1752 punpckldq %mm7, %mm7 1753 1754 movq %mm0, 0 * SIZE(BO) 1755 movq %mm0, 2 * SIZE(BO) 1756 movq %mm1, 4 * SIZE(BO) 1757 movq %mm1, 6 * SIZE(BO) 1758 movq %mm2, 8 * SIZE(BO) 1759 movq %mm2, 10 * SIZE(BO) 1760 movq %mm3, 12 * SIZE(BO) 1761 movq %mm3, 14 * SIZE(BO) 1762 movq %mm4, 16 * SIZE(BO) 1763 movq %mm4, 18 * SIZE(BO) 1764 movq %mm5, 20 * SIZE(BO) 1765 movq %mm5, 22 * SIZE(BO) 1766 movq %mm6, 24 * SIZE(BO) 1767 movq %mm6, 26 * SIZE(BO) 1768 movq %mm7, 28 * SIZE(BO) 1769 movq %mm7, 30 * SIZE(BO) 1770 1771 addq $ 8 * SIZE, B 1772 addq $32 * SIZE, BO 1773#endif 1774 1775 decq %rax 1776 jne .L52 1777 ALIGN_4 1778 1779.L53: 1780 movq K, %rax 1781 andq $3, %rax 1782 BRANCH 1783 jle .L60 1784 ALIGN_4 1785 1786.L54: 1787#if defined(PENTIUM4) || defined(GENERIC) 1788 movss 0 * SIZE(B), %xmm0 1789 movss 1 * SIZE(B), %xmm1 1790 1791 shufps $0, %xmm0, %xmm0 1792 shufps $0, %xmm1, %xmm1 1793 1794 movaps %xmm0, 0 * SIZE(BO) 1795 movaps %xmm1, 4 * SIZE(BO) 1796#endif 1797 1798#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) 1799 movd 0 * SIZE(B), %mm0 1800 movd 1 * SIZE(B), %mm1 1801 1802 punpckldq %mm0, %mm0 1803 punpckldq %mm1, %mm1 1804 1805 movq %mm0, 0 * SIZE(BO) 1806 movq %mm0, 2 * SIZE(BO) 1807 movq %mm1, 4 * SIZE(BO) 1808 movq %mm1, 6 * SIZE(BO) 1809#endif 1810 1811 addq $ 2 * SIZE, B 1812 addq $ 8 * SIZE, BO 1813 decq %rax 1814 jne .L54 1815 ALIGN_4 1816 1817.L60: 1818 movq C, CO1 # coffset1 = c 1819 leaq (C, LDC, 1), CO2 # coffset2 = c + ldc 1820 movq A, AO # aoffset = a 1821 1822 movq M, I 1823 sarq $3, I # i = (m >> 3) 1824 jle .L70 1825 ALIGN_4 1826 1827.L61: 1828#if !defined(TRMMKERNEL) || \ 1829 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1830 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1831 1832 leaq BUFFER, BO 1833#else 1834 leaq BUFFER, BO 1835 movq KK, %rax 1836 leaq (, %rax, 8), %rax 1837 leaq (AO, %rax, 4), AO 1838 leaq (BO, %rax, 4), BO 1839#endif 1840 1841 movaps -32 * SIZE(AO), %xmm8 1842 movaps -16 * SIZE(AO), %xmm10 1843 movaps 0 * SIZE(AO), %xmm12 1844 movaps 16 * SIZE(AO), %xmm14 1845 1846 movaps 0 * SIZE(BO), %xmm9 1847 movaps 16 * SIZE(BO), %xmm11 1848 movaps 32 * SIZE(BO), %xmm13 1849 movaps 48 * SIZE(BO), %xmm15 1850 1851 xorps %xmm0, %xmm0 1852 xorps %xmm1, %xmm1 1853 1854 PREFETCHW 7 * SIZE(CO1) 1855 xorps %xmm4, %xmm4 1856 PREFETCHW 7 * SIZE(CO2) 1857 xorps %xmm5, %xmm5 1858 1859#ifndef TRMMKERNEL 1860 movq K, %rax 1861#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1862 movq K, %rax 1863 subq KK, %rax 1864 movq %rax, KKK 1865#else 1866 movq KK, %rax 1867#ifdef LEFT 1868 addq $8, %rax 1869#else 1870 addq $2, %rax 1871#endif 1872 movq %rax, KKK 1873#endif 1874 sarq $3, %rax 1875 je .L65 1876 ALIGN_4 1877 1878.L62: 1879 mulps %xmm8, %xmm9 1880#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) 1881 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 1882#endif 1883 mulps 4 * SIZE(BO), %xmm8 1884 addps %xmm9, %xmm0 1885 movaps 0 * SIZE(BO), %xmm9 1886 addps %xmm8, %xmm1 1887 movaps -28 * SIZE(AO), %xmm8 1888 mulps %xmm8, %xmm9 1889 mulps 4 * SIZE(BO), %xmm8 1890 addps %xmm9, %xmm4 1891 movaps 8 * SIZE(BO), %xmm9 1892 addps %xmm8, %xmm5 1893 movaps -24 * SIZE(AO), %xmm8 1894 1895 mulps %xmm8, %xmm9 1896 mulps 12 * SIZE(BO), %xmm8 1897 addps %xmm9, %xmm0 1898 movaps 8 * SIZE(BO), %xmm9 1899 addps %xmm8, %xmm1 1900 movaps -20 * SIZE(AO), %xmm8 1901 mulps %xmm8, %xmm9 1902 mulps 12 * SIZE(BO), %xmm8 1903 addps %xmm9, %xmm4 1904 movaps 64 * SIZE(BO), %xmm9 1905 addps %xmm8, %xmm5 1906 movaps 32 * SIZE(AO), %xmm8 1907 1908#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) 1909 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) 1910#endif 1911 mulps %xmm10, %xmm11 1912 mulps 20 * SIZE(BO), %xmm10 1913 addps %xmm11, %xmm0 1914 movaps 16 * SIZE(BO), %xmm11 1915 addps %xmm10, %xmm1 1916 movaps -12 * SIZE(AO), %xmm10 1917 mulps %xmm10, %xmm11 1918 mulps 20 * SIZE(BO), %xmm10 1919 addps %xmm11, %xmm4 1920 movaps 24 * SIZE(BO), %xmm11 1921 addps %xmm10, %xmm5 1922 movaps -8 * SIZE(AO), %xmm10 1923 1924 mulps %xmm10, %xmm11 1925 mulps 28 * SIZE(BO), %xmm10 1926 addps %xmm11, %xmm0 1927 movaps 24 * SIZE(BO), %xmm11 1928 addps %xmm10, %xmm1 1929 movaps -4 * SIZE(AO), %xmm10 1930 mulps %xmm10, %xmm11 1931 mulps 28 * SIZE(BO), %xmm10 1932 addps %xmm11, %xmm4 1933 movaps 80 * SIZE(BO), %xmm11 1934 addps %xmm10, %xmm5 1935 movaps 48 * SIZE(AO), %xmm10 1936 1937#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) 1938 PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) 1939#endif 1940 mulps %xmm12, %xmm13 1941 mulps 36 * SIZE(BO), %xmm12 1942 addps %xmm13, %xmm0 1943 movaps 32 * SIZE(BO), %xmm13 1944 addps %xmm12, %xmm1 1945 movaps 4 * SIZE(AO), %xmm12 1946 mulps %xmm12, %xmm13 1947 mulps 36 * SIZE(BO), %xmm12 1948 addps %xmm13, %xmm4 1949 movaps 40 * SIZE(BO), %xmm13 1950 addps %xmm12, %xmm5 1951 movaps 8 * SIZE(AO), %xmm12 1952 1953 mulps %xmm12, %xmm13 1954 mulps 44 * SIZE(BO), %xmm12 1955 addps %xmm13, %xmm0 1956 movaps 40 * SIZE(BO), %xmm13 1957 addps %xmm12, %xmm1 1958 movaps 12 * SIZE(AO), %xmm12 1959 mulps %xmm12, %xmm13 1960 mulps 44 * SIZE(BO), %xmm12 1961 addps %xmm13, %xmm4 1962 movaps 96 * SIZE(BO), %xmm13 1963 addps %xmm12, %xmm5 1964 movaps 64 * SIZE(AO), %xmm12 1965 1966#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) 1967 PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) 1968#endif 1969 mulps %xmm14, %xmm15 1970 mulps 52 * SIZE(BO), %xmm14 1971 addps %xmm15, %xmm0 1972 movaps 48 * SIZE(BO), %xmm15 1973 addps %xmm14, %xmm1 1974 movaps 20 * SIZE(AO), %xmm14 1975 mulps %xmm14, %xmm15 1976 mulps 52 * SIZE(BO), %xmm14 1977 addps %xmm15, %xmm4 1978 movaps 56 * SIZE(BO), %xmm15 1979 addps %xmm14, %xmm5 1980 movaps 24 * SIZE(AO), %xmm14 1981 1982 mulps %xmm14, %xmm15 1983 mulps 60 * SIZE(BO), %xmm14 1984 addps %xmm15, %xmm0 1985 movaps 56 * SIZE(BO), %xmm15 1986 addps %xmm14, %xmm1 1987 movaps 28 * SIZE(AO), %xmm14 1988 mulps %xmm14, %xmm15 1989 mulps 60 * SIZE(BO), %xmm14 1990 addps %xmm15, %xmm4 1991 movaps 112 * SIZE(BO), %xmm15 1992 addps %xmm14, %xmm5 1993 movaps 80 * SIZE(AO), %xmm14 1994 1995 addq $64 * SIZE, AO 1996 addq $64 * SIZE, BO 1997 decq %rax 1998 jne .L62 1999 ALIGN_4 2000 2001.L65: 2002#ifndef TRMMKERNEL 2003 movq K, %rax 2004#else 2005 movq KKK, %rax 2006#endif 2007 movaps ALPHA, %xmm15 2008 andq $7, %rax # if (k & 1) 2009 BRANCH 2010 je .L68 2011 ALIGN_4 2012 2013.L66: 2014 mulps %xmm8, %xmm9 2015 mulps 4 * SIZE(BO), %xmm8 2016 addps %xmm9, %xmm0 2017 movaps 0 * SIZE(BO), %xmm9 2018 addps %xmm8, %xmm1 2019 movaps -28 * SIZE(AO), %xmm8 2020 mulps %xmm8, %xmm9 2021 mulps 4 * SIZE(BO), %xmm8 2022 addps %xmm9, %xmm4 2023 movaps 8 * SIZE(BO), %xmm9 2024 addps %xmm8, %xmm5 2025 movaps -24 * SIZE(AO), %xmm8 2026 2027 addq $8 * SIZE, AO # aoffset += 4 2028 addq $8 * SIZE, BO # boffset1 += 8 2029 decq %rax 2030 jg .L66 2031 ALIGN_4 2032 2033.L68: 2034#ifndef TRMMKERNEL 2035 movsd 0 * SIZE(CO1), %xmm8 2036 movhps 2 * SIZE(CO1), %xmm8 2037 movsd 4 * SIZE(CO1), %xmm9 2038 movhps 6 * SIZE(CO1), %xmm9 2039 2040 movsd 0 * SIZE(CO2), %xmm10 2041 movhps 2 * SIZE(CO2), %xmm10 2042 movsd 4 * SIZE(CO2), %xmm11 2043 movhps 6 * SIZE(CO2), %xmm11 2044#endif 2045 2046 mulps %xmm15, %xmm0 2047 mulps %xmm15, %xmm4 2048 mulps %xmm15, %xmm1 2049 mulps %xmm15, %xmm5 2050 2051#ifndef TRMMKERNEL 2052 addps %xmm8, %xmm0 2053 addps %xmm9, %xmm4 2054 addps %xmm10, %xmm1 2055 addps %xmm11, %xmm5 2056#endif 2057 2058 movlps %xmm0, 0 * SIZE(CO1) 2059 movhps %xmm0, 2 * SIZE(CO1) 2060 movlps %xmm4, 4 * SIZE(CO1) 2061 movhps %xmm4, 6 * SIZE(CO1) 2062 2063 movlps %xmm1, 0 * SIZE(CO2) 2064 movhps %xmm1, 2 * SIZE(CO2) 2065 movlps %xmm5, 4 * SIZE(CO2) 2066 movhps %xmm5, 6 * SIZE(CO2) 2067 2068#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2069 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2070 movq K, %rax 2071 subq KKK, %rax 2072 leaq (,%rax, 8), %rax 2073 leaq (AO, %rax, 4), AO 2074 leaq (BO, %rax, 4), BO 2075#endif 2076 2077#if defined(TRMMKERNEL) && defined(LEFT) 2078 addq $8, KK 2079#endif 2080 2081 addq $8 * SIZE, CO1 # coffset += 4 2082 addq $8 * SIZE, CO2 # coffset += 4 2083 decq I # i -- 2084 jg .L61 2085 ALIGN_4 2086 2087.L70: 2088 testq $4, M 2089 je .L80 2090 2091 2092#if !defined(TRMMKERNEL) || \ 2093 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2094 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2095 2096 leaq BUFFER, BO 2097#else 2098 leaq BUFFER, BO 2099 movq KK, %rax 2100 leaq (, %rax, 8), %rax 2101 leaq (AO, %rax, 2), AO 2102 leaq (BO, %rax, 4), BO 2103#endif 2104 2105 movaps -32 * SIZE(AO), %xmm8 2106 movaps -16 * SIZE(AO), %xmm10 2107 2108 movaps 0 * SIZE(BO), %xmm9 2109 movaps 16 * SIZE(BO), %xmm11 2110 movaps 32 * SIZE(BO), %xmm13 2111 movaps 48 * SIZE(BO), %xmm15 2112 2113 xorps %xmm0, %xmm0 2114 xorps %xmm1, %xmm1 2115 xorps %xmm2, %xmm2 2116 xorps %xmm3, %xmm3 2117 2118#ifndef TRMMKERNEL 2119 movq K, %rax 2120#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 2121 movq K, %rax 2122 subq KK, %rax 2123 movq %rax, KKK 2124#else 2125 movq KK, %rax 2126#ifdef LEFT 2127 addq $4, %rax 2128#else 2129 addq $2, %rax 2130#endif 2131 movq %rax, KKK 2132#endif 2133 sarq $3, %rax 2134 je .L75 2135 ALIGN_4 2136 2137.L72: 2138 mulps %xmm8, %xmm9 2139#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) 2140 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 2141#endif 2142 2143 mulps 4 * SIZE(BO), %xmm8 2144 addps %xmm9, %xmm0 2145 movaps 8 * SIZE(BO), %xmm9 2146 addps %xmm8, %xmm1 2147 movaps -28 * SIZE(AO), %xmm8 2148 2149 mulps %xmm8, %xmm9 2150 mulps 12 * SIZE(BO), %xmm8 2151 addps %xmm9, %xmm2 2152 movaps 64 * SIZE(BO), %xmm9 2153 addps %xmm8, %xmm3 2154 movaps -24 * SIZE(AO), %xmm8 2155 2156 mulps %xmm8, %xmm11 2157 mulps 20 * SIZE(BO), %xmm8 2158 addps %xmm11, %xmm0 2159 movaps 24 * SIZE(BO), %xmm11 2160 addps %xmm8, %xmm1 2161 movaps -20 * SIZE(AO), %xmm8 2162 2163 mulps %xmm8, %xmm11 2164 mulps 28 * SIZE(BO), %xmm8 2165 addps %xmm11, %xmm2 2166 movaps 80 * SIZE(BO), %xmm11 2167 addps %xmm8, %xmm3 2168 movaps 0 * SIZE(AO), %xmm8 2169 2170 mulps %xmm10, %xmm13 2171 mulps 36 * SIZE(BO), %xmm10 2172 addps %xmm13, %xmm0 2173 movaps 40 * SIZE(BO), %xmm13 2174 addps %xmm10, %xmm1 2175 movaps -12 * SIZE(AO), %xmm10 2176 2177 mulps %xmm10, %xmm13 2178 mulps 44 * SIZE(BO), %xmm10 2179 addps %xmm13, %xmm2 2180 movaps 96 * SIZE(BO), %xmm13 2181 addps %xmm10, %xmm3 2182 movaps -8 * SIZE(AO), %xmm10 2183 2184 mulps %xmm10, %xmm15 2185 mulps 52 * SIZE(BO), %xmm10 2186 addps %xmm15, %xmm0 2187 movaps 56 * SIZE(BO), %xmm15 2188 addps %xmm10, %xmm1 2189 movaps -4 * SIZE(AO), %xmm10 2190 2191 mulps %xmm10, %xmm15 2192 mulps 60 * SIZE(BO), %xmm10 2193 addps %xmm15, %xmm2 2194 movaps 112 * SIZE(BO), %xmm15 2195 addps %xmm10, %xmm3 2196 movaps 16 * SIZE(AO), %xmm10 2197 2198 addq $32 * SIZE, AO 2199 addq $64 * SIZE, BO 2200 decq %rax 2201 jne .L72 2202 ALIGN_4 2203 2204.L75: 2205#ifndef TRMMKERNEL 2206 movq K, %rax 2207#else 2208 movq KKK, %rax 2209#endif 2210 movaps ALPHA, %xmm15 2211 andq $7, %rax # if (k & 1) 2212 BRANCH 2213 je .L78 2214 ALIGN_4 2215 2216.L76: 2217 mulps %xmm8, %xmm9 2218 mulps 4 * SIZE(BO), %xmm8 2219 addps %xmm9, %xmm0 2220 movaps 8 * SIZE(BO), %xmm9 2221 addps %xmm8, %xmm1 2222 movaps -28 * SIZE(AO), %xmm8 2223 2224 addq $4 * SIZE, AO # aoffset += 4 2225 addq $8 * SIZE, BO # boffset1 += 8 2226 decq %rax 2227 jg .L76 2228 ALIGN_4 2229 2230.L78: 2231#ifndef TRMMKERNEL 2232 movsd 0 * SIZE(CO1), %xmm8 2233 movhps 2 * SIZE(CO1), %xmm8 2234 movsd 0 * SIZE(CO2), %xmm10 2235 movhps 2 * SIZE(CO2), %xmm10 2236#endif 2237 2238 addps %xmm2, %xmm0 2239 addps %xmm3, %xmm1 2240 2241 mulps %xmm15, %xmm0 2242 mulps %xmm15, %xmm1 2243 2244#ifndef TRMMKERNEL 2245 addps %xmm8, %xmm0 2246 addps %xmm10, %xmm1 2247#endif 2248 2249 movlps %xmm0, 0 * SIZE(CO1) 2250 movhps %xmm0, 2 * SIZE(CO1) 2251 movlps %xmm1, 0 * SIZE(CO2) 2252 movhps %xmm1, 2 * SIZE(CO2) 2253 2254#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2255 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2256 movq K, %rax 2257 subq KKK, %rax 2258 leaq (,%rax, 8), %rax 2259 leaq (AO, %rax, 2), AO 2260 leaq (BO, %rax, 4), BO 2261#endif 2262 2263#if defined(TRMMKERNEL) && defined(LEFT) 2264 addq $4, KK 2265#endif 2266 2267 addq $4 * SIZE, CO1 # coffset += 4 2268 addq $4 * SIZE, CO2 # coffset += 4 2269 ALIGN_4 2270 2271.L80: 2272 testq $2, M 2273 je .L90 2274 2275#if !defined(TRMMKERNEL) || \ 2276 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2277 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2278 2279 leaq BUFFER, BO 2280#else 2281 leaq BUFFER, BO 2282 movq KK, %rax 2283 leaq (, %rax, 8), %rax 2284 leaq (AO, %rax, 1), AO 2285 leaq (BO, %rax, 4), BO 2286#endif 2287 2288 movaps -32 * SIZE(AO), %xmm8 2289 movaps -24 * SIZE(AO), %xmm10 2290 2291 movaps 0 * SIZE(BO), %xmm9 2292 movaps 16 * SIZE(BO), %xmm11 2293 movaps 32 * SIZE(BO), %xmm13 2294 movaps 48 * SIZE(BO), %xmm15 2295 2296 xorps %xmm0, %xmm0 2297 xorps %xmm1, %xmm1 2298 xorps %xmm2, %xmm2 2299 xorps %xmm3, %xmm3 2300 2301#ifndef TRMMKERNEL 2302 movq K, %rax 2303#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 2304 movq K, %rax 2305 subq KK, %rax 2306 movq %rax, KKK 2307#else 2308 movq KK, %rax 2309#ifdef LEFT 2310 addq $2, %rax 2311#else 2312 addq $2, %rax 2313#endif 2314 movq %rax, KKK 2315#endif 2316 sarq $3, %rax 2317 je .L85 2318 ALIGN_4 2319 2320.L82: 2321 mulps %xmm8, %xmm9 2322 addps %xmm9, %xmm0 2323#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) 2324 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 2325#endif 2326 movaps 4 * SIZE(BO), %xmm9 2327 mulps %xmm8, %xmm9 2328 movsd -30 * SIZE(AO), %xmm8 2329 addps %xmm9, %xmm1 2330 movaps 8 * SIZE(BO), %xmm9 2331 2332 mulps %xmm8, %xmm9 2333 addps %xmm9, %xmm2 2334 movaps 12 * SIZE(BO), %xmm9 2335 mulps %xmm8, %xmm9 2336 movsd -28 * SIZE(AO), %xmm8 2337 addps %xmm9, %xmm3 2338 movaps 64 * SIZE(BO), %xmm9 2339 2340 mulps %xmm8, %xmm11 2341 addps %xmm11, %xmm0 2342 movaps 20 * SIZE(BO), %xmm11 2343 mulps %xmm8, %xmm11 2344 movsd -26 * SIZE(AO), %xmm8 2345 addps %xmm11, %xmm1 2346 movaps 24 * SIZE(BO), %xmm11 2347 2348 mulps %xmm8, %xmm11 2349 addps %xmm11, %xmm2 2350 movaps 28 * SIZE(BO), %xmm11 2351 mulps %xmm8, %xmm11 2352 movsd -16 * SIZE(AO), %xmm8 2353 addps %xmm11, %xmm3 2354 movaps 80 * SIZE(BO), %xmm11 2355 2356 mulps %xmm10, %xmm13 2357 addps %xmm13, %xmm0 2358 movaps 36 * SIZE(BO), %xmm13 2359 mulps %xmm10, %xmm13 2360 movsd -22 * SIZE(AO), %xmm10 2361 addps %xmm13, %xmm1 2362 movaps 40 * SIZE(BO), %xmm13 2363 2364 mulps %xmm10, %xmm13 2365 addps %xmm13, %xmm2 2366 movaps 44 * SIZE(BO), %xmm13 2367 mulps %xmm10, %xmm13 2368 movsd -20 * SIZE(AO), %xmm10 2369 addps %xmm13, %xmm3 2370 movaps 96 * SIZE(BO), %xmm13 2371 2372 mulps %xmm10, %xmm15 2373 addps %xmm15, %xmm0 2374 movaps 52 * SIZE(BO), %xmm15 2375 mulps %xmm10, %xmm15 2376 movsd -18 * SIZE(AO), %xmm10 2377 addps %xmm15, %xmm1 2378 movaps 56 * SIZE(BO), %xmm15 2379 2380 mulps %xmm10, %xmm15 2381 addps %xmm15, %xmm2 2382 movaps 60 * SIZE(BO), %xmm15 2383 mulps %xmm10, %xmm15 2384 movsd -8 * SIZE(AO), %xmm10 2385 addps %xmm15, %xmm3 2386 movaps 112 * SIZE(BO), %xmm15 2387 2388 addq $16 * SIZE, AO 2389 addq $64 * SIZE, BO 2390 decq %rax 2391 jne .L82 2392 ALIGN_4 2393 2394.L85: 2395#ifndef TRMMKERNEL 2396 movq K, %rax 2397#else 2398 movq KKK, %rax 2399#endif 2400 movaps ALPHA, %xmm15 2401 andq $7, %rax # if (k & 1) 2402 BRANCH 2403 je .L88 2404 ALIGN_4 2405 2406.L86: 2407 mulps %xmm8, %xmm9 2408 addps %xmm9, %xmm0 2409 movaps 4 * SIZE(BO), %xmm9 2410 mulps %xmm8, %xmm9 2411 movsd -30 * SIZE(AO), %xmm8 2412 addps %xmm9, %xmm1 2413 movaps 8 * SIZE(BO), %xmm9 2414 2415 addq $2 * SIZE, AO # aoffset += 4 2416 addq $8 * SIZE, BO # boffset1 += 8 2417 decq %rax 2418 jg .L86 2419 ALIGN_4 2420 2421.L88: 2422#ifndef TRMMKERNEL 2423#ifdef movsd 2424 xorps %xmm8, %xmm8 2425#endif 2426 movsd 0 * SIZE(CO1), %xmm8 2427#ifdef movsd 2428 xorps %xmm10, %xmm10 2429#endif 2430 movsd 0 * SIZE(CO2), %xmm10 2431#endif 2432 2433 addps %xmm2, %xmm0 2434 addps %xmm3, %xmm1 2435 2436 mulps %xmm15, %xmm0 2437 mulps %xmm15, %xmm1 2438 2439#ifndef TRMMKERNEL 2440 addps %xmm8, %xmm0 2441 addps %xmm10, %xmm1 2442#endif 2443 2444 movlps %xmm0, 0 * SIZE(CO1) 2445 movlps %xmm1, 0 * SIZE(CO2) 2446 2447#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2448 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2449 movq K, %rax 2450 subq KKK, %rax 2451 leaq (,%rax, 8), %rax 2452 leaq (AO, %rax, 1), AO 2453 leaq (BO, %rax, 4), BO 2454#endif 2455 2456#if defined(TRMMKERNEL) && defined(LEFT) 2457 addq $2, KK 2458#endif 2459 2460 addq $2 * SIZE, CO1 # coffset += 4 2461 addq $2 * SIZE, CO2 # coffset += 4 2462 ALIGN_4 2463 2464.L90: 2465 testq $1, M 2466 je .L99 2467 2468#if !defined(TRMMKERNEL) || \ 2469 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2470 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2471 2472 leaq BUFFER, BO 2473#else 2474 leaq BUFFER, BO 2475 movq KK, %rax 2476 leaq (, %rax, 4), %rax 2477 leaq (AO, %rax, 1), AO 2478 leaq (BO, %rax, 8), BO 2479#endif 2480 2481 movss -32 * SIZE(AO), %xmm8 2482 movss -28 * SIZE(AO), %xmm10 2483 2484 movss 0 * SIZE(BO), %xmm9 2485 movss 16 * SIZE(BO), %xmm11 2486 movss 32 * SIZE(BO), %xmm13 2487 movss 48 * SIZE(BO), %xmm15 2488 2489 xorps %xmm0, %xmm0 2490 xorps %xmm1, %xmm1 2491 xorps %xmm2, %xmm2 2492 xorps %xmm3, %xmm3 2493 2494#ifndef TRMMKERNEL 2495 movq K, %rax 2496#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 2497 movq K, %rax 2498 subq KK, %rax 2499 movq %rax, KKK 2500#else 2501 movq KK, %rax 2502#ifdef LEFT 2503 addq $1, %rax 2504#else 2505 addq $2, %rax 2506#endif 2507 movq %rax, KKK 2508#endif 2509 sarq $3, %rax 2510 je .L95 2511 ALIGN_4 2512 2513.L92: 2514 mulps %xmm8, %xmm9 2515 addps %xmm9, %xmm0 2516#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) 2517 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 2518#endif 2519 movss 4 * SIZE(BO), %xmm9 2520 mulps %xmm8, %xmm9 2521 movss -31 * SIZE(AO), %xmm8 2522 addps %xmm9, %xmm1 2523 movss 8 * SIZE(BO), %xmm9 2524 2525 mulps %xmm8, %xmm9 2526 addps %xmm9, %xmm2 2527 movss 12 * SIZE(BO), %xmm9 2528 mulps %xmm8, %xmm9 2529 movss -30 * SIZE(AO), %xmm8 2530 addps %xmm9, %xmm3 2531 movss 64 * SIZE(BO), %xmm9 2532 2533 mulps %xmm8, %xmm11 2534 addps %xmm11, %xmm0 2535 movss 20 * SIZE(BO), %xmm11 2536 mulps %xmm8, %xmm11 2537 movss -29 * SIZE(AO), %xmm8 2538 addps %xmm11, %xmm1 2539 movss 24 * SIZE(BO), %xmm11 2540 2541 mulps %xmm8, %xmm11 2542 addps %xmm11, %xmm2 2543 movss 28 * SIZE(BO), %xmm11 2544 mulps %xmm8, %xmm11 2545 movss -24 * SIZE(AO), %xmm8 2546 addps %xmm11, %xmm3 2547 movss 80 * SIZE(BO), %xmm11 2548 2549 mulps %xmm10, %xmm13 2550 addps %xmm13, %xmm0 2551 movss 36 * SIZE(BO), %xmm13 2552 mulps %xmm10, %xmm13 2553 movss -27 * SIZE(AO), %xmm10 2554 addps %xmm13, %xmm1 2555 movss 40 * SIZE(BO), %xmm13 2556 2557 mulps %xmm10, %xmm13 2558 addps %xmm13, %xmm2 2559 movss 44 * SIZE(BO), %xmm13 2560 mulps %xmm10, %xmm13 2561 movss -26 * SIZE(AO), %xmm10 2562 addps %xmm13, %xmm3 2563 movss 96 * SIZE(BO), %xmm13 2564 2565 mulps %xmm10, %xmm15 2566 addps %xmm15, %xmm0 2567 movss 52 * SIZE(BO), %xmm15 2568 mulps %xmm10, %xmm15 2569 movss -25 * SIZE(AO), %xmm10 2570 addps %xmm15, %xmm1 2571 movss 56 * SIZE(BO), %xmm15 2572 2573 mulps %xmm10, %xmm15 2574 addps %xmm15, %xmm2 2575 movss 60 * SIZE(BO), %xmm15 2576 mulps %xmm10, %xmm15 2577 movss -20 * SIZE(AO), %xmm10 2578 addps %xmm15, %xmm3 2579 movss 112 * SIZE(BO), %xmm15 2580 2581 addq $ 8 * SIZE, AO 2582 addq $64 * SIZE, BO 2583 decq %rax 2584 jne .L92 2585 ALIGN_4 2586 2587.L95: 2588#ifndef TRMMKERNEL 2589 movq K, %rax 2590#else 2591 movq KKK, %rax 2592#endif 2593 movaps ALPHA, %xmm15 2594 andq $7, %rax # if (k & 1) 2595 BRANCH 2596 je .L98 2597 ALIGN_4 2598 2599.L96: 2600 mulps %xmm8, %xmm9 2601 addps %xmm9, %xmm0 2602 movss 4 * SIZE(BO), %xmm9 2603 mulps %xmm8, %xmm9 2604 movss -31 * SIZE(AO), %xmm8 2605 addps %xmm9, %xmm1 2606 movss 8 * SIZE(BO), %xmm9 2607 2608 addq $1 * SIZE, AO # aoffset += 4 2609 addq $8 * SIZE, BO # boffset1 += 8 2610 decq %rax 2611 jg .L96 2612 ALIGN_4 2613 2614.L98: 2615#ifndef TRMMKERNEL 2616 movss 0 * SIZE(CO1), %xmm8 2617 movss 0 * SIZE(CO2), %xmm10 2618#endif 2619 2620 addss %xmm2, %xmm0 2621 addss %xmm3, %xmm1 2622 mulss %xmm15, %xmm0 2623 mulss %xmm15, %xmm1 2624 2625#ifndef TRMMKERNEL 2626 addss %xmm8, %xmm0 2627 addss %xmm10, %xmm1 2628#endif 2629 2630 movss %xmm0, 0 * SIZE(CO1) 2631 movss %xmm1, 0 * SIZE(CO2) 2632 2633#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2634 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2635 movq K, %rax 2636 subq KKK, %rax 2637 leaq (,%rax, 4), %rax 2638 leaq (AO, %rax, 1), AO 2639 leaq (BO, %rax, 8), BO 2640#endif 2641 2642#if defined(TRMMKERNEL) && defined(LEFT) 2643 addq $1, KK 2644#endif 2645 ALIGN_4 2646 2647.L99: 2648#if defined(TRMMKERNEL) && !defined(LEFT) 2649 addl $2, KK 2650#endif 2651 leaq (C, LDC, 2), C # c += 4 * ldc 2652 ALIGN_4 2653 2654 2655.L100: 2656 testq $1, N 2657 je .L999 2658 2659.L101: 2660#if defined(TRMMKERNEL) && defined(LEFT) 2661 movq OFFSET, %rax 2662 movq %rax, KK 2663#endif 2664 2665/* Copying to Sub Buffer */ 2666 leaq BUFFER, BO 2667 2668 movq K, %rax 2669 sarq $3, %rax 2670 jle .L103 2671 ALIGN_4 2672 2673 2674.L102: 2675#if defined(PENTIUM4) || defined(GENERIC) 2676 movss 0 * SIZE(B), %xmm0 2677 movss 1 * SIZE(B), %xmm1 2678 movss 2 * SIZE(B), %xmm2 2679 movss 3 * SIZE(B), %xmm3 2680 movss 4 * SIZE(B), %xmm4 2681 movss 5 * SIZE(B), %xmm5 2682 movss 6 * SIZE(B), %xmm6 2683 movss 7 * SIZE(B), %xmm7 2684 2685 PREFETCH 32 * SIZE(B) 2686 2687 shufps $0, %xmm0, %xmm0 2688 shufps $0, %xmm1, %xmm1 2689 shufps $0, %xmm2, %xmm2 2690 shufps $0, %xmm3, %xmm3 2691 shufps $0, %xmm4, %xmm4 2692 shufps $0, %xmm5, %xmm5 2693 shufps $0, %xmm6, %xmm6 2694 shufps $0, %xmm7, %xmm7 2695 2696 movaps %xmm0, 0 * SIZE(BO) 2697 movaps %xmm1, 4 * SIZE(BO) 2698 movaps %xmm2, 8 * SIZE(BO) 2699 movaps %xmm3, 12 * SIZE(BO) 2700 movaps %xmm4, 16 * SIZE(BO) 2701 movaps %xmm5, 20 * SIZE(BO) 2702 movaps %xmm6, 24 * SIZE(BO) 2703 movaps %xmm7, 28 * SIZE(BO) 2704 2705 addq $ 8 * SIZE, B 2706 addq $32 * SIZE, BO 2707#endif 2708 2709#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) 2710 PREFETCH 32 * SIZE(B) 2711 2712 movd 0 * SIZE(B), %mm0 2713 movd 1 * SIZE(B), %mm1 2714 movd 2 * SIZE(B), %mm2 2715 movd 3 * SIZE(B), %mm3 2716 movd 4 * SIZE(B), %mm4 2717 movd 5 * SIZE(B), %mm5 2718 movd 6 * SIZE(B), %mm6 2719 movd 7 * SIZE(B), %mm7 2720 2721 punpckldq %mm0, %mm0 2722 punpckldq %mm1, %mm1 2723 punpckldq %mm2, %mm2 2724 punpckldq %mm3, %mm3 2725 punpckldq %mm4, %mm4 2726 punpckldq %mm5, %mm5 2727 punpckldq %mm6, %mm6 2728 punpckldq %mm7, %mm7 2729 2730 movq %mm0, 0 * SIZE(BO) 2731 movq %mm0, 2 * SIZE(BO) 2732 movq %mm1, 4 * SIZE(BO) 2733 movq %mm1, 6 * SIZE(BO) 2734 movq %mm2, 8 * SIZE(BO) 2735 movq %mm2, 10 * SIZE(BO) 2736 movq %mm3, 12 * SIZE(BO) 2737 movq %mm3, 14 * SIZE(BO) 2738 movq %mm4, 16 * SIZE(BO) 2739 movq %mm4, 18 * SIZE(BO) 2740 movq %mm5, 20 * SIZE(BO) 2741 movq %mm5, 22 * SIZE(BO) 2742 movq %mm6, 24 * SIZE(BO) 2743 movq %mm6, 26 * SIZE(BO) 2744 movq %mm7, 28 * SIZE(BO) 2745 movq %mm7, 30 * SIZE(BO) 2746 2747 addq $ 8 * SIZE, B 2748 addq $32 * SIZE, BO 2749#endif 2750 2751 decq %rax 2752 jne .L102 2753 ALIGN_4 2754 2755.L103: 2756 movq K, %rax 2757 andq $7, %rax 2758 BRANCH 2759 jle .L110 2760 ALIGN_4 2761 2762.L104: 2763#if defined(PENTIUM4) || defined(GENERIC) 2764 movss 0 * SIZE(B), %xmm0 2765 shufps $0, %xmm0, %xmm0 2766 movaps %xmm0, 0 * SIZE(BO) 2767#endif 2768 2769#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) 2770 movd 0 * SIZE(B), %mm0 2771 punpckldq %mm0, %mm0 2772 movq %mm0, 0 * SIZE(BO) 2773 movq %mm0, 2 * SIZE(BO) 2774#endif 2775 2776 addq $ 1 * SIZE, B 2777 addq $ 4 * SIZE, BO 2778 decq %rax 2779 jne .L104 2780 ALIGN_4 2781 2782.L110: 2783 movq C, CO1 # coffset1 = c 2784 movq A, AO # aoffset = a 2785 2786 movq M, I 2787 sarq $3, I # i = (m >> 3) 2788 jle .L120 2789 ALIGN_4 2790 2791.L111: 2792#if !defined(TRMMKERNEL) || \ 2793 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2794 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2795 2796 leaq BUFFER, BO 2797#else 2798 leaq BUFFER, BO 2799 movq KK, %rax 2800 leaq (, %rax, 8), %rax 2801 leaq (AO, %rax, 4), AO 2802 leaq (BO, %rax, 2), BO 2803#endif 2804 2805 movaps -32 * SIZE(AO), %xmm8 2806 movaps -16 * SIZE(AO), %xmm10 2807 movaps 0 * SIZE(AO), %xmm12 2808 movaps 16 * SIZE(AO), %xmm14 2809 2810 movaps 0 * SIZE(BO), %xmm9 2811 movaps 16 * SIZE(BO), %xmm11 2812 movaps 32 * SIZE(BO), %xmm13 2813 movaps 48 * SIZE(BO), %xmm15 2814 2815 xorps %xmm0, %xmm0 2816 xorps %xmm1, %xmm1 2817 2818 PREFETCHW 7 * SIZE(CO1) 2819 xorps %xmm4, %xmm4 2820 xorps %xmm5, %xmm5 2821 2822#ifndef TRMMKERNEL 2823 movq K, %rax 2824#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 2825 movq K, %rax 2826 subq KK, %rax 2827 movq %rax, KKK 2828#else 2829 movq KK, %rax 2830#ifdef LEFT 2831 addq $8, %rax 2832#else 2833 addq $1, %rax 2834#endif 2835 movq %rax, KKK 2836#endif 2837 sarq $3, %rax 2838 je .L115 2839 ALIGN_4 2840 2841.L112: 2842 mulps %xmm9, %xmm8 2843#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) 2844 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 2845#endif 2846 2847 mulps -28 * SIZE(AO), %xmm9 2848 addps %xmm8, %xmm0 2849 movaps -24 * SIZE(AO), %xmm8 2850 addps %xmm9, %xmm4 2851 movaps 4 * SIZE(BO), %xmm9 2852 2853 mulps %xmm9, %xmm8 2854 mulps -20 * SIZE(AO), %xmm9 2855 addps %xmm8, %xmm0 2856 movaps 32 * SIZE(AO), %xmm8 2857 addps %xmm9, %xmm4 2858 movaps 8 * SIZE(BO), %xmm9 2859 2860#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) 2861 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) 2862#endif 2863 mulps %xmm9, %xmm10 2864 mulps -12 * SIZE(AO), %xmm9 2865 addps %xmm10, %xmm0 2866 movaps -8 * SIZE(AO), %xmm10 2867 addps %xmm9, %xmm4 2868 movaps 12 * SIZE(BO), %xmm9 2869 2870 mulps %xmm9, %xmm10 2871 mulps -4 * SIZE(AO), %xmm9 2872 addps %xmm10, %xmm0 2873 movaps 48 * SIZE(AO), %xmm10 2874 addps %xmm9, %xmm4 2875 movaps 32 * SIZE(BO), %xmm9 2876 2877#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) 2878 PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) 2879#endif 2880 mulps %xmm11, %xmm12 2881 mulps 4 * SIZE(AO), %xmm11 2882 addps %xmm12, %xmm0 2883 movaps 8 * SIZE(AO), %xmm12 2884 addps %xmm11, %xmm4 2885 movaps 20 * SIZE(BO), %xmm11 2886 2887 mulps %xmm11, %xmm12 2888 mulps 12 * SIZE(AO), %xmm11 2889 addps %xmm12, %xmm0 2890 movaps 64 * SIZE(AO), %xmm12 2891 addps %xmm11, %xmm4 2892 movaps 24 * SIZE(BO), %xmm11 2893 2894#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) 2895 PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) 2896#endif 2897 mulps %xmm11, %xmm14 2898 mulps 20 * SIZE(AO), %xmm11 2899 addps %xmm14, %xmm0 2900 movaps 24 * SIZE(AO), %xmm14 2901 addps %xmm11, %xmm4 2902 movaps 28 * SIZE(BO), %xmm11 2903 2904 mulps %xmm11, %xmm14 2905 mulps 28 * SIZE(AO), %xmm11 2906 addps %xmm14, %xmm0 2907 movaps 80 * SIZE(AO), %xmm14 2908 addps %xmm11, %xmm4 2909 movaps 48 * SIZE(BO), %xmm11 2910 2911 addq $64 * SIZE, AO 2912 addq $32 * SIZE, BO 2913 decq %rax 2914 jne .L112 2915 ALIGN_4 2916 2917.L115: 2918#ifndef TRMMKERNEL 2919 movq K, %rax 2920#else 2921 movq KKK, %rax 2922#endif 2923 movaps ALPHA, %xmm15 2924 andq $7, %rax # if (k & 1) 2925 BRANCH 2926 je .L118 2927 ALIGN_4 2928 2929.L116: 2930 mulps %xmm9, %xmm8 2931 mulps -28 * SIZE(AO), %xmm9 2932 addps %xmm8, %xmm0 2933 movaps -24 * SIZE(AO), %xmm8 2934 addps %xmm9, %xmm4 2935 movaps 4 * SIZE(BO), %xmm9 2936 2937 addq $8 * SIZE, AO # aoffset += 4 2938 addq $4 * SIZE, BO # boffset1 += 8 2939 decq %rax 2940 jg .L116 2941 ALIGN_4 2942 2943.L118: 2944#ifndef TRMMKERNEL 2945 movsd 0 * SIZE(CO1), %xmm8 2946 movhps 2 * SIZE(CO1), %xmm8 2947 movsd 4 * SIZE(CO1), %xmm9 2948 movhps 6 * SIZE(CO1), %xmm9 2949#endif 2950 2951 mulps %xmm15, %xmm0 2952 mulps %xmm15, %xmm4 2953#ifndef TRMMKERNEL 2954 addps %xmm8, %xmm0 2955 addps %xmm9, %xmm4 2956#endif 2957 2958 movlps %xmm0, 0 * SIZE(CO1) 2959 movhps %xmm0, 2 * SIZE(CO1) 2960 movlps %xmm4, 4 * SIZE(CO1) 2961 movhps %xmm4, 6 * SIZE(CO1) 2962 2963#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2964 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2965 movq K, %rax 2966 subq KKK, %rax 2967 leaq (,%rax, 8), %rax 2968 leaq (AO, %rax, 4), AO 2969 leaq (BO, %rax, 2), BO 2970#endif 2971 2972#if defined(TRMMKERNEL) && defined(LEFT) 2973 addq $8, KK 2974#endif 2975 2976 addq $8 * SIZE, CO1 # coffset += 4 2977 decq I # i -- 2978 jg .L111 2979 ALIGN_4 2980 2981.L120: 2982 testq $4, M 2983 je .L130 2984 2985#if !defined(TRMMKERNEL) || \ 2986 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2987 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2988 2989 leaq BUFFER, BO 2990#else 2991 leaq BUFFER, BO 2992 movq KK, %rax 2993 leaq (, %rax, 8), %rax 2994 leaq (AO, %rax, 2), AO 2995 leaq (BO, %rax, 2), BO 2996#endif 2997 2998 movaps -32 * SIZE(AO), %xmm8 2999 movaps -16 * SIZE(AO), %xmm10 3000 3001 movaps 0 * SIZE(BO), %xmm9 3002 movaps 16 * SIZE(BO), %xmm11 3003 3004 xorps %xmm0, %xmm0 3005 xorps %xmm1, %xmm1 3006 xorps %xmm2, %xmm2 3007 xorps %xmm3, %xmm3 3008 3009#ifndef TRMMKERNEL 3010 movq K, %rax 3011#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 3012 movq K, %rax 3013 subq KK, %rax 3014 movq %rax, KKK 3015#else 3016 movq KK, %rax 3017#ifdef LEFT 3018 addq $4, %rax 3019#else 3020 addq $1, %rax 3021#endif 3022 movq %rax, KKK 3023#endif 3024 sarq $3, %rax 3025 je .L125 3026 ALIGN_4 3027 3028.L122: 3029 mulps %xmm8, %xmm9 3030#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) 3031 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 3032#endif 3033 movaps -28 * SIZE(AO), %xmm8 3034 mulps 4 * SIZE(BO), %xmm8 3035 addps %xmm9, %xmm0 3036 movaps 32 * SIZE(BO), %xmm9 3037 addps %xmm8, %xmm1 3038 movaps -24 * SIZE(AO), %xmm8 3039 mulps 8 * SIZE(BO), %xmm8 3040 addps %xmm8, %xmm2 3041 movaps -20 * SIZE(AO), %xmm8 3042 mulps 12 * SIZE(BO), %xmm8 3043 addps %xmm8, %xmm3 3044 movaps 0 * SIZE(AO), %xmm8 3045 3046#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) 3047 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) 3048#endif 3049 mulps %xmm10, %xmm11 3050 movaps -12 * SIZE(AO), %xmm10 3051 mulps 20 * SIZE(BO), %xmm10 3052 addps %xmm11, %xmm0 3053 movaps 48 * SIZE(BO), %xmm11 3054 addps %xmm10, %xmm1 3055 movaps -8 * SIZE(AO), %xmm10 3056 mulps 24 * SIZE(BO), %xmm10 3057 addps %xmm10, %xmm2 3058 movaps -4 * SIZE(AO), %xmm10 3059 mulps 28 * SIZE(BO), %xmm10 3060 addps %xmm10, %xmm3 3061 movaps 16 * SIZE(AO), %xmm10 3062 3063 addq $32 * SIZE, AO 3064 addq $32 * SIZE, BO 3065 decq %rax 3066 jne .L122 3067 ALIGN_4 3068 3069.L125: 3070#ifndef TRMMKERNEL 3071 movq K, %rax 3072#else 3073 movq KKK, %rax 3074#endif 3075 movaps ALPHA, %xmm15 3076 andq $7, %rax # if (k & 1) 3077 BRANCH 3078 je .L128 3079 ALIGN_4 3080 3081.L126: 3082 mulps %xmm8, %xmm9 3083 movaps -28 * SIZE(AO), %xmm8 3084 addps %xmm9, %xmm0 3085 movaps 4 * SIZE(BO), %xmm9 3086 3087 addq $4 * SIZE, AO # aoffset += 4 3088 addq $4 * SIZE, BO # boffset1 += 8 3089 decq %rax 3090 jg .L126 3091 ALIGN_4 3092 3093.L128: 3094#ifndef TRMMKERNEL 3095 movsd 0 * SIZE(CO1), %xmm8 3096 movhps 2 * SIZE(CO1), %xmm8 3097#endif 3098 3099 addps %xmm1, %xmm0 3100 addps %xmm3, %xmm2 3101 addps %xmm2, %xmm0 3102 3103 mulps %xmm15, %xmm0 3104#ifndef TRMMKERNEL 3105 addps %xmm8, %xmm0 3106#endif 3107 3108 movlps %xmm0, 0 * SIZE(CO1) 3109 movhps %xmm0, 2 * SIZE(CO1) 3110 3111#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 3112 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 3113 movq K, %rax 3114 subq KKK, %rax 3115 leaq (,%rax, 8), %rax 3116 leaq (AO, %rax, 2), AO 3117 leaq (BO, %rax, 2), BO 3118#endif 3119 3120#if defined(TRMMKERNEL) && defined(LEFT) 3121 addq $4, KK 3122#endif 3123 3124 addq $4 * SIZE, CO1 # coffset += 4 3125 ALIGN_4 3126 3127.L130: 3128 testq $2, M 3129 je .L140 3130 3131#if !defined(TRMMKERNEL) || \ 3132 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 3133 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 3134 3135 leaq BUFFER, BO 3136#else 3137 leaq BUFFER, BO 3138 movq KK, %rax 3139 leaq (, %rax, 8), %rax 3140 leaq (AO, %rax, 1), AO 3141 leaq (BO, %rax, 2), BO 3142#endif 3143 3144 movaps -32 * SIZE(AO), %xmm8 3145 movaps -24 * SIZE(AO), %xmm10 3146 3147 movaps 0 * SIZE(BO), %xmm9 3148 movaps 16 * SIZE(BO), %xmm11 3149 3150 xorps %xmm0, %xmm0 3151 xorps %xmm1, %xmm1 3152 xorps %xmm2, %xmm2 3153 xorps %xmm3, %xmm3 3154 3155#ifndef TRMMKERNEL 3156 movq K, %rax 3157#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 3158 movq K, %rax 3159 subq KK, %rax 3160 movq %rax, KKK 3161#else 3162 movq KK, %rax 3163#ifdef LEFT 3164 addq $2, %rax 3165#else 3166 addq $1, %rax 3167#endif 3168 movq %rax, KKK 3169#endif 3170 sarq $3, %rax 3171 je .L135 3172 ALIGN_4 3173 3174.L132: 3175 mulps %xmm8, %xmm9 3176#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) 3177 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 3178#endif 3179 movsd -30 * SIZE(AO), %xmm8 3180 addps %xmm9, %xmm0 3181 movaps 4 * SIZE(BO), %xmm9 3182 mulps %xmm8, %xmm9 3183 movsd -28 * SIZE(AO), %xmm8 3184 addps %xmm9, %xmm1 3185 movaps 8 * SIZE(BO), %xmm9 3186 3187 mulps %xmm8, %xmm9 3188 movsd -26 * SIZE(AO), %xmm8 3189 addps %xmm9, %xmm0 3190 movaps 12 * SIZE(BO), %xmm9 3191 3192 mulps %xmm8, %xmm9 3193 movsd -16 * SIZE(AO), %xmm8 3194 addps %xmm9, %xmm1 3195 movaps 32 * SIZE(BO), %xmm9 3196 3197 mulps %xmm10, %xmm11 3198 movsd -22 * SIZE(AO), %xmm10 3199 addps %xmm11, %xmm0 3200 movaps 20 * SIZE(BO), %xmm11 3201 3202 mulps %xmm10, %xmm11 3203 movsd -20 * SIZE(AO), %xmm10 3204 addps %xmm11, %xmm1 3205 movaps 24 * SIZE(BO), %xmm11 3206 3207 mulps %xmm10, %xmm11 3208 movsd -18 * SIZE(AO), %xmm10 3209 addps %xmm11, %xmm0 3210 movaps 28 * SIZE(BO), %xmm11 3211 3212 mulps %xmm10, %xmm11 3213 movsd -8 * SIZE(AO), %xmm10 3214 addps %xmm11, %xmm1 3215 movaps 48 * SIZE(BO), %xmm11 3216 3217 addq $16 * SIZE, AO 3218 addq $32 * SIZE, BO 3219 decq %rax 3220 jne .L132 3221 ALIGN_4 3222 3223.L135: 3224#ifndef TRMMKERNEL 3225 movq K, %rax 3226#else 3227 movq KKK, %rax 3228#endif 3229 movaps ALPHA, %xmm15 3230 andq $7, %rax # if (k & 1) 3231 BRANCH 3232 je .L138 3233 ALIGN_4 3234 3235.L136: 3236 mulps %xmm8, %xmm9 3237 movsd -30 * SIZE(AO), %xmm8 3238 addps %xmm9, %xmm0 3239 movaps 4 * SIZE(BO), %xmm9 3240 3241 addq $2 * SIZE, AO # aoffset += 4 3242 addq $4 * SIZE, BO # boffset1 += 8 3243 decq %rax 3244 jg .L136 3245 ALIGN_4 3246 3247.L138: 3248 addps %xmm1, %xmm0 3249 mulps %xmm15, %xmm0 3250 3251#ifndef TRMMKERNEL 3252#ifdef movsd 3253 xorps %xmm8, %xmm8 3254#endif 3255 movsd 0 * SIZE(CO1), %xmm8 3256 addps %xmm8, %xmm0 3257#endif 3258 3259 movlps %xmm0, 0 * SIZE(CO1) 3260 3261#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 3262 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 3263 movq K, %rax 3264 subq KKK, %rax 3265 leaq (,%rax, 8), %rax 3266 leaq (AO, %rax, 1), AO 3267 leaq (BO, %rax, 2), BO 3268#endif 3269 3270#if defined(TRMMKERNEL) && defined(LEFT) 3271 addq $2, KK 3272#endif 3273 3274 addq $2 * SIZE, CO1 # coffset += 4 3275 ALIGN_4 3276 3277.L140: 3278 testq $1, M 3279 je .L999 3280 3281#if !defined(TRMMKERNEL) || \ 3282 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 3283 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 3284 3285 leaq BUFFER, BO 3286#else 3287 leaq BUFFER, BO 3288 movq KK, %rax 3289 leaq (, %rax, 4), %rax 3290 leaq (AO, %rax, 1), AO 3291 leaq (BO, %rax, 4), BO 3292#endif 3293 3294 movss -32 * SIZE(AO), %xmm8 3295 movss -28 * SIZE(AO), %xmm10 3296 3297 movss 0 * SIZE(BO), %xmm9 3298 movss 16 * SIZE(BO), %xmm11 3299 3300 xorps %xmm0, %xmm0 3301 xorps %xmm1, %xmm1 3302 xorps %xmm2, %xmm2 3303 xorps %xmm3, %xmm3 3304 3305#ifndef TRMMKERNEL 3306 movq K, %rax 3307#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 3308 movq K, %rax 3309 subq KK, %rax 3310 movq %rax, KKK 3311#else 3312 movq KK, %rax 3313#ifdef LEFT 3314 addq $1, %rax 3315#else 3316 addq $1, %rax 3317#endif 3318 movq %rax, KKK 3319#endif 3320 sarq $3, %rax 3321 je .L145 3322 ALIGN_4 3323 3324.L142: 3325 mulss %xmm8, %xmm9 3326#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) 3327 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 3328#endif 3329 movss -31 * SIZE(AO), %xmm8 3330 mulss 4 * SIZE(BO), %xmm8 3331 addss %xmm9, %xmm0 3332 movss 32 * SIZE(BO), %xmm9 3333 addss %xmm8, %xmm1 3334 movss -30 * SIZE(AO), %xmm8 3335 mulss 8 * SIZE(BO), %xmm8 3336 addss %xmm8, %xmm2 3337 movss -29 * SIZE(AO), %xmm8 3338 mulss 12 * SIZE(BO), %xmm8 3339 addss %xmm8, %xmm3 3340 movss -24 * SIZE(AO), %xmm8 3341 mulss %xmm10, %xmm11 3342 movss -27 * SIZE(AO), %xmm10 3343 mulss 20 * SIZE(BO), %xmm10 3344 addss %xmm11, %xmm0 3345 movss 48 * SIZE(BO), %xmm11 3346 addss %xmm10, %xmm1 3347 movss -26 * SIZE(AO), %xmm10 3348 mulss 24 * SIZE(BO), %xmm10 3349 addss %xmm10, %xmm2 3350 movss -25 * SIZE(AO), %xmm10 3351 mulss 28 * SIZE(BO), %xmm10 3352 addss %xmm10, %xmm3 3353 movss -20 * SIZE(AO), %xmm10 3354 3355 addq $ 8 * SIZE, AO 3356 addq $32 * SIZE, BO 3357 decq %rax 3358 jne .L142 3359 ALIGN_4 3360 3361.L145: 3362#ifndef TRMMKERNEL 3363 movq K, %rax 3364#else 3365 movq KKK, %rax 3366#endif 3367 movss ALPHA, %xmm15 3368 andq $7, %rax # if (k & 1) 3369 BRANCH 3370 je .L148 3371 ALIGN_4 3372 3373.L146: 3374 mulss %xmm8, %xmm9 3375 movss -31 * SIZE(AO), %xmm8 3376 addss %xmm9, %xmm0 3377 movss 4 * SIZE(BO), %xmm9 3378 3379 addq $1 * SIZE, AO 3380 addq $4 * SIZE, BO 3381 decq %rax 3382 jg .L146 3383 ALIGN_4 3384 3385.L148: 3386 addss %xmm1, %xmm0 3387 addss %xmm3, %xmm2 3388 addss %xmm2, %xmm0 3389 3390 mulss %xmm15, %xmm0 3391 3392#ifndef TRMMKERNEL 3393 movss 0 * SIZE(CO1), %xmm8 3394 addss %xmm8, %xmm0 3395#endif 3396 movss %xmm0, 0 * SIZE(CO1) 3397 ALIGN_4 3398 3399.L999: 3400 movq %rbx, %rsp 3401 3402 EMMS 3403 3404 movq 0(%rsp), %rbx 3405 movq 8(%rsp), %rbp 3406 movq 16(%rsp), %r12 3407 movq 24(%rsp), %r13 3408 movq 32(%rsp), %r14 3409 movq 40(%rsp), %r15 3410 3411#ifdef WINDOWS_ABI 3412 movq 48(%rsp), %rdi 3413 movq 56(%rsp), %rsi 3414 movups 64(%rsp), %xmm6 3415 movups 80(%rsp), %xmm7 3416 movups 96(%rsp), %xmm8 3417 movups 112(%rsp), %xmm9 3418 movups 128(%rsp), %xmm10 3419 movups 144(%rsp), %xmm11 3420 movups 160(%rsp), %xmm12 3421 movups 176(%rsp), %xmm13 3422 movups 192(%rsp), %xmm14 3423 movups 208(%rsp), %xmm15 3424#endif 3425 3426 addq $STACKSIZE, %rsp 3427 ret 3428 3429 EPILOGUE 3430