1/*********************************************************************/ 2/* */ 3/* Optimized BLAS libraries */ 4/* By Kazushige Goto <kgoto@tacc.utexas.edu> */ 5/* */ 6/* Copyright (c) The University of Texas, 2009. All rights reserved. */ 7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */ 8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */ 9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */ 10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */ 11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */ 12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */ 13/* THE USE OF THE SOFTWARE OR DOCUMENTATION. */ 14/* Under no circumstances shall University be liable for incidental, */ 15/* special, indirect, direct or consequential damages or loss of */ 16/* profits, interruption of business, or related expenses which may */ 17/* arise from use of Software or Documentation, including but not */ 18/* limited to those resulting from defects in Software and/or */ 19/* Documentation, or loss or inaccuracy of data of any kind. */ 20/*********************************************************************/ 21 22#define ASSEMBLER 23#include "common.h" 24 25#define OLD_M %rdi 26#define OLD_N %rsi 27#define M %r13 28#define N %r14 29#define K %rdx 30 31#define A %rcx 32#define B %r8 33#define C %r9 34#define LDC %r10 35 36#define I %r11 37#define AO %rdi 38#define BO %rsi 39#define CO1 %r15 40#define CO2 %rbp 41#define BB %r12 42 43#ifndef WINDOWS_ABI 44 45#define STACKSIZE 64 46 47#define OLD_LDC 8 + STACKSIZE(%rsp) 48#define OLD_OFFSET 16 + STACKSIZE(%rsp) 49 50#else 51 52#define STACKSIZE 256 53 54#define OLD_A 40 + STACKSIZE(%rsp) 55#define OLD_B 48 + STACKSIZE(%rsp) 56#define OLD_C 56 + STACKSIZE(%rsp) 57#define OLD_LDC 64 + STACKSIZE(%rsp) 58#define OLD_OFFSET 72 + STACKSIZE(%rsp) 59 60#endif 61 62#define ALPHA 0(%rsp) 63#define J 16(%rsp) 64#define OFFSET 24(%rsp) 65#define KK 32(%rsp) 66#define KKK 40(%rsp) 67#define BUFFER 256(%rsp) 68 69#ifdef OPTERON 70#define PREFETCH prefetch 71#define PREFETCHW prefetchw 72#define PREFETCHSIZE (8 * 9 + 4) 73#define movsd movlps 74#define movapd movaps 75#endif 76 77#ifdef GENERIC 78#define PREFETCH prefetcht0 79#define PREFETCHW prefetcht0 80#define PREFETCHSIZE (8 * 13 + 4) 81#define movapd movaps 82#endif 83 84#ifndef GENERIC 85#define KERNEL1(xx) \ 86 mulpd %xmm0, %xmm1 ;\ 87 addpd %xmm1, %xmm8 ;\ 88 movaps -16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ 89 mulpd %xmm0, %xmm3 ;\ 90 addpd %xmm3, %xmm9 ;\ 91 movapd -14 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ 92 mulpd %xmm0, %xmm5 ;\ 93 PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\ 94 mulpd -10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\ 95 addpd %xmm5, %xmm10 ;\ 96 movapd -12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ 97 addpd %xmm0, %xmm11 ;\ 98 movapd -8 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0 99 100#define KERNEL2(xx) \ 101 mulpd %xmm2, %xmm1 ;\ 102 addpd %xmm1, %xmm12 ;\ 103 movapd 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ 104 mulpd %xmm2, %xmm3 ;\ 105 addpd %xmm3, %xmm13 ;\ 106 movapd -6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ 107 mulpd %xmm2, %xmm5 ;\ 108 mulpd -10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\ 109 addpd %xmm5, %xmm14 ;\ 110 movapd -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ 111 addpd %xmm2, %xmm15 ;\ 112 movapd -6 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2 113 114#define KERNEL3(xx) \ 115 mulpd %xmm4, %xmm7 ;\ 116 addpd %xmm7, %xmm8 ;\ 117 movapd -8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ 118 mulpd %xmm4, %xmm3 ;\ 119 addpd %xmm3, %xmm9 ;\ 120 movapd -6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ 121 mulpd %xmm4, %xmm5 ;\ 122 mulpd -2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\ 123 addpd %xmm5, %xmm10 ;\ 124 movapd -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ 125 addpd %xmm4, %xmm11 ;\ 126 movapd -4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4 127 128#define KERNEL4(xx) \ 129 mulpd %xmm6, %xmm7 ;\ 130 addpd %xmm7, %xmm12 ;\ 131 movapd 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ 132 mulpd %xmm6, %xmm3 ;\ 133 addpd %xmm3, %xmm13 ;\ 134 movapd 2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ 135 mulpd %xmm6, %xmm5 ;\ 136 mulpd -2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\ 137 addpd %xmm5, %xmm14 ;\ 138 movapd 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ 139 PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\ 140 addpd %xmm6, %xmm15 ;\ 141 movapd -2 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6 142 143#define KERNEL5(xx) \ 144 mulpd %xmm0, %xmm1 ;\ 145 addpd %xmm1, %xmm8 ;\ 146 movapd 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ 147 mulpd %xmm0, %xmm3 ;\ 148 addpd %xmm3, %xmm9 ;\ 149 movapd 2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ 150 mulpd %xmm0, %xmm5 ;\ 151 mulpd 6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\ 152 addpd %xmm5, %xmm10 ;\ 153 movapd 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ 154 addpd %xmm0, %xmm11 ;\ 155 movapd 0 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0 156 157#define KERNEL6(xx) \ 158 mulpd %xmm2, %xmm1 ;\ 159 addpd %xmm1, %xmm12 ;\ 160 movapd 16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ 161 mulpd %xmm2, %xmm3 ;\ 162 addpd %xmm3, %xmm13 ;\ 163 movapd 10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ 164 mulpd %xmm2, %xmm5 ;\ 165 mulpd 6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\ 166 addpd %xmm5, %xmm14 ;\ 167 movapd 12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ 168 addpd %xmm2, %xmm15 ;\ 169 movapd 2 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2 170 171#define KERNEL7(xx) \ 172 mulpd %xmm4, %xmm7 ;\ 173 addpd %xmm7, %xmm8 ;\ 174 movapd 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ 175 mulpd %xmm4, %xmm3 ;\ 176 addpd %xmm3, %xmm9 ;\ 177 movapd 10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ 178 mulpd %xmm4, %xmm5 ;\ 179 mulpd 14 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\ 180 addpd %xmm5, %xmm10 ;\ 181 movapd 12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ 182 addpd %xmm4, %xmm11 ;\ 183 movapd 4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4 184 185#define KERNEL8(xx) \ 186 mulpd %xmm6, %xmm7 ;\ 187 addpd %xmm7, %xmm12 ;\ 188 movapd 24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ 189 mulpd %xmm6, %xmm3 ;\ 190 addpd %xmm3, %xmm13 ;\ 191 movapd 18 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ 192 mulpd %xmm6, %xmm5 ;\ 193 mulpd 14 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\ 194 addpd %xmm5, %xmm14 ;\ 195 movapd 20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ 196 addpd %xmm6, %xmm15 ;\ 197 movapd 6 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6 198 199#else 200 201#define KERNEL1(xx) \ 202 mulpd %xmm0, %xmm1 ;\ 203 addpd %xmm1, %xmm8 ;\ 204 movapd -16 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ 205 mulpd %xmm0, %xmm3 ;\ 206 addpd %xmm3, %xmm9 ;\ 207 movapd -14 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ 208 mulpd %xmm0, %xmm5 ;\ 209 PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\ 210 mulpd -10 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\ 211 addpd %xmm5, %xmm10 ;\ 212 movapd -12 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ 213 addpd %xmm0, %xmm11 ;\ 214 movapd -8 * SIZE + 1 * (xx) * SIZE(AO), %xmm0 215 216#define KERNEL2(xx) \ 217 mulpd %xmm2, %xmm1 ;\ 218 addpd %xmm1, %xmm12 ;\ 219 movapd 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ 220 mulpd %xmm2, %xmm3 ;\ 221 addpd %xmm3, %xmm13 ;\ 222 movapd -6 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ 223 mulpd %xmm2, %xmm5 ;\ 224 mulpd -10 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\ 225 addpd %xmm5, %xmm14 ;\ 226 movapd -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ 227 addpd %xmm2, %xmm15 ;\ 228 movapd -6 * SIZE + 1 * (xx) * SIZE(AO), %xmm2 229 230#define KERNEL3(xx) \ 231 mulpd %xmm4, %xmm7 ;\ 232 addpd %xmm7, %xmm8 ;\ 233 movapd -8 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ 234 mulpd %xmm4, %xmm3 ;\ 235 addpd %xmm3, %xmm9 ;\ 236 movapd -6 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ 237 mulpd %xmm4, %xmm5 ;\ 238 mulpd -2 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\ 239 addpd %xmm5, %xmm10 ;\ 240 movapd -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ 241 addpd %xmm4, %xmm11 ;\ 242 movapd -4 * SIZE + 1 * (xx) * SIZE(AO), %xmm4 243 244#define KERNEL4(xx) \ 245 mulpd %xmm6, %xmm7 ;\ 246 addpd %xmm7, %xmm12 ;\ 247 movapd 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ 248 mulpd %xmm6, %xmm3 ;\ 249 addpd %xmm3, %xmm13 ;\ 250 movapd 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ 251 mulpd %xmm6, %xmm5 ;\ 252 mulpd -2 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\ 253 addpd %xmm5, %xmm14 ;\ 254 movapd 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ 255 PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO) ;\ 256 addpd %xmm6, %xmm15 ;\ 257 movapd -2 * SIZE + 1 * (xx) * SIZE(AO), %xmm6 258 259#define KERNEL5(xx) \ 260 mulpd %xmm0, %xmm1 ;\ 261 addpd %xmm1, %xmm8 ;\ 262 movapd 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ 263 mulpd %xmm0, %xmm3 ;\ 264 addpd %xmm3, %xmm9 ;\ 265 movapd 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ 266 mulpd %xmm0, %xmm5 ;\ 267 mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\ 268 addpd %xmm5, %xmm10 ;\ 269 movapd 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ 270 addpd %xmm0, %xmm11 ;\ 271 movapd 0 * SIZE + 1 * (xx) * SIZE(AO), %xmm0 272 273#define KERNEL6(xx) \ 274 mulpd %xmm2, %xmm1 ;\ 275 addpd %xmm1, %xmm12 ;\ 276 movapd 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ 277 mulpd %xmm2, %xmm3 ;\ 278 addpd %xmm3, %xmm13 ;\ 279 movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ 280 mulpd %xmm2, %xmm5 ;\ 281 mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\ 282 addpd %xmm5, %xmm14 ;\ 283 movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ 284 addpd %xmm2, %xmm15 ;\ 285 movapd 2 * SIZE + 1 * (xx) * SIZE(AO), %xmm2 286 287#define KERNEL7(xx) \ 288 mulpd %xmm4, %xmm7 ;\ 289 addpd %xmm7, %xmm8 ;\ 290 movapd 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ 291 mulpd %xmm4, %xmm3 ;\ 292 addpd %xmm3, %xmm9 ;\ 293 movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ 294 mulpd %xmm4, %xmm5 ;\ 295 mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\ 296 addpd %xmm5, %xmm10 ;\ 297 movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ 298 addpd %xmm4, %xmm11 ;\ 299 movapd 4 * SIZE + 1 * (xx) * SIZE(AO), %xmm4 300 301#define KERNEL8(xx) \ 302 mulpd %xmm6, %xmm7 ;\ 303 addpd %xmm7, %xmm12 ;\ 304 movapd 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ 305 mulpd %xmm6, %xmm3 ;\ 306 addpd %xmm3, %xmm13 ;\ 307 movapd 18 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ 308 mulpd %xmm6, %xmm5 ;\ 309 mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\ 310 addpd %xmm5, %xmm14 ;\ 311 movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ 312 addpd %xmm6, %xmm15 ;\ 313 movapd 6 * SIZE + 1 * (xx) * SIZE(AO), %xmm6 314#endif 315 316 PROLOGUE 317 PROFCODE 318 319 subq $STACKSIZE, %rsp 320 movq %rbx, 0(%rsp) 321 movq %rbp, 8(%rsp) 322 movq %r12, 16(%rsp) 323 movq %r13, 24(%rsp) 324 movq %r14, 32(%rsp) 325 movq %r15, 40(%rsp) 326 327#ifdef WINDOWS_ABI 328 movq %rdi, 48(%rsp) 329 movq %rsi, 56(%rsp) 330 movups %xmm6, 64(%rsp) 331 movups %xmm7, 80(%rsp) 332 movups %xmm8, 96(%rsp) 333 movups %xmm9, 112(%rsp) 334 movups %xmm10, 128(%rsp) 335 movups %xmm11, 144(%rsp) 336 movups %xmm12, 160(%rsp) 337 movups %xmm13, 176(%rsp) 338 movups %xmm14, 192(%rsp) 339 movups %xmm15, 208(%rsp) 340 341 movq ARG1, OLD_M 342 movq ARG2, OLD_N 343 movq ARG3, K 344 movq OLD_A, A 345 movq OLD_B, B 346 movq OLD_C, C 347 movq OLD_LDC, LDC 348#ifdef TRMMKERNEL 349 movsd OLD_OFFSET, %xmm12 350#endif 351 movaps %xmm3, %xmm0 352 353#else 354 movq OLD_LDC, LDC 355#ifdef TRMMKERNEL 356 movsd OLD_OFFSET, %xmm12 357#endif 358 359#endif 360 361 EMMS 362 363 movq %rsp, %rbx # save old stack 364 subq $256 + LOCAL_BUFFER_SIZE, %rsp 365 andq $-4096, %rsp # align stack 366 367 STACK_TOUCHING 368 369 movq OLD_M, M 370 movq OLD_N, N 371 372 subq $-16 * SIZE, A 373 374 unpcklpd %xmm0, %xmm0 375 movapd %xmm0, ALPHA 376 377 leaq (, LDC, SIZE), LDC 378 379#ifdef TRMMKERNEL 380 movsd %xmm12, OFFSET 381 movsd %xmm12, KK 382#ifndef LEFT 383 negq KK 384#endif 385#endif 386 movq N, J 387 sarq $2, J # j = (n >> 2) 388 jle .L40 389 ALIGN_3 390 391.L01: 392/* Copying to Sub Buffer */ 393 leaq 16 * SIZE + BUFFER, BO 394 movq C, CO1 # coffset1 = c 395 leaq (C, LDC, 1), CO2 # coffset2 = c + ldc 396 397#if defined(TRMMKERNEL) && defined(LEFT) 398 movq OFFSET, %rax 399 movq %rax, KK 400#endif 401 402 movq K, %rax 403 sarq $2, %rax 404 jle .L03 405 ALIGN_3 406 407 408#define RPREFETCHSIZE (8 * 7 + 4) 409#define WPREFETCHSIZE (8 * 8 + 4) 410 411.L02: 412 PREFETCH (RPREFETCHSIZE + 0) * SIZE(B) 413 414 movq 0 * SIZE(B), %mm0 415 movq %mm0, -16 * SIZE(BO) 416 movq %mm0, -15 * SIZE(BO) 417 movq 1 * SIZE(B), %mm1 418 movq %mm1, -14 * SIZE(BO) 419 movq %mm1, -13 * SIZE(BO) 420 421 movq 2 * SIZE(B), %mm2 422 movq %mm2, -12 * SIZE(BO) 423 movq %mm2, -11 * SIZE(BO) 424 movq 3 * SIZE(B), %mm3 425 movq %mm3, -10 * SIZE(BO) 426 movq %mm3, -9 * SIZE(BO) 427 428 PREFETCHW (WPREFETCHSIZE + 0) * SIZE(BO) 429 430 movq 4 * SIZE(B), %mm4 431 movq %mm4, -8 * SIZE(BO) 432 movq %mm4, -7 * SIZE(BO) 433 movq 5 * SIZE(B), %mm5 434 movq %mm5, -6 * SIZE(BO) 435 movq %mm5, -5 * SIZE(BO) 436 437 PREFETCHW (WPREFETCHSIZE + 8) * SIZE(BO) 438 439 movq 6 * SIZE(B), %mm6 440 movq %mm6, -4 * SIZE(BO) 441 movq %mm6, -3 * SIZE(BO) 442 movq 7 * SIZE(B), %mm7 443 movq %mm7, -2 * SIZE(BO) 444 movq %mm7, -1 * SIZE(BO) 445 446 PREFETCH (RPREFETCHSIZE + 8) * SIZE(B) 447 448 movq 8 * SIZE(B), %mm0 449 movq %mm0, 0 * SIZE(BO) 450 movq %mm0, 1 * SIZE(BO) 451 movq 9 * SIZE(B), %mm1 452 movq %mm1, 2 * SIZE(BO) 453 movq %mm1, 3 * SIZE(BO) 454 455 movq 10 * SIZE(B), %mm2 456 movq %mm2, 4 * SIZE(BO) 457 movq %mm2, 5 * SIZE(BO) 458 movq 11 * SIZE(B), %mm3 459 movq %mm3, 6 * SIZE(BO) 460 movq %mm3, 7 * SIZE(BO) 461 462 PREFETCHW (WPREFETCHSIZE + 16) * SIZE(BO) 463 464 movq 12 * SIZE(B), %mm4 465 movq %mm4, 8 * SIZE(BO) 466 movq %mm4, 9 * SIZE(BO) 467 movq 13 * SIZE(B), %mm5 468 movq %mm5, 10 * SIZE(BO) 469 movq %mm5, 11 * SIZE(BO) 470 471 PREFETCHW (WPREFETCHSIZE + 24) * SIZE(BO) 472 473 movq 14 * SIZE(B), %mm6 474 movq %mm6, 12 * SIZE(BO) 475 movq %mm6, 13 * SIZE(BO) 476 movq 15 * SIZE(B), %mm7 477 movq %mm7, 14 * SIZE(BO) 478 movq %mm7, 15 * SIZE(BO) 479 480 addq $ 32 * SIZE, BO 481 subq $-16 * SIZE, B 482 483 subq $1, %rax 484 jne .L02 485 ALIGN_3 486 487.L03: 488 movq K, %rax 489 andq $3, %rax 490 BRANCH 491 jle .L10 492 ALIGN_3 493 494.L04: 495 movq 0 * SIZE(B), %mm0 496 movq %mm0, -16 * SIZE(BO) 497 movq %mm0, -15 * SIZE(BO) 498 movq 1 * SIZE(B), %mm1 499 movq %mm1, -14 * SIZE(BO) 500 movq %mm1, -13 * SIZE(BO) 501 502 movq 2 * SIZE(B), %mm2 503 movq %mm2, -12 * SIZE(BO) 504 movq %mm2, -11 * SIZE(BO) 505 movq 3 * SIZE(B), %mm3 506 movq %mm3, -10 * SIZE(BO) 507 movq %mm3, -9 * SIZE(BO) 508 509 addq $4 * SIZE, B 510 addq $8 * SIZE, BO 511 subq $1, %rax 512 jne .L04 513 ALIGN_3 514 515.L10: 516 movq A, AO # aoffset = a 517 518 leaq (RPREFETCHSIZE + 0) * SIZE(B), BB 519 520 movq M, I 521 sarq $2, I # i = (m >> 2) 522 jle .L20 523 ALIGN_3 524 525.L11: 526#if !defined(TRMMKERNEL) || \ 527 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 528 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 529 530 leaq 16 * SIZE + BUFFER, BO 531#else 532 leaq 16 * SIZE + BUFFER, BO 533 movq KK, %rax 534 leaq (, %rax, SIZE), %rax 535 leaq (AO, %rax, 4), AO 536 leaq (BO, %rax, 8), BO 537#endif 538 539 movapd -16 * SIZE(AO), %xmm0 540 movapd -16 * SIZE(BO), %xmm1 541 pxor %xmm8, %xmm8 542 movapd -14 * SIZE(AO), %xmm2 543 movapd -14 * SIZE(BO), %xmm3 544 pxor %xmm9, %xmm9 545 movapd -12 * SIZE(AO), %xmm4 546 movapd -12 * SIZE(BO), %xmm5 547 pxor %xmm10, %xmm10 548 movapd -10 * SIZE(AO), %xmm6 549 movapd -8 * SIZE(BO), %xmm7 550 pxor %xmm11, %xmm11 551 552 PREFETCHW 3 * SIZE(CO1) 553 pxor %xmm12, %xmm12 554 PREFETCHW 7 * SIZE(CO2) 555 pxor %xmm13, %xmm13 556 PREFETCHW 3 * SIZE(CO1, LDC, 2) 557 pxor %xmm14, %xmm14 558 PREFETCHW 7 * SIZE(CO2, LDC, 2) 559 pxor %xmm15, %xmm15 560 561 PREFETCH 0 * SIZE(BB) 562 563#ifndef TRMMKERNEL 564 movq K, %rax 565#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 566 movq K, %rax 567 subq KK, %rax 568 movq %rax, KKK 569#else 570 movq KK, %rax 571#ifdef LEFT 572 addq $4, %rax 573#else 574 addq $4, %rax 575#endif 576 movq %rax, KKK 577#endif 578 579#ifndef GENERIC 580 andq $-8, %rax 581 582 leaq (, %rax, SIZE), %rax 583 leaq (AO, %rax, 4), AO 584 leaq (BO, %rax, 8), BO 585 negq %rax 586 NOBRANCH 587 je .L15 588 ALIGN_3 589 590.L12: 591 KERNEL1(16 * 0) 592 KERNEL2(16 * 0) 593 KERNEL3(16 * 0) 594 KERNEL4(16 * 0) 595 KERNEL5(16 * 0) 596 KERNEL6(16 * 0) 597 KERNEL7(16 * 0) 598 KERNEL8(16 * 0) 599 600 KERNEL1(16 * 1) 601 KERNEL2(16 * 1) 602 KERNEL3(16 * 1) 603 KERNEL4(16 * 1) 604 KERNEL5(16 * 1) 605 KERNEL6(16 * 1) 606 KERNEL7(16 * 1) 607 KERNEL8(16 * 1) 608 609 addq $8 * SIZE, %rax 610 NOBRANCH 611 je .L15 612 613 KERNEL1(16 * 0) 614 KERNEL2(16 * 0) 615 KERNEL3(16 * 0) 616 KERNEL4(16 * 0) 617 KERNEL5(16 * 0) 618 KERNEL6(16 * 0) 619 KERNEL7(16 * 0) 620 KERNEL8(16 * 0) 621 622 KERNEL1(16 * 1) 623 KERNEL2(16 * 1) 624 KERNEL3(16 * 1) 625 KERNEL4(16 * 1) 626 KERNEL5(16 * 1) 627 KERNEL6(16 * 1) 628 KERNEL7(16 * 1) 629 KERNEL8(16 * 1) 630 631 addq $8 * SIZE, %rax 632 NOBRANCH 633 je .L15 634 635 KERNEL1(16 * 0) 636 KERNEL2(16 * 0) 637 KERNEL3(16 * 0) 638 KERNEL4(16 * 0) 639 KERNEL5(16 * 0) 640 KERNEL6(16 * 0) 641 KERNEL7(16 * 0) 642 KERNEL8(16 * 0) 643 644 KERNEL1(16 * 1) 645 KERNEL2(16 * 1) 646 KERNEL3(16 * 1) 647 KERNEL4(16 * 1) 648 KERNEL5(16 * 1) 649 KERNEL6(16 * 1) 650 KERNEL7(16 * 1) 651 KERNEL8(16 * 1) 652 653 addq $8 * SIZE, %rax 654 NOBRANCH 655 je .L15 656 657 KERNEL1(16 * 0) 658 KERNEL2(16 * 0) 659 KERNEL3(16 * 0) 660 KERNEL4(16 * 0) 661 KERNEL5(16 * 0) 662 KERNEL6(16 * 0) 663 KERNEL7(16 * 0) 664 KERNEL8(16 * 0) 665 666 KERNEL1(16 * 1) 667 KERNEL2(16 * 1) 668 KERNEL3(16 * 1) 669 KERNEL4(16 * 1) 670 KERNEL5(16 * 1) 671 KERNEL6(16 * 1) 672 KERNEL7(16 * 1) 673 KERNEL8(16 * 1) 674 675 addq $8 * SIZE, %rax 676 NOBRANCH 677 je .L15 678 679 KERNEL1(16 * 0) 680 KERNEL2(16 * 0) 681 KERNEL3(16 * 0) 682 KERNEL4(16 * 0) 683 KERNEL5(16 * 0) 684 KERNEL6(16 * 0) 685 KERNEL7(16 * 0) 686 KERNEL8(16 * 0) 687 688 KERNEL1(16 * 1) 689 KERNEL2(16 * 1) 690 KERNEL3(16 * 1) 691 KERNEL4(16 * 1) 692 KERNEL5(16 * 1) 693 KERNEL6(16 * 1) 694 KERNEL7(16 * 1) 695 KERNEL8(16 * 1) 696 697 addq $8 * SIZE, %rax 698 NOBRANCH 699 je .L15 700 701 KERNEL1(16 * 0) 702 KERNEL2(16 * 0) 703 KERNEL3(16 * 0) 704 KERNEL4(16 * 0) 705 KERNEL5(16 * 0) 706 KERNEL6(16 * 0) 707 KERNEL7(16 * 0) 708 KERNEL8(16 * 0) 709 710 KERNEL1(16 * 1) 711 KERNEL2(16 * 1) 712 KERNEL3(16 * 1) 713 KERNEL4(16 * 1) 714 KERNEL5(16 * 1) 715 KERNEL6(16 * 1) 716 KERNEL7(16 * 1) 717 KERNEL8(16 * 1) 718 719 addq $8 * SIZE, %rax 720 NOBRANCH 721 je .L15 722 723 KERNEL1(16 * 0) 724 KERNEL2(16 * 0) 725 KERNEL3(16 * 0) 726 KERNEL4(16 * 0) 727 KERNEL5(16 * 0) 728 KERNEL6(16 * 0) 729 KERNEL7(16 * 0) 730 KERNEL8(16 * 0) 731 732 KERNEL1(16 * 1) 733 KERNEL2(16 * 1) 734 KERNEL3(16 * 1) 735 KERNEL4(16 * 1) 736 KERNEL5(16 * 1) 737 KERNEL6(16 * 1) 738 KERNEL7(16 * 1) 739 KERNEL8(16 * 1) 740 741 addq $8 * SIZE, %rax 742 NOBRANCH 743 je .L15 744 745 KERNEL1(16 * 0) 746 KERNEL2(16 * 0) 747 KERNEL3(16 * 0) 748 KERNEL4(16 * 0) 749 KERNEL5(16 * 0) 750 KERNEL6(16 * 0) 751 KERNEL7(16 * 0) 752 KERNEL8(16 * 0) 753 754 KERNEL1(16 * 1) 755 KERNEL2(16 * 1) 756 KERNEL3(16 * 1) 757 KERNEL4(16 * 1) 758 KERNEL5(16 * 1) 759 KERNEL6(16 * 1) 760 KERNEL7(16 * 1) 761 KERNEL8(16 * 1) 762 763 addq $8 * SIZE, %rax 764 BRANCH 765 jl .L12 766 ALIGN_3 767 768.L15: 769#ifndef TRMMKERNEL 770 movq K, %rax 771#else 772 movq KKK, %rax 773#endif 774 testq $4, %rax 775 je .L16 776 xorq %rax, %rax 777 ALIGN_3 778 779 KERNEL1(16 * 0) 780 KERNEL2(16 * 0) 781 KERNEL3(16 * 0) 782 KERNEL4(16 * 0) 783 KERNEL5(16 * 0) 784 KERNEL6(16 * 0) 785 KERNEL7(16 * 0) 786 KERNEL8(16 * 0) 787 788 addq $32 * SIZE, BO 789 addq $16 * SIZE, AO 790 ALIGN_3 791 792#else 793 sarq $2, %rax 794 NOBRANCH 795 jle .L16 796 ALIGN_3 797 798.L12: 799 KERNEL1(16 * 0) 800 KERNEL2(16 * 0) 801 KERNEL3(16 * 0) 802 KERNEL4(16 * 0) 803 KERNEL5(16 * 0) 804 KERNEL6(16 * 0) 805 KERNEL7(16 * 0) 806 KERNEL8(16 * 0) 807 808 addq $ 32 * SIZE, BO 809 subq $-16 * SIZE, AO 810 decq %rax 811 BRANCH 812 jg .L12 813#endif 814 815.L16: 816 movapd ALPHA, %xmm7 817 818#ifndef TRMMKERNEL 819 movq K, %rax 820#else 821 movq KKK, %rax 822#endif 823 andq $3, %rax # if (k & 1) 824 je .L19 825 826 leaq (, %rax, SIZE), %rax 827 leaq (AO, %rax, 4), AO 828 leaq (BO, %rax, 8), BO 829 negq %rax 830 ALIGN_3 831 832.L17: 833 mulpd %xmm0, %xmm1 834 addpd %xmm1, %xmm8 835 movapd -14 * SIZE(BO, %rax, 8), %xmm1 836 mulpd %xmm0, %xmm1 837 addpd %xmm1, %xmm9 838 movapd -12 * SIZE(BO, %rax, 8), %xmm1 839 mulpd %xmm0, %xmm1 840 mulpd -10 * SIZE(BO, %rax, 8), %xmm0 841 addpd %xmm1, %xmm10 842 movapd -16 * SIZE(BO, %rax, 8), %xmm1 843 addpd %xmm0, %xmm11 844 movapd -12 * SIZE(AO, %rax, 4), %xmm0 845 mulpd %xmm2, %xmm1 846 addpd %xmm1, %xmm12 847 movapd -14 * SIZE(BO, %rax, 8), %xmm1 848 mulpd %xmm2, %xmm1 849 addpd %xmm1, %xmm13 850 movapd -12 * SIZE(BO, %rax, 8), %xmm1 851 mulpd %xmm2, %xmm1 852 mulpd -10 * SIZE(BO, %rax, 8), %xmm2 853 addpd %xmm1, %xmm14 854 movapd -8 * SIZE(BO, %rax, 8), %xmm1 855 addpd %xmm2, %xmm15 856 movapd -10 * SIZE(AO, %rax, 4), %xmm2 857 858 addq $SIZE, %rax 859 jl .L17 860 ALIGN_3 861 862.L19: 863 PREFETCH 8 * SIZE(BB) 864 subq $-12 * SIZE, BB 865 866#ifndef TRMMKERNEL 867 movsd 0 * SIZE(CO1), %xmm0 868 movhpd 1 * SIZE(CO1), %xmm0 869 movsd 2 * SIZE(CO1), %xmm1 870 movhpd 3 * SIZE(CO1), %xmm1 871 872 movsd 0 * SIZE(CO2), %xmm2 873 movhpd 1 * SIZE(CO2), %xmm2 874 movsd 2 * SIZE(CO2), %xmm3 875 movhpd 3 * SIZE(CO2), %xmm3 876#endif 877 878 mulpd %xmm7, %xmm8 879 mulpd %xmm7, %xmm9 880 mulpd %xmm7, %xmm10 881 mulpd %xmm7, %xmm11 882 883 mulpd %xmm7, %xmm12 884 mulpd %xmm7, %xmm13 885 mulpd %xmm7, %xmm14 886 mulpd %xmm7, %xmm15 887 888#ifndef TRMMKERNEL 889 movlpd 0 * SIZE(CO1, LDC, 2), %xmm4 890 movhpd 1 * SIZE(CO1, LDC, 2), %xmm4 891 movlpd 2 * SIZE(CO1, LDC, 2), %xmm5 892 movhpd 3 * SIZE(CO1, LDC, 2), %xmm5 893 894 movlpd 0 * SIZE(CO2, LDC, 2), %xmm6 895 movhpd 1 * SIZE(CO2, LDC, 2), %xmm6 896 movlpd 2 * SIZE(CO2, LDC, 2), %xmm7 897 movhpd 3 * SIZE(CO2, LDC, 2), %xmm7 898 899 addpd %xmm0, %xmm8 900 addpd %xmm1, %xmm12 901 addpd %xmm2, %xmm9 902 addpd %xmm3, %xmm13 903#endif 904 905 movlpd %xmm8, 0 * SIZE(CO1) 906 movhpd %xmm8, 1 * SIZE(CO1) 907 movlpd %xmm12, 2 * SIZE(CO1) 908 movhpd %xmm12, 3 * SIZE(CO1) 909 910 movlpd %xmm9, 0 * SIZE(CO2) 911 movhpd %xmm9, 1 * SIZE(CO2) 912 movlpd %xmm13, 2 * SIZE(CO2) 913 movhpd %xmm13, 3 * SIZE(CO2) 914 915#ifndef TRMMKERNEL 916 addpd %xmm4, %xmm10 917 addpd %xmm5, %xmm14 918 addpd %xmm6, %xmm11 919 addpd %xmm7, %xmm15 920#endif 921 922 movlpd %xmm10, 0 * SIZE(CO1, LDC, 2) 923 movhpd %xmm10, 1 * SIZE(CO1, LDC, 2) 924 movlpd %xmm14, 2 * SIZE(CO1, LDC, 2) 925 movhpd %xmm14, 3 * SIZE(CO1, LDC, 2) 926 927 movlpd %xmm11, 0 * SIZE(CO2, LDC, 2) 928 movhpd %xmm11, 1 * SIZE(CO2, LDC, 2) 929 movlpd %xmm15, 2 * SIZE(CO2, LDC, 2) 930 movhpd %xmm15, 3 * SIZE(CO2, LDC, 2) 931 932#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 933 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 934 movq K, %rax 935 subq KKK, %rax 936 leaq (,%rax, SIZE), %rax 937 leaq (AO, %rax, 4), AO 938 leaq (BO, %rax, 8), BO 939#endif 940 941#if defined(TRMMKERNEL) && defined(LEFT) 942 addq $4, KK 943#endif 944 945 addq $4 * SIZE, CO1 # coffset += 4 946 addq $4 * SIZE, CO2 # coffset += 4 947 decq I # i -- 948 BRANCH 949 jg .L11 950 ALIGN_3 951 952.L20: 953 testq $3, M 954 je .L39 955 956 testq $2, M 957 je .L30 958 ALIGN_3 959 960.L21: 961#if !defined(TRMMKERNEL) || \ 962 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 963 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 964 965 leaq BUFFER, BO 966#else 967 leaq BUFFER, BO 968 movq KK, %rax 969 leaq (, %rax, SIZE), %rax 970 leaq (AO, %rax, 2), AO 971 leaq (BO, %rax, 8), BO 972#endif 973 974 movapd -16 * SIZE(AO), %xmm0 975 pxor %xmm8, %xmm8 976 movapd 0 * SIZE(BO), %xmm1 977 pxor %xmm9, %xmm9 978 movapd -8 * SIZE(AO), %xmm2 979 pxor %xmm10, %xmm10 980 movapd 8 * SIZE(BO), %xmm3 981 pxor %xmm11, %xmm11 982 983 movapd 16 * SIZE(BO), %xmm5 984 movapd 24 * SIZE(BO), %xmm7 985 986#ifndef TRMMKERNEL 987 movq K, %rax 988#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 989 movq K, %rax 990 subq KK, %rax 991 movq %rax, KKK 992#else 993 movq KK, %rax 994#ifdef LEFT 995 addq $2, %rax 996#else 997 addq $4, %rax 998#endif 999 movq %rax, KKK 1000#endif 1001 sarq $3, %rax 1002 je .L25 1003 ALIGN_3 1004 1005.L22: 1006 mulpd %xmm0, %xmm1 1007 addpd %xmm1, %xmm8 1008 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 1009 movapd 2 * SIZE(BO), %xmm1 1010 mulpd %xmm0, %xmm1 1011 addpd %xmm1, %xmm9 1012 movapd 4 * SIZE(BO), %xmm1 1013 mulpd %xmm0, %xmm1 1014 mulpd 6 * SIZE(BO), %xmm0 1015 addpd %xmm1, %xmm10 1016 movapd 32 * SIZE(BO), %xmm1 1017 addpd %xmm0, %xmm11 1018 movapd -14 * SIZE(AO), %xmm0 1019 1020 mulpd %xmm0, %xmm3 1021 addpd %xmm3, %xmm8 1022 movapd 10 * SIZE(BO), %xmm3 1023 mulpd %xmm0, %xmm3 1024 addpd %xmm3, %xmm9 1025 movapd 12 * SIZE(BO), %xmm3 1026 mulpd %xmm0, %xmm3 1027 mulpd 14 * SIZE(BO), %xmm0 1028 addpd %xmm3, %xmm10 1029 movapd 40 * SIZE(BO), %xmm3 1030 addpd %xmm0, %xmm11 1031 movapd -12 * SIZE(AO), %xmm0 1032 1033 mulpd %xmm0, %xmm5 1034 addpd %xmm5, %xmm8 1035 movapd 18 * SIZE(BO), %xmm5 1036 mulpd %xmm0, %xmm5 1037 addpd %xmm5, %xmm9 1038 movapd 20 * SIZE(BO), %xmm5 1039 mulpd %xmm0, %xmm5 1040 mulpd 22 * SIZE(BO), %xmm0 1041 addpd %xmm5, %xmm10 1042 movapd 48 * SIZE(BO), %xmm5 1043 addpd %xmm0, %xmm11 1044 movapd -10 * SIZE(AO), %xmm0 1045 1046 mulpd %xmm0, %xmm7 1047 addpd %xmm7, %xmm8 1048 movapd 26 * SIZE(BO), %xmm7 1049 mulpd %xmm0, %xmm7 1050 addpd %xmm7, %xmm9 1051 movapd 28 * SIZE(BO), %xmm7 1052 mulpd %xmm0, %xmm7 1053 mulpd 30 * SIZE(BO), %xmm0 1054 addpd %xmm7, %xmm10 1055 movapd 56 * SIZE(BO), %xmm7 1056 addpd %xmm0, %xmm11 1057 movapd 0 * SIZE(AO), %xmm0 1058 1059 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) 1060 mulpd %xmm2, %xmm1 1061 addpd %xmm1, %xmm8 1062 movapd 34 * SIZE(BO), %xmm1 1063 mulpd %xmm2, %xmm1 1064 addpd %xmm1, %xmm9 1065 movapd 36 * SIZE(BO), %xmm1 1066 mulpd %xmm2, %xmm1 1067 mulpd 38 * SIZE(BO), %xmm2 1068 addpd %xmm1, %xmm10 1069 movapd 64 * SIZE(BO), %xmm1 1070 addpd %xmm2, %xmm11 1071 movapd -6 * SIZE(AO), %xmm2 1072 1073 mulpd %xmm2, %xmm3 1074 addpd %xmm3, %xmm8 1075 movapd 42 * SIZE(BO), %xmm3 1076 mulpd %xmm2, %xmm3 1077 addpd %xmm3, %xmm9 1078 movapd 44 * SIZE(BO), %xmm3 1079 mulpd %xmm2, %xmm3 1080 mulpd 46 * SIZE(BO), %xmm2 1081 addpd %xmm3, %xmm10 1082 movapd 72 * SIZE(BO), %xmm3 1083 addpd %xmm2, %xmm11 1084 movapd -4 * SIZE(AO), %xmm2 1085 1086 mulpd %xmm2, %xmm5 1087 addpd %xmm5, %xmm8 1088 movapd 50 * SIZE(BO), %xmm5 1089 mulpd %xmm2, %xmm5 1090 addpd %xmm5, %xmm9 1091 movapd 52 * SIZE(BO), %xmm5 1092 mulpd %xmm2, %xmm5 1093 mulpd 54 * SIZE(BO), %xmm2 1094 addpd %xmm5, %xmm10 1095 movapd 80 * SIZE(BO), %xmm5 1096 addpd %xmm2, %xmm11 1097 movapd -2 * SIZE(AO), %xmm2 1098 1099 mulpd %xmm2, %xmm7 1100 addpd %xmm7, %xmm8 1101 movapd 58 * SIZE(BO), %xmm7 1102 mulpd %xmm2, %xmm7 1103 addpd %xmm7, %xmm9 1104 movapd 60 * SIZE(BO), %xmm7 1105 mulpd %xmm2, %xmm7 1106 mulpd 62 * SIZE(BO), %xmm2 1107 addpd %xmm7, %xmm10 1108 movapd 88 * SIZE(BO), %xmm7 1109 addpd %xmm2, %xmm11 1110 movapd 8 * SIZE(AO), %xmm2 1111 1112 addq $16 * SIZE, AO 1113 addq $64 * SIZE, BO 1114 decq %rax 1115 jne .L22 1116 ALIGN_3 1117 1118.L25: 1119#ifndef TRMMKERNEL 1120 movq K, %rax 1121#else 1122 movq KKK, %rax 1123#endif 1124 movapd ALPHA, %xmm7 1125 andq $7, %rax # if (k & 1) 1126 BRANCH 1127 je .L29 1128 ALIGN_3 1129 1130.L26: 1131 mulpd %xmm0, %xmm1 1132 addpd %xmm1, %xmm8 1133 movapd 2 * SIZE(BO), %xmm1 1134 mulpd %xmm0, %xmm1 1135 addpd %xmm1, %xmm9 1136 movapd 4 * SIZE(BO), %xmm1 1137 mulpd %xmm0, %xmm1 1138 mulpd 6 * SIZE(BO), %xmm0 1139 addpd %xmm1, %xmm10 1140 movapd 8 * SIZE(BO), %xmm1 1141 addpd %xmm0, %xmm11 1142 movapd -14 * SIZE(AO), %xmm0 1143 1144 addq $2 * SIZE, AO # aoffset += 4 1145 addq $8 * SIZE, BO # boffset1 += 8 1146 decq %rax 1147 jg .L26 1148 ALIGN_3 1149 1150.L29: 1151#ifndef TRMMKERNEL 1152 movlpd 0 * SIZE(CO1), %xmm0 1153 movhpd 1 * SIZE(CO1), %xmm0 1154 movlpd 0 * SIZE(CO2), %xmm2 1155 movhpd 1 * SIZE(CO2), %xmm2 1156 1157 movlpd 0 * SIZE(CO1, LDC, 2), %xmm4 1158 movhpd 1 * SIZE(CO1, LDC, 2), %xmm4 1159 movlpd 0 * SIZE(CO2, LDC, 2), %xmm6 1160 movhpd 1 * SIZE(CO2, LDC, 2), %xmm6 1161#endif 1162 mulpd %xmm7, %xmm8 1163 mulpd %xmm7, %xmm9 1164 mulpd %xmm7, %xmm10 1165 mulpd %xmm7, %xmm11 1166 1167#ifndef TRMMKERNEL 1168 addpd %xmm0, %xmm8 1169 addpd %xmm2, %xmm9 1170 addpd %xmm4, %xmm10 1171 addpd %xmm6, %xmm11 1172#endif 1173 1174 movlpd %xmm8, 0 * SIZE(CO1) 1175 movhpd %xmm8, 1 * SIZE(CO1) 1176 movlpd %xmm9, 0 * SIZE(CO2) 1177 movhpd %xmm9, 1 * SIZE(CO2) 1178 movlpd %xmm10, 0 * SIZE(CO1, LDC, 2) 1179 movhpd %xmm10, 1 * SIZE(CO1, LDC, 2) 1180 movlpd %xmm11, 0 * SIZE(CO2, LDC, 2) 1181 movhpd %xmm11, 1 * SIZE(CO2, LDC, 2) 1182 1183#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1184 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1185 movq K, %rax 1186 subq KKK, %rax 1187 leaq (,%rax, SIZE), %rax 1188 leaq (AO, %rax, 2), AO 1189 leaq (BO, %rax, 8), BO 1190#endif 1191 1192#if defined(TRMMKERNEL) && defined(LEFT) 1193 addq $2, KK 1194#endif 1195 1196 addq $2 * SIZE, CO1 # coffset += 4 1197 addq $2 * SIZE, CO2 # coffset += 4 1198 ALIGN_3 1199 1200.L30: 1201 testq $1, M 1202 je .L39 1203 ALIGN_3 1204 1205.L31: 1206#if !defined(TRMMKERNEL) || \ 1207 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1208 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1209 1210 leaq BUFFER, BO 1211#else 1212 leaq BUFFER, BO 1213 movq KK, %rax 1214 leaq (, %rax, SIZE), %rax 1215 leaq (AO, %rax, 1), AO 1216 leaq (BO, %rax, 8), BO 1217#endif 1218 1219 movsd -16 * SIZE(AO), %xmm0 1220 pxor %xmm8, %xmm8 1221 movsd 0 * SIZE(BO), %xmm1 1222 pxor %xmm9, %xmm9 1223 movsd -8 * SIZE(AO), %xmm2 1224 pxor %xmm10, %xmm10 1225 movsd 8 * SIZE(BO), %xmm3 1226 pxor %xmm11, %xmm11 1227 1228 movsd 16 * SIZE(BO), %xmm5 1229 movsd 24 * SIZE(BO), %xmm7 1230 1231#ifndef TRMMKERNEL 1232 movq K, %rax 1233#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1234 movq K, %rax 1235 subq KK, %rax 1236 movq %rax, KKK 1237#else 1238 movq KK, %rax 1239#ifdef LEFT 1240 addq $1, %rax 1241#else 1242 addq $4, %rax 1243#endif 1244 movq %rax, KKK 1245#endif 1246 sarq $3, %rax 1247 je .L35 1248 ALIGN_3 1249 1250.L32: 1251 mulsd %xmm0, %xmm1 1252 addsd %xmm1, %xmm8 1253 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 1254 movsd 2 * SIZE(BO), %xmm1 1255 mulsd %xmm0, %xmm1 1256 addsd %xmm1, %xmm9 1257 movsd 4 * SIZE(BO), %xmm1 1258 mulsd %xmm0, %xmm1 1259 mulsd 6 * SIZE(BO), %xmm0 1260 addsd %xmm1, %xmm10 1261 movsd 32 * SIZE(BO), %xmm1 1262 addsd %xmm0, %xmm11 1263 movsd -15 * SIZE(AO), %xmm0 1264 1265 mulsd %xmm0, %xmm3 1266 addsd %xmm3, %xmm8 1267 movsd 10 * SIZE(BO), %xmm3 1268 mulsd %xmm0, %xmm3 1269 addsd %xmm3, %xmm9 1270 movsd 12 * SIZE(BO), %xmm3 1271 mulsd %xmm0, %xmm3 1272 mulsd 14 * SIZE(BO), %xmm0 1273 addsd %xmm3, %xmm10 1274 movsd 40 * SIZE(BO), %xmm3 1275 addsd %xmm0, %xmm11 1276 movsd -14 * SIZE(AO), %xmm0 1277 1278 mulsd %xmm0, %xmm5 1279 addsd %xmm5, %xmm8 1280 movsd 18 * SIZE(BO), %xmm5 1281 mulsd %xmm0, %xmm5 1282 addsd %xmm5, %xmm9 1283 movsd 20 * SIZE(BO), %xmm5 1284 mulsd %xmm0, %xmm5 1285 mulsd 22 * SIZE(BO), %xmm0 1286 addsd %xmm5, %xmm10 1287 movsd 48 * SIZE(BO), %xmm5 1288 addsd %xmm0, %xmm11 1289 movsd -13 * SIZE(AO), %xmm0 1290 1291 mulsd %xmm0, %xmm7 1292 addsd %xmm7, %xmm8 1293 movsd 26 * SIZE(BO), %xmm7 1294 mulsd %xmm0, %xmm7 1295 addsd %xmm7, %xmm9 1296 movsd 28 * SIZE(BO), %xmm7 1297 mulsd %xmm0, %xmm7 1298 mulsd 30 * SIZE(BO), %xmm0 1299 addsd %xmm7, %xmm10 1300 movsd 56 * SIZE(BO), %xmm7 1301 addsd %xmm0, %xmm11 1302 movsd -12 * SIZE(AO), %xmm0 1303 1304 mulsd %xmm0, %xmm1 1305 addsd %xmm1, %xmm8 1306 movsd 34 * SIZE(BO), %xmm1 1307 mulsd %xmm0, %xmm1 1308 addsd %xmm1, %xmm9 1309 movsd 36 * SIZE(BO), %xmm1 1310 mulsd %xmm0, %xmm1 1311 mulsd 38 * SIZE(BO), %xmm0 1312 addsd %xmm1, %xmm10 1313 movsd 64 * SIZE(BO), %xmm1 1314 addsd %xmm0, %xmm11 1315 movsd -11 * SIZE(AO), %xmm0 1316 1317 mulsd %xmm0, %xmm3 1318 addsd %xmm3, %xmm8 1319 movsd 42 * SIZE(BO), %xmm3 1320 mulsd %xmm0, %xmm3 1321 addsd %xmm3, %xmm9 1322 movsd 44 * SIZE(BO), %xmm3 1323 mulsd %xmm0, %xmm3 1324 mulsd 46 * SIZE(BO), %xmm0 1325 addsd %xmm3, %xmm10 1326 movsd 72 * SIZE(BO), %xmm3 1327 addsd %xmm0, %xmm11 1328 movsd -10 * SIZE(AO), %xmm0 1329 1330 mulsd %xmm0, %xmm5 1331 addsd %xmm5, %xmm8 1332 movsd 50 * SIZE(BO), %xmm5 1333 mulsd %xmm0, %xmm5 1334 addsd %xmm5, %xmm9 1335 movsd 52 * SIZE(BO), %xmm5 1336 mulsd %xmm0, %xmm5 1337 mulsd 54 * SIZE(BO), %xmm0 1338 addsd %xmm5, %xmm10 1339 movsd 80 * SIZE(BO), %xmm5 1340 addsd %xmm0, %xmm11 1341 movsd -9 * SIZE(AO), %xmm0 1342 1343 mulsd %xmm0, %xmm7 1344 addsd %xmm7, %xmm8 1345 movsd 58 * SIZE(BO), %xmm7 1346 mulsd %xmm0, %xmm7 1347 addsd %xmm7, %xmm9 1348 movsd 60 * SIZE(BO), %xmm7 1349 mulsd %xmm0, %xmm7 1350 mulsd 62 * SIZE(BO), %xmm0 1351 addsd %xmm7, %xmm10 1352 movsd 88 * SIZE(BO), %xmm7 1353 addsd %xmm0, %xmm11 1354 movsd -8 * SIZE(AO), %xmm0 1355 1356 addq $ 8 * SIZE, AO 1357 addq $64 * SIZE, BO 1358 decq %rax 1359 jne .L32 1360 ALIGN_3 1361 1362.L35: 1363#ifndef TRMMKERNEL 1364 movq K, %rax 1365#else 1366 movq KKK, %rax 1367#endif 1368 movsd ALPHA, %xmm7 1369 andq $7, %rax # if (k & 1) 1370 BRANCH 1371 je .L38 1372 ALIGN_3 1373 1374.L36: 1375 mulsd %xmm0, %xmm1 1376 addsd %xmm1, %xmm8 1377 movsd 2 * SIZE(BO), %xmm1 1378 mulsd %xmm0, %xmm1 1379 addsd %xmm1, %xmm9 1380 movsd 4 * SIZE(BO), %xmm1 1381 mulsd %xmm0, %xmm1 1382 mulsd 6 * SIZE(BO), %xmm0 1383 addsd %xmm1, %xmm10 1384 movsd 8 * SIZE(BO), %xmm1 1385 addsd %xmm0, %xmm11 1386 movsd -15 * SIZE(AO), %xmm0 1387 1388 addq $1 * SIZE, AO # aoffset += 4 1389 addq $8 * SIZE, BO # boffset1 += 8 1390 decq %rax 1391 jg .L36 1392 ALIGN_3 1393 1394.L38: 1395#ifndef TRMMKERNEL 1396 movsd 0 * SIZE(CO1), %xmm0 1397 movsd 0 * SIZE(CO2), %xmm2 1398 movsd 0 * SIZE(CO1, LDC, 2), %xmm4 1399 movsd 0 * SIZE(CO2, LDC, 2), %xmm6 1400#endif 1401 1402 mulsd %xmm7, %xmm8 1403 mulsd %xmm7, %xmm9 1404 mulsd %xmm7, %xmm10 1405 mulsd %xmm7, %xmm11 1406 1407#ifndef TRMMKERNEL 1408 addsd %xmm0, %xmm8 1409 addsd %xmm2, %xmm9 1410 addsd %xmm4, %xmm10 1411 addsd %xmm6, %xmm11 1412#endif 1413 1414 movsd %xmm8, 0 * SIZE(CO1) 1415 movsd %xmm9, 0 * SIZE(CO2) 1416 movsd %xmm10, 0 * SIZE(CO1, LDC, 2) 1417 movsd %xmm11, 0 * SIZE(CO2, LDC, 2) 1418 1419#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1420 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1421 movq K, %rax 1422 subq KKK, %rax 1423 leaq (,%rax, SIZE), %rax 1424 leaq (AO, %rax, 1), AO 1425 leaq (BO, %rax, 8), BO 1426#endif 1427 1428#if defined(TRMMKERNEL) && defined(LEFT) 1429 addq $1, KK 1430#endif 1431 ALIGN_3 1432 1433.L39: 1434#if defined(TRMMKERNEL) && !defined(LEFT) 1435 addl $4, KK 1436#endif 1437 1438 leaq (C, LDC, 4), C # c += 4 * ldc 1439 decq J # j -- 1440 jg .L01 1441 ALIGN_3 1442 1443.L40: 1444 testq $3, N 1445 je .L999 1446 1447 testq $2, N 1448 je .L80 1449 ALIGN_4 1450 1451.L41: 1452/* Copying to Sub Buffer */ 1453 leaq BUFFER, BO 1454 1455#if defined(TRMMKERNEL) && defined(LEFT) 1456 movq OFFSET, %rax 1457 movq %rax, KK 1458#endif 1459 1460 movq K, %rax 1461 sarq $2, %rax 1462 jle .L43 1463 ALIGN_3 1464 1465.L42: 1466 PREFETCH 56 * SIZE(B) 1467 1468 movq 0 * SIZE(B), %mm0 1469 movq 1 * SIZE(B), %mm1 1470 movq 2 * SIZE(B), %mm2 1471 movq 3 * SIZE(B), %mm3 1472 movq 4 * SIZE(B), %mm4 1473 movq 5 * SIZE(B), %mm5 1474 movq 6 * SIZE(B), %mm6 1475 movq 7 * SIZE(B), %mm7 1476 1477 addq $ 8 * SIZE, B 1478 addq $16 * SIZE, BO 1479 1480 movq %mm0, -16 * SIZE(BO) 1481 movq %mm0, -15 * SIZE(BO) 1482 movq %mm1, -14 * SIZE(BO) 1483 movq %mm1, -13 * SIZE(BO) 1484 movq %mm2, -12 * SIZE(BO) 1485 movq %mm2, -11 * SIZE(BO) 1486 movq %mm3, -10 * SIZE(BO) 1487 movq %mm3, -9 * SIZE(BO) 1488 movq %mm4, -8 * SIZE(BO) 1489 movq %mm4, -7 * SIZE(BO) 1490 movq %mm5, -6 * SIZE(BO) 1491 movq %mm5, -5 * SIZE(BO) 1492 movq %mm6, -4 * SIZE(BO) 1493 movq %mm6, -3 * SIZE(BO) 1494 movq %mm7, -2 * SIZE(BO) 1495 movq %mm7, -1 * SIZE(BO) 1496 1497 decq %rax 1498 jne .L42 1499 ALIGN_3 1500 1501.L43: 1502 movq K, %rax 1503 andq $3, %rax 1504 BRANCH 1505 jle .L50 1506 ALIGN_3 1507 1508.L44: 1509 movq 0 * SIZE(B), %mm0 1510 movq 1 * SIZE(B), %mm1 1511 1512 movq %mm0, 0 * SIZE(BO) 1513 movq %mm0, 1 * SIZE(BO) 1514 movq %mm1, 2 * SIZE(BO) 1515 movq %mm1, 3 * SIZE(BO) 1516 1517 addq $2 * SIZE, B 1518 addq $4 * SIZE, BO 1519 decq %rax 1520 jne .L44 1521 ALIGN_3 1522 1523.L50: 1524 movq C, CO1 # coffset1 = c 1525 leaq (C, LDC, 1), CO2 # coffset2 = c + ldc 1526 movq A, AO # aoffset = a 1527 1528 movq M, I 1529 sarq $2, I # i = (m >> 2) 1530 jle .L60 1531 ALIGN_3 1532 1533.L51: 1534#if !defined(TRMMKERNEL) || \ 1535 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1536 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1537 1538 leaq BUFFER, BO 1539#else 1540 leaq BUFFER, BO 1541 movq KK, %rax 1542 leaq (, %rax, SIZE), %rax 1543 leaq (AO, %rax, 4), AO 1544 leaq (BO, %rax, 4), BO 1545#endif 1546 1547 movapd -16 * SIZE(AO), %xmm0 1548 pxor %xmm8, %xmm8 1549 movapd 0 * SIZE(BO), %xmm1 1550 pxor %xmm9, %xmm9 1551 movapd -8 * SIZE(AO), %xmm2 1552 pxor %xmm12, %xmm12 1553 movapd 8 * SIZE(BO), %xmm3 1554 pxor %xmm13, %xmm13 1555 1556 movapd 0 * SIZE(AO), %xmm4 1557 movapd 16 * SIZE(BO), %xmm5 1558 movapd 8 * SIZE(AO), %xmm6 1559 movapd 24 * SIZE(BO), %xmm7 1560 1561 PREFETCHW 4 * SIZE(CO1) 1562 PREFETCHW 4 * SIZE(CO2) 1563 1564#ifndef TRMMKERNEL 1565 movq K, %rax 1566#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1567 movq K, %rax 1568 subq KK, %rax 1569 movq %rax, KKK 1570#else 1571 movq KK, %rax 1572#ifdef LEFT 1573 addq $4, %rax 1574#else 1575 addq $2, %rax 1576#endif 1577 movq %rax, KKK 1578#endif 1579 sarq $3, %rax 1580 je .L55 1581 ALIGN_3 1582 1583.L52: 1584 mulpd %xmm0, %xmm1 1585 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 1586 mulpd 2 * SIZE(BO), %xmm0 1587 addpd %xmm1, %xmm8 1588 movapd 0 * SIZE(BO), %xmm1 1589 addpd %xmm0, %xmm9 1590 movapd -14 * SIZE(AO), %xmm0 1591 mulpd %xmm0, %xmm1 1592 mulpd 2 * SIZE(BO), %xmm0 1593 addpd %xmm1, %xmm12 1594 movapd 4 * SIZE(BO), %xmm1 1595 addpd %xmm0, %xmm13 1596 movapd -12 * SIZE(AO), %xmm0 1597 1598 mulpd %xmm0, %xmm1 1599 mulpd 6 * SIZE(BO), %xmm0 1600 addpd %xmm1, %xmm8 1601 movapd 4 * SIZE(BO), %xmm1 1602 addpd %xmm0, %xmm9 1603 movapd -10 * SIZE(AO), %xmm0 1604 mulpd %xmm0, %xmm1 1605 mulpd 6 * SIZE(BO), %xmm0 1606 addpd %xmm1, %xmm12 1607 movapd 32 * SIZE(BO), %xmm1 1608 addpd %xmm0, %xmm13 1609 movapd 16 * SIZE(AO), %xmm0 1610 1611 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) 1612 mulpd %xmm2, %xmm3 1613 mulpd 10 * SIZE(BO), %xmm2 1614 addpd %xmm3, %xmm8 1615 movapd 8 * SIZE(BO), %xmm3 1616 addpd %xmm2, %xmm9 1617 movapd -6 * SIZE(AO), %xmm2 1618 mulpd %xmm2, %xmm3 1619 mulpd 10 * SIZE(BO), %xmm2 1620 addpd %xmm3, %xmm12 1621 movapd 12 * SIZE(BO), %xmm3 1622 addpd %xmm2, %xmm13 1623 movapd -4 * SIZE(AO), %xmm2 1624 1625 mulpd %xmm2, %xmm3 1626 mulpd 14 * SIZE(BO), %xmm2 1627 addpd %xmm3, %xmm8 1628 movapd 12 * SIZE(BO), %xmm3 1629 addpd %xmm2, %xmm9 1630 movapd -2 * SIZE(AO), %xmm2 1631 mulpd %xmm2, %xmm3 1632 mulpd 14 * SIZE(BO), %xmm2 1633 addpd %xmm3, %xmm12 1634 movapd 40 * SIZE(BO), %xmm3 1635 addpd %xmm2, %xmm13 1636 movapd 24 * SIZE(AO), %xmm2 1637 1638 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) 1639 mulpd %xmm4, %xmm5 1640 mulpd 18 * SIZE(BO), %xmm4 1641 addpd %xmm5, %xmm8 1642 movapd 16 * SIZE(BO), %xmm5 1643 addpd %xmm4, %xmm9 1644 movapd 2 * SIZE(AO), %xmm4 1645 mulpd %xmm4, %xmm5 1646 mulpd 18 * SIZE(BO), %xmm4 1647 addpd %xmm5, %xmm12 1648 movapd 20 * SIZE(BO), %xmm5 1649 addpd %xmm4, %xmm13 1650 movapd 4 * SIZE(AO), %xmm4 1651 1652 mulpd %xmm4, %xmm5 1653 mulpd 22 * SIZE(BO), %xmm4 1654 addpd %xmm5, %xmm8 1655 movapd 20 * SIZE(BO), %xmm5 1656 addpd %xmm4, %xmm9 1657 movapd 6 * SIZE(AO), %xmm4 1658 mulpd %xmm4, %xmm5 1659 mulpd 22 * SIZE(BO), %xmm4 1660 addpd %xmm5, %xmm12 1661 movapd 48 * SIZE(BO), %xmm5 1662 addpd %xmm4, %xmm13 1663 movapd 32 * SIZE(AO), %xmm4 1664 1665 PREFETCH (PREFETCHSIZE + 24) * SIZE(AO) 1666 mulpd %xmm6, %xmm7 1667 mulpd 26 * SIZE(BO), %xmm6 1668 addpd %xmm7, %xmm8 1669 movapd 24 * SIZE(BO), %xmm7 1670 addpd %xmm6, %xmm9 1671 movapd 10 * SIZE(AO), %xmm6 1672 mulpd %xmm6, %xmm7 1673 mulpd 26 * SIZE(BO), %xmm6 1674 addpd %xmm7, %xmm12 1675 movapd 28 * SIZE(BO), %xmm7 1676 addpd %xmm6, %xmm13 1677 movapd 12 * SIZE(AO), %xmm6 1678 1679 mulpd %xmm6, %xmm7 1680 mulpd 30 * SIZE(BO), %xmm6 1681 addpd %xmm7, %xmm8 1682 movapd 28 * SIZE(BO), %xmm7 1683 addpd %xmm6, %xmm9 1684 movapd 14 * SIZE(AO), %xmm6 1685 mulpd %xmm6, %xmm7 1686 mulpd 30 * SIZE(BO), %xmm6 1687 addpd %xmm7, %xmm12 1688 movapd 56 * SIZE(BO), %xmm7 1689 addpd %xmm6, %xmm13 1690 movapd 40 * SIZE(AO), %xmm6 1691 1692 addq $32 * SIZE, AO 1693 addq $32 * SIZE, BO 1694 decq %rax 1695 jne .L52 1696 ALIGN_3 1697 1698.L55: 1699#ifndef TRMMKERNEL 1700 movq K, %rax 1701#else 1702 movq KKK, %rax 1703#endif 1704 movapd ALPHA, %xmm7 1705 andq $7, %rax # if (k & 1) 1706 BRANCH 1707 je .L59 1708 ALIGN_3 1709 1710.L56: 1711 movapd 0 * SIZE(BO), %xmm1 1712 mulpd %xmm0, %xmm1 1713 addpd %xmm1, %xmm8 1714 mulpd 2 * SIZE(BO), %xmm0 1715 addpd %xmm0, %xmm9 1716 movapd -14 * SIZE(AO), %xmm0 1717 movapd 0 * SIZE(BO), %xmm1 1718 mulpd %xmm0, %xmm1 1719 addpd %xmm1, %xmm12 1720 mulpd 2 * SIZE(BO), %xmm0 1721 addpd %xmm0, %xmm13 1722 movapd -12 * SIZE(AO), %xmm0 1723 1724 addq $4 * SIZE, AO # aoffset += 4 1725 addq $4 * SIZE, BO # boffset1 += 8 1726 decq %rax 1727 jg .L56 1728 ALIGN_3 1729 1730.L59: 1731#ifndef TRMMKERNEL 1732 movsd 0 * SIZE(CO1), %xmm0 1733 movhpd 1 * SIZE(CO1), %xmm0 1734 movsd 2 * SIZE(CO1), %xmm1 1735 movhpd 3 * SIZE(CO1), %xmm1 1736 movsd 0 * SIZE(CO2), %xmm2 1737 movhpd 1 * SIZE(CO2), %xmm2 1738 movsd 2 * SIZE(CO2), %xmm3 1739 movhpd 3 * SIZE(CO2), %xmm3 1740#endif 1741 1742 mulpd %xmm7, %xmm8 1743 mulpd %xmm7, %xmm9 1744 mulpd %xmm7, %xmm12 1745 mulpd %xmm7, %xmm13 1746 1747#ifndef TRMMKERNEL 1748 addpd %xmm0, %xmm8 1749 addpd %xmm1, %xmm12 1750 addpd %xmm2, %xmm9 1751 addpd %xmm3, %xmm13 1752#endif 1753 1754 movsd %xmm8, 0 * SIZE(CO1) 1755 movhpd %xmm8, 1 * SIZE(CO1) 1756 movsd %xmm12, 2 * SIZE(CO1) 1757 movhpd %xmm12, 3 * SIZE(CO1) 1758 movsd %xmm9, 0 * SIZE(CO2) 1759 movhpd %xmm9, 1 * SIZE(CO2) 1760 movsd %xmm13, 2 * SIZE(CO2) 1761 movhpd %xmm13, 3 * SIZE(CO2) 1762 1763#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1764 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1765 movq K, %rax 1766 subq KKK, %rax 1767 leaq (,%rax, SIZE), %rax 1768 leaq (AO, %rax, 4), AO 1769 leaq (BO, %rax, 4), BO 1770#endif 1771 1772#if defined(TRMMKERNEL) && defined(LEFT) 1773 addq $4, KK 1774#endif 1775 1776 addq $4 * SIZE, CO1 # coffset += 4 1777 addq $4 * SIZE, CO2 # coffset += 4 1778 decq I # i -- 1779 jg .L51 1780 ALIGN_3 1781 1782.L60: 1783 testq $2, M 1784 je .L70 1785 ALIGN_3 1786 1787.L61: 1788#if !defined(TRMMKERNEL) || \ 1789 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1790 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1791 1792 leaq BUFFER, BO 1793#else 1794 leaq BUFFER, BO 1795 movq KK, %rax 1796 leaq (, %rax, SIZE), %rax 1797 leaq (AO, %rax, 2), AO 1798 leaq (BO, %rax, 4), BO 1799#endif 1800 1801 movapd -16 * SIZE(AO), %xmm0 1802 pxor %xmm8, %xmm8 1803 movapd 0 * SIZE(BO), %xmm1 1804 pxor %xmm9, %xmm9 1805 movapd -8 * SIZE(AO), %xmm2 1806 pxor %xmm10, %xmm10 1807 movapd 8 * SIZE(BO), %xmm3 1808 pxor %xmm11, %xmm11 1809 1810 movapd 16 * SIZE(BO), %xmm5 1811 movapd 24 * SIZE(BO), %xmm7 1812 1813#ifndef TRMMKERNEL 1814 movq K, %rax 1815#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1816 movq K, %rax 1817 subq KK, %rax 1818 movq %rax, KKK 1819#else 1820 movq KK, %rax 1821#ifdef LEFT 1822 addq $2, %rax 1823#else 1824 addq $2, %rax 1825#endif 1826 movq %rax, KKK 1827#endif 1828 sarq $3, %rax 1829 je .L65 1830 ALIGN_3 1831 1832.L62: 1833 mulpd %xmm0, %xmm1 1834 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 1835 mulpd 2 * SIZE(BO), %xmm0 1836 addpd %xmm1, %xmm8 1837 movapd 4 * SIZE(BO), %xmm1 1838 addpd %xmm0, %xmm9 1839 movapd -14 * SIZE(AO), %xmm0 1840 1841 mulpd %xmm0, %xmm1 1842 mulpd 6 * SIZE(BO), %xmm0 1843 addpd %xmm1, %xmm10 1844 movapd 32 * SIZE(BO), %xmm1 1845 addpd %xmm0, %xmm11 1846 movapd -12 * SIZE(AO), %xmm0 1847 1848 mulpd %xmm0, %xmm3 1849 mulpd 10 * SIZE(BO), %xmm0 1850 addpd %xmm3, %xmm8 1851 movapd 12 * SIZE(BO), %xmm3 1852 addpd %xmm0, %xmm9 1853 movapd -10 * SIZE(AO), %xmm0 1854 1855 mulpd %xmm0, %xmm3 1856 mulpd 14 * SIZE(BO), %xmm0 1857 addpd %xmm3, %xmm10 1858 movapd 40 * SIZE(BO), %xmm3 1859 addpd %xmm0, %xmm11 1860 movapd 0 * SIZE(AO), %xmm0 1861 1862 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) 1863 mulpd %xmm2, %xmm5 1864 mulpd 18 * SIZE(BO), %xmm2 1865 addpd %xmm5, %xmm8 1866 movapd 20 * SIZE(BO), %xmm5 1867 addpd %xmm2, %xmm9 1868 movapd -6 * SIZE(AO), %xmm2 1869 1870 mulpd %xmm2, %xmm5 1871 mulpd 22 * SIZE(BO), %xmm2 1872 addpd %xmm5, %xmm10 1873 movapd 48 * SIZE(BO), %xmm5 1874 addpd %xmm2, %xmm11 1875 movapd -4 * SIZE(AO), %xmm2 1876 1877 mulpd %xmm2, %xmm7 1878 mulpd 26 * SIZE(BO), %xmm2 1879 addpd %xmm7, %xmm8 1880 movapd 28 * SIZE(BO), %xmm7 1881 addpd %xmm2, %xmm9 1882 movapd -2 * SIZE(AO), %xmm2 1883 1884 mulpd %xmm2, %xmm7 1885 mulpd 30 * SIZE(BO), %xmm2 1886 addpd %xmm7, %xmm10 1887 movapd 56 * SIZE(BO), %xmm7 1888 addpd %xmm2, %xmm11 1889 movapd 8 * SIZE(AO), %xmm2 1890 1891 addq $16 * SIZE, AO 1892 addq $32 * SIZE, BO 1893 decq %rax 1894 jne .L62 1895 ALIGN_3 1896 1897.L65: 1898#ifndef TRMMKERNEL 1899 movq K, %rax 1900#else 1901 movq KKK, %rax 1902#endif 1903 movapd ALPHA, %xmm7 1904 andq $7, %rax # if (k & 1) 1905 BRANCH 1906 je .L69 1907 ALIGN_3 1908 1909.L66: 1910 mulpd %xmm0, %xmm1 1911 mulpd 2 * SIZE(BO), %xmm0 1912 addpd %xmm1, %xmm8 1913 movapd 4 * SIZE(BO), %xmm1 1914 addpd %xmm0, %xmm9 1915 movapd -14 * SIZE(AO), %xmm0 1916 1917 addq $2 * SIZE, AO # aoffset += 4 1918 addq $4 * SIZE, BO # boffset1 += 8 1919 decq %rax 1920 jg .L66 1921 ALIGN_3 1922 1923.L69: 1924#ifndef TRMMKERNEL 1925 movsd 0 * SIZE(CO1), %xmm0 1926 movhpd 1 * SIZE(CO1), %xmm0 1927 movsd 0 * SIZE(CO2), %xmm2 1928 movhpd 1 * SIZE(CO2), %xmm2 1929#endif 1930 1931 addpd %xmm10, %xmm8 1932 addpd %xmm11, %xmm9 1933 1934 mulpd %xmm7, %xmm8 1935 mulpd %xmm7, %xmm9 1936 1937#ifndef TRMMKERNEL 1938 addpd %xmm0, %xmm8 1939 addpd %xmm2, %xmm9 1940#endif 1941 1942 movsd %xmm8, 0 * SIZE(CO1) 1943 movhpd %xmm8, 1 * SIZE(CO1) 1944 movsd %xmm9, 0 * SIZE(CO2) 1945 movhpd %xmm9, 1 * SIZE(CO2) 1946 1947#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1948 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1949 movq K, %rax 1950 subq KKK, %rax 1951 leaq (,%rax, SIZE), %rax 1952 leaq (AO, %rax, 2), AO 1953 leaq (BO, %rax, 4), BO 1954#endif 1955 1956#if defined(TRMMKERNEL) && defined(LEFT) 1957 addq $2, KK 1958#endif 1959 1960 addq $2 * SIZE, CO1 # coffset += 4 1961 addq $2 * SIZE, CO2 # coffset += 4 1962 ALIGN_3 1963 1964.L70: 1965 testq $1, M 1966 je .L79 1967 ALIGN_3 1968 1969.L71: 1970#if !defined(TRMMKERNEL) || \ 1971 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1972 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1973 1974 leaq BUFFER, BO 1975#else 1976 leaq BUFFER, BO 1977 movq KK, %rax 1978 leaq (, %rax, SIZE), %rax 1979 leaq (AO, %rax, 1), AO 1980 leaq (BO, %rax, 4), BO 1981#endif 1982 1983 movsd -16 * SIZE(AO), %xmm0 1984 pxor %xmm8, %xmm8 1985 movsd 0 * SIZE(BO), %xmm1 1986 pxor %xmm9, %xmm9 1987 movsd -12 * SIZE(AO), %xmm2 1988 pxor %xmm10, %xmm10 1989 movsd 8 * SIZE(BO), %xmm3 1990 pxor %xmm11, %xmm11 1991 1992 movsd 16 * SIZE(BO), %xmm5 1993 movsd 24 * SIZE(BO), %xmm7 1994 1995#ifndef TRMMKERNEL 1996 movq K, %rax 1997#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1998 movq K, %rax 1999 subq KK, %rax 2000 movq %rax, KKK 2001#else 2002 movq KK, %rax 2003#ifdef LEFT 2004 addq $1, %rax 2005#else 2006 addq $2, %rax 2007#endif 2008 movq %rax, KKK 2009#endif 2010 sarq $3, %rax 2011 je .L75 2012 ALIGN_3 2013 2014.L72: 2015 mulsd %xmm0, %xmm1 2016 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 2017 mulsd 2 * SIZE(BO), %xmm0 2018 addsd %xmm1, %xmm8 2019 movsd 4 * SIZE(BO), %xmm1 2020 addsd %xmm0, %xmm9 2021 movsd -15 * SIZE(AO), %xmm0 2022 2023 mulsd %xmm0, %xmm1 2024 mulsd 6 * SIZE(BO), %xmm0 2025 addsd %xmm1, %xmm10 2026 movsd 32 * SIZE(BO), %xmm1 2027 addsd %xmm0, %xmm11 2028 movsd -14 * SIZE(AO), %xmm0 2029 2030 mulsd %xmm0, %xmm3 2031 mulsd 10 * SIZE(BO), %xmm0 2032 addsd %xmm3, %xmm8 2033 movsd 12 * SIZE(BO), %xmm3 2034 addsd %xmm0, %xmm9 2035 movsd -13 * SIZE(AO), %xmm0 2036 2037 mulsd %xmm0, %xmm3 2038 mulsd 14 * SIZE(BO), %xmm0 2039 addsd %xmm3, %xmm10 2040 movsd 40 * SIZE(BO), %xmm3 2041 addsd %xmm0, %xmm11 2042 movsd -8 * SIZE(AO), %xmm0 2043 2044 mulsd %xmm2, %xmm5 2045 mulsd 18 * SIZE(BO), %xmm2 2046 addsd %xmm5, %xmm8 2047 movsd 20 * SIZE(BO), %xmm5 2048 addsd %xmm2, %xmm9 2049 movsd -11 * SIZE(AO), %xmm2 2050 2051 mulsd %xmm2, %xmm5 2052 mulsd 22 * SIZE(BO), %xmm2 2053 addsd %xmm5, %xmm10 2054 movsd 48 * SIZE(BO), %xmm5 2055 addsd %xmm2, %xmm11 2056 movsd -10 * SIZE(AO), %xmm2 2057 2058 mulsd %xmm2, %xmm7 2059 mulsd 26 * SIZE(BO), %xmm2 2060 addsd %xmm7, %xmm8 2061 movsd 28 * SIZE(BO), %xmm7 2062 addsd %xmm2, %xmm9 2063 movsd -9 * SIZE(AO), %xmm2 2064 2065 mulsd %xmm2, %xmm7 2066 mulsd 30 * SIZE(BO), %xmm2 2067 addsd %xmm7, %xmm10 2068 movsd 56 * SIZE(BO), %xmm7 2069 addsd %xmm2, %xmm11 2070 movsd -4 * SIZE(AO), %xmm2 2071 2072 addq $ 8 * SIZE, AO 2073 addq $32 * SIZE, BO 2074 decq %rax 2075 jne .L72 2076 ALIGN_3 2077 2078.L75: 2079#ifndef TRMMKERNEL 2080 movq K, %rax 2081#else 2082 movq KKK, %rax 2083#endif 2084 movsd ALPHA, %xmm7 2085 andq $7, %rax # if (k & 1) 2086 BRANCH 2087 je .L78 2088 ALIGN_3 2089 2090.L76: 2091 mulsd %xmm0, %xmm1 2092 mulsd 2 * SIZE(BO), %xmm0 2093 addsd %xmm1, %xmm8 2094 addsd %xmm0, %xmm9 2095 movsd -15 * SIZE(AO), %xmm0 2096 movsd 4 * SIZE(BO), %xmm1 2097 2098 addq $1 * SIZE, AO # aoffset += 4 2099 addq $4 * SIZE, BO # boffset1 += 8 2100 decq %rax 2101 jg .L76 2102 ALIGN_3 2103 2104.L78: 2105#ifndef TRMMKERNEL 2106 movsd 0 * SIZE(CO1), %xmm0 2107 movsd 0 * SIZE(CO2), %xmm2 2108#endif 2109 2110 addsd %xmm10, %xmm8 2111 addsd %xmm11, %xmm9 2112 2113 mulsd %xmm7, %xmm8 2114 mulsd %xmm7, %xmm9 2115 2116#ifndef TRMMKERNEL 2117 addsd %xmm0, %xmm8 2118 addsd %xmm2, %xmm9 2119#endif 2120 2121 movsd %xmm8, 0 * SIZE(CO1) 2122 movsd %xmm9, 0 * SIZE(CO2) 2123 2124#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2125 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2126 movq K, %rax 2127 subq KKK, %rax 2128 leaq (,%rax, SIZE), %rax 2129 leaq (AO, %rax, 1), AO 2130 leaq (BO, %rax, 4), BO 2131#endif 2132 2133#if defined(TRMMKERNEL) && defined(LEFT) 2134 addq $1, KK 2135#endif 2136 ALIGN_3 2137 2138.L79: 2139#if defined(TRMMKERNEL) && !defined(LEFT) 2140 addl $2, KK 2141#endif 2142 leaq (C, LDC, 2), C 2143 ALIGN_3 2144 2145.L80: 2146 testq $1, N 2147 je .L999 2148 ALIGN_4 2149 2150.L81: 2151/* Copying to Sub Buffer */ 2152 leaq BUFFER, BO 2153 2154#if defined(TRMMKERNEL) && defined(LEFT) 2155 movq OFFSET, %rax 2156 movq %rax, KK 2157#endif 2158 2159 movq K, %rax 2160 sarq $3, %rax 2161 jle .L83 2162 ALIGN_3 2163 2164.L82: 2165 PREFETCH 56 * SIZE(B) 2166 2167 movq 0 * SIZE(B), %mm0 2168 movq 1 * SIZE(B), %mm1 2169 movq 2 * SIZE(B), %mm2 2170 movq 3 * SIZE(B), %mm3 2171 movq 4 * SIZE(B), %mm4 2172 movq 5 * SIZE(B), %mm5 2173 movq 6 * SIZE(B), %mm6 2174 movq 7 * SIZE(B), %mm7 2175 2176 addq $ 8 * SIZE, B 2177 addq $16 * SIZE, BO 2178 2179 movq %mm0, -16 * SIZE(BO) 2180 movq %mm0, -15 * SIZE(BO) 2181 movq %mm1, -14 * SIZE(BO) 2182 movq %mm1, -13 * SIZE(BO) 2183 movq %mm2, -12 * SIZE(BO) 2184 movq %mm2, -11 * SIZE(BO) 2185 movq %mm3, -10 * SIZE(BO) 2186 movq %mm3, -9 * SIZE(BO) 2187 movq %mm4, -8 * SIZE(BO) 2188 movq %mm4, -7 * SIZE(BO) 2189 movq %mm5, -6 * SIZE(BO) 2190 movq %mm5, -5 * SIZE(BO) 2191 movq %mm6, -4 * SIZE(BO) 2192 movq %mm6, -3 * SIZE(BO) 2193 movq %mm7, -2 * SIZE(BO) 2194 movq %mm7, -1 * SIZE(BO) 2195 2196 decq %rax 2197 jne .L82 2198 ALIGN_3 2199 2200.L83: 2201 movq K, %rax 2202 andq $7, %rax 2203 BRANCH 2204 jle .L90 2205 ALIGN_3 2206 2207.L84: 2208 movq 0 * SIZE(B), %mm0 2209 2210 movq %mm0, 0 * SIZE(BO) 2211 movq %mm0, 1 * SIZE(BO) 2212 2213 addq $1 * SIZE, B 2214 addq $2 * SIZE, BO 2215 decq %rax 2216 jne .L84 2217 ALIGN_3 2218 2219.L90: 2220 movq C, CO1 # coffset1 = c 2221 movq A, AO # aoffset = a 2222 2223 movq M, I 2224 sarq $2, I # i = (m >> 2) 2225 jle .L100 2226 ALIGN_3 2227 2228.L91: 2229#if !defined(TRMMKERNEL) || \ 2230 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2231 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2232 2233 leaq BUFFER, BO 2234#else 2235 leaq BUFFER, BO 2236 movq KK, %rax 2237 leaq (, %rax, SIZE), %rax 2238 leaq (AO, %rax, 4), AO 2239 leaq (BO, %rax, 2), BO 2240#endif 2241 2242 movapd -16 * SIZE(AO), %xmm0 2243 pxor %xmm8, %xmm8 2244 movapd 0 * SIZE(BO), %xmm1 2245 pxor %xmm9, %xmm9 2246 movapd -8 * SIZE(AO), %xmm2 2247 pxor %xmm10, %xmm10 2248 movapd 8 * SIZE(BO), %xmm3 2249 pxor %xmm11, %xmm11 2250 2251 movapd 0 * SIZE(AO), %xmm4 2252 movapd 8 * SIZE(AO), %xmm6 2253 2254 PREFETCHW 4 * SIZE(CO1) 2255 2256#ifndef TRMMKERNEL 2257 movq K, %rax 2258#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 2259 movq K, %rax 2260 subq KK, %rax 2261 movq %rax, KKK 2262#else 2263 movq KK, %rax 2264#ifdef LEFT 2265 addq $4, %rax 2266#else 2267 addq $1, %rax 2268#endif 2269 movq %rax, KKK 2270#endif 2271 sarq $3, %rax 2272 je .L95 2273 ALIGN_3 2274 2275.L92: 2276 mulpd %xmm1, %xmm0 2277 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 2278 mulpd -14 * SIZE(AO), %xmm1 2279 addpd %xmm0, %xmm8 2280 movapd -12 * SIZE(AO), %xmm0 2281 addpd %xmm1, %xmm9 2282 movapd 2 * SIZE(BO), %xmm1 2283 mulpd %xmm1, %xmm0 2284 mulpd -10 * SIZE(AO), %xmm1 2285 addpd %xmm0, %xmm10 2286 movapd 16 * SIZE(AO), %xmm0 2287 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) 2288 addpd %xmm1, %xmm11 2289 movapd 4 * SIZE(BO), %xmm1 2290 mulpd %xmm1, %xmm2 2291 mulpd -6 * SIZE(AO), %xmm1 2292 addpd %xmm2, %xmm8 2293 movapd -4 * SIZE(AO), %xmm2 2294 addpd %xmm1, %xmm9 2295 movapd 6 * SIZE(BO), %xmm1 2296 mulpd %xmm1, %xmm2 2297 mulpd -2 * SIZE(AO), %xmm1 2298 addpd %xmm2, %xmm10 2299 movapd 24 * SIZE(AO), %xmm2 2300 PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) 2301 addpd %xmm1, %xmm11 2302 movapd 16 * SIZE(BO), %xmm1 2303 mulpd %xmm3, %xmm4 2304 mulpd 2 * SIZE(AO), %xmm3 2305 addpd %xmm4, %xmm8 2306 movapd 4 * SIZE(AO), %xmm4 2307 addpd %xmm3, %xmm9 2308 movapd 10 * SIZE(BO), %xmm3 2309 mulpd %xmm3, %xmm4 2310 mulpd 6 * SIZE(AO), %xmm3 2311 addpd %xmm4, %xmm10 2312 movapd 32 * SIZE(AO), %xmm4 2313 PREFETCH (PREFETCHSIZE + 24) * SIZE(AO) 2314 addpd %xmm3, %xmm11 2315 movapd 12 * SIZE(BO), %xmm3 2316 mulpd %xmm3, %xmm6 2317 mulpd 10 * SIZE(AO), %xmm3 2318 addpd %xmm6, %xmm8 2319 movapd 12 * SIZE(AO), %xmm6 2320 addpd %xmm3, %xmm9 2321 movapd 14 * SIZE(BO), %xmm3 2322 mulpd %xmm3, %xmm6 2323 mulpd 14 * SIZE(AO), %xmm3 2324 addpd %xmm6, %xmm10 2325 movapd 40 * SIZE(AO), %xmm6 2326 addpd %xmm3, %xmm11 2327 movapd 24 * SIZE(BO), %xmm3 2328 2329 addq $32 * SIZE, AO 2330 addq $16 * SIZE, BO 2331 decq %rax 2332 jne .L92 2333 ALIGN_3 2334 2335.L95: 2336#ifndef TRMMKERNEL 2337 movq K, %rax 2338#else 2339 movq KKK, %rax 2340#endif 2341 movapd ALPHA, %xmm7 2342 andq $7, %rax # if (k & 1) 2343 BRANCH 2344 je .L99 2345 ALIGN_3 2346 2347.L96: 2348 mulpd %xmm1, %xmm0 2349 mulpd -14 * SIZE(AO), %xmm1 2350 addpd %xmm0, %xmm8 2351 movapd -12 * SIZE(AO), %xmm0 2352 addpd %xmm1, %xmm9 2353 movapd 2 * SIZE(BO), %xmm1 2354 2355 addq $4 * SIZE, AO # aoffset += 4 2356 addq $2 * SIZE, BO # boffset1 += 8 2357 decq %rax 2358 jg .L96 2359 ALIGN_3 2360 2361.L99: 2362#ifndef TRMMKERNEL 2363 movsd 0 * SIZE(CO1), %xmm0 2364 movhpd 1 * SIZE(CO1), %xmm0 2365 movsd 2 * SIZE(CO1), %xmm1 2366 movhpd 3 * SIZE(CO1), %xmm1 2367#endif 2368 2369 addpd %xmm10, %xmm8 2370 addpd %xmm11, %xmm9 2371 2372 mulpd %xmm7, %xmm8 2373 mulpd %xmm7, %xmm9 2374 2375#ifndef TRMMKERNEL 2376 addpd %xmm0, %xmm8 2377 addpd %xmm1, %xmm9 2378#endif 2379 2380 movsd %xmm8, 0 * SIZE(CO1) 2381 movhpd %xmm8, 1 * SIZE(CO1) 2382 movsd %xmm9, 2 * SIZE(CO1) 2383 movhpd %xmm9, 3 * SIZE(CO1) 2384 2385#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2386 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2387 movq K, %rax 2388 subq KKK, %rax 2389 leaq (,%rax, SIZE), %rax 2390 leaq (AO, %rax, 4), AO 2391 leaq (BO, %rax, 2), BO 2392#endif 2393 2394#if defined(TRMMKERNEL) && defined(LEFT) 2395 addq $4, KK 2396#endif 2397 2398 addq $4 * SIZE, CO1 # coffset += 4 2399 decq I # i -- 2400 jg .L91 2401 ALIGN_3 2402 2403.L100: 2404 testq $2, M 2405 je .L110 2406 ALIGN_3 2407 2408.L101: 2409#if !defined(TRMMKERNEL) || \ 2410 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2411 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2412 2413 leaq BUFFER, BO 2414#else 2415 leaq BUFFER, BO 2416 movq KK, %rax 2417 leaq (, %rax, SIZE), %rax 2418 leaq (AO, %rax, 2), AO 2419 leaq (BO, %rax, 2), BO 2420#endif 2421 2422 movapd -16 * SIZE(AO), %xmm0 2423 pxor %xmm8, %xmm8 2424 movapd 0 * SIZE(BO), %xmm1 2425 pxor %xmm9, %xmm9 2426 movapd -8 * SIZE(AO), %xmm2 2427 pxor %xmm10, %xmm10 2428 movapd 8 * SIZE(BO), %xmm3 2429 pxor %xmm11, %xmm11 2430 2431#ifndef TRMMKERNEL 2432 movq K, %rax 2433#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 2434 movq K, %rax 2435 subq KK, %rax 2436 movq %rax, KKK 2437#else 2438 movq KK, %rax 2439#ifdef LEFT 2440 addq $2, %rax 2441#else 2442 addq $1, %rax 2443#endif 2444 movq %rax, KKK 2445#endif 2446 sarq $3, %rax 2447 je .L105 2448 ALIGN_3 2449 2450.L102: 2451 mulpd %xmm0, %xmm1 2452 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 2453 movapd -14 * SIZE(AO), %xmm0 2454 mulpd 2 * SIZE(BO), %xmm0 2455 addpd %xmm1, %xmm8 2456 movapd 16 * SIZE(BO), %xmm1 2457 addpd %xmm0, %xmm9 2458 movapd -12 * SIZE(AO), %xmm0 2459 mulpd 4 * SIZE(BO), %xmm0 2460 addpd %xmm0, %xmm10 2461 movapd -10 * SIZE(AO), %xmm0 2462 mulpd 6 * SIZE(BO), %xmm0 2463 addpd %xmm0, %xmm11 2464 movapd 0 * SIZE(AO), %xmm0 2465 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 2466 mulpd %xmm2, %xmm3 2467 movapd -6 * SIZE(AO), %xmm2 2468 mulpd 10 * SIZE(BO), %xmm2 2469 addpd %xmm3, %xmm8 2470 movapd 24 * SIZE(BO), %xmm3 2471 addpd %xmm2, %xmm9 2472 movapd -4 * SIZE(AO), %xmm2 2473 mulpd 12 * SIZE(BO), %xmm2 2474 addpd %xmm2, %xmm10 2475 movapd -2 * SIZE(AO), %xmm2 2476 mulpd 14 * SIZE(BO), %xmm2 2477 addpd %xmm2, %xmm11 2478 movapd 8 * SIZE(AO), %xmm2 2479 2480 addq $16 * SIZE, AO 2481 addq $16 * SIZE, BO 2482 decq %rax 2483 jne .L102 2484 ALIGN_3 2485 2486.L105: 2487#ifndef TRMMKERNEL 2488 movq K, %rax 2489#else 2490 movq KKK, %rax 2491#endif 2492 movapd ALPHA, %xmm7 2493 andq $7, %rax # if (k & 1) 2494 BRANCH 2495 je .L109 2496 ALIGN_3 2497 2498.L106: 2499 mulpd %xmm0, %xmm1 2500 addpd %xmm1, %xmm8 2501 movapd -14 * SIZE(AO), %xmm0 2502 movapd 2 * SIZE(BO), %xmm1 2503 2504 addq $2 * SIZE, AO # aoffset += 4 2505 addq $2 * SIZE, BO # boffset1 += 8 2506 decq %rax 2507 jg .L106 2508 ALIGN_3 2509 2510.L109: 2511 addpd %xmm9, %xmm8 2512 addpd %xmm11, %xmm10 2513 addpd %xmm10, %xmm8 2514 2515 mulpd %xmm7, %xmm8 2516 2517#ifndef TRMMKERNEL 2518 movsd 0 * SIZE(CO1), %xmm0 2519 movhpd 1 * SIZE(CO1), %xmm0 2520 2521 addpd %xmm0, %xmm8 2522#endif 2523 2524 movsd %xmm8, 0 * SIZE(CO1) 2525 movhpd %xmm8, 1 * SIZE(CO1) 2526 addq $2 * SIZE, CO1 # coffset += 4 2527 2528#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2529 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2530 movq K, %rax 2531 subq KKK, %rax 2532 leaq (,%rax, SIZE), %rax 2533 leaq (AO, %rax, 2), AO 2534 leaq (BO, %rax, 2), BO 2535#endif 2536 2537#if defined(TRMMKERNEL) && defined(LEFT) 2538 addq $2, KK 2539#endif 2540 ALIGN_3 2541 2542.L110: 2543 testq $1, M 2544 je .L999 2545 ALIGN_3 2546 2547.L111: 2548#if !defined(TRMMKERNEL) || \ 2549 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2550 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2551 2552 leaq BUFFER, BO 2553#else 2554 leaq BUFFER, BO 2555 movq KK, %rax 2556 leaq (, %rax, SIZE), %rax 2557 leaq (AO, %rax, 1), AO 2558 leaq (BO, %rax, 2), BO 2559#endif 2560 2561 movsd -16 * SIZE(AO), %xmm0 2562 pxor %xmm8, %xmm8 2563 movsd 0 * SIZE(BO), %xmm1 2564 pxor %xmm9, %xmm9 2565 movsd -12 * SIZE(AO), %xmm2 2566 pxor %xmm10, %xmm10 2567 movsd 8 * SIZE(BO), %xmm3 2568 pxor %xmm11, %xmm11 2569 2570#ifndef TRMMKERNEL 2571 movq K, %rax 2572#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 2573 movq K, %rax 2574 subq KK, %rax 2575 movq %rax, KKK 2576#else 2577 movq KK, %rax 2578#ifdef LEFT 2579 addq $1, %rax 2580#else 2581 addq $1, %rax 2582#endif 2583 movq %rax, KKK 2584#endif 2585 sarq $3, %rax 2586 je .L115 2587 ALIGN_3 2588 2589.L112: 2590 mulsd %xmm0, %xmm1 2591 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 2592 movsd -15 * SIZE(AO), %xmm0 2593 addsd %xmm1, %xmm8 2594 movsd 16 * SIZE(BO), %xmm1 2595 mulsd 2 * SIZE(BO), %xmm0 2596 addsd %xmm0, %xmm9 2597 movsd -14 * SIZE(AO), %xmm0 2598 mulsd 4 * SIZE(BO), %xmm0 2599 addsd %xmm0, %xmm10 2600 movsd -13 * SIZE(AO), %xmm0 2601 mulsd 6 * SIZE(BO), %xmm0 2602 addsd %xmm0, %xmm11 2603 movsd -8 * SIZE(AO), %xmm0 2604 mulsd %xmm2, %xmm3 2605 movsd -11 * SIZE(AO), %xmm2 2606 addsd %xmm3, %xmm8 2607 movsd 24 * SIZE(BO), %xmm3 2608 mulsd 10 * SIZE(BO), %xmm2 2609 addsd %xmm2, %xmm9 2610 movsd -10 * SIZE(AO), %xmm2 2611 mulsd 12 * SIZE(BO), %xmm2 2612 addsd %xmm2, %xmm10 2613 movsd -9 * SIZE(AO), %xmm2 2614 mulsd 14 * SIZE(BO), %xmm2 2615 addsd %xmm2, %xmm11 2616 movsd -4 * SIZE(AO), %xmm2 2617 2618 addq $ 8 * SIZE, AO 2619 addq $16 * SIZE, BO 2620 decq %rax 2621 jne .L112 2622 ALIGN_3 2623 2624.L115: 2625#ifndef TRMMKERNEL 2626 movq K, %rax 2627#else 2628 movq KKK, %rax 2629#endif 2630 movsd ALPHA, %xmm7 2631 andq $7, %rax # if (k & 1) 2632 BRANCH 2633 je .L118 2634 ALIGN_3 2635 2636.L116: 2637 mulsd %xmm0, %xmm1 2638 movsd -15 * SIZE(AO), %xmm0 2639 addsd %xmm1, %xmm8 2640 movsd 2 * SIZE(BO), %xmm1 2641 2642 addq $1 * SIZE, AO # aoffset += 4 2643 addq $2 * SIZE, BO # boffset1 += 8 2644 decq %rax 2645 jg .L116 2646 ALIGN_3 2647 2648.L118: 2649 addsd %xmm10, %xmm8 2650 addsd %xmm11, %xmm9 2651 addsd %xmm9, %xmm8 2652 2653 mulsd %xmm7, %xmm8 2654#ifndef TRMMKERNEL 2655 addsd 0 * SIZE(CO1), %xmm8 2656#endif 2657 movsd %xmm8, 0 * SIZE(CO1) 2658 ALIGN_3 2659 2660.L999: 2661 movq %rbx, %rsp 2662 2663 EMMS 2664 2665 movq 0(%rsp), %rbx 2666 movq 8(%rsp), %rbp 2667 movq 16(%rsp), %r12 2668 movq 24(%rsp), %r13 2669 movq 32(%rsp), %r14 2670 movq 40(%rsp), %r15 2671 2672#ifdef WINDOWS_ABI 2673 movq 48(%rsp), %rdi 2674 movq 56(%rsp), %rsi 2675 movups 64(%rsp), %xmm6 2676 movups 80(%rsp), %xmm7 2677 movups 96(%rsp), %xmm8 2678 movups 112(%rsp), %xmm9 2679 movups 128(%rsp), %xmm10 2680 movups 144(%rsp), %xmm11 2681 movups 160(%rsp), %xmm12 2682 movups 176(%rsp), %xmm13 2683 movups 192(%rsp), %xmm14 2684 movups 208(%rsp), %xmm15 2685#endif 2686 2687 addq $STACKSIZE, %rsp 2688 ret 2689 2690 EPILOGUE 2691