1/*********************************************************************/ 2/* */ 3/* Optimized BLAS libraries */ 4/* By Kazushige Goto <kgoto@tacc.utexas.edu> */ 5/* */ 6/* Copyright (c) The University of Texas, 2009. All rights reserved. */ 7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */ 8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */ 9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */ 10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */ 11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */ 12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */ 13/* THE USE OF THE SOFTWARE OR DOCUMENTATION. */ 14/* Under no circumstances shall University be liable for incidental, */ 15/* special, indirect, direct or consequential damages or loss of */ 16/* profits, interruption of business, or related expenses which may */ 17/* arise from use of Software or Documentation, including but not */ 18/* limited to those resulting from defects in Software and/or */ 19/* Documentation, or loss or inaccuracy of data of any kind. */ 20/*********************************************************************/ 21 22#define ASSEMBLER 23#include "common.h" 24 25#define OLD_M %rdi 26#define OLD_N %rsi 27#define M %r13 28#define N %r14 29#define K %rdx 30 31#define A %rcx 32#define B %r8 33#define C %r9 34#define LDC %r10 35#define I %r11 36#define AO %rdi 37#define BO %rsi 38#define CO1 %r15 39#define CO2 %rbp 40#define BB %r12 41 42#ifndef WINDOWS_ABI 43 44#define STACKSIZE 64 45 46#else 47 48#define STACKSIZE 256 49 50#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) 51#define OLD_A 48 + STACKSIZE(%rsp) 52#define OLD_B 56 + STACKSIZE(%rsp) 53#define OLD_C 64 + STACKSIZE(%rsp) 54#define OLD_LDC 72 + STACKSIZE(%rsp) 55#define OLD_OFFSET 80 + STACKSIZE(%rsp) 56 57#endif 58 59#define POSINV 0(%rsp) 60#define ALPHA_R 16(%rsp) 61#define ALPHA_I 32(%rsp) 62#define J 48(%rsp) 63#define OFFSET 56(%rsp) 64#define KK 64(%rsp) 65#define KKK 72(%rsp) 66#define BUFFER 256(%rsp) 67 68#ifdef OPTERON 69#define PREFETCH prefetch 70#define PREFETCHW prefetchw 71#define PREFETCHSIZE (8 * 9 + 4) 72 73#define RPREFETCHSIZE (8 * 7 + 4) 74#define WPREFETCHSIZE (8 * 8 + 4) 75#endif 76 77#ifdef GENERIC 78#define PREFETCH prefetcht0 79#define PREFETCHW prefetcht0 80#define PREFETCHSIZE (8 * 5 + 4) 81 82#define RPREFETCHSIZE (8 * 7 + 4) 83#define WPREFETCHSIZE (8 * 8 + 4) 84#endif 85 86#ifndef GENERIC 87#define KERNEL1(xx) \ 88 mulpd %xmm0, %xmm1 ;\ 89 addpd %xmm1, %xmm8 ;\ 90 movapd -16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ 91 mulpd %xmm0, %xmm3 ;\ 92 addpd %xmm3, %xmm9 ;\ 93 movapd -14 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ 94 mulpd %xmm0, %xmm5 ;\ 95 PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\ 96 mulpd -10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\ 97 addpd %xmm5, %xmm10 ;\ 98 movapd -12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ 99 addpd %xmm0, %xmm11 ;\ 100 movapd -8 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0 101 102#define KERNEL2(xx) \ 103 mulpd %xmm2, %xmm1 ;\ 104 addpd %xmm1, %xmm12 ;\ 105 movapd 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ 106 mulpd %xmm2, %xmm3 ;\ 107 addpd %xmm3, %xmm13 ;\ 108 movapd -6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ 109 mulpd %xmm2, %xmm5 ;\ 110 mulpd -10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\ 111 addpd %xmm5, %xmm14 ;\ 112 movapd -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ 113 addpd %xmm2, %xmm15 ;\ 114 movapd -6 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2 115 116#define KERNEL3(xx) \ 117 mulpd %xmm4, %xmm7 ;\ 118 addpd %xmm7, %xmm8 ;\ 119 movapd -8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ 120 mulpd %xmm4, %xmm3 ;\ 121 addpd %xmm3, %xmm9 ;\ 122 movapd -6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ 123 mulpd %xmm4, %xmm5 ;\ 124 mulpd -2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\ 125 addpd %xmm5, %xmm10 ;\ 126 movapd -4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ 127 addpd %xmm4, %xmm11 ;\ 128 movapd -4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4 129 130#define KERNEL4(xx) \ 131 mulpd %xmm6, %xmm7 ;\ 132 addpd %xmm7, %xmm12 ;\ 133 movapd 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ 134 mulpd %xmm6, %xmm3 ;\ 135 addpd %xmm3, %xmm13 ;\ 136 movapd 2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ 137 mulpd %xmm6, %xmm5 ;\ 138 mulpd -2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\ 139 addpd %xmm5, %xmm14 ;\ 140 movapd 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ 141 PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO, %rax, 4) ;\ 142 addpd %xmm6, %xmm15 ;\ 143 movapd -2 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6 144 145#define KERNEL5(xx) \ 146 mulpd %xmm0, %xmm1 ;\ 147 addpd %xmm1, %xmm8 ;\ 148 movapd 0 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ 149 mulpd %xmm0, %xmm3 ;\ 150 addpd %xmm3, %xmm9 ;\ 151 movapd 2 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ 152 mulpd %xmm0, %xmm5 ;\ 153 mulpd 6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm0 ;\ 154 addpd %xmm5, %xmm10 ;\ 155 movapd 4 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ 156 addpd %xmm0, %xmm11 ;\ 157 movapd 0 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm0 158 159#define KERNEL6(xx) \ 160 mulpd %xmm2, %xmm1 ;\ 161 addpd %xmm1, %xmm12 ;\ 162 movapd 16 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm1 ;\ 163 mulpd %xmm2, %xmm3 ;\ 164 addpd %xmm3, %xmm13 ;\ 165 movapd 10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ 166 mulpd %xmm2, %xmm5 ;\ 167 mulpd 6 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm2 ;\ 168 addpd %xmm5, %xmm14 ;\ 169 movapd 12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ 170 addpd %xmm2, %xmm15 ;\ 171 movapd 2 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm2 172 173#define KERNEL7(xx) \ 174 mulpd %xmm4, %xmm7 ;\ 175 addpd %xmm7, %xmm8 ;\ 176 movapd 8 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ 177 mulpd %xmm4, %xmm3 ;\ 178 addpd %xmm3, %xmm9 ;\ 179 movapd 10 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ 180 mulpd %xmm4, %xmm5 ;\ 181 mulpd 14 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm4 ;\ 182 addpd %xmm5, %xmm10 ;\ 183 movapd 12 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ 184 addpd %xmm4, %xmm11 ;\ 185 movapd 4 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm4 186 187#define KERNEL8(xx) \ 188 mulpd %xmm6, %xmm7 ;\ 189 addpd %xmm7, %xmm12 ;\ 190 movapd 24 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm7 ;\ 191 mulpd %xmm6, %xmm3 ;\ 192 addpd %xmm3, %xmm13 ;\ 193 movapd 18 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm3 ;\ 194 mulpd %xmm6, %xmm5 ;\ 195 mulpd 14 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm6 ;\ 196 addpd %xmm5, %xmm14 ;\ 197 movapd 20 * SIZE + 2 * (xx) * SIZE(BO, %rax, 8), %xmm5 ;\ 198 addpd %xmm6, %xmm15 ;\ 199 movapd 6 * SIZE + 1 * (xx) * SIZE(AO, %rax, 4), %xmm6 200 201#else 202#define KERNEL1(xx) \ 203 mulpd %xmm0, %xmm1 ;\ 204 addpd %xmm1, %xmm8 ;\ 205 movapd -16 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ 206 mulpd %xmm0, %xmm3 ;\ 207 addpd %xmm3, %xmm9 ;\ 208 movapd -14 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ 209 mulpd %xmm0, %xmm5 ;\ 210 PREFETCH (PREFETCHSIZE + 0) * SIZE + 1 * (xx) * SIZE(AO) ;\ 211 mulpd -10 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\ 212 addpd %xmm5, %xmm10 ;\ 213 movapd -12 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ 214 addpd %xmm0, %xmm11 ;\ 215 movapd -8 * SIZE + 1 * (xx) * SIZE(AO), %xmm0 216 217#define KERNEL2(xx) \ 218 mulpd %xmm2, %xmm1 ;\ 219 addpd %xmm1, %xmm12 ;\ 220 movapd 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ 221 mulpd %xmm2, %xmm3 ;\ 222 addpd %xmm3, %xmm13 ;\ 223 movapd -6 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ 224 mulpd %xmm2, %xmm5 ;\ 225 mulpd -10 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\ 226 addpd %xmm5, %xmm14 ;\ 227 movapd -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ 228 addpd %xmm2, %xmm15 ;\ 229 movapd -6 * SIZE + 1 * (xx) * SIZE(AO), %xmm2 230 231#define KERNEL3(xx) \ 232 mulpd %xmm4, %xmm7 ;\ 233 addpd %xmm7, %xmm8 ;\ 234 movapd -8 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ 235 mulpd %xmm4, %xmm3 ;\ 236 addpd %xmm3, %xmm9 ;\ 237 movapd -6 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ 238 mulpd %xmm4, %xmm5 ;\ 239 mulpd -2 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\ 240 addpd %xmm5, %xmm10 ;\ 241 movapd -4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ 242 addpd %xmm4, %xmm11 ;\ 243 movapd -4 * SIZE + 1 * (xx) * SIZE(AO), %xmm4 244 245#define KERNEL4(xx) \ 246 mulpd %xmm6, %xmm7 ;\ 247 addpd %xmm7, %xmm12 ;\ 248 movapd 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ 249 mulpd %xmm6, %xmm3 ;\ 250 addpd %xmm3, %xmm13 ;\ 251 movapd 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ 252 mulpd %xmm6, %xmm5 ;\ 253 mulpd -2 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\ 254 addpd %xmm5, %xmm14 ;\ 255 movapd 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ 256 PREFETCH (PREFETCHSIZE + 8) * SIZE + 1 * (xx) * SIZE(AO) ;\ 257 addpd %xmm6, %xmm15 ;\ 258 movapd -2 * SIZE + 1 * (xx) * SIZE(AO), %xmm6 259 260#define KERNEL5(xx) \ 261 mulpd %xmm0, %xmm1 ;\ 262 addpd %xmm1, %xmm8 ;\ 263 movapd 0 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ 264 mulpd %xmm0, %xmm3 ;\ 265 addpd %xmm3, %xmm9 ;\ 266 movapd 2 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ 267 mulpd %xmm0, %xmm5 ;\ 268 mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm0 ;\ 269 addpd %xmm5, %xmm10 ;\ 270 movapd 4 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ 271 addpd %xmm0, %xmm11 ;\ 272 movapd 0 * SIZE + 1 * (xx) * SIZE(AO), %xmm0 273 274#define KERNEL6(xx) \ 275 mulpd %xmm2, %xmm1 ;\ 276 addpd %xmm1, %xmm12 ;\ 277 movapd 16 * SIZE + 2 * (xx) * SIZE(BO), %xmm1 ;\ 278 mulpd %xmm2, %xmm3 ;\ 279 addpd %xmm3, %xmm13 ;\ 280 movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ 281 mulpd %xmm2, %xmm5 ;\ 282 mulpd 6 * SIZE + 2 * (xx) * SIZE(BO), %xmm2 ;\ 283 addpd %xmm5, %xmm14 ;\ 284 movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ 285 addpd %xmm2, %xmm15 ;\ 286 movapd 2 * SIZE + 1 * (xx) * SIZE(AO), %xmm2 287 288#define KERNEL7(xx) \ 289 mulpd %xmm4, %xmm7 ;\ 290 addpd %xmm7, %xmm8 ;\ 291 movapd 8 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ 292 mulpd %xmm4, %xmm3 ;\ 293 addpd %xmm3, %xmm9 ;\ 294 movapd 10 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ 295 mulpd %xmm4, %xmm5 ;\ 296 mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm4 ;\ 297 addpd %xmm5, %xmm10 ;\ 298 movapd 12 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ 299 addpd %xmm4, %xmm11 ;\ 300 movapd 4 * SIZE + 1 * (xx) * SIZE(AO), %xmm4 301 302#define KERNEL8(xx) \ 303 mulpd %xmm6, %xmm7 ;\ 304 addpd %xmm7, %xmm12 ;\ 305 movapd 24 * SIZE + 2 * (xx) * SIZE(BO), %xmm7 ;\ 306 mulpd %xmm6, %xmm3 ;\ 307 addpd %xmm3, %xmm13 ;\ 308 movapd 18 * SIZE + 2 * (xx) * SIZE(BO), %xmm3 ;\ 309 mulpd %xmm6, %xmm5 ;\ 310 mulpd 14 * SIZE + 2 * (xx) * SIZE(BO), %xmm6 ;\ 311 addpd %xmm5, %xmm14 ;\ 312 movapd 20 * SIZE + 2 * (xx) * SIZE(BO), %xmm5 ;\ 313 addpd %xmm6, %xmm15 ;\ 314 movapd 6 * SIZE + 1 * (xx) * SIZE(AO), %xmm6 315 316#endif 317 318 PROLOGUE 319 PROFCODE 320 321 subq $STACKSIZE, %rsp 322 323 movq %rbx, 0(%rsp) 324 movq %rbp, 8(%rsp) 325 movq %r12, 16(%rsp) 326 movq %r13, 24(%rsp) 327 movq %r14, 32(%rsp) 328 movq %r15, 40(%rsp) 329 330#ifdef WINDOWS_ABI 331 movq %rdi, 48(%rsp) 332 movq %rsi, 56(%rsp) 333 movups %xmm6, 64(%rsp) 334 movups %xmm7, 80(%rsp) 335 movups %xmm8, 96(%rsp) 336 movups %xmm9, 112(%rsp) 337 movups %xmm10, 128(%rsp) 338 movups %xmm11, 144(%rsp) 339 movups %xmm12, 160(%rsp) 340 movups %xmm13, 176(%rsp) 341 movups %xmm14, 192(%rsp) 342 movups %xmm15, 208(%rsp) 343 344 movq ARG1, OLD_M 345 movq ARG2, OLD_N 346 movq ARG3, K 347 movq OLD_A, A 348 movq OLD_B, B 349 movq OLD_C, C 350 movq OLD_LDC, LDC 351#ifdef TRMMKERNEL 352 movsd OLD_OFFSET, %xmm12 353#endif 354 movaps %xmm3, %xmm0 355 movsd OLD_ALPHA_I, %xmm1 356#else 357 movq 72(%rsp), LDC 358#ifdef TRMMKERNEL 359 movsd 80(%rsp), %xmm12 360#endif 361 362#endif 363 364 EMMS 365 366 movq %rsp, %rbx # save old stack 367 subq $256 + LOCAL_BUFFER_SIZE, %rsp 368 andq $-4096, %rsp # align stack 369 370 STACK_TOUCHING 371 372 movq OLD_M, M 373 movq OLD_N, N 374 375 pcmpeqb %xmm7, %xmm7 376 psllq $63, %xmm7 # Generate mask 377 pxor %xmm10, %xmm10 378 379 movlpd %xmm0, 0 + ALPHA_R 380 movlpd %xmm0, 8 + ALPHA_R 381 382 movlpd %xmm1, 8 + ALPHA_I 383 xorpd %xmm7, %xmm1 384 movlpd %xmm1, 0 + ALPHA_I 385 386 movlpd %xmm10, 0 + POSINV 387 movlpd %xmm7, 8 + POSINV 388 389#ifdef TRMMKERNEL 390 movlpd %xmm12, OFFSET 391 movlpd %xmm12, KK 392#ifndef LEFT 393 negq KK 394#endif 395#endif 396 397 subq $-16 * SIZE, A 398 399 salq $ZBASE_SHIFT, LDC 400 401 movq N, J 402 sarq $1, J # j = (n >> 2) 403 jle .L100 404 ALIGN_4 405 406.L01: 407 movq C, CO1 # coffset1 = c 408 leaq (C, LDC, 1), CO2 # coffset2 = c + ldc 409 410#if defined(TRMMKERNEL) && defined(LEFT) 411 movq OFFSET, %rax 412 movq %rax, KK 413#endif 414 415 leaq 16 * SIZE + BUFFER, BO 416 417 movq K, %rax 418 sarq $2, %rax 419 jle .L03 420 ALIGN_4 421 422.L02: 423 PREFETCH (RPREFETCHSIZE + 0) * SIZE(B) 424 425 movq 0 * SIZE(B), %mm0 426 movq %mm0, -16 * SIZE(BO) 427 movq %mm0, -15 * SIZE(BO) 428 movq 1 * SIZE(B), %mm1 429 movq %mm1, -14 * SIZE(BO) 430 movq %mm1, -13 * SIZE(BO) 431 432 movq 2 * SIZE(B), %mm2 433 movq %mm2, -12 * SIZE(BO) 434 movq %mm2, -11 * SIZE(BO) 435 movq 3 * SIZE(B), %mm3 436 movq %mm3, -10 * SIZE(BO) 437 movq %mm3, -9 * SIZE(BO) 438 439 PREFETCHW (WPREFETCHSIZE + 0) * SIZE(BO) 440 441 movq 4 * SIZE(B), %mm4 442 movq %mm4, -8 * SIZE(BO) 443 movq %mm4, -7 * SIZE(BO) 444 movq 5 * SIZE(B), %mm5 445 movq %mm5, -6 * SIZE(BO) 446 movq %mm5, -5 * SIZE(BO) 447 448 PREFETCHW (WPREFETCHSIZE + 8) * SIZE(BO) 449 450 movq 6 * SIZE(B), %mm6 451 movq %mm6, -4 * SIZE(BO) 452 movq %mm6, -3 * SIZE(BO) 453 movq 7 * SIZE(B), %mm7 454 movq %mm7, -2 * SIZE(BO) 455 movq %mm7, -1 * SIZE(BO) 456 457 PREFETCH (RPREFETCHSIZE + 8) * SIZE(B) 458 459 movq 8 * SIZE(B), %mm0 460 movq %mm0, 0 * SIZE(BO) 461 movq %mm0, 1 * SIZE(BO) 462 movq 9 * SIZE(B), %mm1 463 movq %mm1, 2 * SIZE(BO) 464 movq %mm1, 3 * SIZE(BO) 465 466 movq 10 * SIZE(B), %mm2 467 movq %mm2, 4 * SIZE(BO) 468 movq %mm2, 5 * SIZE(BO) 469 movq 11 * SIZE(B), %mm3 470 movq %mm3, 6 * SIZE(BO) 471 movq %mm3, 7 * SIZE(BO) 472 473 PREFETCHW (WPREFETCHSIZE + 16) * SIZE(BO) 474 475 movq 12 * SIZE(B), %mm4 476 movq %mm4, 8 * SIZE(BO) 477 movq %mm4, 9 * SIZE(BO) 478 movq 13 * SIZE(B), %mm5 479 movq %mm5, 10 * SIZE(BO) 480 movq %mm5, 11 * SIZE(BO) 481 482 PREFETCHW (WPREFETCHSIZE + 24) * SIZE(BO) 483 484 movq 14 * SIZE(B), %mm6 485 movq %mm6, 12 * SIZE(BO) 486 movq %mm6, 13 * SIZE(BO) 487 movq 15 * SIZE(B), %mm7 488 movq %mm7, 14 * SIZE(BO) 489 movq %mm7, 15 * SIZE(BO) 490 491 addq $ 32 * SIZE, BO 492 subq $-16 * SIZE, B 493 decq %rax 494 jne .L02 495 ALIGN_4 496 497.L03: 498 movq K, %rax 499 andq $3, %rax 500 BRANCH 501 jle .L05 502 ALIGN_4 503 504.L04: 505 movq 0 * SIZE(B), %mm0 506 movq %mm0, -16 * SIZE(BO) 507 movq %mm0, -15 * SIZE(BO) 508 movq 1 * SIZE(B), %mm1 509 movq %mm1, -14 * SIZE(BO) 510 movq %mm1, -13 * SIZE(BO) 511 512 movq 2 * SIZE(B), %mm2 513 movq %mm2, -12 * SIZE(BO) 514 movq %mm2, -11 * SIZE(BO) 515 movq 3 * SIZE(B), %mm3 516 movq %mm3, -10 * SIZE(BO) 517 movq %mm3, -9 * SIZE(BO) 518 519 addq $ 4 * SIZE, B 520 addq $ 8 * SIZE, BO 521 522 decq %rax 523 jne .L04 524 ALIGN_4 525 526.L05: 527 movq A, AO # aoffset = a 528 529 leaq (RPREFETCHSIZE + 0) * SIZE(B), BB 530 531 movq M, I 532 sarq $1, I # i = (m >> 2) 533 jle .L30 534 ALIGN_4 535 536.L10: 537#if !defined(TRMMKERNEL) || \ 538 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 539 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 540 541 leaq 16 * SIZE + BUFFER, BO 542#else 543 leaq 16 * SIZE + BUFFER, BO 544 movq KK, %rax 545 leaq (, %rax, SIZE), %rax 546 leaq (AO, %rax, 4), AO 547 leaq (BO, %rax, 8), BO 548#endif 549 550 movapd -16 * SIZE(AO), %xmm0 551 movapd -16 * SIZE(BO), %xmm1 552 pxor %xmm8, %xmm8 553 PREFETCH 0 * SIZE(BB) 554 movapd -14 * SIZE(AO), %xmm2 555 movapd -14 * SIZE(BO), %xmm3 556 pxor %xmm9, %xmm9 557 movapd -12 * SIZE(AO), %xmm4 558 movapd -12 * SIZE(BO), %xmm5 559 pxor %xmm10, %xmm10 560 movapd -10 * SIZE(AO), %xmm6 561 movapd -8 * SIZE(BO), %xmm7 562 pxor %xmm11, %xmm11 563 564 pxor %xmm12, %xmm12 565 PREFETCHW 3 * SIZE(CO1) 566 pxor %xmm13, %xmm13 567 PREFETCHW 3 * SIZE(CO2) 568 pxor %xmm14, %xmm14 569 pxor %xmm15, %xmm15 570 571#ifndef TRMMKERNEL 572 movq K, %rax 573#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 574 movq K, %rax 575 subq KK, %rax 576 movq %rax, KKK 577#else 578 movq KK, %rax 579#ifdef LEFT 580 addq $2, %rax 581#else 582 addq $2, %rax 583#endif 584 movq %rax, KKK 585#endif 586#ifndef GENERIC 587 andq $-8, %rax 588 589 leaq (, %rax, SIZE), %rax 590 leaq (AO, %rax, 4), AO 591 leaq (BO, %rax, 8), BO 592 negq %rax 593 NOBRANCH 594 je .L15 595 ALIGN_3 596 597.L12: 598 KERNEL1(16 * 0) 599 KERNEL2(16 * 0) 600 KERNEL3(16 * 0) 601 KERNEL4(16 * 0) 602 KERNEL5(16 * 0) 603 KERNEL6(16 * 0) 604 KERNEL7(16 * 0) 605 KERNEL8(16 * 0) 606 607 KERNEL1(16 * 1) 608 KERNEL2(16 * 1) 609 KERNEL3(16 * 1) 610 KERNEL4(16 * 1) 611 KERNEL5(16 * 1) 612 KERNEL6(16 * 1) 613 KERNEL7(16 * 1) 614 KERNEL8(16 * 1) 615 616 addq $8 * SIZE, %rax 617 NOBRANCH 618 je .L15 619 KERNEL1(16 * 0) 620 KERNEL2(16 * 0) 621 KERNEL3(16 * 0) 622 KERNEL4(16 * 0) 623 KERNEL5(16 * 0) 624 KERNEL6(16 * 0) 625 KERNEL7(16 * 0) 626 KERNEL8(16 * 0) 627 628 KERNEL1(16 * 1) 629 KERNEL2(16 * 1) 630 KERNEL3(16 * 1) 631 KERNEL4(16 * 1) 632 KERNEL5(16 * 1) 633 KERNEL6(16 * 1) 634 KERNEL7(16 * 1) 635 KERNEL8(16 * 1) 636 637 addq $8 * SIZE, %rax 638 NOBRANCH 639 je .L15 640 KERNEL1(16 * 0) 641 KERNEL2(16 * 0) 642 KERNEL3(16 * 0) 643 KERNEL4(16 * 0) 644 KERNEL5(16 * 0) 645 KERNEL6(16 * 0) 646 KERNEL7(16 * 0) 647 KERNEL8(16 * 0) 648 649 KERNEL1(16 * 1) 650 KERNEL2(16 * 1) 651 KERNEL3(16 * 1) 652 KERNEL4(16 * 1) 653 KERNEL5(16 * 1) 654 KERNEL6(16 * 1) 655 KERNEL7(16 * 1) 656 KERNEL8(16 * 1) 657 658 addq $8 * SIZE, %rax 659 NOBRANCH 660 je .L15 661 KERNEL1(16 * 0) 662 KERNEL2(16 * 0) 663 KERNEL3(16 * 0) 664 KERNEL4(16 * 0) 665 KERNEL5(16 * 0) 666 KERNEL6(16 * 0) 667 KERNEL7(16 * 0) 668 KERNEL8(16 * 0) 669 670 KERNEL1(16 * 1) 671 KERNEL2(16 * 1) 672 KERNEL3(16 * 1) 673 KERNEL4(16 * 1) 674 KERNEL5(16 * 1) 675 KERNEL6(16 * 1) 676 KERNEL7(16 * 1) 677 KERNEL8(16 * 1) 678 679 addq $8 * SIZE, %rax 680 NOBRANCH 681 je .L15 682 KERNEL1(16 * 0) 683 KERNEL2(16 * 0) 684 KERNEL3(16 * 0) 685 KERNEL4(16 * 0) 686 KERNEL5(16 * 0) 687 KERNEL6(16 * 0) 688 KERNEL7(16 * 0) 689 KERNEL8(16 * 0) 690 691 KERNEL1(16 * 1) 692 KERNEL2(16 * 1) 693 KERNEL3(16 * 1) 694 KERNEL4(16 * 1) 695 KERNEL5(16 * 1) 696 KERNEL6(16 * 1) 697 KERNEL7(16 * 1) 698 KERNEL8(16 * 1) 699 700 addq $8 * SIZE, %rax 701 NOBRANCH 702 je .L15 703 KERNEL1(16 * 0) 704 KERNEL2(16 * 0) 705 KERNEL3(16 * 0) 706 KERNEL4(16 * 0) 707 KERNEL5(16 * 0) 708 KERNEL6(16 * 0) 709 KERNEL7(16 * 0) 710 KERNEL8(16 * 0) 711 712 KERNEL1(16 * 1) 713 KERNEL2(16 * 1) 714 KERNEL3(16 * 1) 715 KERNEL4(16 * 1) 716 KERNEL5(16 * 1) 717 KERNEL6(16 * 1) 718 KERNEL7(16 * 1) 719 KERNEL8(16 * 1) 720 721 addq $8 * SIZE, %rax 722 NOBRANCH 723 je .L15 724 KERNEL1(16 * 0) 725 KERNEL2(16 * 0) 726 KERNEL3(16 * 0) 727 KERNEL4(16 * 0) 728 KERNEL5(16 * 0) 729 KERNEL6(16 * 0) 730 KERNEL7(16 * 0) 731 KERNEL8(16 * 0) 732 733 KERNEL1(16 * 1) 734 KERNEL2(16 * 1) 735 KERNEL3(16 * 1) 736 KERNEL4(16 * 1) 737 KERNEL5(16 * 1) 738 KERNEL6(16 * 1) 739 KERNEL7(16 * 1) 740 KERNEL8(16 * 1) 741 742 addq $8 * SIZE, %rax 743 NOBRANCH 744 je .L15 745 KERNEL1(16 * 0) 746 KERNEL2(16 * 0) 747 KERNEL3(16 * 0) 748 KERNEL4(16 * 0) 749 KERNEL5(16 * 0) 750 KERNEL6(16 * 0) 751 KERNEL7(16 * 0) 752 KERNEL8(16 * 0) 753 754 KERNEL1(16 * 1) 755 KERNEL2(16 * 1) 756 KERNEL3(16 * 1) 757 KERNEL4(16 * 1) 758 KERNEL5(16 * 1) 759 KERNEL6(16 * 1) 760 KERNEL7(16 * 1) 761 KERNEL8(16 * 1) 762 763 addq $8 * SIZE, %rax 764 BRANCH 765 jl .L12 766 ALIGN_3 767 768.L15: 769 PREFETCH 8 * SIZE(BB) 770 subq $-16 * SIZE, BB 771 772#ifndef TRMMKERNEL 773 movq K, %rax 774#else 775 movq KKK, %rax 776#endif 777 testq $4, %rax 778 je .L16 779 xorq %rax, %rax 780 ALIGN_3 781 782 KERNEL1(16 * 0) 783 KERNEL2(16 * 0) 784 KERNEL3(16 * 0) 785 KERNEL4(16 * 0) 786 KERNEL5(16 * 0) 787 KERNEL6(16 * 0) 788 KERNEL7(16 * 0) 789 KERNEL8(16 * 0) 790 791 addq $32 * SIZE, BO 792 addq $16 * SIZE, AO 793 ALIGN_3 794#else 795 sarq $2, %rax 796 NOBRANCH 797 jle .L16 798 ALIGN_3 799 800.L12: 801 KERNEL1(16 * 0) 802 KERNEL2(16 * 0) 803 KERNEL3(16 * 0) 804 KERNEL4(16 * 0) 805 KERNEL5(16 * 0) 806 KERNEL6(16 * 0) 807 KERNEL7(16 * 0) 808 KERNEL8(16 * 0) 809 810 addq $ 32 * SIZE, BO 811 subq $-16 * SIZE, AO 812 decq %rax 813 BRANCH 814 jg .L12 815#endif 816 817.L16: 818 movapd POSINV, %xmm5 819 movapd ALPHA_R, %xmm6 820 movapd ALPHA_I, %xmm7 821 822#ifndef TRMMKERNEL 823 movq K, %rax 824#else 825 movq KKK, %rax 826#endif 827 andq $3, %rax # if (k & 1) 828 je .L19 829 830 leaq (, %rax, SIZE), %rax 831 leaq (AO, %rax, 4), AO 832 leaq (BO, %rax, 8), BO 833 negq %rax 834 ALIGN_3 835 836.L17: 837 mulpd %xmm0, %xmm1 838 addpd %xmm1, %xmm8 839 movapd -14 * SIZE(BO, %rax, 8), %xmm1 840 mulpd %xmm0, %xmm1 841 addpd %xmm1, %xmm9 842 movapd -12 * SIZE(BO, %rax, 8), %xmm1 843 mulpd %xmm0, %xmm1 844 mulpd -10 * SIZE(BO, %rax, 8), %xmm0 845 addpd %xmm1, %xmm10 846 movapd -16 * SIZE(BO, %rax, 8), %xmm1 847 addpd %xmm0, %xmm11 848 movapd -12 * SIZE(AO, %rax, 4), %xmm0 849 mulpd %xmm2, %xmm1 850 addpd %xmm1, %xmm12 851 movapd -14 * SIZE(BO, %rax, 8), %xmm1 852 mulpd %xmm2, %xmm1 853 addpd %xmm1, %xmm13 854 movapd -12 * SIZE(BO, %rax, 8), %xmm1 855 mulpd %xmm2, %xmm1 856 mulpd -10 * SIZE(BO, %rax, 8), %xmm2 857 addpd %xmm1, %xmm14 858 movapd -8 * SIZE(BO, %rax, 8), %xmm1 859 addpd %xmm2, %xmm15 860 movapd -10 * SIZE(AO, %rax, 4), %xmm2 861 862 addq $SIZE, %rax 863 jl .L17 864 ALIGN_3 865 866.L19: 867#ifndef TRMMKERNEL 868 movlpd 0 * SIZE(CO1), %xmm0 869 movhpd 1 * SIZE(CO1), %xmm0 870 movlpd 2 * SIZE(CO1), %xmm2 871 movhpd 3 * SIZE(CO1), %xmm2 872 873 movlpd 0 * SIZE(CO2), %xmm1 874 movhpd 1 * SIZE(CO2), %xmm1 875 movlpd 2 * SIZE(CO2), %xmm3 876 movhpd 3 * SIZE(CO2), %xmm3 877#endif 878 879 SHUFPD_1 %xmm9, %xmm9 880 SHUFPD_1 %xmm11, %xmm11 881 SHUFPD_1 %xmm13, %xmm13 882 SHUFPD_1 %xmm15, %xmm15 883 884#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 885 defined(NR) || defined(NC) || defined(TR) || defined(TC) 886 xorpd %xmm5, %xmm9 887 xorpd %xmm5, %xmm11 888 xorpd %xmm5, %xmm13 889 xorpd %xmm5, %xmm15 890#else 891 xorpd %xmm5, %xmm8 892 xorpd %xmm5, %xmm10 893 xorpd %xmm5, %xmm12 894 xorpd %xmm5, %xmm14 895#endif 896 897#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 898 defined(RR) || defined(RC) || defined(CR) || defined(CC) 899 subpd %xmm9, %xmm8 900 subpd %xmm11, %xmm10 901 subpd %xmm13, %xmm12 902 subpd %xmm15, %xmm14 903#else 904 addpd %xmm9, %xmm8 905 addpd %xmm11, %xmm10 906 addpd %xmm13, %xmm12 907 addpd %xmm15, %xmm14 908#endif 909 910 pshufd $0x4e, %xmm8, %xmm9 911 pshufd $0x4e, %xmm10, %xmm11 912 pshufd $0x4e, %xmm12, %xmm13 913 pshufd $0x4e, %xmm14, %xmm15 914 915 mulpd %xmm6, %xmm8 916 mulpd %xmm7, %xmm9 917 mulpd %xmm6, %xmm10 918 mulpd %xmm7, %xmm11 919 920 mulpd %xmm6, %xmm12 921 mulpd %xmm7, %xmm13 922 mulpd %xmm6, %xmm14 923 mulpd %xmm7, %xmm15 924 925 addpd %xmm9, %xmm8 926 addpd %xmm11, %xmm10 927 addpd %xmm13, %xmm12 928 addpd %xmm15, %xmm14 929 930#ifndef TRMMKERNEL 931 addpd %xmm0, %xmm8 932 addpd %xmm2, %xmm12 933 addpd %xmm1, %xmm10 934 addpd %xmm3, %xmm14 935#endif 936 937 movlpd %xmm8, 0 * SIZE(CO1) 938 movhpd %xmm8, 1 * SIZE(CO1) 939 movlpd %xmm12, 2 * SIZE(CO1) 940 movhpd %xmm12, 3 * SIZE(CO1) 941 942 movlpd %xmm10, 0 * SIZE(CO2) 943 movhpd %xmm10, 1 * SIZE(CO2) 944 movlpd %xmm14, 2 * SIZE(CO2) 945 movhpd %xmm14, 3 * SIZE(CO2) 946 947#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 948 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 949 movq K, %rax 950 subq KKK, %rax 951 leaq (,%rax, SIZE), %rax 952 leaq (AO, %rax, 4), AO 953 leaq (BO, %rax, 8), BO 954#endif 955 956#if defined(TRMMKERNEL) && defined(LEFT) 957 addq $2, KK 958#endif 959 960 addq $4 * SIZE, CO1 # coffset += 4 961 addq $4 * SIZE, CO2 # coffset += 4 962 decq I # i -- 963 jg .L10 964 ALIGN_4 965 966.L30: 967 testq $1, M 968 jle .L99 969 970#if !defined(TRMMKERNEL) || \ 971 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 972 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 973 974 leaq 16 * SIZE + BUFFER, BO 975#else 976 leaq 16 * SIZE + BUFFER, BO 977 movq KK, %rax 978 leaq (, %rax, SIZE), %rax 979 leaq (AO, %rax, 2), AO 980 leaq (BO, %rax, 8), BO 981#endif 982 983 movapd -16 * SIZE(AO), %xmm0 984 pxor %xmm8, %xmm8 985 movapd -8 * SIZE(AO), %xmm2 986 pxor %xmm9, %xmm9 987 movapd -16 * SIZE(BO), %xmm1 988 pxor %xmm10, %xmm10 989 movapd -8 * SIZE(BO), %xmm3 990 pxor %xmm11, %xmm11 991 992#ifndef TRMMKERNEL 993 movq K, %rax 994#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 995 movq K, %rax 996 subq KK, %rax 997 movq %rax, KKK 998#else 999 movq KK, %rax 1000#ifdef LEFT 1001 addq $1, %rax 1002#else 1003 addq $2, %rax 1004#endif 1005 movq %rax, KKK 1006#endif 1007 sarq $3, %rax 1008 je .L44 1009 ALIGN_4 1010 1011.L41: 1012 mulpd %xmm0, %xmm1 1013 addpd %xmm1, %xmm8 1014 movapd -14 * SIZE(BO), %xmm1 1015 mulpd %xmm0, %xmm1 1016 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 1017 addpd %xmm1, %xmm9 1018 movapd -12 * SIZE(BO), %xmm1 1019 mulpd %xmm0, %xmm1 1020 mulpd -10 * SIZE(BO), %xmm0 1021 addpd %xmm1, %xmm10 1022 movapd 0 * SIZE(BO), %xmm1 1023 addpd %xmm0, %xmm11 1024 movapd -14 * SIZE(AO), %xmm0 1025 mulpd %xmm0, %xmm3 1026 addpd %xmm3, %xmm8 1027 movapd -6 * SIZE(BO), %xmm3 1028 mulpd %xmm0, %xmm3 1029 addpd %xmm3, %xmm9 1030 movapd -4 * SIZE(BO), %xmm3 1031 mulpd %xmm0, %xmm3 1032 mulpd -2 * SIZE(BO), %xmm0 1033 addpd %xmm3, %xmm10 1034 movapd 8 * SIZE(BO), %xmm3 1035 addpd %xmm0, %xmm11 1036 movapd -12 * SIZE(AO), %xmm0 1037 mulpd %xmm0, %xmm1 1038 addpd %xmm1, %xmm8 1039 movapd 2 * SIZE(BO), %xmm1 1040 mulpd %xmm0, %xmm1 1041 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) 1042 addpd %xmm1, %xmm9 1043 movapd 4 * SIZE(BO), %xmm1 1044 mulpd %xmm0, %xmm1 1045 mulpd 6 * SIZE(BO), %xmm0 1046 addpd %xmm1, %xmm10 1047 movapd 16 * SIZE(BO), %xmm1 1048 addpd %xmm0, %xmm11 1049 movapd -10 * SIZE(AO), %xmm0 1050 mulpd %xmm0, %xmm3 1051 addpd %xmm3, %xmm8 1052 movapd 10 * SIZE(BO), %xmm3 1053 mulpd %xmm0, %xmm3 1054 addpd %xmm3, %xmm9 1055 movapd 12 * SIZE(BO), %xmm3 1056 mulpd %xmm0, %xmm3 1057 mulpd 14 * SIZE(BO), %xmm0 1058 addpd %xmm3, %xmm10 1059 movapd 24 * SIZE(BO), %xmm3 1060 addpd %xmm0, %xmm11 1061 movapd 0 * SIZE(AO), %xmm0 1062 mulpd %xmm2, %xmm1 1063 addpd %xmm1, %xmm8 1064 movapd 18 * SIZE(BO), %xmm1 1065 mulpd %xmm2, %xmm1 1066 addpd %xmm1, %xmm9 1067 movapd 20 * SIZE(BO), %xmm1 1068 mulpd %xmm2, %xmm1 1069 mulpd 22 * SIZE(BO), %xmm2 1070 addpd %xmm1, %xmm10 1071 movapd 32 * SIZE(BO), %xmm1 1072 addpd %xmm2, %xmm11 1073 movapd -6 * SIZE(AO), %xmm2 1074 mulpd %xmm2, %xmm3 1075 addpd %xmm3, %xmm8 1076 movapd 26 * SIZE(BO), %xmm3 1077 mulpd %xmm2, %xmm3 1078 addpd %xmm3, %xmm9 1079 movapd 28 * SIZE(BO), %xmm3 1080 mulpd %xmm2, %xmm3 1081 mulpd 30 * SIZE(BO), %xmm2 1082 addpd %xmm3, %xmm10 1083 movapd 40 * SIZE(BO), %xmm3 1084 addpd %xmm2, %xmm11 1085 movapd -4 * SIZE(AO), %xmm2 1086 mulpd %xmm2, %xmm1 1087 addpd %xmm1, %xmm8 1088 movapd 34 * SIZE(BO), %xmm1 1089 mulpd %xmm2, %xmm1 1090 addpd %xmm1, %xmm9 1091 movapd 36 * SIZE(BO), %xmm1 1092 mulpd %xmm2, %xmm1 1093 mulpd 38 * SIZE(BO), %xmm2 1094 addpd %xmm1, %xmm10 1095 movapd 48 * SIZE(BO), %xmm1 1096 addpd %xmm2, %xmm11 1097 movapd -2 * SIZE(AO), %xmm2 1098 mulpd %xmm2, %xmm3 1099 addpd %xmm3, %xmm8 1100 movapd 42 * SIZE(BO), %xmm3 1101 mulpd %xmm2, %xmm3 1102 addpd %xmm3, %xmm9 1103 movapd 44 * SIZE(BO), %xmm3 1104 mulpd %xmm2, %xmm3 1105 mulpd 46 * SIZE(BO), %xmm2 1106 addpd %xmm3, %xmm10 1107 movapd 56 * SIZE(BO), %xmm3 1108 addpd %xmm2, %xmm11 1109 movapd 8 * SIZE(AO), %xmm2 1110 1111 subq $-16 * SIZE, AO 1112 addq $64 * SIZE, BO 1113 decq %rax 1114 jne .L41 1115 ALIGN_4 1116 1117.L44: 1118#ifndef TRMMKERNEL 1119 movq K, %rax 1120#else 1121 movq KKK, %rax 1122#endif 1123 andq $4, %rax 1124 BRANCH 1125 jle .L45 1126 1127 mulpd %xmm0, %xmm1 1128 addpd %xmm1, %xmm8 1129 movapd -14 * SIZE(BO), %xmm1 1130 mulpd %xmm0, %xmm1 1131 addpd %xmm1, %xmm9 1132 movapd -12 * SIZE(BO), %xmm1 1133 mulpd %xmm0, %xmm1 1134 mulpd -10 * SIZE(BO), %xmm0 1135 addpd %xmm1, %xmm10 1136 movapd 0 * SIZE(BO), %xmm1 1137 addpd %xmm0, %xmm11 1138 movapd -14 * SIZE(AO), %xmm0 1139 mulpd %xmm0, %xmm3 1140 addpd %xmm3, %xmm8 1141 movapd -6 * SIZE(BO), %xmm3 1142 mulpd %xmm0, %xmm3 1143 addpd %xmm3, %xmm9 1144 movapd -4 * SIZE(BO), %xmm3 1145 mulpd %xmm0, %xmm3 1146 mulpd -2 * SIZE(BO), %xmm0 1147 addpd %xmm3, %xmm10 1148 movapd 8 * SIZE(BO), %xmm3 1149 addpd %xmm0, %xmm11 1150 movapd -12 * SIZE(AO), %xmm0 1151 mulpd %xmm0, %xmm1 1152 addpd %xmm1, %xmm8 1153 movapd 2 * SIZE(BO), %xmm1 1154 mulpd %xmm0, %xmm1 1155 addpd %xmm1, %xmm9 1156 movapd 4 * SIZE(BO), %xmm1 1157 mulpd %xmm0, %xmm1 1158 mulpd 6 * SIZE(BO), %xmm0 1159 addpd %xmm1, %xmm10 1160 movapd 16 * SIZE(BO), %xmm1 1161 addpd %xmm0, %xmm11 1162 movapd -10 * SIZE(AO), %xmm0 1163 mulpd %xmm0, %xmm3 1164 addpd %xmm3, %xmm8 1165 movapd 10 * SIZE(BO), %xmm3 1166 mulpd %xmm0, %xmm3 1167 addpd %xmm3, %xmm9 1168 movapd 12 * SIZE(BO), %xmm3 1169 mulpd %xmm0, %xmm3 1170 mulpd 14 * SIZE(BO), %xmm0 1171 addpd %xmm3, %xmm10 1172 movapd 24 * SIZE(BO), %xmm3 1173 addpd %xmm0, %xmm11 1174 movapd -8 * SIZE(AO), %xmm0 1175 1176 addq $ 8 * SIZE, AO 1177 addq $32 * SIZE, BO 1178 ALIGN_4 1179 1180.L45: 1181#ifndef TRMMKERNEL 1182 movq K, %rax 1183#else 1184 movq KKK, %rax 1185#endif 1186 movapd POSINV, %xmm5 1187 movapd ALPHA_R, %xmm6 1188 movapd ALPHA_I, %xmm7 1189 andq $3, %rax # if (k & 1) 1190 BRANCH 1191 jle .L47 1192 ALIGN_4 1193 1194.L46: 1195 mulpd %xmm0, %xmm1 1196 addpd %xmm1, %xmm8 1197 movapd -14 * SIZE(BO), %xmm1 1198 mulpd %xmm0, %xmm1 1199 addpd %xmm1, %xmm9 1200 movapd -12 * SIZE(BO), %xmm1 1201 mulpd %xmm0, %xmm1 1202 mulpd -10 * SIZE(BO), %xmm0 1203 addpd %xmm1, %xmm10 1204 movapd -8 * SIZE(BO), %xmm1 1205 addpd %xmm0, %xmm11 1206 movapd -14 * SIZE(AO), %xmm0 1207 1208 addq $2 * SIZE, AO 1209 addq $8 * SIZE, BO 1210 1211 decq %rax 1212 jg .L46 1213 ALIGN_4 1214 1215.L47: 1216#ifndef TRMMKERNEL 1217 movlpd 0 * SIZE(CO1), %xmm0 1218 movhpd 1 * SIZE(CO1), %xmm0 1219 movlpd 0 * SIZE(CO2), %xmm1 1220 movhpd 1 * SIZE(CO2), %xmm1 1221#endif 1222 1223 SHUFPD_1 %xmm9, %xmm9 1224 SHUFPD_1 %xmm11, %xmm11 1225 1226#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 1227 defined(NR) || defined(NC) || defined(TR) || defined(TC) 1228 xorpd %xmm5, %xmm9 1229 xorpd %xmm5, %xmm11 1230#else 1231 xorpd %xmm5, %xmm8 1232 xorpd %xmm5, %xmm10 1233#endif 1234 1235#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 1236 defined(RR) || defined(RC) || defined(CR) || defined(CC) 1237 subpd %xmm9, %xmm8 1238 subpd %xmm11, %xmm10 1239#else 1240 addpd %xmm9, %xmm8 1241 addpd %xmm11, %xmm10 1242#endif 1243 1244 pshufd $0x4e, %xmm8, %xmm9 1245 pshufd $0x4e, %xmm10, %xmm11 1246 1247 mulpd %xmm6, %xmm8 1248 mulpd %xmm7, %xmm9 1249 mulpd %xmm6, %xmm10 1250 mulpd %xmm7, %xmm11 1251 1252 addpd %xmm9, %xmm8 1253 addpd %xmm11, %xmm10 1254 1255#ifndef TRMMKERNEL 1256 addpd %xmm0, %xmm8 1257 addpd %xmm1, %xmm10 1258#endif 1259 1260 movlpd %xmm8, 0 * SIZE(CO1) 1261 movhpd %xmm8, 1 * SIZE(CO1) 1262 movlpd %xmm10, 0 * SIZE(CO2) 1263 movhpd %xmm10, 1 * SIZE(CO2) 1264 1265#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1266 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1267 movq K, %rax 1268 subq KKK, %rax 1269 leaq (,%rax, SIZE), %rax 1270 leaq (AO, %rax, 2), AO 1271 leaq (BO, %rax, 8), BO 1272#endif 1273 1274#if defined(TRMMKERNEL) && defined(LEFT) 1275 addq $1, KK 1276#endif 1277 ALIGN_4 1278 1279.L99: 1280#if defined(TRMMKERNEL) && !defined(LEFT) 1281 addl $2, KK 1282#endif 1283 1284 leaq (C, LDC, 2), C # c += 2 * ldc 1285 decq J # j -- 1286 jg .L01 1287 1288.L100: 1289 testq $1, N 1290 jle .L999 1291 1292.L101: 1293#if defined(TRMMKERNEL) && defined(LEFT) 1294 movq OFFSET, %rax 1295 movq %rax, KK 1296#endif 1297 1298/* Copying to Sub Buffer */ 1299 leaq BUFFER, BO 1300 1301 movq K, %rax 1302 sarq $2, %rax 1303 jle .L103 1304 ALIGN_4 1305 1306.L102: 1307 movlpd 0 * SIZE(B), %xmm8 1308 movlpd 1 * SIZE(B), %xmm9 1309 movlpd 2 * SIZE(B), %xmm10 1310 movlpd 3 * SIZE(B), %xmm11 1311 movlpd 4 * SIZE(B), %xmm12 1312 movlpd 5 * SIZE(B), %xmm13 1313 movlpd 6 * SIZE(B), %xmm14 1314 movlpd 7 * SIZE(B), %xmm15 1315 1316 movlpd %xmm8, 0 * SIZE(BO) 1317 movlpd %xmm8, 1 * SIZE(BO) 1318 movlpd %xmm9, 2 * SIZE(BO) 1319 movlpd %xmm9, 3 * SIZE(BO) 1320 movlpd %xmm10, 4 * SIZE(BO) 1321 movlpd %xmm10, 5 * SIZE(BO) 1322 movlpd %xmm11, 6 * SIZE(BO) 1323 movlpd %xmm11, 7 * SIZE(BO) 1324 movlpd %xmm12, 8 * SIZE(BO) 1325 movlpd %xmm12, 9 * SIZE(BO) 1326 movlpd %xmm13, 10 * SIZE(BO) 1327 movlpd %xmm13, 11 * SIZE(BO) 1328 movlpd %xmm14, 12 * SIZE(BO) 1329 movlpd %xmm14, 13 * SIZE(BO) 1330 movlpd %xmm15, 14 * SIZE(BO) 1331 movlpd %xmm15, 15 * SIZE(BO) 1332 1333 subq $-16 * SIZE, BO 1334 addq $ 8 * SIZE, B 1335 decq %rax 1336 jne .L102 1337 ALIGN_4 1338 1339.L103: 1340 movq K, %rax 1341 andq $3, %rax 1342 BRANCH 1343 jle .L105 1344 ALIGN_4 1345 1346.L104: 1347 movlpd 0 * SIZE(B), %xmm8 1348 movlpd 1 * SIZE(B), %xmm9 1349 1350 movlpd %xmm8, 0 * SIZE(BO) 1351 movlpd %xmm8, 1 * SIZE(BO) 1352 movlpd %xmm9, 2 * SIZE(BO) 1353 movlpd %xmm9, 3 * SIZE(BO) 1354 1355 addq $4 * SIZE, BO 1356 addq $2 * SIZE, B 1357 decq %rax 1358 jne .L104 1359 ALIGN_4 1360 1361.L105: 1362 movq C, CO1 # coffset1 = c 1363 movq A, AO # aoffset = a 1364 1365 movq M, I 1366 sarq $1, I # i = (m >> 2) 1367 jle .L130 1368 ALIGN_4 1369 1370.L110: 1371#if !defined(TRMMKERNEL) || \ 1372 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1373 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1374 1375 leaq 16 * SIZE + BUFFER, BO 1376#else 1377 leaq 16 * SIZE + BUFFER, BO 1378 movq KK, %rax 1379 leaq (, %rax, SIZE), %rax 1380 leaq (AO, %rax, 4), AO 1381 leaq (BO, %rax, 4), BO 1382#endif 1383 1384 movapd -16 * SIZE(AO), %xmm0 1385 pxor %xmm8, %xmm8 1386 movapd -16 * SIZE(BO), %xmm1 1387 pxor %xmm9, %xmm9 1388 movapd -8 * SIZE(AO), %xmm2 1389 pxor %xmm12, %xmm12 1390 movapd -8 * SIZE(BO), %xmm3 1391 pxor %xmm13, %xmm13 1392 PREFETCHW 3 * SIZE(CO1) 1393 1394#ifndef TRMMKERNEL 1395 movq K, %rax 1396#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1397 movq K, %rax 1398 subq KK, %rax 1399 movq %rax, KKK 1400#else 1401 movq KK, %rax 1402#ifdef LEFT 1403 addq $2, %rax 1404#else 1405 addq $1, %rax 1406#endif 1407 movq %rax, KKK 1408#endif 1409 sarq $2, %rax 1410 je .L112 1411 1412.L111: 1413 mulpd %xmm0, %xmm1 1414 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 1415 mulpd -14 * SIZE(BO), %xmm0 1416 addpd %xmm1, %xmm8 1417 movapd -16 * SIZE(BO), %xmm1 1418 addpd %xmm0, %xmm9 1419 movapd -14 * SIZE(AO), %xmm0 1420 mulpd %xmm0, %xmm1 1421 mulpd -14 * SIZE(BO), %xmm0 1422 addpd %xmm1, %xmm12 1423 movapd -12 * SIZE(BO), %xmm1 1424 addpd %xmm0, %xmm13 1425 movapd -12 * SIZE(AO), %xmm0 1426 mulpd %xmm0, %xmm1 1427 mulpd -10 * SIZE(BO), %xmm0 1428 addpd %xmm1, %xmm8 1429 movapd -12 * SIZE(BO), %xmm1 1430 addpd %xmm0, %xmm9 1431 movapd -10 * SIZE(AO), %xmm0 1432 mulpd %xmm0, %xmm1 1433 mulpd -10 * SIZE(BO), %xmm0 1434 addpd %xmm1, %xmm12 1435 movapd 0 * SIZE(BO), %xmm1 1436 addpd %xmm0, %xmm13 1437 movapd 0 * SIZE(AO), %xmm0 1438 mulpd %xmm2, %xmm3 1439 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) 1440 mulpd -6 * SIZE(BO), %xmm2 1441 addpd %xmm3, %xmm8 1442 movapd -8 * SIZE(BO), %xmm3 1443 addpd %xmm2, %xmm9 1444 movapd -6 * SIZE(AO), %xmm2 1445 mulpd %xmm2, %xmm3 1446 mulpd -6 * SIZE(BO), %xmm2 1447 addpd %xmm3, %xmm12 1448 movapd -4 * SIZE(BO), %xmm3 1449 addpd %xmm2, %xmm13 1450 movapd -4 * SIZE(AO), %xmm2 1451 mulpd %xmm2, %xmm3 1452 mulpd -2 * SIZE(BO), %xmm2 1453 addpd %xmm3, %xmm8 1454 movapd -4 * SIZE(BO), %xmm3 1455 addpd %xmm2, %xmm9 1456 movapd -2 * SIZE(AO), %xmm2 1457 mulpd %xmm2, %xmm3 1458 mulpd -2 * SIZE(BO), %xmm2 1459 addpd %xmm3, %xmm12 1460 movapd 8 * SIZE(BO), %xmm3 1461 addpd %xmm2, %xmm13 1462 movapd 8 * SIZE(AO), %xmm2 1463 1464 subq $-16 * SIZE, AO 1465 subq $-16 * SIZE, BO 1466 decq %rax 1467 jne .L111 1468 ALIGN_4 1469 1470.L112: 1471#ifndef TRMMKERNEL 1472 movq K, %rax 1473#else 1474 movq KKK, %rax 1475#endif 1476 movapd POSINV, %xmm5 1477 movapd ALPHA_R, %xmm6 1478 movapd ALPHA_I, %xmm7 1479 andq $3, %rax # if (k & 1) 1480 BRANCH 1481 jle .L114 1482 1483.L113: 1484 mulpd %xmm0, %xmm1 1485 mulpd -14 * SIZE(BO), %xmm0 1486 addpd %xmm1, %xmm8 1487 movapd -16 * SIZE(BO), %xmm1 1488 addpd %xmm0, %xmm9 1489 movapd -14 * SIZE(AO), %xmm0 1490 mulpd %xmm0, %xmm1 1491 mulpd -14 * SIZE(BO), %xmm0 1492 addpd %xmm1, %xmm12 1493 movapd -12 * SIZE(BO), %xmm1 1494 addpd %xmm0, %xmm13 1495 movapd -12 * SIZE(AO), %xmm0 1496 1497 addq $4 * SIZE, AO # aoffset += 4 1498 addq $4 * SIZE, BO # boffset1 += 8 1499 decq %rax 1500 jg .L113 1501 ALIGN_4 1502 1503.L114: 1504#ifndef TRMMKERNEL 1505 movlpd 0 * SIZE(CO1), %xmm0 1506 movhpd 1 * SIZE(CO1), %xmm0 1507 movlpd 2 * SIZE(CO1), %xmm2 1508 movhpd 3 * SIZE(CO1), %xmm2 1509#endif 1510 1511 SHUFPD_1 %xmm9, %xmm9 1512 SHUFPD_1 %xmm13, %xmm13 1513 1514#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 1515 defined(NR) || defined(NC) || defined(TR) || defined(TC) 1516 xorpd %xmm5, %xmm9 1517 xorpd %xmm5, %xmm13 1518#else 1519 xorpd %xmm5, %xmm8 1520 xorpd %xmm5, %xmm12 1521#endif 1522 1523#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 1524 defined(RR) || defined(RC) || defined(CR) || defined(CC) 1525 subpd %xmm9, %xmm8 1526 subpd %xmm13, %xmm12 1527#else 1528 addpd %xmm9, %xmm8 1529 addpd %xmm13, %xmm12 1530#endif 1531 1532 pshufd $0x4e, %xmm8, %xmm9 1533 pshufd $0x4e, %xmm12, %xmm13 1534 1535 mulpd %xmm6, %xmm8 1536 mulpd %xmm7, %xmm9 1537 mulpd %xmm6, %xmm12 1538 mulpd %xmm7, %xmm13 1539 1540 addpd %xmm9, %xmm8 1541 addpd %xmm13, %xmm12 1542 1543#ifndef TRMMKERNEL 1544 addpd %xmm0, %xmm8 1545 addpd %xmm2, %xmm12 1546#endif 1547 1548 movlpd %xmm8, 0 * SIZE(CO1) 1549 movhpd %xmm8, 1 * SIZE(CO1) 1550 movlpd %xmm12, 2 * SIZE(CO1) 1551 movhpd %xmm12, 3 * SIZE(CO1) 1552 1553#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1554 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1555 movq K, %rax 1556 subq KKK, %rax 1557 leaq (,%rax, SIZE), %rax 1558 leaq (AO, %rax, 4), AO 1559 leaq (BO, %rax, 4), BO 1560#endif 1561 1562#if defined(TRMMKERNEL) && defined(LEFT) 1563 addq $2, KK 1564#endif 1565 1566 addq $4 * SIZE, CO1 # coffset += 4 1567 decq I # i -- 1568 jg .L110 1569 ALIGN_4 1570 1571.L130: 1572 testq $1, M 1573 jle .L999 1574 1575#if !defined(TRMMKERNEL) || \ 1576 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1577 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1578 1579 leaq 16 * SIZE + BUFFER, BO 1580#else 1581 leaq 16 * SIZE + BUFFER, BO 1582 movq KK, %rax 1583 leaq (, %rax, SIZE), %rax 1584 leaq (AO, %rax, 2), AO 1585 leaq (BO, %rax, 4), BO 1586#endif 1587 1588 movapd -16 * SIZE(AO), %xmm0 1589 movapd -16 * SIZE(BO), %xmm1 1590 movapd -8 * SIZE(AO), %xmm2 1591 movapd -8 * SIZE(BO), %xmm3 1592 1593 pxor %xmm8, %xmm8 1594 pxor %xmm9, %xmm9 1595 pxor %xmm10, %xmm10 1596 pxor %xmm11, %xmm11 1597 1598#ifndef TRMMKERNEL 1599 movq K, %rax 1600#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1601 movq K, %rax 1602 subq KK, %rax 1603 movq %rax, KKK 1604#else 1605 movq KK, %rax 1606#ifdef LEFT 1607 addq $1, %rax 1608#else 1609 addq $1, %rax 1610#endif 1611 movq %rax, KKK 1612#endif 1613 sarq $3, %rax 1614 je .L144 1615 ALIGN_4 1616 1617.L141: 1618 mulpd %xmm0, %xmm1 1619 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 1620 mulpd -14 * SIZE(BO), %xmm0 1621 addpd %xmm1, %xmm8 1622 movapd -12 * SIZE(BO), %xmm1 1623 addpd %xmm0, %xmm9 1624 movapd -14 * SIZE(AO), %xmm0 1625 mulpd %xmm0, %xmm1 1626 mulpd -10 * SIZE(BO), %xmm0 1627 addpd %xmm1, %xmm10 1628 movapd 0 * SIZE(BO), %xmm1 1629 addpd %xmm0, %xmm11 1630 movapd -12 * SIZE(AO), %xmm0 1631 mulpd %xmm0, %xmm3 1632 mulpd -6 * SIZE(BO), %xmm0 1633 addpd %xmm3, %xmm8 1634 movapd -4 * SIZE(BO), %xmm3 1635 addpd %xmm0, %xmm9 1636 movapd -10 * SIZE(AO), %xmm0 1637 mulpd %xmm0, %xmm3 1638 mulpd -2 * SIZE(BO), %xmm0 1639 addpd %xmm3, %xmm10 1640 movapd 8 * SIZE(BO), %xmm3 1641 addpd %xmm0, %xmm11 1642 movapd 0 * SIZE(AO), %xmm0 1643 mulpd %xmm2, %xmm1 1644 PREFETCH (PREFETCHSIZE + 8) * SIZE(AO) 1645 mulpd 2 * SIZE(BO), %xmm2 1646 addpd %xmm1, %xmm8 1647 movapd 4 * SIZE(BO), %xmm1 1648 addpd %xmm2, %xmm9 1649 movapd -6 * SIZE(AO), %xmm2 1650 mulpd %xmm2, %xmm1 1651 mulpd 6 * SIZE(BO), %xmm2 1652 addpd %xmm1, %xmm10 1653 movapd 16 * SIZE(BO), %xmm1 1654 addpd %xmm2, %xmm11 1655 movapd -4 * SIZE(AO), %xmm2 1656 mulpd %xmm2, %xmm3 1657 mulpd 10 * SIZE(BO), %xmm2 1658 addpd %xmm3, %xmm8 1659 movapd 12 * SIZE(BO), %xmm3 1660 addpd %xmm2, %xmm9 1661 movapd -2 * SIZE(AO), %xmm2 1662 mulpd %xmm2, %xmm3 1663 mulpd 14 * SIZE(BO), %xmm2 1664 addpd %xmm3, %xmm10 1665 movapd 24 * SIZE(BO), %xmm3 1666 addpd %xmm2, %xmm11 1667 movapd 8 * SIZE(AO), %xmm2 1668 1669 subq $-16 * SIZE, AO 1670 subq $-32 * SIZE, BO 1671 decq %rax 1672 jne .L141 1673 ALIGN_4 1674 1675 1676.L144: 1677#ifndef TRMMKERNEL 1678 movq K, %rax 1679#else 1680 movq KKK, %rax 1681#endif 1682 andq $4, %rax # if (k & 1) 1683 BRANCH 1684 jle .L145 1685 1686 mulpd %xmm0, %xmm1 1687 mulpd -14 * SIZE(BO), %xmm0 1688 addpd %xmm1, %xmm8 1689 movapd -12 * SIZE(BO), %xmm1 1690 addpd %xmm0, %xmm9 1691 movapd -14 * SIZE(AO), %xmm0 1692 mulpd %xmm0, %xmm1 1693 mulpd -10 * SIZE(BO), %xmm0 1694 addpd %xmm1, %xmm10 1695 movapd 0 * SIZE(BO), %xmm1 1696 addpd %xmm0, %xmm11 1697 movapd -12 * SIZE(AO), %xmm0 1698 mulpd %xmm0, %xmm3 1699 mulpd -6 * SIZE(BO), %xmm0 1700 addpd %xmm3, %xmm8 1701 movapd -4 * SIZE(BO), %xmm3 1702 addpd %xmm0, %xmm9 1703 movapd -10 * SIZE(AO), %xmm0 1704 mulpd %xmm0, %xmm3 1705 mulpd -2 * SIZE(BO), %xmm0 1706 addpd %xmm3, %xmm10 1707 addpd %xmm0, %xmm11 1708 movapd -8 * SIZE(AO), %xmm0 1709 1710 addq $8 * SIZE, AO 1711 subq $-16 * SIZE, BO 1712 ALIGN_4 1713 1714.L145: 1715 movapd POSINV, %xmm5 1716 movapd ALPHA_R, %xmm6 1717 movapd ALPHA_I, %xmm7 1718 1719#ifndef TRMMKERNEL 1720 movq K, %rax 1721#else 1722 movq KKK, %rax 1723#endif 1724 andq $3, %rax # if (k & 1) 1725 BRANCH 1726 jle .L148 1727 ALIGN_4 1728 1729.L146: 1730 mulpd %xmm0, %xmm1 1731 mulpd -14 * SIZE(BO), %xmm0 1732 addpd %xmm1, %xmm8 1733 movapd -12 * SIZE(BO), %xmm1 1734 addpd %xmm0, %xmm9 1735 movapd -14 * SIZE(AO), %xmm0 1736 1737 addq $2 * SIZE, AO # aoffset += 4 1738 addq $4 * SIZE, BO # boffset1 += 8 1739 decq %rax 1740 jg .L146 1741 ALIGN_4 1742 1743.L148: 1744 addpd %xmm10, %xmm8 1745 addpd %xmm11, %xmm9 1746 1747#ifndef TRMMKERNEL 1748 movlpd 0 * SIZE(CO1), %xmm0 1749 movhpd 1 * SIZE(CO1), %xmm0 1750#endif 1751 1752 SHUFPD_1 %xmm9, %xmm9 1753 1754#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 1755 defined(NR) || defined(NC) || defined(TR) || defined(TC) 1756 xorpd %xmm5, %xmm9 1757#else 1758 xorpd %xmm5, %xmm8 1759#endif 1760 1761#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 1762 defined(RR) || defined(RC) || defined(CR) || defined(CC) 1763 subpd %xmm9, %xmm8 1764#else 1765 addpd %xmm9, %xmm8 1766#endif 1767 1768 pshufd $0x4e, %xmm8, %xmm9 1769 1770 mulpd %xmm6, %xmm8 1771 mulpd %xmm7, %xmm9 1772 1773 addpd %xmm9, %xmm8 1774 1775#ifndef TRMMKERNEL 1776 addpd %xmm0, %xmm8 1777#endif 1778 1779 movlpd %xmm8, 0 * SIZE(CO1) 1780 movhpd %xmm8, 1 * SIZE(CO1) 1781 ALIGN_4 1782 1783.L999: 1784 movq %rbx, %rsp 1785 EMMS 1786 1787 movq 0(%rsp), %rbx 1788 movq 8(%rsp), %rbp 1789 movq 16(%rsp), %r12 1790 movq 24(%rsp), %r13 1791 movq 32(%rsp), %r14 1792 movq 40(%rsp), %r15 1793 1794#ifdef WINDOWS_ABI 1795 movq 48(%rsp), %rdi 1796 movq 56(%rsp), %rsi 1797 movups 64(%rsp), %xmm6 1798 movups 80(%rsp), %xmm7 1799 movups 96(%rsp), %xmm8 1800 movups 112(%rsp), %xmm9 1801 movups 128(%rsp), %xmm10 1802 movups 144(%rsp), %xmm11 1803 movups 160(%rsp), %xmm12 1804 movups 176(%rsp), %xmm13 1805 movups 192(%rsp), %xmm14 1806 movups 208(%rsp), %xmm15 1807#endif 1808 1809 addq $STACKSIZE, %rsp 1810 ret 1811 1812 EPILOGUE 1813