1/*********************************************************************/ 2/* */ 3/* Optimized BLAS libraries */ 4/* By Kazushige Goto <kgoto@tacc.utexas.edu> */ 5/* */ 6/* Copyright (c) The University of Texas, 2009. All rights reserved. */ 7/* UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING */ 8/* THIS SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF */ 9/* MERCHANTABILITY, FITNESS FOR ANY PARTICULAR PURPOSE, */ 10/* NON-INFRINGEMENT AND WARRANTIES OF PERFORMANCE, AND ANY WARRANTY */ 11/* THAT MIGHT OTHERWISE ARISE FROM COURSE OF DEALING OR USAGE OF */ 12/* TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH RESPECT TO */ 13/* THE USE OF THE SOFTWARE OR DOCUMENTATION. */ 14/* Under no circumstances shall University be liable for incidental, */ 15/* special, indirect, direct or consequential damages or loss of */ 16/* profits, interruption of business, or related expenses which may */ 17/* arise from use of Software or Documentation, including but not */ 18/* limited to those resulting from defects in Software and/or */ 19/* Documentation, or loss or inaccuracy of data of any kind. */ 20/*********************************************************************/ 21 22#define ASSEMBLER 23#include "common.h" 24 25#define M %rdi 26#define N %rsi 27#define K %rdx 28 29#define A %rcx 30#define B %r8 31#define C %r9 32#define LDC %r10 33 34#define I %r11 35#define AO %r12 36#define BO %r13 37#define CO1 %r14 38#define CO2 %r15 39#define BB %rbp 40 41#ifndef WINDOWS_ABI 42 43#define STACKSIZE 64 44 45#else 46 47#define STACKSIZE 256 48 49#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) 50#define OLD_A 48 + STACKSIZE(%rsp) 51#define OLD_B 56 + STACKSIZE(%rsp) 52#define OLD_C 64 + STACKSIZE(%rsp) 53#define OLD_LDC 72 + STACKSIZE(%rsp) 54#define OLD_OFFSET 80 + STACKSIZE(%rsp) 55 56#endif 57 58#define ALPHA_R 0(%rsp) 59#define ALPHA_I 16(%rsp) 60#define J 32(%rsp) 61#define OFFSET 40(%rsp) 62#define KK 48(%rsp) 63#define KKK 56(%rsp) 64#define BUFFER 128(%rsp) 65 66#define PREFETCH prefetcht0 67#define PREFETCHSIZE 320 68 69#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 70 defined(RN) || defined(RT) || defined(CN) || defined(CT) 71#define ADDSUB addps 72#else 73#define ADDSUB subps 74#endif 75 76#define KERNEL1(address) \ 77 mulps %xmm8, %xmm9; \ 78 PREFETCH (PREFETCHSIZE + 0) * SIZE + (address) * 2 * SIZE(AO); \ 79 addps %xmm9, %xmm0; \ 80 movshdup 0 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \ 81 mulps %xmm8, %xmm9; \ 82 ADDSUB %xmm9, %xmm1; \ 83 movsldup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \ 84 mulps %xmm8, %xmm9; \ 85 addps %xmm9, %xmm2; \ 86 movshdup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \ 87 mulps %xmm8, %xmm9; \ 88 movaps 4 * SIZE + (address) * 2 * SIZE(AO), %xmm8; \ 89 ADDSUB %xmm9, %xmm3; \ 90 movsldup 0 * SIZE + (address) * 2 * SIZE(BO), %xmm9 91 92#define KERNEL2(address) \ 93 mulps %xmm8, %xmm9; \ 94 addps %xmm9, %xmm4; \ 95 movshdup 0 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \ 96 mulps %xmm8, %xmm9; \ 97 ADDSUB %xmm9, %xmm5; \ 98 movsldup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \ 99 mulps %xmm8, %xmm9; \ 100 addps %xmm9, %xmm6; \ 101 movshdup 4 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \ 102 mulps %xmm8, %xmm9; \ 103 movaps 8 * SIZE + (address) * 2 * SIZE(AO), %xmm8; \ 104 ADDSUB %xmm9, %xmm7; \ 105 movsldup 8 * SIZE + (address) * 2 * SIZE(BO), %xmm9 106 107#define KERNEL3(address) \ 108 mulps %xmm8, %xmm9; \ 109 addps %xmm9, %xmm0; \ 110 movshdup 8 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \ 111 mulps %xmm8, %xmm9; \ 112 ADDSUB %xmm9, %xmm1; \ 113 movsldup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \ 114 mulps %xmm8, %xmm9; \ 115 addps %xmm9, %xmm2; \ 116 movshdup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \ 117 mulps %xmm8, %xmm9; \ 118 movaps 12 * SIZE + (address) * 2 * SIZE(AO), %xmm8; \ 119 ADDSUB %xmm9, %xmm3; \ 120 movsldup 8 * SIZE + (address) * 2 * SIZE(BO), %xmm9 121 122#define KERNEL4(address) \ 123 mulps %xmm8, %xmm9; \ 124 addps %xmm9, %xmm4; \ 125 movshdup 8 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \ 126 mulps %xmm8, %xmm9; \ 127 ADDSUB %xmm9, %xmm5; \ 128 movsldup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \ 129 mulps %xmm8, %xmm9; \ 130 addps %xmm9, %xmm6; \ 131 movshdup 12 * SIZE + (address) * 2 * SIZE(BO), %xmm9; \ 132 mulps %xmm8, %xmm9; \ 133 movaps 64 * SIZE + (address) * 2 * SIZE(AO), %xmm8; \ 134 ADDSUB %xmm9, %xmm7; \ 135 movsldup 64 * SIZE + (address) * 2 * SIZE(BO), %xmm9 136 137#define KERNEL5(address) \ 138 mulps %xmm10, %xmm11; \ 139 addps %xmm11, %xmm0; \ 140 movshdup 16 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \ 141 mulps %xmm10, %xmm11; \ 142 ADDSUB %xmm11, %xmm1; \ 143 movsldup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \ 144 mulps %xmm10, %xmm11; \ 145 addps %xmm11, %xmm2; \ 146 movshdup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \ 147 mulps %xmm10, %xmm11; \ 148 movaps 20 * SIZE + (address) * 2 * SIZE(AO), %xmm10; \ 149 ADDSUB %xmm11, %xmm3; \ 150 movsldup 16 * SIZE + (address) * 2 * SIZE(BO), %xmm11 151 152#define KERNEL6(address) \ 153 mulps %xmm10, %xmm11; \ 154 addps %xmm11, %xmm4; \ 155 movshdup 16 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \ 156 mulps %xmm10, %xmm11; \ 157 ADDSUB %xmm11, %xmm5; \ 158 movsldup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \ 159 mulps %xmm10, %xmm11; \ 160 addps %xmm11, %xmm6; \ 161 movshdup 20 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \ 162 mulps %xmm10, %xmm11; \ 163 movaps 24 * SIZE + (address) * 2 * SIZE(AO), %xmm10; \ 164 ADDSUB %xmm11, %xmm7; \ 165 movsldup 24 * SIZE + (address) * 2 * SIZE(BO), %xmm11 166 167#define KERNEL7(address) \ 168 mulps %xmm10, %xmm11; \ 169 addps %xmm11, %xmm0; \ 170 movshdup 24 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \ 171 mulps %xmm10, %xmm11; \ 172 ADDSUB %xmm11, %xmm1; \ 173 movsldup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \ 174 mulps %xmm10, %xmm11; \ 175 addps %xmm11, %xmm2; \ 176 movshdup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \ 177 mulps %xmm10, %xmm11; \ 178 movaps 28 * SIZE + (address) * 2 * SIZE(AO), %xmm10; \ 179 ADDSUB %xmm11, %xmm3; \ 180 movsldup 24 * SIZE + (address) * 2 * SIZE(BO), %xmm11 181 182#define KERNEL8(address) \ 183 mulps %xmm10, %xmm11; \ 184 addps %xmm11, %xmm4; \ 185 movshdup 24 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \ 186 mulps %xmm10, %xmm11; \ 187 ADDSUB %xmm11, %xmm5; \ 188 movsldup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \ 189 mulps %xmm10, %xmm11; \ 190 addps %xmm11, %xmm6; \ 191 movshdup 28 * SIZE + (address) * 2 * SIZE(BO), %xmm11; \ 192 mulps %xmm10, %xmm11; \ 193 movaps 80 * SIZE + (address) * 2 * SIZE(AO), %xmm10; \ 194 ADDSUB %xmm11, %xmm7; \ 195 movsldup 80 * SIZE + (address) * 2 * SIZE(BO), %xmm11 196 197#define KERNEL9(address) \ 198 mulps %xmm12, %xmm13; \ 199 PREFETCH (PREFETCHSIZE + 32) * SIZE + (address) * 2 * SIZE(AO); \ 200 addps %xmm13, %xmm0; \ 201 movshdup 32 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \ 202 mulps %xmm12, %xmm13; \ 203 ADDSUB %xmm13, %xmm1; \ 204 movsldup 36 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \ 205 mulps %xmm12, %xmm13; \ 206 addps %xmm13, %xmm2; \ 207 movshdup 36 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \ 208 mulps %xmm12, %xmm13; \ 209 movaps 36 * SIZE + (address) * 2 * SIZE(AO), %xmm12; \ 210 ADDSUB %xmm13, %xmm3; \ 211 movsldup 32 * SIZE + (address) * 2 * SIZE(BO), %xmm13 212 213#define KERNEL10(address) \ 214 mulps %xmm12, %xmm13; \ 215 addps %xmm13, %xmm4; \ 216 movshdup 32 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \ 217 mulps %xmm12, %xmm13; \ 218 ADDSUB %xmm13, %xmm5; \ 219 movsldup 36 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \ 220 mulps %xmm12, %xmm13; \ 221 addps %xmm13, %xmm6; \ 222 movshdup 36 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \ 223 mulps %xmm12, %xmm13; \ 224 movaps 40 * SIZE + (address) * 2 * SIZE(AO), %xmm12; \ 225 ADDSUB %xmm13, %xmm7; \ 226 movsldup 40 * SIZE + (address) * 2 * SIZE(BO), %xmm13 227 228#define KERNEL11(address) \ 229 mulps %xmm12, %xmm13; \ 230 addps %xmm13, %xmm0; \ 231 movshdup 40 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \ 232 mulps %xmm12, %xmm13; \ 233 ADDSUB %xmm13, %xmm1; \ 234 movsldup 44 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \ 235 mulps %xmm12, %xmm13; \ 236 addps %xmm13, %xmm2; \ 237 movshdup 44 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \ 238 mulps %xmm12, %xmm13; \ 239 movaps 44 * SIZE + (address) * 2 * SIZE(AO), %xmm12; \ 240 ADDSUB %xmm13, %xmm3; \ 241 movsldup 40 * SIZE + (address) * 2 * SIZE(BO), %xmm13 242 243#define KERNEL12(address) \ 244 mulps %xmm12, %xmm13; \ 245 addps %xmm13, %xmm4; \ 246 movshdup 40 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \ 247 mulps %xmm12, %xmm13; \ 248 ADDSUB %xmm13, %xmm5; \ 249 movsldup 44 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \ 250 mulps %xmm12, %xmm13; \ 251 addps %xmm13, %xmm6; \ 252 movshdup 44 * SIZE + (address) * 2 * SIZE(BO), %xmm13; \ 253 mulps %xmm12, %xmm13; \ 254 movaps 96 * SIZE + (address) * 2 * SIZE(AO), %xmm12; \ 255 ADDSUB %xmm13, %xmm7; \ 256 movsldup 96 * SIZE + (address) * 2 * SIZE(BO), %xmm13 257 258#define KERNEL13(address) \ 259 mulps %xmm14, %xmm15; \ 260 addps %xmm15, %xmm0; \ 261 movshdup 48 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \ 262 mulps %xmm14, %xmm15; \ 263 ADDSUB %xmm15, %xmm1; \ 264 movsldup 52 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \ 265 mulps %xmm14, %xmm15; \ 266 addps %xmm15, %xmm2; \ 267 movshdup 52 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \ 268 mulps %xmm14, %xmm15; \ 269 movaps 52 * SIZE + (address) * 2 * SIZE(AO), %xmm14; \ 270 ADDSUB %xmm15, %xmm3; \ 271 movsldup 48 * SIZE + (address) * 2 * SIZE(BO), %xmm15 272 273#define KERNEL14(address) \ 274 mulps %xmm14, %xmm15; \ 275 addps %xmm15, %xmm4; \ 276 movshdup 48 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \ 277 mulps %xmm14, %xmm15; \ 278 ADDSUB %xmm15, %xmm5; \ 279 movsldup 52 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \ 280 mulps %xmm14, %xmm15; \ 281 addps %xmm15, %xmm6; \ 282 movshdup 52 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \ 283 mulps %xmm14, %xmm15; \ 284 movaps 56 * SIZE + (address) * 2 * SIZE(AO), %xmm14; \ 285 ADDSUB %xmm15, %xmm7; \ 286 movsldup 56 * SIZE + (address) * 2 * SIZE(BO), %xmm15 287 288#define KERNEL15(address) \ 289 mulps %xmm14, %xmm15; \ 290 addps %xmm15, %xmm0; \ 291 movshdup 56 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \ 292 mulps %xmm14, %xmm15; \ 293 ADDSUB %xmm15, %xmm1; \ 294 movsldup 60 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \ 295 mulps %xmm14, %xmm15; \ 296 addps %xmm15, %xmm2; \ 297 movshdup 60 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \ 298 mulps %xmm14, %xmm15; \ 299 movaps 60 * SIZE + (address) * 2 * SIZE(AO), %xmm14; \ 300 ADDSUB %xmm15, %xmm3; \ 301 movsldup 56 * SIZE + (address) * 2 * SIZE(BO), %xmm15 302 303#define KERNEL16(address) \ 304 mulps %xmm14, %xmm15; \ 305 addps %xmm15, %xmm4; \ 306 movshdup 56 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \ 307 mulps %xmm14, %xmm15; \ 308 ADDSUB %xmm15, %xmm5; \ 309 movsldup 60 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \ 310 mulps %xmm14, %xmm15; \ 311 addps %xmm15, %xmm6; \ 312 movshdup 60 * SIZE + (address) * 2 * SIZE(BO), %xmm15; \ 313 mulps %xmm14, %xmm15; \ 314 movaps 112 * SIZE + (address) * 2 * SIZE(AO), %xmm14; \ 315 ADDSUB %xmm15, %xmm7; \ 316 movsldup 112 * SIZE + (address) * 2 * SIZE(BO), %xmm15 317 318 PROLOGUE 319 PROFCODE 320 321 subq $STACKSIZE, %rsp 322 323 movq %rbx, 0(%rsp) 324 movq %rbp, 8(%rsp) 325 movq %r12, 16(%rsp) 326 movq %r13, 24(%rsp) 327 movq %r14, 32(%rsp) 328 movq %r15, 40(%rsp) 329 330#ifdef WINDOWS_ABI 331 movq %rdi, 48(%rsp) 332 movq %rsi, 56(%rsp) 333 movups %xmm6, 64(%rsp) 334 movups %xmm7, 80(%rsp) 335 movups %xmm8, 96(%rsp) 336 movups %xmm9, 112(%rsp) 337 movups %xmm10, 128(%rsp) 338 movups %xmm11, 144(%rsp) 339 movups %xmm12, 160(%rsp) 340 movups %xmm13, 176(%rsp) 341 movups %xmm14, 192(%rsp) 342 movups %xmm15, 208(%rsp) 343 344 movq ARG1, M 345 movq ARG2, N 346 movq ARG3, K 347 movq OLD_A, A 348 movq OLD_B, B 349 movq OLD_C, C 350 movq OLD_LDC, LDC 351#ifdef TRMMKERNEL 352 movsd OLD_OFFSET, %xmm4 353#endif 354 movaps %xmm3, %xmm0 355 movsd OLD_ALPHA_I, %xmm1 356#else 357 movq 72(%rsp), LDC 358#ifdef TRMMKERNEL 359 movsd 80(%rsp), %xmm4 360#endif 361 362#endif 363 364 movq %rsp, %rbx # save old stack 365 subq $128 + LOCAL_BUFFER_SIZE, %rsp 366 andq $-4096, %rsp # align stack 367 368 STACK_TOUCHING 369 370 pxor %xmm15, %xmm15 371 cmpeqps %xmm15, %xmm15 372 pslld $31, %xmm15 # Generate mask 373 pxor %xmm2, %xmm2 374 375 shufps $0, %xmm0, %xmm0 376 movaps %xmm0, 0 + ALPHA_R 377 378 movss %xmm1, 4 + ALPHA_I 379 movss %xmm1, 12 + ALPHA_I 380 xorps %xmm15, %xmm1 381 movss %xmm1, 0 + ALPHA_I 382 movss %xmm1, 8 + ALPHA_I 383 384#ifdef TRMMKERNEL 385 movsd %xmm4, OFFSET 386 movsd %xmm4, KK 387#ifndef LEFT 388 negq KK 389#endif 390#endif 391 392 salq $ZBASE_SHIFT, LDC 393 movq N, J 394 sarq $1, J # j = (n >> 2) 395 jle .L40 396 ALIGN_4 397 398.L01: 399#if defined(TRMMKERNEL) && defined(LEFT) 400 movq OFFSET, %rax 401 movq %rax, KK 402#endif 403 404/* Copying to Sub Buffer */ 405 leaq BUFFER, BO 406 407 movq K, %rax 408 sarq $2, %rax 409 jle .L03 410 ALIGN_4 411 412.L02: 413 movddup 0 * SIZE(B), %xmm0 414 movddup 2 * SIZE(B), %xmm1 415 movddup 4 * SIZE(B), %xmm2 416 movddup 6 * SIZE(B), %xmm3 417 movddup 8 * SIZE(B), %xmm4 418 movddup 10 * SIZE(B), %xmm5 419 movddup 12 * SIZE(B), %xmm6 420 movddup 14 * SIZE(B), %xmm7 421 422 movaps %xmm0, 0 * SIZE(BO) 423 movaps %xmm1, 4 * SIZE(BO) 424 movaps %xmm2, 8 * SIZE(BO) 425 movaps %xmm3, 12 * SIZE(BO) 426 movaps %xmm4, 16 * SIZE(BO) 427 movaps %xmm5, 20 * SIZE(BO) 428 movaps %xmm6, 24 * SIZE(BO) 429 movaps %xmm7, 28 * SIZE(BO) 430 431 prefetcht1 128 * SIZE(BO) 432 prefetcht0 112 * SIZE(B) 433 434 addq $16 * SIZE, B 435 addq $32 * SIZE, BO 436 decq %rax 437 jne .L02 438 ALIGN_4 439 440.L03: 441 movq K, %rax 442 andq $3, %rax 443 BRANCH 444 jle .L10 445 ALIGN_4 446 447.L04: 448 movddup 0 * SIZE(B), %xmm0 449 movddup 2 * SIZE(B), %xmm1 450 451 movaps %xmm0, 0 * SIZE(BO) 452 movaps %xmm1, 4 * SIZE(BO) 453 454 addq $4 * SIZE, B 455 addq $8 * SIZE, BO 456 decq %rax 457 jne .L04 458 ALIGN_4 459 460.L10: 461 movq C, CO1 # coffset1 = c 462 leaq (C, LDC, 1), CO2 # coffset2 = c + ldc 463 movq A, AO # aoffset = a 464 465 leaq 112 * SIZE(B), BB 466 467 movq M, I 468 sarq $2, I # i = (m >> 2) 469 jle .L20 470 ALIGN_4 471 472.L11: 473 prefetcht0 0 * SIZE(BB) 474 subq $-8 * SIZE, BB 475 476#if !defined(TRMMKERNEL) || \ 477 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 478 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 479 480 leaq BUFFER, BO 481#else 482 leaq BUFFER, BO 483 movq KK, %rax 484 leaq (, %rax, 8), %rax 485 leaq (AO, %rax, 4), AO 486 leaq (BO, %rax, 4), BO 487#endif 488 489 movaps 0 * SIZE(AO), %xmm8 490 pxor %xmm0, %xmm0 491 movaps 16 * SIZE(AO), %xmm10 492 pxor %xmm1, %xmm1 493 movaps 32 * SIZE(AO), %xmm12 494 pxor %xmm2, %xmm2 495 movaps 48 * SIZE(AO), %xmm14 496 pxor %xmm3, %xmm3 497 498 movsldup 0 * SIZE(BO), %xmm9 499 pxor %xmm4, %xmm4 500 movsldup 16 * SIZE(BO), %xmm11 501 pxor %xmm5, %xmm5 502 movsldup 32 * SIZE(BO), %xmm13 503 pxor %xmm6, %xmm6 504 movsldup 48 * SIZE(BO), %xmm15 505 pxor %xmm7, %xmm7 506 507 prefetchnta 8 * SIZE(CO1) 508 prefetchnta 8 * SIZE(CO2) 509 510#ifndef TRMMKERNEL 511 movq K, %rax 512#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 513 movq K, %rax 514 subq KK, %rax 515 movq %rax, KKK 516#else 517 movq KK, %rax 518#ifdef LEFT 519 addq $4, %rax 520#else 521 addq $2, %rax 522#endif 523 movq %rax, KKK 524#endif 525#if 1 526 andq $-8, %rax 527 salq $4, %rax 528 je .L15 529 530.L1X: 531 KERNEL1 (32 * 0) 532 KERNEL2 (32 * 0) 533 KERNEL3 (32 * 0) 534 KERNEL4 (32 * 0) 535 KERNEL5 (32 * 0) 536 KERNEL6 (32 * 0) 537 KERNEL7 (32 * 0) 538 KERNEL8 (32 * 0) 539 KERNEL9 (32 * 0) 540 KERNEL10(32 * 0) 541 KERNEL11(32 * 0) 542 KERNEL12(32 * 0) 543 KERNEL13(32 * 0) 544 KERNEL14(32 * 0) 545 KERNEL15(32 * 0) 546 KERNEL16(32 * 0) 547 cmpq $128 * 1, %rax 548 jle .L12 549 KERNEL1 (32 * 1) 550 KERNEL2 (32 * 1) 551 KERNEL3 (32 * 1) 552 KERNEL4 (32 * 1) 553 KERNEL5 (32 * 1) 554 KERNEL6 (32 * 1) 555 KERNEL7 (32 * 1) 556 KERNEL8 (32 * 1) 557 KERNEL9 (32 * 1) 558 KERNEL10(32 * 1) 559 KERNEL11(32 * 1) 560 KERNEL12(32 * 1) 561 KERNEL13(32 * 1) 562 KERNEL14(32 * 1) 563 KERNEL15(32 * 1) 564 KERNEL16(32 * 1) 565 cmpq $128 * 2, %rax 566 jle .L12 567 KERNEL1 (32 * 2) 568 KERNEL2 (32 * 2) 569 KERNEL3 (32 * 2) 570 KERNEL4 (32 * 2) 571 KERNEL5 (32 * 2) 572 KERNEL6 (32 * 2) 573 KERNEL7 (32 * 2) 574 KERNEL8 (32 * 2) 575 KERNEL9 (32 * 2) 576 KERNEL10(32 * 2) 577 KERNEL11(32 * 2) 578 KERNEL12(32 * 2) 579 KERNEL13(32 * 2) 580 KERNEL14(32 * 2) 581 KERNEL15(32 * 2) 582 KERNEL16(32 * 2) 583 cmpq $128 * 3, %rax 584 jle .L12 585 KERNEL1 (32 * 3) 586 KERNEL2 (32 * 3) 587 KERNEL3 (32 * 3) 588 KERNEL4 (32 * 3) 589 KERNEL5 (32 * 3) 590 KERNEL6 (32 * 3) 591 KERNEL7 (32 * 3) 592 KERNEL8 (32 * 3) 593 KERNEL9 (32 * 3) 594 KERNEL10(32 * 3) 595 KERNEL11(32 * 3) 596 KERNEL12(32 * 3) 597 KERNEL13(32 * 3) 598 KERNEL14(32 * 3) 599 KERNEL15(32 * 3) 600 KERNEL16(32 * 3) 601 cmpq $128 * 4, %rax 602 jle .L12 603 KERNEL1 (32 * 4) 604 KERNEL2 (32 * 4) 605 KERNEL3 (32 * 4) 606 KERNEL4 (32 * 4) 607 KERNEL5 (32 * 4) 608 KERNEL6 (32 * 4) 609 KERNEL7 (32 * 4) 610 KERNEL8 (32 * 4) 611 KERNEL9 (32 * 4) 612 KERNEL10(32 * 4) 613 KERNEL11(32 * 4) 614 KERNEL12(32 * 4) 615 KERNEL13(32 * 4) 616 KERNEL14(32 * 4) 617 KERNEL15(32 * 4) 618 KERNEL16(32 * 4) 619 cmpq $128 * 5, %rax 620 jle .L12 621 KERNEL1 (32 * 5) 622 KERNEL2 (32 * 5) 623 KERNEL3 (32 * 5) 624 KERNEL4 (32 * 5) 625 KERNEL5 (32 * 5) 626 KERNEL6 (32 * 5) 627 KERNEL7 (32 * 5) 628 KERNEL8 (32 * 5) 629 KERNEL9 (32 * 5) 630 KERNEL10(32 * 5) 631 KERNEL11(32 * 5) 632 KERNEL12(32 * 5) 633 KERNEL13(32 * 5) 634 KERNEL14(32 * 5) 635 KERNEL15(32 * 5) 636 KERNEL16(32 * 5) 637 cmpq $128 * 6, %rax 638 jle .L12 639 KERNEL1 (32 * 6) 640 KERNEL2 (32 * 6) 641 KERNEL3 (32 * 6) 642 KERNEL4 (32 * 6) 643 KERNEL5 (32 * 6) 644 KERNEL6 (32 * 6) 645 KERNEL7 (32 * 6) 646 KERNEL8 (32 * 6) 647 KERNEL9 (32 * 6) 648 KERNEL10(32 * 6) 649 KERNEL11(32 * 6) 650 KERNEL12(32 * 6) 651 KERNEL13(32 * 6) 652 KERNEL14(32 * 6) 653 KERNEL15(32 * 6) 654 KERNEL16(32 * 6) 655 cmpq $128 * 7, %rax 656 jle .L12 657 KERNEL1 (32 * 7) 658 KERNEL2 (32 * 7) 659 KERNEL3 (32 * 7) 660 KERNEL4 (32 * 7) 661 KERNEL5 (32 * 7) 662 KERNEL6 (32 * 7) 663 KERNEL7 (32 * 7) 664 KERNEL8 (32 * 7) 665 KERNEL9 (32 * 7) 666 KERNEL10(32 * 7) 667 KERNEL11(32 * 7) 668 KERNEL12(32 * 7) 669 KERNEL13(32 * 7) 670 KERNEL14(32 * 7) 671 KERNEL15(32 * 7) 672 KERNEL16(32 * 7) 673 674 addq $64 * 8 * SIZE, AO 675 addq $64 * 8 * SIZE, BO 676 subq $128 * 8, %rax 677 jg .L1X 678 679.L12: 680 leaq (AO, %rax, 2), AO # * 16 681 leaq (BO, %rax, 2), BO # * 64 682#else 683 sarq $3, %rax 684 je .L15 685 ALIGN_4 686 687.L12: 688 KERNEL1 (32 * 0) 689 KERNEL2 (32 * 0) 690 KERNEL3 (32 * 0) 691 KERNEL4 (32 * 0) 692 KERNEL5 (32 * 0) 693 KERNEL6 (32 * 0) 694 KERNEL7 (32 * 0) 695 KERNEL8 (32 * 0) 696 KERNEL9 (32 * 0) 697 KERNEL10(32 * 0) 698 KERNEL11(32 * 0) 699 KERNEL12(32 * 0) 700 KERNEL13(32 * 0) 701 KERNEL14(32 * 0) 702 KERNEL15(32 * 0) 703 KERNEL16(32 * 0) 704 705 addq $64 * SIZE, AO 706 addq $64 * SIZE, BO 707 decq %rax 708 jne .L12 709#endif 710 ALIGN_4 711 712.L15: 713#ifndef TRMMKERNEL 714 movq K, %rax 715#else 716 movq KKK, %rax 717#endif 718 movaps ALPHA_R, %xmm14 719 movaps ALPHA_I, %xmm15 720 andq $7, %rax # if (k & 1) 721 BRANCH 722 je .L18 723 ALIGN_4 724 725.L16: 726 mulps %xmm8, %xmm9 727 addps %xmm9, %xmm0 728 movshdup 0 * SIZE(BO), %xmm9 729 mulps %xmm8, %xmm9 730 ADDSUB %xmm9, %xmm1 731 movsldup 4 * SIZE(BO), %xmm9 732 mulps %xmm8, %xmm9 733 addps %xmm9, %xmm2 734 movshdup 4 * SIZE(BO), %xmm9 735 mulps %xmm8, %xmm9 736 movaps 4 * SIZE(AO), %xmm8 737 ADDSUB %xmm9, %xmm3 738 movsldup 0 * SIZE(BO), %xmm9 739 mulps %xmm8, %xmm9 740 addps %xmm9, %xmm4 741 movshdup 0 * SIZE(BO), %xmm9 742 mulps %xmm8, %xmm9 743 ADDSUB %xmm9, %xmm5 744 movsldup 4 * SIZE(BO), %xmm9 745 mulps %xmm8, %xmm9 746 addps %xmm9, %xmm6 747 movshdup 4 * SIZE(BO), %xmm9 748 mulps %xmm8, %xmm9 749 movaps 8 * SIZE(AO), %xmm8 750 ADDSUB %xmm9, %xmm7 751 movsldup 8 * SIZE(BO), %xmm9 752 753 addq $8 * SIZE, AO 754 addq $8 * SIZE, BO 755 decq %rax 756 jg .L16 757 ALIGN_4 758 759.L18: 760#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 761 defined(NR) || defined(NC) || defined(TR) || defined(TC) 762 763 shufps $0xb1, %xmm1, %xmm1 764 shufps $0xb1, %xmm3, %xmm3 765 shufps $0xb1, %xmm5, %xmm5 766 shufps $0xb1, %xmm7, %xmm7 767 768 addsubps %xmm1, %xmm0 769 addsubps %xmm3, %xmm2 770 addsubps %xmm5, %xmm4 771 addsubps %xmm7, %xmm6 772 773 movaps %xmm0, %xmm1 774 movaps %xmm2, %xmm3 775 movaps %xmm4, %xmm5 776 movaps %xmm6, %xmm7 777 778 shufps $0xb1, %xmm0, %xmm0 779 shufps $0xb1, %xmm2, %xmm2 780 shufps $0xb1, %xmm4, %xmm4 781 shufps $0xb1, %xmm6, %xmm6 782#else 783 shufps $0xb1, %xmm0, %xmm0 784 shufps $0xb1, %xmm2, %xmm2 785 shufps $0xb1, %xmm4, %xmm4 786 shufps $0xb1, %xmm6, %xmm6 787 788 addsubps %xmm0, %xmm1 789 addsubps %xmm2, %xmm3 790 addsubps %xmm4, %xmm5 791 addsubps %xmm6, %xmm7 792 793 movaps %xmm1, %xmm0 794 movaps %xmm3, %xmm2 795 movaps %xmm5, %xmm4 796 movaps %xmm7, %xmm6 797 798 shufps $0xb1, %xmm1, %xmm1 799 shufps $0xb1, %xmm3, %xmm3 800 shufps $0xb1, %xmm5, %xmm5 801 shufps $0xb1, %xmm7, %xmm7 802#endif 803 804 mulps %xmm14, %xmm1 805 mulps %xmm15, %xmm0 806 mulps %xmm14, %xmm3 807 mulps %xmm15, %xmm2 808 809 mulps %xmm14, %xmm5 810 mulps %xmm15, %xmm4 811 mulps %xmm14, %xmm7 812 mulps %xmm15, %xmm6 813 814 addps %xmm1, %xmm0 815 addps %xmm3, %xmm2 816 addps %xmm5, %xmm4 817 addps %xmm7, %xmm6 818 819#if! defined(TRMMKERNEL) && !defined(BETAZERO) 820 shufps $0xe4, %xmm8, %xmm8 821 shufps $0xe4, %xmm9, %xmm9 822 shufps $0xe4, %xmm10, %xmm10 823 shufps $0xe4, %xmm11, %xmm11 824 825 movsd 0 * SIZE(CO1), %xmm8 826 movhps 2 * SIZE(CO1), %xmm8 827 movsd 4 * SIZE(CO1), %xmm10 828 movhps 6 * SIZE(CO1), %xmm10 829 830 movsd 0 * SIZE(CO2), %xmm9 831 movhps 2 * SIZE(CO2), %xmm9 832 movsd 4 * SIZE(CO2), %xmm11 833 movhps 6 * SIZE(CO2), %xmm11 834 835 addps %xmm8, %xmm0 836 addps %xmm9, %xmm2 837 addps %xmm10, %xmm4 838 addps %xmm11, %xmm6 839#endif 840 841 movsd %xmm0, 0 * SIZE(CO1) 842 movhps %xmm0, 2 * SIZE(CO1) 843 movsd %xmm4, 4 * SIZE(CO1) 844 movhps %xmm4, 6 * SIZE(CO1) 845 846 movsd %xmm2, 0 * SIZE(CO2) 847 movhps %xmm2, 2 * SIZE(CO2) 848 movsd %xmm6, 4 * SIZE(CO2) 849 movhps %xmm6, 6 * SIZE(CO2) 850 851#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 852 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 853 movq K, %rax 854 subq KKK, %rax 855 leaq (,%rax, 8), %rax 856 leaq (AO, %rax, 4), AO 857 leaq (BO, %rax, 4), BO 858#endif 859 860#if defined(TRMMKERNEL) && defined(LEFT) 861 addq $4, KK 862#endif 863 864 addq $8 * SIZE, CO1 # coffset += 4 865 addq $8 * SIZE, CO2 # coffset += 4 866 decq I # i -- 867 jg .L11 868 ALIGN_4 869 870.L20: 871 testq $2, M 872 je .L30 873 874#if !defined(TRMMKERNEL) || \ 875 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 876 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 877 878 leaq BUFFER, BO 879#else 880 leaq BUFFER, BO 881 movq KK, %rax 882 leaq (, %rax, 8), %rax 883 leaq (AO, %rax, 2), AO 884 leaq (BO, %rax, 4), BO 885#endif 886 887 movaps 0 * SIZE(AO), %xmm8 888 pxor %xmm0, %xmm0 889 movaps 16 * SIZE(AO), %xmm10 890 pxor %xmm1, %xmm1 891 892 movsldup 0 * SIZE(BO), %xmm9 893 pxor %xmm2, %xmm2 894 movsldup 16 * SIZE(BO), %xmm11 895 pxor %xmm3, %xmm3 896 movsldup 32 * SIZE(BO), %xmm13 897 movsldup 48 * SIZE(BO), %xmm15 898 899#ifndef TRMMKERNEL 900 movq K, %rax 901#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 902 movq K, %rax 903 subq KK, %rax 904 movq %rax, KKK 905#else 906 movq KK, %rax 907#ifdef LEFT 908 addq $2, %rax 909#else 910 addq $2, %rax 911#endif 912 movq %rax, KKK 913#endif 914 sarq $3, %rax 915 je .L25 916 ALIGN_4 917 918.L22: 919 mulps %xmm8, %xmm9 920 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 921 addps %xmm9, %xmm0 922 movshdup 0 * SIZE(BO), %xmm9 923 mulps %xmm8, %xmm9 924 ADDSUB %xmm9, %xmm1 925 movsldup 4 * SIZE(BO), %xmm9 926 mulps %xmm8, %xmm9 927 addps %xmm9, %xmm2 928 movshdup 4 * SIZE(BO), %xmm9 929 mulps %xmm8, %xmm9 930 movaps 4 * SIZE(AO), %xmm8 931 ADDSUB %xmm9, %xmm3 932 movsldup 8 * SIZE(BO), %xmm9 933 mulps %xmm8, %xmm9 934 addps %xmm9, %xmm0 935 movshdup 8 * SIZE(BO), %xmm9 936 mulps %xmm8, %xmm9 937 ADDSUB %xmm9, %xmm1 938 movsldup 12 * SIZE(BO), %xmm9 939 mulps %xmm8, %xmm9 940 addps %xmm9, %xmm2 941 movshdup 12 * SIZE(BO), %xmm9 942 mulps %xmm8, %xmm9 943 movaps 8 * SIZE(AO), %xmm8 944 ADDSUB %xmm9, %xmm3 945 movsldup 64 * SIZE(BO), %xmm9 946 mulps %xmm8, %xmm11 947 addps %xmm11, %xmm0 948 movshdup 16 * SIZE(BO), %xmm11 949 mulps %xmm8, %xmm11 950 ADDSUB %xmm11, %xmm1 951 movsldup 20 * SIZE(BO), %xmm11 952 mulps %xmm8, %xmm11 953 addps %xmm11, %xmm2 954 movshdup 20 * SIZE(BO), %xmm11 955 mulps %xmm8, %xmm11 956 movaps 12 * SIZE(AO), %xmm8 957 ADDSUB %xmm11, %xmm3 958 movsldup 24 * SIZE(BO), %xmm11 959 mulps %xmm8, %xmm11 960 addps %xmm11, %xmm0 961 movshdup 24 * SIZE(BO), %xmm11 962 mulps %xmm8, %xmm11 963 ADDSUB %xmm11, %xmm1 964 movsldup 28 * SIZE(BO), %xmm11 965 mulps %xmm8, %xmm11 966 addps %xmm11, %xmm2 967 movshdup 28 * SIZE(BO), %xmm11 968 mulps %xmm8, %xmm11 969 movaps 32 * SIZE(AO), %xmm8 970 ADDSUB %xmm11, %xmm3 971 movsldup 80 * SIZE(BO), %xmm11 972 mulps %xmm10, %xmm13 973 addps %xmm13, %xmm0 974 movshdup 32 * SIZE(BO), %xmm13 975 mulps %xmm10, %xmm13 976 ADDSUB %xmm13, %xmm1 977 movsldup 36 * SIZE(BO), %xmm13 978 mulps %xmm10, %xmm13 979 addps %xmm13, %xmm2 980 movshdup 36 * SIZE(BO), %xmm13 981 mulps %xmm10, %xmm13 982 movaps 20 * SIZE(AO), %xmm10 983 ADDSUB %xmm13, %xmm3 984 movsldup 40 * SIZE(BO), %xmm13 985 mulps %xmm10, %xmm13 986 addps %xmm13, %xmm0 987 movshdup 40 * SIZE(BO), %xmm13 988 mulps %xmm10, %xmm13 989 ADDSUB %xmm13, %xmm1 990 movsldup 44 * SIZE(BO), %xmm13 991 mulps %xmm10, %xmm13 992 addps %xmm13, %xmm2 993 movshdup 44 * SIZE(BO), %xmm13 994 mulps %xmm10, %xmm13 995 movaps 24 * SIZE(AO), %xmm10 996 ADDSUB %xmm13, %xmm3 997 movsldup 96 * SIZE(BO), %xmm13 998 mulps %xmm10, %xmm15 999 addps %xmm15, %xmm0 1000 movshdup 48 * SIZE(BO), %xmm15 1001 mulps %xmm10, %xmm15 1002 ADDSUB %xmm15, %xmm1 1003 movsldup 52 * SIZE(BO), %xmm15 1004 mulps %xmm10, %xmm15 1005 addps %xmm15, %xmm2 1006 movshdup 52 * SIZE(BO), %xmm15 1007 mulps %xmm10, %xmm15 1008 movaps 28 * SIZE(AO), %xmm10 1009 ADDSUB %xmm15, %xmm3 1010 movsldup 56 * SIZE(BO), %xmm15 1011 mulps %xmm10, %xmm15 1012 addps %xmm15, %xmm0 1013 movshdup 56 * SIZE(BO), %xmm15 1014 mulps %xmm10, %xmm15 1015 ADDSUB %xmm15, %xmm1 1016 movsldup 60 * SIZE(BO), %xmm15 1017 mulps %xmm10, %xmm15 1018 addps %xmm15, %xmm2 1019 movshdup 60 * SIZE(BO), %xmm15 1020 mulps %xmm10, %xmm15 1021 movaps 48 * SIZE(AO), %xmm10 1022 ADDSUB %xmm15, %xmm3 1023 movsldup 112 * SIZE(BO), %xmm15 1024 1025 addq $32 * SIZE, AO 1026 addq $64 * SIZE, BO 1027 1028 decq %rax 1029 jne .L22 1030 ALIGN_4 1031 1032.L25: 1033#ifndef TRMMKERNEL 1034 movq K, %rax 1035#else 1036 movq KKK, %rax 1037#endif 1038 movaps ALPHA_R, %xmm14 1039 movaps ALPHA_I, %xmm15 1040 andq $7, %rax # if (k & 1) 1041 BRANCH 1042 je .L28 1043 ALIGN_4 1044 1045.L26: 1046 mulps %xmm8, %xmm9 1047 addps %xmm9, %xmm0 1048 movshdup 0 * SIZE(BO), %xmm9 1049 mulps %xmm8, %xmm9 1050 ADDSUB %xmm9, %xmm1 1051 movsldup 4 * SIZE(BO), %xmm9 1052 mulps %xmm8, %xmm9 1053 addps %xmm9, %xmm2 1054 movshdup 4 * SIZE(BO), %xmm9 1055 mulps %xmm8, %xmm9 1056 movaps 4 * SIZE(AO), %xmm8 1057 ADDSUB %xmm9, %xmm3 1058 movsldup 8 * SIZE(BO), %xmm9 1059 1060 addq $ 4 * SIZE, AO 1061 addq $ 8 * SIZE, BO 1062 decq %rax 1063 jg .L26 1064 ALIGN_4 1065 1066.L28: 1067#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 1068 defined(NR) || defined(NC) || defined(TR) || defined(TC) 1069 1070 shufps $0xb1, %xmm1, %xmm1 1071 shufps $0xb1, %xmm3, %xmm3 1072 1073 addsubps %xmm1, %xmm0 1074 addsubps %xmm3, %xmm2 1075 1076 movaps %xmm0, %xmm1 1077 movaps %xmm2, %xmm3 1078 1079 shufps $0xb1, %xmm0, %xmm0 1080 shufps $0xb1, %xmm2, %xmm2 1081#else 1082 shufps $0xb1, %xmm0, %xmm0 1083 shufps $0xb1, %xmm2, %xmm2 1084 1085 addsubps %xmm0, %xmm1 1086 addsubps %xmm2, %xmm3 1087 1088 movaps %xmm1, %xmm0 1089 movaps %xmm3, %xmm2 1090 1091 shufps $0xb1, %xmm1, %xmm1 1092 shufps $0xb1, %xmm3, %xmm3 1093#endif 1094 1095 mulps %xmm14, %xmm1 1096 mulps %xmm15, %xmm0 1097 mulps %xmm14, %xmm3 1098 mulps %xmm15, %xmm2 1099 1100 addps %xmm1, %xmm0 1101 addps %xmm3, %xmm2 1102 1103#if! defined(TRMMKERNEL) && !defined(BETAZERO) 1104 shufps $0xe4, %xmm8, %xmm8 1105 shufps $0xe4, %xmm10, %xmm10 1106 1107 movsd 0 * SIZE(CO1), %xmm8 1108 movhps 2 * SIZE(CO1), %xmm8 1109 movsd 0 * SIZE(CO2), %xmm10 1110 movhps 2 * SIZE(CO2), %xmm10 1111 1112 addps %xmm8, %xmm0 1113 addps %xmm10, %xmm2 1114#endif 1115 1116 movsd %xmm0, 0 * SIZE(CO1) 1117 movhps %xmm0, 2 * SIZE(CO1) 1118 movsd %xmm2, 0 * SIZE(CO2) 1119 movhps %xmm2, 2 * SIZE(CO2) 1120 1121#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1122 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1123 movq K, %rax 1124 subq KKK, %rax 1125 leaq (,%rax, 8), %rax 1126 leaq (AO, %rax, 2), AO 1127 leaq (BO, %rax, 4), BO 1128#endif 1129 1130#if defined(TRMMKERNEL) && defined(LEFT) 1131 addq $2, KK 1132#endif 1133 1134 addq $4 * SIZE, CO1 # coffset += 4 1135 addq $4 * SIZE, CO2 # coffset += 4 1136 ALIGN_4 1137 1138.L30: 1139 testq $1, M 1140 je .L39 1141 1142#if !defined(TRMMKERNEL) || \ 1143 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1144 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1145 1146 leaq BUFFER, BO 1147#else 1148 leaq BUFFER, BO 1149 movq KK, %rax 1150 leaq (, %rax, 8), %rax 1151 leaq (AO, %rax, 1), AO 1152 leaq (BO, %rax, 4), BO 1153#endif 1154 1155 movddup 0 * SIZE(AO), %xmm8 1156 pxor %xmm0, %xmm0 1157 movddup 8 * SIZE(AO), %xmm10 1158 pxor %xmm1, %xmm1 1159 movsd 0 * SIZE(BO), %xmm9 1160 pxor %xmm2, %xmm2 1161 movsd 16 * SIZE(BO), %xmm11 1162 pxor %xmm3, %xmm3 1163 movsd 32 * SIZE(BO), %xmm13 1164 movsd 48 * SIZE(BO), %xmm15 1165 1166#ifndef TRMMKERNEL 1167 movq K, %rax 1168#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1169 movq K, %rax 1170 subq KK, %rax 1171 movq %rax, KKK 1172#else 1173 movq KK, %rax 1174#ifdef LEFT 1175 addq $1, %rax 1176#else 1177 addq $2, %rax 1178#endif 1179 movq %rax, KKK 1180#endif 1181 sarq $3, %rax 1182 je .L35 1183 ALIGN_4 1184 1185.L32: 1186 shufps $0x50, %xmm9, %xmm9 1187 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 1188 mulps %xmm8, %xmm9 1189 addps %xmm9, %xmm0 1190 movsd 4 * SIZE(BO), %xmm9 1191 shufps $0x50, %xmm9, %xmm9 1192 mulps %xmm8, %xmm9 1193 movddup 2 * SIZE(AO), %xmm8 1194 addps %xmm9, %xmm1 1195 movsd 8 * SIZE(BO), %xmm9 1196 shufps $0x50, %xmm9, %xmm9 1197 mulps %xmm8, %xmm9 1198 addps %xmm9, %xmm0 1199 movsd 12 * SIZE(BO), %xmm9 1200 shufps $0x50, %xmm9, %xmm9 1201 mulps %xmm8, %xmm9 1202 movddup 4 * SIZE(AO), %xmm8 1203 addps %xmm9, %xmm1 1204 movsd 64 * SIZE(BO), %xmm9 1205 shufps $0x50, %xmm11, %xmm11 1206 mulps %xmm8, %xmm11 1207 addps %xmm11, %xmm0 1208 movsd 20 * SIZE(BO), %xmm11 1209 shufps $0x50, %xmm11, %xmm11 1210 mulps %xmm8, %xmm11 1211 movddup 6 * SIZE(AO), %xmm8 1212 addps %xmm11, %xmm1 1213 movsd 24 * SIZE(BO), %xmm11 1214 shufps $0x50, %xmm11, %xmm11 1215 mulps %xmm8, %xmm11 1216 addps %xmm11, %xmm0 1217 movsd 28 * SIZE(BO), %xmm11 1218 shufps $0x50, %xmm11, %xmm11 1219 mulps %xmm8, %xmm11 1220 movddup 16 * SIZE(AO), %xmm8 1221 addps %xmm11, %xmm1 1222 movsd 80 * SIZE(BO), %xmm11 1223 shufps $0x50, %xmm13, %xmm13 1224 mulps %xmm10, %xmm13 1225 addps %xmm13, %xmm0 1226 movsd 36 * SIZE(BO), %xmm13 1227 shufps $0x50, %xmm13, %xmm13 1228 mulps %xmm10, %xmm13 1229 movddup 10 * SIZE(AO), %xmm10 1230 addps %xmm13, %xmm1 1231 movsd 40 * SIZE(BO), %xmm13 1232 shufps $0x50, %xmm13, %xmm13 1233 mulps %xmm10, %xmm13 1234 addps %xmm13, %xmm0 1235 movsd 44 * SIZE(BO), %xmm13 1236 shufps $0x50, %xmm13, %xmm13 1237 mulps %xmm10, %xmm13 1238 movddup 12 * SIZE(AO), %xmm10 1239 addps %xmm13, %xmm1 1240 movsd 96 * SIZE(BO), %xmm13 1241 shufps $0x50, %xmm15, %xmm15 1242 mulps %xmm10, %xmm15 1243 addps %xmm15, %xmm0 1244 movsd 52 * SIZE(BO), %xmm15 1245 shufps $0x50, %xmm15, %xmm15 1246 mulps %xmm10, %xmm15 1247 movddup 14 * SIZE(AO), %xmm10 1248 addps %xmm15, %xmm1 1249 movsd 56 * SIZE(BO), %xmm15 1250 shufps $0x50, %xmm15, %xmm15 1251 mulps %xmm10, %xmm15 1252 addps %xmm15, %xmm0 1253 movsd 60 * SIZE(BO), %xmm15 1254 shufps $0x50, %xmm15, %xmm15 1255 mulps %xmm10, %xmm15 1256 movddup 24 * SIZE(AO), %xmm10 1257 addps %xmm15, %xmm1 1258 movsd 112 * SIZE(BO), %xmm15 1259 1260 addq $16 * SIZE, AO 1261 addq $64 * SIZE, BO 1262 1263 decq %rax 1264 jne .L32 1265 ALIGN_4 1266 1267.L35: 1268#ifndef TRMMKERNEL 1269 movq K, %rax 1270#else 1271 movq KKK, %rax 1272#endif 1273 movaps ALPHA_R, %xmm14 1274 movaps ALPHA_I, %xmm15 1275 andq $7, %rax # if (k & 1) 1276 BRANCH 1277 je .L38 1278 ALIGN_4 1279 1280.L36: 1281 shufps $0x50, %xmm9, %xmm9 1282 mulps %xmm8, %xmm9 1283 addps %xmm9, %xmm0 1284 movsd 4 * SIZE(BO), %xmm9 1285 shufps $0x50, %xmm9, %xmm9 1286 mulps %xmm8, %xmm9 1287 movddup 2 * SIZE(AO), %xmm8 1288 addps %xmm9, %xmm1 1289 movsd 8 * SIZE(BO), %xmm9 1290 1291 addq $2 * SIZE, AO 1292 addq $8 * SIZE, BO 1293 decq %rax 1294 jg .L36 1295 ALIGN_4 1296 1297.L38: 1298 movaps %xmm0, %xmm6 1299 movlhps %xmm1, %xmm0 1300 movhlps %xmm6, %xmm1 1301 1302#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ 1303 defined(RR) || defined(RC) || defined(CR) || defined(CC) 1304 cmpeqps %xmm7, %xmm7 1305 pslld $31, %xmm7 1306 xorps %xmm7, %xmm1 1307#endif 1308 1309#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 1310 defined(NR) || defined(NC) || defined(TR) || defined(TC) 1311 shufps $0xb1, %xmm1, %xmm1 1312 1313 addsubps %xmm1, %xmm0 1314 1315 movaps %xmm0, %xmm1 1316 1317 shufps $0xb1, %xmm0, %xmm0 1318#else 1319 shufps $0xb1, %xmm0, %xmm0 1320 1321 addsubps %xmm0, %xmm1 1322 1323 movaps %xmm1, %xmm0 1324 1325 shufps $0xb1, %xmm1, %xmm1 1326#endif 1327 1328 mulps %xmm14, %xmm1 1329 mulps %xmm15, %xmm0 1330 1331 addps %xmm1, %xmm0 1332 1333#if! defined(TRMMKERNEL) && !defined(BETAZERO) 1334 movsd 0 * SIZE(CO1), %xmm8 1335 movhps 0 * SIZE(CO2), %xmm8 1336 1337 addps %xmm8, %xmm0 1338#endif 1339 1340 movsd %xmm0, 0 * SIZE(CO1) 1341 movhps %xmm0, 0 * SIZE(CO2) 1342 1343#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1344 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1345 movq K, %rax 1346 subq KKK, %rax 1347 leaq (,%rax, 8), %rax 1348 leaq (AO, %rax, 1), AO 1349 leaq (BO, %rax, 4), BO 1350#endif 1351 1352#if defined(TRMMKERNEL) && defined(LEFT) 1353 addq $1, KK 1354#endif 1355 ALIGN_4 1356 1357.L39: 1358#if defined(TRMMKERNEL) && !defined(LEFT) 1359 addl $2, KK 1360#endif 1361 1362 leaq (C, LDC, 2), C # c += 2 * ldc 1363 decq J # j -- 1364 jg .L01 1365 ALIGN_4 1366 1367.L40: 1368 testq $1, N 1369 je .L999 1370 ALIGN_4 1371 1372.L41: 1373#if defined(TRMMKERNEL) && defined(LEFT) 1374 movq OFFSET, %rax 1375 movq %rax, KK 1376#endif 1377 1378/* Copying to Sub Buffer */ 1379 leaq BUFFER, BO 1380 1381 movq K, %rax 1382 sarq $3, %rax 1383 jle .L43 1384 ALIGN_4 1385 1386.L42: 1387 movddup 0 * SIZE(B), %xmm0 1388 movddup 2 * SIZE(B), %xmm1 1389 movddup 4 * SIZE(B), %xmm2 1390 movddup 6 * SIZE(B), %xmm3 1391 movddup 8 * SIZE(B), %xmm4 1392 movddup 10 * SIZE(B), %xmm5 1393 movddup 12 * SIZE(B), %xmm6 1394 movddup 14 * SIZE(B), %xmm7 1395 1396 movaps %xmm0, 0 * SIZE(BO) 1397 movaps %xmm1, 4 * SIZE(BO) 1398 movaps %xmm2, 8 * SIZE(BO) 1399 movaps %xmm3, 12 * SIZE(BO) 1400 movaps %xmm4, 16 * SIZE(BO) 1401 movaps %xmm5, 20 * SIZE(BO) 1402 movaps %xmm6, 24 * SIZE(BO) 1403 movaps %xmm7, 28 * SIZE(BO) 1404 1405 prefetcht1 128 * SIZE(BO) 1406 prefetcht0 112 * SIZE(B) 1407 1408 addq $16 * SIZE, B 1409 addq $32 * SIZE, BO 1410 decq %rax 1411 jne .L42 1412 ALIGN_4 1413 1414.L43: 1415 movq K, %rax 1416 andq $7, %rax 1417 BRANCH 1418 jle .L50 1419 ALIGN_4 1420 1421.L44: 1422 movddup 0 * SIZE(B), %xmm0 1423 1424 movaps %xmm0, 0 * SIZE(BO) 1425 1426 addq $2 * SIZE, B 1427 addq $4 * SIZE, BO 1428 decq %rax 1429 jne .L44 1430 ALIGN_4 1431 1432.L50: 1433 movq C, CO1 # coffset1 = c 1434 movq A, AO # aoffset = a 1435 1436 movq M, I 1437 sarq $2, I # i = (m >> 2) 1438 jle .L60 1439 ALIGN_4 1440 1441.L51: 1442#if !defined(TRMMKERNEL) || \ 1443 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1444 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1445 1446 leaq BUFFER, BO 1447#else 1448 leaq BUFFER, BO 1449 movq KK, %rax 1450 leaq (, %rax, 8), %rax 1451 leaq (AO, %rax, 4), AO 1452 leaq (BO, %rax, 2), BO 1453#endif 1454 1455 movaps 0 * SIZE(AO), %xmm8 1456 pxor %xmm0, %xmm0 1457 movaps 16 * SIZE(AO), %xmm10 1458 pxor %xmm1, %xmm1 1459 movaps 32 * SIZE(AO), %xmm12 1460 pxor %xmm4, %xmm4 1461 movaps 48 * SIZE(AO), %xmm14 1462 pxor %xmm5, %xmm5 1463 1464 movsldup 0 * SIZE(BO), %xmm9 1465 movsldup 16 * SIZE(BO), %xmm11 1466 1467 prefetchnta 4 * SIZE(CO1) 1468 1469#ifndef TRMMKERNEL 1470 movq K, %rax 1471#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1472 movq K, %rax 1473 subq KK, %rax 1474 movq %rax, KKK 1475#else 1476 movq KK, %rax 1477#ifdef LEFT 1478 addq $4, %rax 1479#else 1480 addq $1, %rax 1481#endif 1482 movq %rax, KKK 1483#endif 1484 sarq $3, %rax 1485 je .L55 1486 ALIGN_4 1487 1488.L52: 1489 mulps %xmm8, %xmm9 1490 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 1491 addps %xmm9, %xmm0 1492 movshdup 0 * SIZE(BO), %xmm9 1493 mulps %xmm8, %xmm9 1494 movaps 4 * SIZE(AO), %xmm8 1495 ADDSUB %xmm9, %xmm1 1496 movsldup 0 * SIZE(BO), %xmm9 1497 mulps %xmm8, %xmm9 1498 addps %xmm9, %xmm4 1499 movshdup 0 * SIZE(BO), %xmm9 1500 mulps %xmm8, %xmm9 1501 movaps 8 * SIZE(AO), %xmm8 1502 ADDSUB %xmm9, %xmm5 1503 movsldup 4 * SIZE(BO), %xmm9 1504 mulps %xmm8, %xmm9 1505 addps %xmm9, %xmm0 1506 movshdup 4 * SIZE(BO), %xmm9 1507 mulps %xmm8, %xmm9 1508 movaps 12 * SIZE(AO), %xmm8 1509 ADDSUB %xmm9, %xmm1 1510 movsldup 4 * SIZE(BO), %xmm9 1511 mulps %xmm8, %xmm9 1512 addps %xmm9, %xmm4 1513 movshdup 4 * SIZE(BO), %xmm9 1514 mulps %xmm8, %xmm9 1515 movaps 64 * SIZE(AO), %xmm8 1516 ADDSUB %xmm9, %xmm5 1517 movsldup 8 * SIZE(BO), %xmm9 1518 mulps %xmm10, %xmm9 1519 addps %xmm9, %xmm0 1520 movshdup 8 * SIZE(BO), %xmm9 1521 mulps %xmm10, %xmm9 1522 movaps 20 * SIZE(AO), %xmm10 1523 ADDSUB %xmm9, %xmm1 1524 movsldup 8 * SIZE(BO), %xmm9 1525 mulps %xmm10, %xmm9 1526 addps %xmm9, %xmm4 1527 movshdup 8 * SIZE(BO), %xmm9 1528 mulps %xmm10, %xmm9 1529 movaps 24 * SIZE(AO), %xmm10 1530 ADDSUB %xmm9, %xmm5 1531 movsldup 12 * SIZE(BO), %xmm9 1532 mulps %xmm10, %xmm9 1533 addps %xmm9, %xmm0 1534 movshdup 12 * SIZE(BO), %xmm9 1535 mulps %xmm10, %xmm9 1536 movaps 28 * SIZE(AO), %xmm10 1537 ADDSUB %xmm9, %xmm1 1538 movsldup 12 * SIZE(BO), %xmm9 1539 mulps %xmm10, %xmm9 1540 addps %xmm9, %xmm4 1541 movshdup 12 * SIZE(BO), %xmm9 1542 mulps %xmm10, %xmm9 1543 movaps 80 * SIZE(AO), %xmm10 1544 ADDSUB %xmm9, %xmm5 1545 movsldup 32 * SIZE(BO), %xmm9 1546 mulps %xmm12, %xmm11 1547 PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) 1548 addps %xmm11, %xmm0 1549 movshdup 16 * SIZE(BO), %xmm11 1550 mulps %xmm12, %xmm11 1551 movaps 36 * SIZE(AO), %xmm12 1552 ADDSUB %xmm11, %xmm1 1553 movsldup 16 * SIZE(BO), %xmm11 1554 mulps %xmm12, %xmm11 1555 addps %xmm11, %xmm4 1556 movshdup 16 * SIZE(BO), %xmm11 1557 mulps %xmm12, %xmm11 1558 movaps 40 * SIZE(AO), %xmm12 1559 ADDSUB %xmm11, %xmm5 1560 movsldup 20 * SIZE(BO), %xmm11 1561 mulps %xmm12, %xmm11 1562 addps %xmm11, %xmm0 1563 movshdup 20 * SIZE(BO), %xmm11 1564 mulps %xmm12, %xmm11 1565 movaps 44 * SIZE(AO), %xmm12 1566 ADDSUB %xmm11, %xmm1 1567 movsldup 20 * SIZE(BO), %xmm11 1568 mulps %xmm12, %xmm11 1569 addps %xmm11, %xmm4 1570 movshdup 20 * SIZE(BO), %xmm11 1571 mulps %xmm12, %xmm11 1572 movaps 96 * SIZE(AO), %xmm12 1573 ADDSUB %xmm11, %xmm5 1574 movsldup 24 * SIZE(BO), %xmm11 1575 mulps %xmm14, %xmm11 1576 addps %xmm11, %xmm0 1577 movshdup 24 * SIZE(BO), %xmm11 1578 mulps %xmm14, %xmm11 1579 movaps 52 * SIZE(AO), %xmm14 1580 ADDSUB %xmm11, %xmm1 1581 movsldup 24 * SIZE(BO), %xmm11 1582 mulps %xmm14, %xmm11 1583 addps %xmm11, %xmm4 1584 movshdup 24 * SIZE(BO), %xmm11 1585 mulps %xmm14, %xmm11 1586 movaps 56 * SIZE(AO), %xmm14 1587 ADDSUB %xmm11, %xmm5 1588 movsldup 28 * SIZE(BO), %xmm11 1589 mulps %xmm14, %xmm11 1590 addps %xmm11, %xmm0 1591 movshdup 28 * SIZE(BO), %xmm11 1592 mulps %xmm14, %xmm11 1593 movaps 60 * SIZE(AO), %xmm14 1594 ADDSUB %xmm11, %xmm1 1595 movsldup 28 * SIZE(BO), %xmm11 1596 mulps %xmm14, %xmm11 1597 addps %xmm11, %xmm4 1598 movshdup 28 * SIZE(BO), %xmm11 1599 mulps %xmm14, %xmm11 1600 movaps 112 * SIZE(AO), %xmm14 1601 ADDSUB %xmm11, %xmm5 1602 movsldup 48 * SIZE(BO), %xmm11 1603 1604 addq $64 * SIZE, AO 1605 addq $32 * SIZE, BO 1606 decq %rax 1607 jne .L52 1608 ALIGN_4 1609 1610.L55: 1611#ifndef TRMMKERNEL 1612 movq K, %rax 1613#else 1614 movq KKK, %rax 1615#endif 1616 movaps ALPHA_R, %xmm14 1617 movaps ALPHA_I, %xmm15 1618 andq $7, %rax # if (k & 1) 1619 BRANCH 1620 je .L58 1621 ALIGN_4 1622 1623.L56: 1624 mulps %xmm8, %xmm9 1625 addps %xmm9, %xmm0 1626 movshdup 0 * SIZE(BO), %xmm9 1627 mulps %xmm8, %xmm9 1628 movaps 4 * SIZE(AO), %xmm8 1629 ADDSUB %xmm9, %xmm1 1630 movsldup 0 * SIZE(BO), %xmm9 1631 mulps %xmm8, %xmm9 1632 addps %xmm9, %xmm4 1633 movshdup 0 * SIZE(BO), %xmm9 1634 mulps %xmm8, %xmm9 1635 movaps 8 * SIZE(AO), %xmm8 1636 ADDSUB %xmm9, %xmm5 1637 movsldup 4 * SIZE(BO), %xmm9 1638 1639 addq $ 8 * SIZE, AO 1640 addq $ 4 * SIZE, BO 1641 decq %rax 1642 jg .L56 1643 ALIGN_4 1644 1645.L58: 1646#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 1647 defined(NR) || defined(NC) || defined(TR) || defined(TC) 1648 1649 shufps $0xb1, %xmm1, %xmm1 1650 shufps $0xb1, %xmm5, %xmm5 1651 1652 addsubps %xmm1, %xmm0 1653 addsubps %xmm5, %xmm4 1654 1655 movaps %xmm0, %xmm1 1656 movaps %xmm4, %xmm5 1657 1658 shufps $0xb1, %xmm0, %xmm0 1659 shufps $0xb1, %xmm4, %xmm4 1660#else 1661 shufps $0xb1, %xmm0, %xmm0 1662 shufps $0xb1, %xmm4, %xmm4 1663 1664 addsubps %xmm0, %xmm1 1665 addsubps %xmm4, %xmm5 1666 1667 movaps %xmm1, %xmm0 1668 movaps %xmm5, %xmm4 1669 1670 shufps $0xb1, %xmm1, %xmm1 1671 shufps $0xb1, %xmm5, %xmm5 1672#endif 1673 1674 mulps %xmm14, %xmm1 1675 mulps %xmm15, %xmm0 1676 mulps %xmm14, %xmm5 1677 mulps %xmm15, %xmm4 1678 1679 addps %xmm1, %xmm0 1680 addps %xmm5, %xmm4 1681 1682#if! defined(TRMMKERNEL) && !defined(BETAZERO) 1683 movsd 0 * SIZE(CO1), %xmm8 1684 movhps 2 * SIZE(CO1), %xmm8 1685 movsd 4 * SIZE(CO1), %xmm9 1686 movhps 6 * SIZE(CO1), %xmm9 1687 1688 addps %xmm8, %xmm0 1689 addps %xmm9, %xmm4 1690#endif 1691 1692 movsd %xmm0, 0 * SIZE(CO1) 1693 movhps %xmm0, 2 * SIZE(CO1) 1694 movsd %xmm4, 4 * SIZE(CO1) 1695 movhps %xmm4, 6 * SIZE(CO1) 1696 1697#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1698 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1699 movq K, %rax 1700 subq KKK, %rax 1701 leaq (,%rax, 8), %rax 1702 leaq (AO, %rax, 4), AO 1703 leaq (BO, %rax, 2), BO 1704#endif 1705 1706#if defined(TRMMKERNEL) && defined(LEFT) 1707 addq $4, KK 1708#endif 1709 1710 addq $8 * SIZE, CO1 # coffset += 4 1711 decq I # i -- 1712 jg .L51 1713 ALIGN_4 1714 1715.L60: 1716 testq $2, M 1717 je .L70 1718 1719#if !defined(TRMMKERNEL) || \ 1720 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1721 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1722 1723 leaq BUFFER, BO 1724#else 1725 leaq BUFFER, BO 1726 movq KK, %rax 1727 leaq (, %rax, 8), %rax 1728 leaq (AO, %rax, 2), AO 1729 leaq (BO, %rax, 2), BO 1730#endif 1731 1732 movaps 0 * SIZE(AO), %xmm8 1733 pxor %xmm0, %xmm0 1734 movsldup 0 * SIZE(BO), %xmm9 1735 pxor %xmm1, %xmm1 1736 movaps 16 * SIZE(AO), %xmm10 1737 movsldup 16 * SIZE(BO), %xmm11 1738 1739#ifndef TRMMKERNEL 1740 movq K, %rax 1741#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1742 movq K, %rax 1743 subq KK, %rax 1744 movq %rax, KKK 1745#else 1746 movq KK, %rax 1747#ifdef LEFT 1748 addq $2, %rax 1749#else 1750 addq $1, %rax 1751#endif 1752 movq %rax, KKK 1753#endif 1754 sarq $3, %rax 1755 je .L65 1756 ALIGN_4 1757 1758.L62: 1759 mulps %xmm8, %xmm9 1760 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 1761 addps %xmm9, %xmm0 1762 movshdup 0 * SIZE(BO), %xmm9 1763 mulps %xmm8, %xmm9 1764 movaps 4 * SIZE(AO), %xmm8 1765 ADDSUB %xmm9, %xmm1 1766 movsldup 4 * SIZE(BO), %xmm9 1767 mulps %xmm8, %xmm9 1768 addps %xmm9, %xmm0 1769 movshdup 4 * SIZE(BO), %xmm9 1770 mulps %xmm8, %xmm9 1771 movaps 8 * SIZE(AO), %xmm8 1772 ADDSUB %xmm9, %xmm1 1773 movsldup 8 * SIZE(BO), %xmm9 1774 mulps %xmm8, %xmm9 1775 addps %xmm9, %xmm0 1776 movshdup 8 * SIZE(BO), %xmm9 1777 mulps %xmm8, %xmm9 1778 movaps 12 * SIZE(AO), %xmm8 1779 ADDSUB %xmm9, %xmm1 1780 movsldup 12 * SIZE(BO), %xmm9 1781 mulps %xmm8, %xmm9 1782 addps %xmm9, %xmm0 1783 movshdup 12 * SIZE(BO), %xmm9 1784 mulps %xmm8, %xmm9 1785 movaps 32 * SIZE(AO), %xmm8 1786 ADDSUB %xmm9, %xmm1 1787 movsldup 32 * SIZE(BO), %xmm9 1788 mulps %xmm10, %xmm11 1789 addps %xmm11, %xmm0 1790 movshdup 16 * SIZE(BO), %xmm11 1791 mulps %xmm10, %xmm11 1792 movaps 20 * SIZE(AO), %xmm10 1793 ADDSUB %xmm11, %xmm1 1794 movsldup 20 * SIZE(BO), %xmm11 1795 mulps %xmm10, %xmm11 1796 addps %xmm11, %xmm0 1797 movshdup 20 * SIZE(BO), %xmm11 1798 mulps %xmm10, %xmm11 1799 movaps 24 * SIZE(AO), %xmm10 1800 ADDSUB %xmm11, %xmm1 1801 movsldup 24 * SIZE(BO), %xmm11 1802 mulps %xmm10, %xmm11 1803 addps %xmm11, %xmm0 1804 movshdup 24 * SIZE(BO), %xmm11 1805 mulps %xmm10, %xmm11 1806 movaps 28 * SIZE(AO), %xmm10 1807 ADDSUB %xmm11, %xmm1 1808 movsldup 28 * SIZE(BO), %xmm11 1809 mulps %xmm10, %xmm11 1810 addps %xmm11, %xmm0 1811 movshdup 28 * SIZE(BO), %xmm11 1812 mulps %xmm10, %xmm11 1813 movaps 48 * SIZE(AO), %xmm10 1814 ADDSUB %xmm11, %xmm1 1815 movsldup 48 * SIZE(BO), %xmm11 1816 1817 addq $32 * SIZE, AO 1818 addq $32 * SIZE, BO 1819 1820 decq %rax 1821 jne .L62 1822 ALIGN_4 1823 1824.L65: 1825#ifndef TRMMKERNEL 1826 movq K, %rax 1827#else 1828 movq KKK, %rax 1829#endif 1830 movaps ALPHA_R, %xmm14 1831 movaps ALPHA_I, %xmm15 1832 andq $7, %rax # if (k & 1) 1833 BRANCH 1834 je .L68 1835 ALIGN_4 1836 1837.L66: 1838 mulps %xmm8, %xmm9 1839 addps %xmm9, %xmm0 1840 movshdup 0 * SIZE(BO), %xmm9 1841 mulps %xmm8, %xmm9 1842 movaps 4 * SIZE(AO), %xmm8 1843 ADDSUB %xmm9, %xmm1 1844 movsldup 4 * SIZE(BO), %xmm9 1845 1846 addq $4 * SIZE, AO # aoffset += 4 1847 addq $4 * SIZE, BO # boffset1 += 8 1848 decq %rax 1849 jg .L66 1850 ALIGN_4 1851 1852.L68: 1853#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 1854 defined(NR) || defined(NC) || defined(TR) || defined(TC) 1855 shufps $0xb1, %xmm1, %xmm1 1856 addsubps %xmm1, %xmm0 1857 movaps %xmm0, %xmm1 1858 shufps $0xb1, %xmm0, %xmm0 1859#else 1860 shufps $0xb1, %xmm0, %xmm0 1861 addsubps %xmm0, %xmm1 1862 movaps %xmm1, %xmm0 1863 shufps $0xb1, %xmm1, %xmm1 1864#endif 1865 1866 mulps %xmm14, %xmm1 1867 mulps %xmm15, %xmm0 1868 addps %xmm1, %xmm0 1869 1870#if! defined(TRMMKERNEL) && !defined(BETAZERO) 1871 movsd 0 * SIZE(CO1), %xmm8 1872 movhps 2 * SIZE(CO1), %xmm8 1873 1874 addps %xmm8, %xmm0 1875#endif 1876 1877 movsd %xmm0, 0 * SIZE(CO1) 1878 movhps %xmm0, 2 * SIZE(CO1) 1879 1880#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1881 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1882 movq K, %rax 1883 subq KKK, %rax 1884 leaq (,%rax, 8), %rax 1885 leaq (AO, %rax, 2), AO 1886 leaq (BO, %rax, 2), BO 1887#endif 1888 1889#if defined(TRMMKERNEL) && defined(LEFT) 1890 addq $2, KK 1891#endif 1892 addq $4 * SIZE, CO1 # coffset += 4 1893 ALIGN_4 1894 1895.L70: 1896 testq $1, M 1897 je .L999 1898 1899#if !defined(TRMMKERNEL) || \ 1900 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 1901 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 1902 1903 leaq BUFFER, BO 1904#else 1905 leaq BUFFER, BO 1906 movq KK, %rax 1907 leaq (, %rax, 8), %rax 1908 leaq (AO, %rax, 1), AO 1909 leaq (BO, %rax, 2), BO 1910#endif 1911 1912 movddup 0 * SIZE(AO), %xmm8 1913 pxor %xmm0, %xmm0 1914 movsd 0 * SIZE(BO), %xmm9 1915 pxor %xmm1, %xmm1 1916 movddup 8 * SIZE(AO), %xmm10 1917 movsd 16 * SIZE(BO), %xmm11 1918 1919#ifndef TRMMKERNEL 1920 movq K, %rax 1921#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1922 movq K, %rax 1923 subq KK, %rax 1924 movq %rax, KKK 1925#else 1926 movq KK, %rax 1927#ifdef LEFT 1928 addq $1, %rax 1929#else 1930 addq $1, %rax 1931#endif 1932 movq %rax, KKK 1933#endif 1934 sarq $3, %rax 1935 je .L75 1936 ALIGN_4 1937 1938.L72: 1939 shufps $0x50, %xmm9, %xmm9 1940 PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) 1941 mulps %xmm8, %xmm9 1942 movddup 2 * SIZE(AO), %xmm8 1943 addps %xmm9, %xmm0 1944 movsd 4 * SIZE(BO), %xmm9 1945 shufps $0x50, %xmm9, %xmm9 1946 mulps %xmm8, %xmm9 1947 movddup 4 * SIZE(AO), %xmm8 1948 addps %xmm9, %xmm1 1949 movsd 8 * SIZE(BO), %xmm9 1950 shufps $0x50, %xmm9, %xmm9 1951 mulps %xmm8, %xmm9 1952 movddup 6 * SIZE(AO), %xmm8 1953 addps %xmm9, %xmm0 1954 movsd 12 * SIZE(BO), %xmm9 1955 shufps $0x50, %xmm9, %xmm9 1956 mulps %xmm8, %xmm9 1957 movddup 16 * SIZE(AO), %xmm8 1958 addps %xmm9, %xmm1 1959 movsd 32 * SIZE(BO), %xmm9 1960 shufps $0x50, %xmm11, %xmm11 1961 mulps %xmm10, %xmm11 1962 movddup 10 * SIZE(AO), %xmm10 1963 addps %xmm11, %xmm0 1964 movsd 20 * SIZE(BO), %xmm11 1965 shufps $0x50, %xmm11, %xmm11 1966 mulps %xmm10, %xmm11 1967 movddup 12 * SIZE(AO), %xmm10 1968 addps %xmm11, %xmm1 1969 movsd 24 * SIZE(BO), %xmm11 1970 shufps $0x50, %xmm11, %xmm11 1971 mulps %xmm10, %xmm11 1972 movddup 14 * SIZE(AO), %xmm10 1973 addps %xmm11, %xmm0 1974 movsd 28 * SIZE(BO), %xmm11 1975 shufps $0x50, %xmm11, %xmm11 1976 mulps %xmm10, %xmm11 1977 movddup 24 * SIZE(AO), %xmm10 1978 addps %xmm11, %xmm1 1979 movsd 48 * SIZE(BO), %xmm11 1980 1981 addq $16 * SIZE, AO 1982 addq $32 * SIZE, BO 1983 decq %rax 1984 jne .L72 1985 ALIGN_4 1986 1987.L75: 1988#ifndef TRMMKERNEL 1989 movq K, %rax 1990#else 1991 movq KKK, %rax 1992#endif 1993 movaps ALPHA_R, %xmm14 1994 movaps ALPHA_I, %xmm15 1995 andq $7, %rax # if (k & 1) 1996 BRANCH 1997 je .L78 1998 ALIGN_4 1999 2000.L76: 2001 shufps $0x50, %xmm9, %xmm9 2002 mulps %xmm8, %xmm9 2003 movddup 2 * SIZE(AO), %xmm8 2004 addps %xmm9, %xmm0 2005 movsd 4 * SIZE(BO), %xmm9 2006 2007 addq $2 * SIZE, AO 2008 addq $4 * SIZE, BO 2009 decq %rax 2010 jg .L76 2011 ALIGN_4 2012 2013.L78: 2014 addps %xmm1, %xmm0 2015 2016 movhlps %xmm0, %xmm1 2017 2018#if defined(NR) || defined(NC) || defined(TR) || defined(TC) || \ 2019 defined(RR) || defined(RC) || defined(CR) || defined(CC) 2020 cmpeqps %xmm7, %xmm7 2021 pslld $31, %xmm7 2022 xorps %xmm7, %xmm1 2023#endif 2024 2025#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 2026 defined(NR) || defined(NC) || defined(TR) || defined(TC) 2027 shufps $0xb1, %xmm1, %xmm1 2028 2029 addsubps %xmm1, %xmm0 2030 2031 movaps %xmm0, %xmm1 2032 2033 shufps $0xb1, %xmm0, %xmm0 2034#else 2035 shufps $0xb1, %xmm0, %xmm0 2036 2037 addsubps %xmm0, %xmm1 2038 2039 movaps %xmm1, %xmm0 2040 2041 shufps $0xb1, %xmm1, %xmm1 2042#endif 2043 2044 mulps %xmm14, %xmm1 2045 mulps %xmm15, %xmm0 2046 2047 addps %xmm1, %xmm0 2048 2049#if! defined(TRMMKERNEL) && !defined(BETAZERO) 2050 movsd 0 * SIZE(CO1), %xmm8 2051 addps %xmm8, %xmm0 2052#endif 2053 movsd %xmm0, 0 * SIZE(CO1) 2054 ALIGN_4 2055 2056.L999: 2057 movq %rbx, %rsp 2058 2059 movq 0(%rsp), %rbx 2060 movq 8(%rsp), %rbp 2061 movq 16(%rsp), %r12 2062 movq 24(%rsp), %r13 2063 movq 32(%rsp), %r14 2064 movq 40(%rsp), %r15 2065 2066#ifdef WINDOWS_ABI 2067 movq 48(%rsp), %rdi 2068 movq 56(%rsp), %rsi 2069 movups 64(%rsp), %xmm6 2070 movups 80(%rsp), %xmm7 2071 movups 96(%rsp), %xmm8 2072 movups 112(%rsp), %xmm9 2073 movups 128(%rsp), %xmm10 2074 movups 144(%rsp), %xmm11 2075 movups 160(%rsp), %xmm12 2076 movups 176(%rsp), %xmm13 2077 movups 192(%rsp), %xmm14 2078 movups 208(%rsp), %xmm15 2079#endif 2080 2081 addq $STACKSIZE, %rsp 2082 ret 2083 2084 EPILOGUE 2085