1/********************************************************************************* 2Copyright (c) 2013, The OpenBLAS Project 3All rights reserved. 4Redistribution and use in source and binary forms, with or without 5modification, are permitted provided that the following conditions are 6met: 71. Redistributions of source code must retain the above copyright 8notice, this list of conditions and the following disclaimer. 92. Redistributions in binary form must reproduce the above copyright 10notice, this list of conditions and the following disclaimer in 11the documentation and/or other materials provided with the 12distribution. 133. Neither the name of the OpenBLAS project nor the names of 14its contributors may be used to endorse or promote products 15derived from this software without specific prior written permission. 16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE 20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE 25USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26**********************************************************************************/ 27 28/********************************************************************* 29* 2014/07/29 Saar 30* BLASTEST : OK 31* CTEST : OK 32* TEST : OK 33* 34* 2013/10/28 Saar 35* Parameter: 36* CGEMM_DEFAULT_UNROLL_N 2 37* CGEMM_DEFAULT_UNROLL_M 8 38* CGEMM_DEFAULT_P 384 39* CGEMM_DEFAULT_Q 192 40* A_PR1 512 41* B_PR1 512 42* 43* 2014/07/29 Saar 44* Performance at 6912x6912x6912: 45* 1 thread: 107 GFLOPS (SANDYBRIDGE: 60) (MKL: 86) 46* 2 threads: 208 GFLOPS (SANDYBRIDGE: 114) (MKL: 155) 47* 3 threads: 289 GFLOPS (SANDYBRIDGE: 162) (MKL: 222) 48* 4 threads: 377 GFLOPS (SANDYBRIDGE: 223) (MKL: 279) 49* 50* 51*********************************************************************/ 52 53 54 55#define ASSEMBLER 56#include "common.h" 57 58#define OLD_M %rdi 59#define OLD_N %rsi 60#define M %r13 61#define J %r14 62#define OLD_K %rdx 63 64#define A %rcx 65#define B %r8 66#define C %r9 67#define LDC %r10 68 69#define I %r11 70#define AO %rdi 71#define BO %rsi 72#define CO1 %r15 73#define K %r12 74#define BI %rbp 75#define SP %rbx 76 77#define BO1 %rdi 78#define BO2 %rbp 79 80#ifndef WINDOWS_ABI 81 82#define STACKSIZE 96 83 84#else 85 86#define STACKSIZE 320 87 88#define OLD_ALPHA_I 40 + STACKSIZE(%rsp) 89#define OLD_A 48 + STACKSIZE(%rsp) 90#define OLD_B 56 + STACKSIZE(%rsp) 91#define OLD_C 64 + STACKSIZE(%rsp) 92#define OLD_LDC 72 + STACKSIZE(%rsp) 93#define OLD_OFFSET 80 + STACKSIZE(%rsp) 94 95#endif 96 97#define L_BUFFER_SIZE 8192 98 99#define Ndiv6 24(%rsp) 100#define Nmod6 32(%rsp) 101#define N 40(%rsp) 102#define ALPHA_R 48(%rsp) 103#define ALPHA_I 56(%rsp) 104#define OFFSET 64(%rsp) 105#define KK 72(%rsp) 106#define KKK 80(%rsp) 107#define BUFFER1 128(%rsp) 108 109#if defined(OS_WINDOWS) 110#if L_BUFFER_SIZE > 16384 111#define STACK_TOUCH \ 112 movl $ 0, 4096 * 4(%rsp);\ 113 movl $ 0, 4096 * 3(%rsp);\ 114 movl $ 0, 4096 * 2(%rsp);\ 115 movl $ 0, 4096 * 1(%rsp); 116#elif L_BUFFER_SIZE > 12288 117#define STACK_TOUCH \ 118 movl $ 0, 4096 * 3(%rsp);\ 119 movl $ 0, 4096 * 2(%rsp);\ 120 movl $ 0, 4096 * 1(%rsp); 121#elif L_BUFFER_SIZE > 8192 122#define STACK_TOUCH \ 123 movl $ 0, 4096 * 2(%rsp);\ 124 movl $ 0, 4096 * 1(%rsp); 125#elif L_BUFFER_SIZE > 4096 126#define STACK_TOUCH \ 127 movl $ 0, 4096 * 1(%rsp); 128#else 129#define STACK_TOUCH 130#endif 131#else 132#define STACK_TOUCH 133#endif 134 135 136#if defined(BULLDOZER) 137 138#if defined(NN) || defined(NT) || defined(TN) || defined(TT) 139 140#define VFMADDPS_R( y0,y1,y2 ) vfmaddps y0,y1,y2,y0 141 142#define VFMADDPS_I( y0,y1,y2 ) vfmaddps y0,y1,y2,y0 143 144#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) 145 146#define VFMADDPS_R( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0 147 148#define VFMADDPS_I( y0,y1,y2 ) vfmaddps y0,y1,y2,y0 149 150#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) 151 152#define VFMADDPS_R( y0,y1,y2 ) vfmaddps y0,y1,y2,y0 153 154#define VFMADDPS_I( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0 155 156#else 157 158#define VFMADDPS_R( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0 159 160#define VFMADDPS_I( y0,y1,y2 ) vfnmaddps y0,y1,y2,y0 161 162#endif 163 164#else 165 166#if defined(NN) || defined(NT) || defined(TN) || defined(TT) 167 168#define VFMADDPS_R( y0,y1,y2 ) vfmadd231ps y1,y2,y0 169 170#define VFMADDPS_I( y0,y1,y2 ) vfmadd231ps y1,y2,y0 171 172#elif defined(RN) || defined(RT) || defined(CN) || defined(CT) 173 174#define VFMADDPS_R( y0,y1,y2 ) vfnmadd231ps y1,y2,y0 175 176#define VFMADDPS_I( y0,y1,y2 ) vfmadd231ps y1,y2,y0 177 178#elif defined(NR) || defined(NC) || defined(TR) || defined(TC) 179 180#define VFMADDPS_R( y0,y1,y2 ) vfmadd231ps y1,y2,y0 181 182#define VFMADDPS_I( y0,y1,y2 ) vfnmadd231ps y1,y2,y0 183 184#else 185 186#define VFMADDPS_R( y0,y1,y2 ) vfnmadd231ps y1,y2,y0 187 188#define VFMADDPS_I( y0,y1,y2 ) vfnmadd231ps y1,y2,y0 189 190#endif 191 192#endif 193 194 195#define A_PR1 512 196#define B_PR1 512 197 198 199 200/***************************************************************************************************************************/ 201 202.macro KERNEL8x3_SUB 203 204 vmovups -16 * SIZE(AO), %ymm0 205 vmovups -8 * SIZE(AO), %ymm1 206 vbroadcastss -8 * SIZE(BO), %ymm2 207 vbroadcastss -7 * SIZE(BO), %ymm3 208 prefetcht0 A_PR1(AO) 209 210 VFMADDPS_R( %ymm8 ,%ymm2,%ymm0 ) 211 VFMADDPS_R( %ymm12,%ymm2,%ymm1 ) 212 VFMADDPS_I( %ymm9 ,%ymm3,%ymm0 ) 213 VFMADDPS_I( %ymm13,%ymm3,%ymm1 ) 214 215 vbroadcastss -6 * SIZE(BO), %ymm2 216 vbroadcastss -5 * SIZE(BO), %ymm3 217 VFMADDPS_R( %ymm10,%ymm2,%ymm0 ) 218 VFMADDPS_R( %ymm14,%ymm2,%ymm1 ) 219 VFMADDPS_I( %ymm11,%ymm3,%ymm0 ) 220 VFMADDPS_I( %ymm15,%ymm3,%ymm1 ) 221 222 vbroadcastss -4 * SIZE(BO), %ymm2 223 vbroadcastss -3 * SIZE(BO), %ymm3 224 VFMADDPS_R( %ymm4 ,%ymm2,%ymm0 ) 225 VFMADDPS_R( %ymm6 ,%ymm2,%ymm1 ) 226 VFMADDPS_I( %ymm5 ,%ymm3,%ymm0 ) 227 VFMADDPS_I( %ymm7 ,%ymm3,%ymm1 ) 228 229 230 addq $ 6*SIZE, BO 231 addq $ 16*SIZE, AO 232 decq %rax 233.endm 234 235.macro SAVE8x3 236 237 vbroadcastss ALPHA_R, %ymm0 238 vbroadcastss ALPHA_I, %ymm1 239 240 // swap high and low 64 bytes 241 vshufps $ 0xb1, %ymm9 , %ymm9 , %ymm9 242 vshufps $ 0xb1, %ymm11, %ymm11, %ymm11 243 vshufps $ 0xb1, %ymm13, %ymm13, %ymm13 244 vshufps $ 0xb1, %ymm15, %ymm15, %ymm15 245 vshufps $ 0xb1, %ymm5 , %ymm5 , %ymm5 246 vshufps $ 0xb1, %ymm7 , %ymm7 , %ymm7 247 248#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 249 defined(NR) || defined(NC) || defined(TR) || defined(TC) 250 251 vaddsubps %ymm9, %ymm8 , %ymm8 252 vaddsubps %ymm11,%ymm10, %ymm10 253 vaddsubps %ymm13,%ymm12, %ymm12 254 vaddsubps %ymm15,%ymm14, %ymm14 255 vaddsubps %ymm5, %ymm4 , %ymm4 256 vaddsubps %ymm7, %ymm6 , %ymm6 257 258 vshufps $ 0xb1, %ymm8 , %ymm8 , %ymm9 259 vshufps $ 0xb1, %ymm10, %ymm10, %ymm11 260 vshufps $ 0xb1, %ymm12, %ymm12, %ymm13 261 vshufps $ 0xb1, %ymm14, %ymm14, %ymm15 262 vshufps $ 0xb1, %ymm4 , %ymm4 , %ymm5 263 vshufps $ 0xb1, %ymm6 , %ymm6 , %ymm7 264 265#else 266 vaddsubps %ymm8, %ymm9 ,%ymm9 267 vaddsubps %ymm10, %ymm11,%ymm11 268 vaddsubps %ymm12, %ymm13,%ymm13 269 vaddsubps %ymm14, %ymm15,%ymm15 270 vaddsubps %ymm4, %ymm5 ,%ymm5 271 vaddsubps %ymm6, %ymm7 ,%ymm7 272 273 vmovaps %ymm9, %ymm8 274 vmovaps %ymm11, %ymm10 275 vmovaps %ymm13, %ymm12 276 vmovaps %ymm15, %ymm14 277 vmovaps %ymm5, %ymm4 278 vmovaps %ymm7, %ymm6 279 280 // swap high and low 64 bytes 281 vshufps $ 0xb1, %ymm9 , %ymm9 , %ymm9 282 vshufps $ 0xb1, %ymm11, %ymm11, %ymm11 283 vshufps $ 0xb1, %ymm13, %ymm13, %ymm13 284 vshufps $ 0xb1, %ymm15, %ymm15, %ymm15 285 vshufps $ 0xb1, %ymm5 , %ymm5 , %ymm5 286 vshufps $ 0xb1, %ymm7 , %ymm7 , %ymm7 287 288#endif 289 290 // multiply with ALPHA_R 291 vmulps %ymm8 , %ymm0, %ymm8 292 vmulps %ymm10, %ymm0, %ymm10 293 vmulps %ymm12, %ymm0, %ymm12 294 vmulps %ymm14, %ymm0, %ymm14 295 vmulps %ymm4 , %ymm0, %ymm4 296 vmulps %ymm6 , %ymm0, %ymm6 297 298 // multiply with ALPHA_I 299 vmulps %ymm9 , %ymm1, %ymm9 300 vmulps %ymm11, %ymm1, %ymm11 301 vmulps %ymm13, %ymm1, %ymm13 302 vmulps %ymm15, %ymm1, %ymm15 303 vmulps %ymm5 , %ymm1, %ymm5 304 vmulps %ymm7 , %ymm1, %ymm7 305 306 vaddsubps %ymm9, %ymm8 , %ymm8 307 vaddsubps %ymm11,%ymm10, %ymm10 308 vaddsubps %ymm13,%ymm12, %ymm12 309 vaddsubps %ymm15,%ymm14, %ymm14 310 vaddsubps %ymm5, %ymm4 , %ymm4 311 vaddsubps %ymm7, %ymm6 , %ymm6 312 313#if !defined(TRMMKERNEL) 314 315 vaddps (CO1), %ymm8 , %ymm8 316 vaddps 8 * SIZE(CO1), %ymm12, %ymm12 317 318 vaddps (CO1, LDC), %ymm10, %ymm10 319 vaddps 8 * SIZE(CO1, LDC), %ymm14, %ymm14 320 321 vaddps (CO1, LDC,2), %ymm4, %ymm4 322 vaddps 8 * SIZE(CO1, LDC,2), %ymm6, %ymm6 323 324#endif 325 326 vmovups %ymm8 , (CO1) 327 vmovups %ymm12 , 8 * SIZE(CO1) 328 329 vmovups %ymm10 , (CO1, LDC) 330 vmovups %ymm14 , 8 * SIZE(CO1, LDC) 331 332 vmovups %ymm4 , (CO1, LDC,2) 333 vmovups %ymm6 , 8 * SIZE(CO1, LDC,2) 334 335.endm 336 337 338/***************************************************************************************************************************/ 339 340.macro KERNEL4x3_SUB 341 342 vmovups -16 * SIZE(AO), %ymm0 343 vbroadcastss -8 * SIZE(BO), %ymm2 344 vbroadcastss -7 * SIZE(BO), %ymm3 345 346 VFMADDPS_R( %ymm8 ,%ymm2,%ymm0 ) 347 VFMADDPS_I( %ymm9 ,%ymm3,%ymm0 ) 348 349 vbroadcastss -6 * SIZE(BO), %ymm2 350 vbroadcastss -5 * SIZE(BO), %ymm3 351 VFMADDPS_R( %ymm12,%ymm2,%ymm0 ) 352 VFMADDPS_I( %ymm13,%ymm3,%ymm0 ) 353 354 vbroadcastss -4 * SIZE(BO), %ymm2 355 vbroadcastss -3 * SIZE(BO), %ymm3 356 VFMADDPS_R( %ymm4 ,%ymm2,%ymm0 ) 357 VFMADDPS_I( %ymm5 ,%ymm3,%ymm0 ) 358 359 addq $ 6*SIZE, BO 360 addq $ 8*SIZE, AO 361 decq %rax 362.endm 363 364.macro SAVE4x3 365 366 vbroadcastss ALPHA_R, %ymm0 367 vbroadcastss ALPHA_I, %ymm1 368 369 // swap high and low 64 bytes 370 vshufps $ 0xb1, %ymm9 , %ymm9 , %ymm9 371 vshufps $ 0xb1, %ymm13, %ymm13, %ymm13 372 vshufps $ 0xb1, %ymm5 , %ymm5 , %ymm5 373 374#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 375 defined(NR) || defined(NC) || defined(TR) || defined(TC) 376 377 vaddsubps %ymm9, %ymm8 , %ymm8 378 vaddsubps %ymm13,%ymm12, %ymm12 379 vaddsubps %ymm5, %ymm4 , %ymm4 380 381 vshufps $ 0xb1, %ymm8 , %ymm8 , %ymm9 382 vshufps $ 0xb1, %ymm12, %ymm12, %ymm13 383 vshufps $ 0xb1, %ymm4 , %ymm4 , %ymm5 384 385#else 386 vaddsubps %ymm8, %ymm9 ,%ymm9 387 vaddsubps %ymm12, %ymm13,%ymm13 388 vaddsubps %ymm4, %ymm5 ,%ymm5 389 390 vmovaps %ymm9, %ymm8 391 vmovaps %ymm13, %ymm12 392 vmovaps %ymm5, %ymm4 393 394 // swap high and low 64 bytes 395 vshufps $ 0xb1, %ymm9 , %ymm9 , %ymm9 396 vshufps $ 0xb1, %ymm13, %ymm13, %ymm13 397 vshufps $ 0xb1, %ymm5 , %ymm5 , %ymm5 398 399#endif 400 401 // multiply with ALPHA_R 402 vmulps %ymm8 , %ymm0, %ymm8 403 vmulps %ymm12, %ymm0, %ymm12 404 vmulps %ymm4 , %ymm0, %ymm4 405 406 // multiply with ALPHA_I 407 vmulps %ymm9 , %ymm1, %ymm9 408 vmulps %ymm13, %ymm1, %ymm13 409 vmulps %ymm5 , %ymm1, %ymm5 410 411 vaddsubps %ymm9, %ymm8 , %ymm8 412 vaddsubps %ymm13,%ymm12, %ymm12 413 vaddsubps %ymm5, %ymm4 , %ymm4 414 415#if !defined(TRMMKERNEL) 416 417 vaddps (CO1), %ymm8 , %ymm8 418 vaddps (CO1, LDC), %ymm12, %ymm12 419 vaddps (CO1, LDC,2), %ymm4, %ymm4 420 421#endif 422 423 vmovups %ymm8 , (CO1) 424 vmovups %ymm12 , (CO1, LDC) 425 vmovups %ymm4 , (CO1, LDC,2) 426 427.endm 428 429/***************************************************************************************************************************/ 430 431.macro KERNEL2x3_SUB 432 433 vmovups -16 * SIZE(AO), %xmm0 434 vbroadcastss -8 * SIZE(BO), %xmm2 435 vbroadcastss -7 * SIZE(BO), %xmm3 436 437 VFMADDPS_R( %xmm8 ,%xmm2,%xmm0 ) 438 VFMADDPS_I( %xmm9 ,%xmm3,%xmm0 ) 439 440 vbroadcastss -6 * SIZE(BO), %xmm2 441 vbroadcastss -5 * SIZE(BO), %xmm3 442 VFMADDPS_R( %xmm12,%xmm2,%xmm0 ) 443 VFMADDPS_I( %xmm13,%xmm3,%xmm0 ) 444 445 vbroadcastss -4 * SIZE(BO), %xmm2 446 vbroadcastss -3 * SIZE(BO), %xmm3 447 VFMADDPS_R( %xmm4 ,%xmm2,%xmm0 ) 448 VFMADDPS_I( %xmm5 ,%xmm3,%xmm0 ) 449 450 addq $ 6*SIZE, BO 451 addq $ 4*SIZE, AO 452 decq %rax 453 454.endm 455 456.macro SAVE2x3 457 458 vbroadcastss ALPHA_R, %xmm0 459 vbroadcastss ALPHA_I, %xmm1 460 461 // swap high and low 64 bytes 462 vshufps $ 0xb1, %xmm9 , %xmm9 , %xmm9 463 vshufps $ 0xb1, %xmm13, %xmm13, %xmm13 464 vshufps $ 0xb1, %xmm5 , %xmm5 , %xmm5 465 466#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 467 defined(NR) || defined(NC) || defined(TR) || defined(TC) 468 469 vaddsubps %xmm9, %xmm8 , %xmm8 470 vaddsubps %xmm13,%xmm12, %xmm12 471 vaddsubps %xmm5, %xmm4 , %xmm4 472 473 vshufps $ 0xb1, %xmm8 , %xmm8 , %xmm9 474 vshufps $ 0xb1, %xmm12, %xmm12, %xmm13 475 vshufps $ 0xb1, %xmm4 , %xmm4 , %xmm5 476 477#else 478 vaddsubps %xmm8, %xmm9 ,%xmm9 479 vaddsubps %xmm12, %xmm13,%xmm13 480 vaddsubps %xmm4, %xmm5 ,%xmm5 481 482 vmovaps %xmm9, %xmm8 483 vmovaps %xmm13, %xmm12 484 vmovaps %xmm5, %xmm4 485 486 // swap high and low 64 bytes 487 vshufps $ 0xb1, %xmm9 , %xmm9 , %xmm9 488 vshufps $ 0xb1, %xmm13, %xmm13, %xmm13 489 vshufps $ 0xb1, %xmm5 , %xmm5 , %xmm5 490 491#endif 492 493 // multiply with ALPHA_R 494 vmulps %xmm8 , %xmm0, %xmm8 495 vmulps %xmm12, %xmm0, %xmm12 496 vmulps %xmm4 , %xmm0, %xmm4 497 498 // multiply with ALPHA_I 499 vmulps %xmm9 , %xmm1, %xmm9 500 vmulps %xmm13, %xmm1, %xmm13 501 vmulps %xmm5 , %xmm1, %xmm5 502 503 vaddsubps %xmm9, %xmm8 , %xmm8 504 vaddsubps %xmm13,%xmm12, %xmm12 505 vaddsubps %xmm5, %xmm4 , %xmm4 506 507#if !defined(TRMMKERNEL) 508 509 vaddps (CO1), %xmm8 , %xmm8 510 vaddps (CO1, LDC), %xmm12, %xmm12 511 vaddps (CO1, LDC,2), %xmm4, %xmm4 512 513#endif 514 515 vmovups %xmm8 , (CO1) 516 vmovups %xmm12 , (CO1, LDC) 517 vmovups %xmm4 , (CO1, LDC,2) 518 519.endm 520 521 522/***************************************************************************************************************************/ 523 524.macro KERNEL1x3_SUB 525 526 vmovsd -16 * SIZE(AO), %xmm0 527 vbroadcastss -8 * SIZE(BO), %xmm2 528 vbroadcastss -7 * SIZE(BO), %xmm3 529 530 VFMADDPS_R( %xmm8 ,%xmm2,%xmm0 ) 531 VFMADDPS_I( %xmm9 ,%xmm3,%xmm0 ) 532 533 vbroadcastss -6 * SIZE(BO), %xmm2 534 vbroadcastss -5 * SIZE(BO), %xmm3 535 VFMADDPS_R( %xmm12,%xmm2,%xmm0 ) 536 VFMADDPS_I( %xmm13,%xmm3,%xmm0 ) 537 538 vbroadcastss -4 * SIZE(BO), %xmm2 539 vbroadcastss -3 * SIZE(BO), %xmm3 540 VFMADDPS_R( %xmm4 ,%xmm2,%xmm0 ) 541 VFMADDPS_I( %xmm5 ,%xmm3,%xmm0 ) 542 543 addq $ 6*SIZE, BO 544 addq $ 2*SIZE, AO 545 decq %rax 546 547.endm 548 549.macro SAVE1x3 550 551 vbroadcastss ALPHA_R, %xmm0 552 vbroadcastss ALPHA_I, %xmm1 553 554 // swap high and low 64 bytes 555 vshufps $ 0xb1, %xmm9 , %xmm9 , %xmm9 556 vshufps $ 0xb1, %xmm13, %xmm13, %xmm13 557 vshufps $ 0xb1, %xmm5 , %xmm5 , %xmm5 558 559#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 560 defined(NR) || defined(NC) || defined(TR) || defined(TC) 561 562 vaddsubps %xmm9, %xmm8 , %xmm8 563 vaddsubps %xmm13,%xmm12, %xmm12 564 vaddsubps %xmm5, %xmm4 , %xmm4 565 566 vshufps $ 0xb1, %xmm8 , %xmm8 , %xmm9 567 vshufps $ 0xb1, %xmm12, %xmm12, %xmm13 568 vshufps $ 0xb1, %xmm4 , %xmm4 , %xmm5 569 570#else 571 vaddsubps %xmm8, %xmm9 ,%xmm9 572 vaddsubps %xmm12, %xmm13,%xmm13 573 vaddsubps %xmm4, %xmm5 ,%xmm5 574 575 vmovaps %xmm9, %xmm8 576 vmovaps %xmm13, %xmm12 577 vmovaps %xmm5, %xmm4 578 579 // swap high and low 64 bytes 580 vshufps $ 0xb1, %xmm9 , %xmm9 , %xmm9 581 vshufps $ 0xb1, %xmm13, %xmm13, %xmm13 582 vshufps $ 0xb1, %xmm5 , %xmm5 , %xmm5 583 584#endif 585 586 // multiply with ALPHA_R 587 vmulps %xmm8 , %xmm0, %xmm8 588 vmulps %xmm12, %xmm0, %xmm12 589 vmulps %xmm4 , %xmm0, %xmm4 590 591 // multiply with ALPHA_I 592 vmulps %xmm9 , %xmm1, %xmm9 593 vmulps %xmm13, %xmm1, %xmm13 594 vmulps %xmm5 , %xmm1, %xmm5 595 596 vaddsubps %xmm9, %xmm8 , %xmm8 597 vaddsubps %xmm13,%xmm12, %xmm12 598 vaddsubps %xmm5, %xmm4 , %xmm4 599 600#if !defined(TRMMKERNEL) 601 602 vmovsd (CO1) , %xmm9 603 vmovsd (CO1,LDC) , %xmm13 604 vmovsd (CO1,LDC,2), %xmm5 605 vaddps %xmm9 , %xmm8 , %xmm8 606 vaddps %xmm13, %xmm12, %xmm12 607 vaddps %xmm5 , %xmm4, %xmm4 608 609#endif 610 611 vmovsd %xmm8 , (CO1) 612 vmovsd %xmm12 , (CO1, LDC) 613 vmovsd %xmm4 , (CO1, LDC,2) 614 615.endm 616 617 618/***************************************************************************************************************************/ 619 620.macro KERNEL8x2_SUB 621 622 vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 623 vbroadcastss -8 * SIZE(BO, BI, SIZE), %ymm4 624 VFMADDPS_R( %ymm8,%ymm4,%ymm0 ) 625 vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 626 VFMADDPS_R( %ymm12,%ymm4,%ymm1 ) 627 vbroadcastss -7 * SIZE(BO, BI, SIZE), %ymm5 628 VFMADDPS_I( %ymm9,%ymm5,%ymm0 ) 629 VFMADDPS_I( %ymm13,%ymm5,%ymm1 ) 630 vbroadcastss -6 * SIZE(BO, BI, SIZE), %ymm6 631 VFMADDPS_R( %ymm10,%ymm6,%ymm0 ) 632 VFMADDPS_R( %ymm14,%ymm6,%ymm1 ) 633 vbroadcastss -5 * SIZE(BO, BI, SIZE), %ymm7 634 VFMADDPS_I( %ymm11,%ymm7,%ymm0 ) 635 VFMADDPS_I( %ymm15,%ymm7,%ymm1 ) 636 addq $ 4 , BI 637 addq $ 16, %rax 638.endm 639 640.macro SAVE8x2 641 642 vbroadcastss ALPHA_R, %ymm0 643 vbroadcastss ALPHA_I, %ymm1 644 645 // swap high and low 64 bytes 646 vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9 647 vshufps $ 0xb1, %ymm11, %ymm11, %ymm11 648 vshufps $ 0xb1, %ymm13, %ymm13, %ymm13 649 vshufps $ 0xb1, %ymm15, %ymm15, %ymm15 650 651#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 652 defined(NR) || defined(NC) || defined(TR) || defined(TC) 653 654 vaddsubps %ymm9, %ymm8 , %ymm8 655 vaddsubps %ymm11,%ymm10, %ymm10 656 vaddsubps %ymm13,%ymm12, %ymm12 657 vaddsubps %ymm15,%ymm14, %ymm14 658 659 vshufps $ 0xb1, %ymm8 , %ymm8, %ymm9 660 vshufps $ 0xb1, %ymm10, %ymm10, %ymm11 661 vshufps $ 0xb1, %ymm12, %ymm12, %ymm13 662 vshufps $ 0xb1, %ymm14, %ymm14, %ymm15 663 664#else 665 vaddsubps %ymm8, %ymm9 ,%ymm9 666 vaddsubps %ymm10, %ymm11,%ymm11 667 vaddsubps %ymm12, %ymm13,%ymm13 668 vaddsubps %ymm14, %ymm15,%ymm15 669 670 vmovaps %ymm9, %ymm8 671 vmovaps %ymm11, %ymm10 672 vmovaps %ymm13, %ymm12 673 vmovaps %ymm15, %ymm14 674 675 // swap high and low 64 bytes 676 vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9 677 vshufps $ 0xb1, %ymm11, %ymm11, %ymm11 678 vshufps $ 0xb1, %ymm13, %ymm13, %ymm13 679 vshufps $ 0xb1, %ymm15, %ymm15, %ymm15 680 681#endif 682 683 // multiply with ALPHA_R 684 vmulps %ymm8 , %ymm0, %ymm8 685 vmulps %ymm10, %ymm0, %ymm10 686 vmulps %ymm12, %ymm0, %ymm12 687 vmulps %ymm14, %ymm0, %ymm14 688 689 // multiply with ALPHA_I 690 vmulps %ymm9 , %ymm1, %ymm9 691 vmulps %ymm11, %ymm1, %ymm11 692 vmulps %ymm13, %ymm1, %ymm13 693 vmulps %ymm15, %ymm1, %ymm15 694 695 vaddsubps %ymm9, %ymm8 , %ymm8 696 vaddsubps %ymm11,%ymm10, %ymm10 697 vaddsubps %ymm13,%ymm12, %ymm12 698 vaddsubps %ymm15,%ymm14, %ymm14 699 700 701 702#if !defined(TRMMKERNEL) 703 704 vaddps (CO1), %ymm8 , %ymm8 705 vaddps 8 * SIZE(CO1), %ymm12, %ymm12 706 707 vaddps (CO1, LDC), %ymm10, %ymm10 708 vaddps 8 * SIZE(CO1, LDC), %ymm14, %ymm14 709 710#endif 711 712 vmovups %ymm8 , (CO1) 713 vmovups %ymm12 , 8 * SIZE(CO1) 714 715 vmovups %ymm10 , (CO1, LDC) 716 vmovups %ymm14 , 8 * SIZE(CO1, LDC) 717 718 prefetcht0 64(CO1) 719 prefetcht0 64(CO1, LDC) 720 721.endm 722 723/***************************************************************************************************************************/ 724 725.macro KERNEL4x2_SUB 726 vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 727 vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 728 VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) 729 vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 730 VFMADDPS_R( %xmm12,%xmm4,%xmm1 ) 731 vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 732 VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) 733 VFMADDPS_I( %xmm13,%xmm5,%xmm1 ) 734 vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 735 VFMADDPS_R( %xmm10,%xmm6,%xmm0 ) 736 VFMADDPS_R( %xmm14,%xmm6,%xmm1 ) 737 vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 738 VFMADDPS_I( %xmm11,%xmm7,%xmm0 ) 739 VFMADDPS_I( %xmm15,%xmm7,%xmm1 ) 740 addq $ 4, BI 741 addq $ 8, %rax 742.endm 743 744.macro SAVE4x2 745 746 vbroadcastss ALPHA_R, %xmm0 747 vbroadcastss ALPHA_I, %xmm1 748 749 // swap high and low 64 bytes 750 vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 751 vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 752 vshufps $ 0xb1, %xmm13, %xmm13, %xmm13 753 vshufps $ 0xb1, %xmm15, %xmm15, %xmm15 754 755#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 756 defined(NR) || defined(NC) || defined(TR) || defined(TC) 757 758 vaddsubps %xmm9, %xmm8 , %xmm8 759 vaddsubps %xmm11,%xmm10, %xmm10 760 vaddsubps %xmm13,%xmm12, %xmm12 761 vaddsubps %xmm15,%xmm14, %xmm14 762 763 vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 764 vshufps $ 0xb1, %xmm10, %xmm10, %xmm11 765 vshufps $ 0xb1, %xmm12, %xmm12, %xmm13 766 vshufps $ 0xb1, %xmm14, %xmm14, %xmm15 767 768#else 769 vaddsubps %xmm8, %xmm9 ,%xmm9 770 vaddsubps %xmm10, %xmm11,%xmm11 771 vaddsubps %xmm12, %xmm13,%xmm13 772 vaddsubps %xmm14, %xmm15,%xmm15 773 774 vmovaps %xmm9, %xmm8 775 vmovaps %xmm11, %xmm10 776 vmovaps %xmm13, %xmm12 777 vmovaps %xmm15, %xmm14 778 779 // swap high and low 64 bytes 780 vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 781 vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 782 vshufps $ 0xb1, %xmm13, %xmm13, %xmm13 783 vshufps $ 0xb1, %xmm15, %xmm15, %xmm15 784 785#endif 786 787 // multiply with ALPHA_R 788 vmulps %xmm8 , %xmm0, %xmm8 789 vmulps %xmm10, %xmm0, %xmm10 790 vmulps %xmm12, %xmm0, %xmm12 791 vmulps %xmm14, %xmm0, %xmm14 792 793 // multiply with ALPHA_I 794 vmulps %xmm9 , %xmm1, %xmm9 795 vmulps %xmm11, %xmm1, %xmm11 796 vmulps %xmm13, %xmm1, %xmm13 797 vmulps %xmm15, %xmm1, %xmm15 798 799 vaddsubps %xmm9, %xmm8 , %xmm8 800 vaddsubps %xmm11,%xmm10, %xmm10 801 vaddsubps %xmm13,%xmm12, %xmm12 802 vaddsubps %xmm15,%xmm14, %xmm14 803 804#if !defined(TRMMKERNEL) 805 806 vaddps (CO1), %xmm8 , %xmm8 807 vaddps 4 * SIZE(CO1), %xmm12, %xmm12 808 809 vaddps (CO1, LDC), %xmm10, %xmm10 810 vaddps 4 * SIZE(CO1, LDC), %xmm14, %xmm14 811 812#endif 813 814 vmovups %xmm8 , (CO1) 815 vmovups %xmm12 , 4 * SIZE(CO1) 816 817 vmovups %xmm10 , (CO1, LDC) 818 vmovups %xmm14 , 4 * SIZE(CO1, LDC) 819 820.endm 821 822/************************************************************************************************/ 823 824.macro KERNEL2x2_SUB 825 vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 826 vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 827 VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) 828 vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 829 VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) 830 vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 831 VFMADDPS_R( %xmm10,%xmm6,%xmm0 ) 832 vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 833 VFMADDPS_I( %xmm11,%xmm7,%xmm0 ) 834 addq $ 4, BI 835 addq $ 4, %rax 836.endm 837 838.macro SAVE2x2 839 840 vbroadcastss ALPHA_R, %xmm0 841 vbroadcastss ALPHA_I, %xmm1 842 843 // swap high and low 4 bytes 844 vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 845 vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 846 847#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 848 defined(NR) || defined(NC) || defined(TR) || defined(TC) 849 850 vaddsubps %xmm9, %xmm8 , %xmm8 851 vaddsubps %xmm11,%xmm10, %xmm10 852 853 vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 854 vshufps $ 0xb1, %xmm10, %xmm10, %xmm11 855 856#else 857 vaddsubps %xmm8, %xmm9 ,%xmm9 858 vaddsubps %xmm10, %xmm11,%xmm11 859 860 vmovaps %xmm9, %xmm8 861 vmovaps %xmm11, %xmm10 862 863 // swap high and low 4 bytes 864 vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 865 vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 866 867#endif 868 869 // multiply with ALPHA_R 870 vmulps %xmm8 , %xmm0, %xmm8 871 vmulps %xmm10, %xmm0, %xmm10 872 873 // multiply with ALPHA_I 874 vmulps %xmm9 , %xmm1, %xmm9 875 vmulps %xmm11, %xmm1, %xmm11 876 877 vaddsubps %xmm9, %xmm8 , %xmm8 878 vaddsubps %xmm11,%xmm10, %xmm10 879 880#if !defined(TRMMKERNEL) 881 882 vaddps (CO1), %xmm8 , %xmm8 883 884 vaddps (CO1, LDC), %xmm10, %xmm10 885 886#endif 887 888 vmovups %xmm8 , (CO1) 889 890 vmovups %xmm10 , (CO1, LDC) 891 892.endm 893 894/************************************************************************************************/ 895 896.macro KERNEL1x2_SUB 897 vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 898 vbroadcastss -8 * SIZE(BO, BI, SIZE), %xmm4 899 VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) 900 vbroadcastss -7 * SIZE(BO, BI, SIZE), %xmm5 901 VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) 902 vbroadcastss -6 * SIZE(BO, BI, SIZE), %xmm6 903 VFMADDPS_R( %xmm10,%xmm6,%xmm0 ) 904 vbroadcastss -5 * SIZE(BO, BI, SIZE), %xmm7 905 VFMADDPS_I( %xmm11,%xmm7,%xmm0 ) 906 addq $ 4, BI 907 addq $ 2, %rax 908.endm 909 910.macro SAVE1x2 911 912 vbroadcastss ALPHA_R, %xmm0 913 vbroadcastss ALPHA_I, %xmm1 914 915 // swap high and low 64 bytes 916 vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 917 vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 918 919#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 920 defined(NR) || defined(NC) || defined(TR) || defined(TC) 921 922 vaddsubps %xmm9, %xmm8 , %xmm8 923 vaddsubps %xmm11,%xmm10, %xmm10 924 925 vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 926 vshufps $ 0xb1, %xmm10, %xmm10, %xmm11 927 928#else 929 vaddsubps %xmm8, %xmm9 ,%xmm9 930 vaddsubps %xmm10, %xmm11,%xmm11 931 932 vmovaps %xmm9, %xmm8 933 vmovaps %xmm11, %xmm10 934 935 // swap high and low 64 bytes 936 vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 937 vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 938 939#endif 940 941 // multiply with ALPHA_R 942 vmulps %xmm8 , %xmm0, %xmm8 943 vmulps %xmm10, %xmm0, %xmm10 944 945 // multiply with ALPHA_I 946 vmulps %xmm9 , %xmm1, %xmm9 947 vmulps %xmm11, %xmm1, %xmm11 948 949 vaddsubps %xmm9, %xmm8 , %xmm8 950 vaddsubps %xmm11,%xmm10, %xmm10 951 952#if !defined(TRMMKERNEL) 953 954 vmovsd (CO1), %xmm14 955 vaddps %xmm14, %xmm8 , %xmm8 956 957 vmovsd (CO1, LDC), %xmm15 958 vaddps %xmm15, %xmm10, %xmm10 959 960#endif 961 962 vmovsd %xmm8 , (CO1) 963 vmovsd %xmm10 , (CO1, LDC) 964 965.endm 966 967/************************************************************************************************/ 968 969.macro KERNEL8x1_SUB 970 vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 971 vmovups -8 * SIZE(AO, %rax, SIZE), %ymm1 972 vbroadcastss -4 * SIZE(BO, BI, SIZE), %ymm4 973 VFMADDPS_R( %ymm8,%ymm4,%ymm0 ) 974 VFMADDPS_R( %ymm12,%ymm4,%ymm1 ) 975 vbroadcastss -3 * SIZE(BO, BI, SIZE), %ymm5 976 VFMADDPS_I( %ymm9,%ymm5,%ymm0 ) 977 VFMADDPS_I( %ymm13,%ymm5,%ymm1 ) 978 addq $ 2 , BI 979 addq $ 16, %rax 980.endm 981 982.macro SAVE8x1 983 984 vbroadcastss ALPHA_R, %ymm0 985 vbroadcastss ALPHA_I, %ymm1 986 987 // swap high and low 64 bytes 988 vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9 989 vshufps $ 0xb1, %ymm13, %ymm13, %ymm13 990 991#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 992 defined(NR) || defined(NC) || defined(TR) || defined(TC) 993 994 vaddsubps %ymm9, %ymm8 , %ymm8 995 vaddsubps %ymm13,%ymm12, %ymm12 996 997 vshufps $ 0xb1, %ymm8 , %ymm8, %ymm9 998 vshufps $ 0xb1, %ymm12, %ymm12, %ymm13 999 1000#else 1001 vaddsubps %ymm8, %ymm9 ,%ymm9 1002 vaddsubps %ymm12, %ymm13,%ymm13 1003 1004 vmovaps %ymm9, %ymm8 1005 vmovaps %ymm13, %ymm12 1006 1007 // swap high and low 64 bytes 1008 vshufps $ 0xb1, %ymm9 , %ymm9, %ymm9 1009 vshufps $ 0xb1, %ymm13, %ymm13, %ymm13 1010 1011#endif 1012 1013 // multiply with ALPHA_R 1014 vmulps %ymm8 , %ymm0, %ymm8 1015 vmulps %ymm12, %ymm0, %ymm12 1016 1017 // multiply with ALPHA_I 1018 vmulps %ymm9 , %ymm1, %ymm9 1019 vmulps %ymm13, %ymm1, %ymm13 1020 1021 vaddsubps %ymm9, %ymm8 , %ymm8 1022 vaddsubps %ymm13,%ymm12, %ymm12 1023 1024 1025 1026#if !defined(TRMMKERNEL) 1027 1028 vaddps (CO1), %ymm8 , %ymm8 1029 vaddps 8 * SIZE(CO1), %ymm12, %ymm12 1030 1031#endif 1032 1033 vmovups %ymm8 , (CO1) 1034 vmovups %ymm12 , 8 * SIZE(CO1) 1035 1036.endm 1037 1038 1039/************************************************************************************************/ 1040 1041.macro KERNEL4x1_SUB 1042 vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 1043 vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 1044 VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) 1045 vmovups -12 * SIZE(AO, %rax, SIZE), %xmm1 1046 VFMADDPS_R( %xmm12,%xmm4,%xmm1 ) 1047 vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 1048 VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) 1049 VFMADDPS_I( %xmm13,%xmm5,%xmm1 ) 1050 addq $ 2, BI 1051 addq $ 8, %rax 1052.endm 1053 1054.macro SAVE4x1 1055 1056 vbroadcastss ALPHA_R, %xmm0 1057 vbroadcastss ALPHA_I, %xmm1 1058 1059 // swap high and low 4 bytes 1060 vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 1061 vshufps $ 0xb1, %xmm13, %xmm13, %xmm13 1062 1063#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 1064 defined(NR) || defined(NC) || defined(TR) || defined(TC) 1065 1066 vaddsubps %xmm9, %xmm8 , %xmm8 1067 vaddsubps %xmm13,%xmm12, %xmm12 1068 1069 vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 1070 vshufps $ 0xb1, %xmm12, %xmm12, %xmm13 1071 1072#else 1073 vaddsubps %xmm8, %xmm9 ,%xmm9 1074 vaddsubps %xmm12, %xmm13,%xmm13 1075 1076 vmovaps %xmm9, %xmm8 1077 vmovaps %xmm13, %xmm12 1078 1079 // swap high and low 4 bytes 1080 vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 1081 vshufps $ 0xb1, %xmm13, %xmm13, %xmm13 1082 1083#endif 1084 1085 // multiply with ALPHA_R 1086 vmulps %xmm8 , %xmm0, %xmm8 1087 vmulps %xmm12, %xmm0, %xmm12 1088 1089 // multiply with ALPHA_I 1090 vmulps %xmm9 , %xmm1, %xmm9 1091 vmulps %xmm13, %xmm1, %xmm13 1092 1093 vaddsubps %xmm9, %xmm8 , %xmm8 1094 vaddsubps %xmm13,%xmm12, %xmm12 1095 1096#ifndef TRMMKERNEL 1097 1098 vaddps (CO1), %xmm8 , %xmm8 1099 vaddps 4 * SIZE(CO1), %xmm12, %xmm12 1100 1101#endif 1102 1103 vmovups %xmm8 , (CO1) 1104 vmovups %xmm12 , 4 * SIZE(CO1) 1105 1106.endm 1107 1108/************************************************************************************************/ 1109 1110.macro KERNEL2x1_SUB 1111 vmovups -16 * SIZE(AO, %rax, SIZE), %xmm0 1112 vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 1113 VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) 1114 vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 1115 VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) 1116 addq $ 2, BI 1117 addq $ 4, %rax 1118.endm 1119 1120.macro SAVE2x1 1121 1122 vbroadcastss ALPHA_R, %xmm0 1123 vbroadcastss ALPHA_I, %xmm1 1124 1125 // swap high and low 64 bytes 1126 vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 1127 1128#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 1129 defined(NR) || defined(NC) || defined(TR) || defined(TC) 1130 1131 vaddsubps %xmm9, %xmm8 , %xmm8 1132 1133 vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 1134 1135#else 1136 vaddsubps %xmm8, %xmm9 ,%xmm9 1137 1138 vmovaps %xmm9, %xmm8 1139 1140 // swap high and low 64 bytes 1141 vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 1142 1143#endif 1144 1145 // multiply with ALPHA_R 1146 vmulps %xmm8 , %xmm0, %xmm8 1147 1148 // multiply with ALPHA_I 1149 vmulps %xmm9 , %xmm1, %xmm9 1150 1151 vaddsubps %xmm9, %xmm8 , %xmm8 1152 1153#if !defined(TRMMKERNEL) 1154 1155 vaddps (CO1), %xmm8 , %xmm8 1156 1157#endif 1158 1159 vmovups %xmm8 , (CO1) 1160 1161.endm 1162 1163/************************************************************************************************/ 1164 1165.macro KERNEL1x1_SUB 1166 vmovsd -16 * SIZE(AO, %rax, SIZE), %xmm0 1167 vbroadcastss -4 * SIZE(BO, BI, SIZE), %xmm4 1168 VFMADDPS_R( %xmm8,%xmm4,%xmm0 ) 1169 vbroadcastss -3 * SIZE(BO, BI, SIZE), %xmm5 1170 VFMADDPS_I( %xmm9,%xmm5,%xmm0 ) 1171 addq $ 2, BI 1172 addq $ 2, %rax 1173.endm 1174 1175.macro SAVE1x1 1176 1177 vbroadcastss ALPHA_R, %xmm0 1178 vbroadcastss ALPHA_I, %xmm1 1179 1180 // swap high and low 64 bytes 1181 vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 1182 1183#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 1184 defined(NR) || defined(NC) || defined(TR) || defined(TC) 1185 1186 vaddsubps %xmm9, %xmm8 , %xmm8 1187 1188 vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 1189 1190#else 1191 vaddsubps %xmm8, %xmm9 ,%xmm9 1192 1193 vmovaps %xmm9, %xmm8 1194 1195 // swap high and low 64 bytes 1196 vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 1197 1198#endif 1199 1200 // multiply with ALPHA_R 1201 vmulps %xmm8 , %xmm0, %xmm8 1202 1203 // multiply with ALPHA_I 1204 vmulps %xmm9 , %xmm1, %xmm9 1205 1206 vaddsubps %xmm9, %xmm8 , %xmm8 1207 1208#if !defined(TRMMKERNEL) 1209 1210 vmovsd (CO1), %xmm14 1211 vaddps %xmm14, %xmm8 , %xmm8 1212 1213#endif 1214 1215 vmovsd %xmm8 , (CO1) 1216 1217.endm 1218 1219 1220#if !defined(TRMMKERNEL) 1221 1222 PROLOGUE 1223 PROFCODE 1224 1225 subq $STACKSIZE, %rsp 1226 movq %rbx, (%rsp) 1227 movq %rbp, 8(%rsp) 1228 movq %r12, 16(%rsp) 1229 movq %r13, 24(%rsp) 1230 movq %r14, 32(%rsp) 1231 movq %r15, 40(%rsp) 1232 1233 vzeroupper 1234 1235#ifdef WINDOWS_ABI 1236 movq %rdi, 48(%rsp) 1237 movq %rsi, 56(%rsp) 1238 vmovups %xmm6, 64(%rsp) 1239 vmovups %xmm7, 80(%rsp) 1240 vmovups %xmm8, 96(%rsp) 1241 vmovups %xmm9, 112(%rsp) 1242 vmovups %xmm10, 128(%rsp) 1243 vmovups %xmm11, 144(%rsp) 1244 vmovups %xmm12, 160(%rsp) 1245 vmovups %xmm13, 176(%rsp) 1246 vmovups %xmm14, 192(%rsp) 1247 vmovups %xmm15, 208(%rsp) 1248 1249 movq ARG1, OLD_M 1250 movq ARG2, OLD_N 1251 movq ARG3, OLD_K 1252 movq OLD_A, A 1253 movq OLD_B, B 1254 movq OLD_C, C 1255 movq OLD_LDC, LDC 1256 vmovaps %xmm3, %xmm0 1257 vmovsd OLD_ALPHA_I, %xmm1 1258 1259#else 1260 movq STACKSIZE + 8(%rsp), LDC 1261 1262#endif 1263 1264 movq %rsp, SP # save old stack 1265 subq $ 128 + L_BUFFER_SIZE, %rsp 1266 andq $ -4096, %rsp # align stack 1267 1268 STACK_TOUCH 1269 1270 cmpq $ 0, OLD_M 1271 je .L999 1272 1273 cmpq $ 0, OLD_N 1274 je .L999 1275 1276 cmpq $ 0, OLD_K 1277 je .L999 1278 1279 movq OLD_M, M 1280 movq OLD_N, N 1281 movq OLD_K, K 1282 1283 vmovss %xmm0, ALPHA_R 1284 vmovss %xmm1, ALPHA_I 1285 1286 salq $ ZBASE_SHIFT, LDC 1287 1288 movq N, %rax 1289 xorq %rdx, %rdx 1290 movq $ 6, %rdi 1291 divq %rdi // N / 6 1292 movq %rax, Ndiv6 // N / 6 1293 movq %rdx, Nmod6 // N % 6 1294 1295/************************************************************************************************/ 1296 1297.L6_0: 1298 1299 movq Ndiv6, J 1300 cmpq $ 0, J 1301 je .L2_00 1302 ALIGN_4 1303 1304 1305 1306.L6_01: 1307 // copy to sub buffer 1308 movq B, BO1 1309 leaq BUFFER1, BO // first buffer to BO 1310 movq K, %rax 1311 salq $2, %rax // 2 * COMPSIZE 1312 leaq (B, %rax,4), BO2 1313 movq BO2, B // next offset of B 1314 movq K, %rax 1315 1316 ALIGN_4 1317 1318.L6_02b: 1319 1320 vmovups (BO1), %xmm0 1321 vmovsd (BO2), %xmm1 1322 vmovups %xmm0, (BO) 1323 vmovsd %xmm1, 4*SIZE(BO) 1324 addq $ 4*SIZE,BO1 1325 addq $ 4*SIZE,BO2 1326 addq $ 6*SIZE,BO 1327 decq %rax 1328 jnz .L6_02b 1329 1330 1331.L6_10: 1332 movq C, CO1 1333 leaq (C, LDC, 2), C // c += 2 * ldc 1334 leaq (C, LDC, 1), C // c += 1 * ldc 1335 1336 movq A, AO // aoffset = a 1337 addq $ 16 * SIZE, AO 1338 1339 movq M, I 1340 sarq $ 3, I // i = (m >> 3) 1341 je .L6_4_10 1342 1343 ALIGN_4 1344/**********************************************************************************************************/ 1345 1346.L6_8_11: 1347 1348 leaq BUFFER1, BO // first buffer to BO 1349 addq $ 8 * SIZE, BO 1350 1351 vzeroall 1352 1353 movq K, %rax 1354 1355 andq $ -8, %rax // K = K - ( K % 8 ) 1356 je .L6_8_16 1357 1358 ALIGN_4 1359 1360.L6_8_12: 1361 1362 KERNEL8x3_SUB 1363 KERNEL8x3_SUB 1364 KERNEL8x3_SUB 1365 KERNEL8x3_SUB 1366 1367 KERNEL8x3_SUB 1368 KERNEL8x3_SUB 1369 KERNEL8x3_SUB 1370 KERNEL8x3_SUB 1371 1372 je .L6_8_16 1373 1374 KERNEL8x3_SUB 1375 KERNEL8x3_SUB 1376 KERNEL8x3_SUB 1377 KERNEL8x3_SUB 1378 1379 KERNEL8x3_SUB 1380 KERNEL8x3_SUB 1381 KERNEL8x3_SUB 1382 KERNEL8x3_SUB 1383 1384 je .L6_8_16 1385 1386 jmp .L6_8_12 1387 ALIGN_4 1388 1389.L6_8_16: 1390 movq K, %rax 1391 1392 andq $ 7, %rax # if (k & 1) 1393 je .L6_8_19 1394 1395 ALIGN_4 1396 1397.L6_8_17: 1398 1399 KERNEL8x3_SUB 1400 1401 jnz .L6_8_17 1402 ALIGN_4 1403 1404 1405.L6_8_19: 1406 1407 SAVE8x3 1408 1409 addq $ 16 * SIZE, CO1 # coffset += 16 1410 decq I # i -- 1411 jg .L6_8_11 1412 ALIGN_4 1413 1414 1415/**********************************************************************************************************/ 1416 1417 1418.L6_4_10: 1419 testq $ 7, M 1420 jz .L6_4_60 // to next 2 lines of N 1421 1422 testq $ 4, M 1423 jz .L6_4_20 1424 ALIGN_4 1425 1426 1427.L6_4_11: 1428 1429 leaq BUFFER1, BO // first buffer to BO 1430 addq $ 8 * SIZE, BO 1431 1432 vzeroall 1433 1434 movq K, %rax 1435 1436 andq $ -8, %rax // K = K - ( K % 8 ) 1437 je .L6_4_16 1438 1439 ALIGN_4 1440 1441.L6_4_12: 1442 1443 prefetcht0 A_PR1(AO) 1444 KERNEL4x3_SUB 1445 KERNEL4x3_SUB 1446 prefetcht0 A_PR1(AO) 1447 KERNEL4x3_SUB 1448 KERNEL4x3_SUB 1449 1450 prefetcht0 A_PR1(AO) 1451 KERNEL4x3_SUB 1452 KERNEL4x3_SUB 1453 prefetcht0 A_PR1(AO) 1454 KERNEL4x3_SUB 1455 KERNEL4x3_SUB 1456 1457 je .L6_4_16 1458 1459 prefetcht0 A_PR1(AO) 1460 KERNEL4x3_SUB 1461 KERNEL4x3_SUB 1462 prefetcht0 A_PR1(AO) 1463 KERNEL4x3_SUB 1464 KERNEL4x3_SUB 1465 1466 prefetcht0 A_PR1(AO) 1467 KERNEL4x3_SUB 1468 KERNEL4x3_SUB 1469 prefetcht0 A_PR1(AO) 1470 KERNEL4x3_SUB 1471 KERNEL4x3_SUB 1472 1473 je .L6_4_16 1474 1475 jmp .L6_4_12 1476 ALIGN_4 1477 1478.L6_4_16: 1479 movq K, %rax 1480 1481 andq $ 7, %rax # if (k & 1) 1482 je .L6_4_19 1483 1484 ALIGN_4 1485 1486.L6_4_17: 1487 1488 KERNEL4x3_SUB 1489 1490 jnz .L6_4_17 1491 ALIGN_4 1492 1493 1494.L6_4_19: 1495 1496 SAVE4x3 1497 1498 addq $ 8 * SIZE, CO1 # coffset += 8 1499 ALIGN_4 1500 1501 1502 1503/************************************************************************** 1504* Rest of M 1505***************************************************************************/ 1506 1507.L6_4_20: 1508 1509 testq $ 2, M 1510 jz .L6_4_40 1511 ALIGN_4 1512 1513.L6_4_21: 1514 1515 leaq BUFFER1, BO // first buffer to BO 1516 addq $ 8 * SIZE, BO 1517 1518 vzeroall 1519 1520 movq K, %rax 1521 1522 andq $ -8, %rax // K = K - ( K % 8 ) 1523 je .L6_4_26 1524 1525 ALIGN_4 1526 1527.L6_4_22: 1528 1529 prefetcht0 A_PR1(AO) 1530 KERNEL2x3_SUB 1531 KERNEL2x3_SUB 1532 KERNEL2x3_SUB 1533 KERNEL2x3_SUB 1534 1535 prefetcht0 A_PR1(AO) 1536 KERNEL2x3_SUB 1537 KERNEL2x3_SUB 1538 KERNEL2x3_SUB 1539 KERNEL2x3_SUB 1540 1541 je .L6_4_26 1542 1543 prefetcht0 A_PR1(AO) 1544 KERNEL2x3_SUB 1545 KERNEL2x3_SUB 1546 KERNEL2x3_SUB 1547 KERNEL2x3_SUB 1548 1549 prefetcht0 A_PR1(AO) 1550 KERNEL2x3_SUB 1551 KERNEL2x3_SUB 1552 KERNEL2x3_SUB 1553 KERNEL2x3_SUB 1554 1555 je .L6_4_26 1556 1557 jmp .L6_4_22 1558 ALIGN_4 1559 1560.L6_4_26: 1561 movq K, %rax 1562 1563 andq $ 7, %rax # if (k & 1) 1564 je .L6_4_29 1565 1566 ALIGN_4 1567 1568.L6_4_27: 1569 1570 KERNEL2x3_SUB 1571 1572 jnz .L6_4_27 1573 ALIGN_4 1574 1575 1576.L6_4_29: 1577 1578 SAVE2x3 1579 1580 addq $ 4 * SIZE, CO1 # coffset += 4 1581 decq I # i -- 1582 jg .L6_4_21 1583 ALIGN_4 1584 1585 1586 1587/**************************************************************************/ 1588.L6_4_40: 1589 testq $ 1, M 1590 jz .L6_4_60 // to next 2 lines of N 1591 1592 ALIGN_4 1593 1594.L6_4_41: 1595 1596 leaq BUFFER1, BO // first buffer to BO 1597 addq $ 8 * SIZE, BO 1598 1599 vzeroall 1600 1601 movq K, %rax 1602 1603 andq $ -8, %rax // K = K - ( K % 8 ) 1604 je .L6_4_46 1605 1606 ALIGN_4 1607 1608.L6_4_42: 1609 1610 prefetcht0 A_PR1(AO) 1611 KERNEL1x3_SUB 1612 KERNEL1x3_SUB 1613 KERNEL1x3_SUB 1614 KERNEL1x3_SUB 1615 1616 KERNEL1x3_SUB 1617 KERNEL1x3_SUB 1618 KERNEL1x3_SUB 1619 KERNEL1x3_SUB 1620 1621 je .L6_4_46 1622 1623 prefetcht0 A_PR1(AO) 1624 KERNEL1x3_SUB 1625 KERNEL1x3_SUB 1626 KERNEL1x3_SUB 1627 KERNEL1x3_SUB 1628 1629 KERNEL1x3_SUB 1630 KERNEL1x3_SUB 1631 KERNEL1x3_SUB 1632 KERNEL1x3_SUB 1633 1634 je .L6_4_46 1635 1636 jmp .L6_4_42 1637 ALIGN_4 1638 1639.L6_4_46: 1640 movq K, %rax 1641 1642 andq $ 7, %rax # if (k & 1) 1643 je .L6_4_49 1644 ALIGN_4 1645 1646.L6_4_47: 1647 1648 KERNEL1x3_SUB 1649 1650 jnz .L6_4_47 1651 ALIGN_4 1652 1653 1654.L6_4_49: 1655 1656 SAVE1x3 1657 1658 addq $ 2 * SIZE, CO1 # coffset += 2 1659 decq I # i -- 1660 jg .L6_4_41 1661 ALIGN_4 1662 1663 1664 1665 1666.L6_4_60: 1667 1668 1669/*******************************************************************************************/ 1670 1671.L7_01: 1672 // copy to sub buffer 1673 movq B, BO1 1674 leaq BUFFER1, BO // first buffer to BO 1675 movq K, %rax 1676 salq $2, %rax // 2 * COMPSIZE 1677 leaq (B, %rax,4), BO2 1678 movq K, %rax 1679 1680 ALIGN_4 1681 1682.L7_02b: 1683 1684 vmovsd 2*SIZE(BO1), %xmm0 1685 vmovups (BO2), %xmm1 1686 vmovsd %xmm0, (BO) 1687 vmovups %xmm1, 2*SIZE(BO) 1688 addq $ 4*SIZE,BO1 1689 addq $ 4*SIZE,BO2 1690 addq $ 6*SIZE,BO 1691 decq %rax 1692 jnz .L7_02b 1693 1694 movq BO2, B // next offset of B 1695 1696.L7_10: 1697 movq C, CO1 1698 leaq (C, LDC, 2), C // c += 2 * ldc 1699 leaq (C, LDC, 1), C // c += 1 * ldc 1700 1701 movq A, AO // aoffset = a 1702 addq $ 16 * SIZE, AO 1703 1704 movq M, I 1705 sarq $ 3, I // i = (m >> 3) 1706 je .L7_4_10 1707 1708 ALIGN_4 1709/**********************************************************************************************************/ 1710 1711.L7_8_11: 1712 1713 leaq BUFFER1, BO // first buffer to BO 1714 addq $ 8 * SIZE, BO 1715 1716 vzeroall 1717 1718 movq K, %rax 1719 1720 andq $ -8, %rax // K = K - ( K % 8 ) 1721 je .L7_8_16 1722 1723 ALIGN_4 1724 1725.L7_8_12: 1726 1727 KERNEL8x3_SUB 1728 KERNEL8x3_SUB 1729 KERNEL8x3_SUB 1730 KERNEL8x3_SUB 1731 1732 KERNEL8x3_SUB 1733 KERNEL8x3_SUB 1734 KERNEL8x3_SUB 1735 KERNEL8x3_SUB 1736 1737 je .L7_8_16 1738 1739 KERNEL8x3_SUB 1740 KERNEL8x3_SUB 1741 KERNEL8x3_SUB 1742 KERNEL8x3_SUB 1743 1744 KERNEL8x3_SUB 1745 KERNEL8x3_SUB 1746 KERNEL8x3_SUB 1747 KERNEL8x3_SUB 1748 1749 je .L7_8_16 1750 1751 jmp .L7_8_12 1752 ALIGN_4 1753 1754.L7_8_16: 1755 movq K, %rax 1756 1757 andq $ 7, %rax # if (k & 1) 1758 je .L7_8_19 1759 1760 ALIGN_4 1761 1762.L7_8_17: 1763 1764 KERNEL8x3_SUB 1765 1766 jnz .L7_8_17 1767 ALIGN_4 1768 1769 1770.L7_8_19: 1771 1772 SAVE8x3 1773 1774 addq $ 16 * SIZE, CO1 # coffset += 16 1775 decq I # i -- 1776 jg .L7_8_11 1777 ALIGN_4 1778 1779 1780/**********************************************************************************************************/ 1781 1782 1783.L7_4_10: 1784 testq $ 7, M 1785 jz .L7_4_60 // to next 2 lines of N 1786 1787 testq $ 4, M 1788 jz .L7_4_20 1789 ALIGN_4 1790 1791 1792.L7_4_11: 1793 1794 leaq BUFFER1, BO // first buffer to BO 1795 addq $ 8 * SIZE, BO 1796 1797 vzeroall 1798 1799 movq K, %rax 1800 1801 andq $ -8, %rax // K = K - ( K % 8 ) 1802 je .L7_4_16 1803 1804 ALIGN_4 1805 1806.L7_4_12: 1807 1808 prefetcht0 A_PR1(AO) 1809 KERNEL4x3_SUB 1810 KERNEL4x3_SUB 1811 prefetcht0 A_PR1(AO) 1812 KERNEL4x3_SUB 1813 KERNEL4x3_SUB 1814 1815 prefetcht0 A_PR1(AO) 1816 KERNEL4x3_SUB 1817 KERNEL4x3_SUB 1818 prefetcht0 A_PR1(AO) 1819 KERNEL4x3_SUB 1820 KERNEL4x3_SUB 1821 1822 je .L7_4_16 1823 1824 prefetcht0 A_PR1(AO) 1825 KERNEL4x3_SUB 1826 KERNEL4x3_SUB 1827 prefetcht0 A_PR1(AO) 1828 KERNEL4x3_SUB 1829 KERNEL4x3_SUB 1830 1831 prefetcht0 A_PR1(AO) 1832 KERNEL4x3_SUB 1833 KERNEL4x3_SUB 1834 prefetcht0 A_PR1(AO) 1835 KERNEL4x3_SUB 1836 KERNEL4x3_SUB 1837 1838 je .L7_4_16 1839 1840 jmp .L7_4_12 1841 ALIGN_4 1842 1843.L7_4_16: 1844 movq K, %rax 1845 1846 andq $ 7, %rax # if (k & 1) 1847 je .L7_4_19 1848 1849 ALIGN_4 1850 1851.L7_4_17: 1852 1853 KERNEL4x3_SUB 1854 1855 jnz .L7_4_17 1856 ALIGN_4 1857 1858 1859.L7_4_19: 1860 1861 SAVE4x3 1862 1863 addq $ 8 * SIZE, CO1 # coffset += 8 1864 ALIGN_4 1865 1866 1867 1868/************************************************************************** 1869* Rest of M 1870***************************************************************************/ 1871 1872.L7_4_20: 1873 1874 testq $ 2, M 1875 jz .L7_4_40 1876 ALIGN_4 1877 1878.L7_4_21: 1879 1880 leaq BUFFER1, BO // first buffer to BO 1881 addq $ 8 * SIZE, BO 1882 1883 vzeroall 1884 1885 movq K, %rax 1886 1887 andq $ -8, %rax // K = K - ( K % 8 ) 1888 je .L7_4_26 1889 1890 ALIGN_4 1891 1892.L7_4_22: 1893 1894 prefetcht0 A_PR1(AO) 1895 KERNEL2x3_SUB 1896 KERNEL2x3_SUB 1897 KERNEL2x3_SUB 1898 KERNEL2x3_SUB 1899 1900 prefetcht0 A_PR1(AO) 1901 KERNEL2x3_SUB 1902 KERNEL2x3_SUB 1903 KERNEL2x3_SUB 1904 KERNEL2x3_SUB 1905 1906 je .L7_4_26 1907 1908 prefetcht0 A_PR1(AO) 1909 KERNEL2x3_SUB 1910 KERNEL2x3_SUB 1911 KERNEL2x3_SUB 1912 KERNEL2x3_SUB 1913 1914 prefetcht0 A_PR1(AO) 1915 KERNEL2x3_SUB 1916 KERNEL2x3_SUB 1917 KERNEL2x3_SUB 1918 KERNEL2x3_SUB 1919 1920 je .L7_4_26 1921 1922 jmp .L7_4_22 1923 ALIGN_4 1924 1925.L7_4_26: 1926 movq K, %rax 1927 1928 andq $ 7, %rax # if (k & 1) 1929 je .L7_4_29 1930 1931 ALIGN_4 1932 1933.L7_4_27: 1934 1935 KERNEL2x3_SUB 1936 1937 jnz .L7_4_27 1938 ALIGN_4 1939 1940 1941.L7_4_29: 1942 1943 SAVE2x3 1944 1945 addq $ 4 * SIZE, CO1 # coffset += 4 1946 decq I # i -- 1947 jg .L7_4_21 1948 ALIGN_4 1949 1950 1951 1952/**************************************************************************/ 1953.L7_4_40: 1954 testq $ 1, M 1955 jz .L7_4_60 // to next 2 lines of N 1956 1957 ALIGN_4 1958 1959.L7_4_41: 1960 1961 leaq BUFFER1, BO // first buffer to BO 1962 addq $ 8 * SIZE, BO 1963 1964 vzeroall 1965 1966 movq K, %rax 1967 1968 andq $ -8, %rax // K = K - ( K % 8 ) 1969 je .L7_4_46 1970 1971 ALIGN_4 1972 1973.L7_4_42: 1974 1975 prefetcht0 A_PR1(AO) 1976 KERNEL1x3_SUB 1977 KERNEL1x3_SUB 1978 KERNEL1x3_SUB 1979 KERNEL1x3_SUB 1980 1981 KERNEL1x3_SUB 1982 KERNEL1x3_SUB 1983 KERNEL1x3_SUB 1984 KERNEL1x3_SUB 1985 1986 je .L7_4_46 1987 1988 prefetcht0 A_PR1(AO) 1989 KERNEL1x3_SUB 1990 KERNEL1x3_SUB 1991 KERNEL1x3_SUB 1992 KERNEL1x3_SUB 1993 1994 KERNEL1x3_SUB 1995 KERNEL1x3_SUB 1996 KERNEL1x3_SUB 1997 KERNEL1x3_SUB 1998 1999 je .L7_4_46 2000 2001 jmp .L7_4_42 2002 ALIGN_4 2003 2004.L7_4_46: 2005 movq K, %rax 2006 2007 andq $ 7, %rax # if (k & 1) 2008 je .L7_4_49 2009 ALIGN_4 2010 2011.L7_4_47: 2012 2013 KERNEL1x3_SUB 2014 2015 jnz .L7_4_47 2016 ALIGN_4 2017 2018 2019.L7_4_49: 2020 2021 SAVE1x3 2022 2023 addq $ 2 * SIZE, CO1 # coffset += 2 2024 decq I # i -- 2025 jg .L7_4_41 2026 ALIGN_4 2027 2028 2029 2030 2031.L7_4_60: 2032 2033 decq J // j -- 2034 jg .L6_01 // next 6 lines of N 2035 2036 2037 2038/************************************************************************************************/ 2039 2040.L2_00: 2041 2042 movq Nmod6, J 2043 sarq $1, J // j = j / 2 2044 cmpq $ 0, J 2045 je .L1_0 2046 ALIGN_4 2047 2048 2049 2050.L2_01: 2051 // copy to sub buffer 2052 movq B, BO1 2053 leaq BUFFER1, BO // first buffer to BO 2054 movq K, %rax 2055 ALIGN_4 2056 2057.L2_02b: 2058 2059 vmovups (BO1), %xmm0 2060 vmovups %xmm0, (BO) 2061 addq $ 4*SIZE,BO1 2062 addq $ 4*SIZE,BO 2063 decq %rax 2064 jnz .L2_02b 2065 2066.L2_02c: 2067 2068 movq BO1, B // next offset of B 2069 2070.L2_10: 2071 movq C, CO1 2072 leaq (C, LDC, 2), C // c += 2 * ldc 2073 2074#if defined(TRMMKERNEL) && defined(LEFT) 2075 movq OFFSET, %rax 2076 movq %rax, KK 2077#endif 2078 2079 movq A, AO // aoffset = a 2080 addq $ 16 * SIZE, AO 2081 2082 movq M, I 2083 sarq $ 3, I // i = (m >> 3) 2084 je .L2_4_10 2085 2086 ALIGN_4 2087/**********************************************************************************************************/ 2088 2089.L2_8_11: 2090 2091#if !defined(TRMMKERNEL) || \ 2092 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2093 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2094 leaq BUFFER1, BO // first buffer to BO 2095 addq $ 8 * SIZE, BO 2096#else 2097 movq KK, %rax 2098 leaq BUFFER1, BO // first buffer to BO 2099 addq $ 8 * SIZE, BO 2100 movq %rax, BI // Index for BO 2101 leaq (,BI,4), BI // BI = BI * 4 ; number of values 2102 leaq (BO, BI, SIZE), BO 2103 salq $ 4, %rax // rax = rax *16 ; number of values 2104 leaq (AO, %rax, SIZE), AO 2105#endif 2106 2107 vzeroall 2108 2109#ifndef TRMMKERNEL 2110 movq K, %rax 2111#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 2112 movq K, %rax 2113 subq KK, %rax 2114 movq %rax, KKK 2115#else 2116 movq KK, %rax 2117#ifdef LEFT 2118 addq $ 8, %rax // number of values in AO 2119#else 2120 addq $ 2, %rax // number of values in BO 2121#endif 2122 movq %rax, KKK 2123#endif 2124 2125 2126 andq $ -8, %rax // K = K - ( K % 8 ) 2127 je .L2_8_16 2128 movq %rax, BI // Index for BO 2129 leaq ( ,BI,4), BI // BI = BI * 4 ; number of values 2130 2131 salq $ 4, %rax // rax = rax *16 ; number of values 2132 leaq (AO, %rax, SIZE), AO 2133 leaq (BO, BI, SIZE), BO 2134 negq BI 2135 negq %rax 2136 ALIGN_4 2137 2138.L2_8_12: 2139 2140 prefetcht0 A_PR1(AO,%rax,SIZE) 2141 prefetcht0 B_PR1(BO,BI,SIZE) 2142 KERNEL8x2_SUB 2143 prefetcht0 A_PR1(AO,%rax,SIZE) 2144 KERNEL8x2_SUB 2145 prefetcht0 A_PR1(AO,%rax,SIZE) 2146 KERNEL8x2_SUB 2147 prefetcht0 A_PR1(AO,%rax,SIZE) 2148 KERNEL8x2_SUB 2149 2150 prefetcht0 A_PR1(AO,%rax,SIZE) 2151 prefetcht0 B_PR1(BO,BI,SIZE) 2152 KERNEL8x2_SUB 2153 prefetcht0 A_PR1(AO,%rax,SIZE) 2154 KERNEL8x2_SUB 2155 prefetcht0 A_PR1(AO,%rax,SIZE) 2156 KERNEL8x2_SUB 2157 prefetcht0 A_PR1(AO,%rax,SIZE) 2158 KERNEL8x2_SUB 2159 2160 je .L2_8_16 2161 2162 prefetcht0 A_PR1(AO,%rax,SIZE) 2163 prefetcht0 B_PR1(BO,BI,SIZE) 2164 KERNEL8x2_SUB 2165 prefetcht0 A_PR1(AO,%rax,SIZE) 2166 KERNEL8x2_SUB 2167 prefetcht0 A_PR1(AO,%rax,SIZE) 2168 KERNEL8x2_SUB 2169 prefetcht0 A_PR1(AO,%rax,SIZE) 2170 KERNEL8x2_SUB 2171 2172 prefetcht0 A_PR1(AO,%rax,SIZE) 2173 prefetcht0 B_PR1(BO,BI,SIZE) 2174 KERNEL8x2_SUB 2175 prefetcht0 A_PR1(AO,%rax,SIZE) 2176 KERNEL8x2_SUB 2177 prefetcht0 A_PR1(AO,%rax,SIZE) 2178 KERNEL8x2_SUB 2179 prefetcht0 A_PR1(AO,%rax,SIZE) 2180 KERNEL8x2_SUB 2181 2182 je .L2_8_16 2183 2184 jmp .L2_8_12 2185 ALIGN_4 2186 2187.L2_8_16: 2188#ifndef TRMMKERNEL 2189 movq K, %rax 2190#else 2191 movq KKK, %rax 2192#endif 2193 2194 andq $ 7, %rax # if (k & 1) 2195 je .L2_8_19 2196 2197 movq %rax, BI // Index for BO 2198 leaq ( ,BI,4), BI // BI = BI * 4 ; number of values 2199 2200 salq $ 4, %rax // rax = rax *16 ; number of values 2201 leaq (AO, %rax, SIZE), AO 2202 leaq (BO, BI, SIZE), BO 2203 negq BI 2204 negq %rax 2205 ALIGN_4 2206 2207.L2_8_17: 2208 2209 KERNEL8x2_SUB 2210 2211 jl .L2_8_17 2212 ALIGN_4 2213 2214 2215.L2_8_19: 2216 2217 SAVE8x2 2218 2219 2220#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2221 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2222 movq K, %rax 2223 subq KKK, %rax 2224 movq %rax, BI // Index for BO 2225 leaq ( ,BI,4), BI // BI = BI * 4 ; number of values 2226 leaq (BO, BI, SIZE), BO 2227 salq $ 4, %rax // rax = rax *16 ; number of values 2228 leaq (AO, %rax, SIZE), AO 2229#endif 2230 2231 2232#if defined(TRMMKERNEL) && defined(LEFT) 2233 addq $ 8, KK 2234#endif 2235 2236 addq $ 16 * SIZE, CO1 # coffset += 16 2237 decq I # i -- 2238 jg .L2_8_11 2239 ALIGN_4 2240 2241 2242/**********************************************************************************************************/ 2243 2244 2245 2246 2247.L2_4_10: 2248 testq $ 7, M 2249 jz .L2_4_60 // to next 2 lines of N 2250 2251 testq $ 4, M 2252 jz .L2_4_20 2253 ALIGN_4 2254 2255 2256.L2_4_11: 2257 2258#if !defined(TRMMKERNEL) || \ 2259 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2260 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2261 leaq BUFFER1, BO // first buffer to BO 2262 addq $ 8 * SIZE, BO 2263#else 2264 movq KK, %rax 2265 leaq BUFFER1, BO // first buffer to BO 2266 addq $ 8 * SIZE, BO 2267 movq %rax, BI // Index for BO 2268 leaq (,BI,4), BI // BI = BI * 4 ; number of values 2269 leaq (BO, BI, SIZE), BO 2270 salq $ 3, %rax // rax = rax * 8 ; number of values 2271 leaq (AO, %rax, SIZE), AO 2272#endif 2273 2274 vzeroall 2275 2276#ifndef TRMMKERNEL 2277 movq K, %rax 2278#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 2279 movq K, %rax 2280 subq KK, %rax 2281 movq %rax, KKK 2282#else 2283 movq KK, %rax 2284#ifdef LEFT 2285 addq $ 4, %rax // number of values in AO 2286#else 2287 addq $ 2, %rax // number of values in BO 2288#endif 2289 movq %rax, KKK 2290#endif 2291 2292 2293 andq $ -8, %rax // K = K - ( K % 8 ) 2294 je .L2_4_16 2295 movq %rax, BI // Index for BO 2296 leaq ( ,BI,4), BI // BI = BI * 4 ; number of values 2297 2298 salq $ 3, %rax // rax = rax * 8 ; number of values 2299 leaq (AO, %rax, SIZE), AO 2300 leaq (BO, BI, SIZE), BO 2301 negq BI 2302 negq %rax 2303 ALIGN_4 2304 2305.L2_4_12: 2306 2307 prefetcht0 A_PR1(AO,%rax,SIZE) 2308 prefetcht0 B_PR1(BO,BI,SIZE) 2309 KERNEL4x2_SUB 2310 KERNEL4x2_SUB 2311 prefetcht0 A_PR1(AO,%rax,SIZE) 2312 KERNEL4x2_SUB 2313 KERNEL4x2_SUB 2314 2315 prefetcht0 A_PR1(AO,%rax,SIZE) 2316 prefetcht0 B_PR1(BO,BI,SIZE) 2317 KERNEL4x2_SUB 2318 KERNEL4x2_SUB 2319 prefetcht0 A_PR1(AO,%rax,SIZE) 2320 KERNEL4x2_SUB 2321 KERNEL4x2_SUB 2322 2323 je .L2_4_16 2324 2325 prefetcht0 A_PR1(AO,%rax,SIZE) 2326 prefetcht0 B_PR1(BO,BI,SIZE) 2327 KERNEL4x2_SUB 2328 KERNEL4x2_SUB 2329 prefetcht0 A_PR1(AO,%rax,SIZE) 2330 KERNEL4x2_SUB 2331 KERNEL4x2_SUB 2332 2333 prefetcht0 A_PR1(AO,%rax,SIZE) 2334 prefetcht0 B_PR1(BO,BI,SIZE) 2335 KERNEL4x2_SUB 2336 KERNEL4x2_SUB 2337 prefetcht0 A_PR1(AO,%rax,SIZE) 2338 KERNEL4x2_SUB 2339 KERNEL4x2_SUB 2340 2341 je .L2_4_16 2342 2343 jmp .L2_4_12 2344 ALIGN_4 2345 2346.L2_4_16: 2347#ifndef TRMMKERNEL 2348 movq K, %rax 2349#else 2350 movq KKK, %rax 2351#endif 2352 2353 andq $ 7, %rax # if (k & 1) 2354 je .L2_4_19 2355 2356 movq %rax, BI // Index for BO 2357 leaq ( ,BI,4), BI // BI = BI * 4 ; number of values 2358 2359 salq $ 3, %rax // rax = rax * 8 ; number of values 2360 leaq (AO, %rax, SIZE), AO 2361 leaq (BO, BI, SIZE), BO 2362 negq BI 2363 negq %rax 2364 ALIGN_4 2365 2366.L2_4_17: 2367 2368 KERNEL4x2_SUB 2369 2370 jl .L2_4_17 2371 ALIGN_4 2372 2373 2374.L2_4_19: 2375 2376 SAVE4x2 2377 2378#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2379 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2380 movq K, %rax 2381 subq KKK, %rax 2382 movq %rax, BI // Index for BO 2383 leaq ( ,BI,4), BI // BI = BI * 4 ; number of values 2384 leaq (BO, BI, SIZE), BO 2385 salq $ 3, %rax // rax = rax * 8 ; number of values 2386 leaq (AO, %rax, SIZE), AO 2387#endif 2388 2389 2390#if defined(TRMMKERNEL) && defined(LEFT) 2391 addq $ 4, KK 2392#endif 2393 2394 addq $ 8 * SIZE, CO1 # coffset += 8 2395 ALIGN_4 2396 2397 2398 2399/************************************************************************** 2400* Rest of M 2401***************************************************************************/ 2402 2403.L2_4_20: 2404 2405 testq $ 2, M 2406 jz .L2_4_40 2407 ALIGN_4 2408 2409.L2_4_21: 2410 2411#if !defined(TRMMKERNEL) || \ 2412 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2413 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2414 leaq BUFFER1, BO // first buffer to BO 2415 addq $ 8 * SIZE, BO 2416#else 2417 movq KK, %rax 2418 leaq BUFFER1, BO // first buffer to BO 2419 addq $ 8 * SIZE, BO 2420 movq %rax, BI // Index for BO 2421 leaq (,BI,4), BI // BI = BI * 4 ; number of values 2422 leaq (BO, BI, SIZE), BO 2423 salq $ 2, %rax // rax = rax * 4 ; number of values 2424 leaq (AO, %rax, SIZE), AO 2425#endif 2426 2427 vzeroall 2428 2429#ifndef TRMMKERNEL 2430 movq K, %rax 2431#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 2432 movq K, %rax 2433 subq KK, %rax 2434 movq %rax, KKK 2435#else 2436 movq KK, %rax 2437#ifdef LEFT 2438 addq $ 2, %rax // number of values in AO 2439#else 2440 addq $ 2, %rax // number of values in BO 2441#endif 2442 movq %rax, KKK 2443#endif 2444 2445 2446 andq $ -8, %rax // K = K - ( K % 8 ) 2447 je .L2_4_26 2448 movq %rax, BI // Index for BO 2449 leaq ( ,BI,4), BI // BI = BI * 4 ; number of values 2450 2451 salq $ 2, %rax // rax = rax * 4 ; number of values 2452 leaq (AO, %rax, SIZE), AO 2453 leaq (BO, BI, SIZE), BO 2454 negq BI 2455 negq %rax 2456 ALIGN_4 2457 2458.L2_4_22: 2459 2460 prefetcht0 A_PR1(AO,%rax,SIZE) 2461 prefetcht0 B_PR1(BO,BI,SIZE) 2462 KERNEL2x2_SUB 2463 KERNEL2x2_SUB 2464 KERNEL2x2_SUB 2465 KERNEL2x2_SUB 2466 2467 prefetcht0 A_PR1(AO,%rax,SIZE) 2468 prefetcht0 B_PR1(BO,BI,SIZE) 2469 KERNEL2x2_SUB 2470 KERNEL2x2_SUB 2471 KERNEL2x2_SUB 2472 KERNEL2x2_SUB 2473 2474 je .L2_4_26 2475 2476 prefetcht0 A_PR1(AO,%rax,SIZE) 2477 prefetcht0 B_PR1(BO,BI,SIZE) 2478 KERNEL2x2_SUB 2479 KERNEL2x2_SUB 2480 KERNEL2x2_SUB 2481 KERNEL2x2_SUB 2482 2483 prefetcht0 A_PR1(AO,%rax,SIZE) 2484 prefetcht0 B_PR1(BO,BI,SIZE) 2485 KERNEL2x2_SUB 2486 KERNEL2x2_SUB 2487 KERNEL2x2_SUB 2488 KERNEL2x2_SUB 2489 2490 je .L2_4_26 2491 2492 jmp .L2_4_22 2493 ALIGN_4 2494 2495.L2_4_26: 2496#ifndef TRMMKERNEL 2497 movq K, %rax 2498#else 2499 movq KKK, %rax 2500#endif 2501 2502 andq $ 7, %rax # if (k & 1) 2503 je .L2_4_29 2504 2505 movq %rax, BI // Index for BO 2506 leaq ( ,BI,4), BI // BI = BI * 4 ; number of values 2507 2508 salq $ 2, %rax // rax = rax * 4 ; number of values 2509 leaq (AO, %rax, SIZE), AO 2510 leaq (BO, BI, SIZE), BO 2511 negq BI 2512 negq %rax 2513 ALIGN_4 2514 2515.L2_4_27: 2516 2517 KERNEL2x2_SUB 2518 2519 jl .L2_4_27 2520 ALIGN_4 2521 2522 2523.L2_4_29: 2524 2525 vbroadcastss ALPHA_R, %xmm0 2526 vbroadcastss ALPHA_I, %xmm1 2527 2528 // swap high and low 64 bytes 2529 vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 2530 vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 2531 2532#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 2533 defined(NR) || defined(NC) || defined(TR) || defined(TC) 2534 2535 vaddsubps %xmm9, %xmm8 , %xmm8 2536 vaddsubps %xmm11,%xmm10, %xmm10 2537 2538 vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 2539 vshufps $ 0xb1, %xmm10, %xmm10, %xmm11 2540 2541#else 2542 vaddsubps %xmm8, %xmm9 ,%xmm9 2543 vaddsubps %xmm10, %xmm11,%xmm11 2544 2545 vmovaps %xmm9, %xmm8 2546 vmovaps %xmm11, %xmm10 2547 2548 // swap high and low 64 bytes 2549 vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 2550 vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 2551 2552#endif 2553 2554 // multiply with ALPHA_R 2555 vmulps %xmm8 , %xmm0, %xmm8 2556 vmulps %xmm10, %xmm0, %xmm10 2557 2558 // multiply with ALPHA_I 2559 vmulps %xmm9 , %xmm1, %xmm9 2560 vmulps %xmm11, %xmm1, %xmm11 2561 2562 vaddsubps %xmm9, %xmm8 , %xmm8 2563 vaddsubps %xmm11,%xmm10, %xmm10 2564 2565 2566 2567#ifndef TRMMKERNEL 2568 2569 vaddps (CO1), %xmm8 , %xmm8 2570 2571 vaddps (CO1, LDC), %xmm10, %xmm10 2572 2573#endif 2574 2575 vmovups %xmm8 , (CO1) 2576 2577 vmovups %xmm10 , (CO1, LDC) 2578 2579 2580 2581#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2582 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2583 movq K, %rax 2584 subq KKK, %rax 2585 movq %rax, BI // Index for BO 2586 leaq ( ,BI,4), BI // BI = BI * 4 ; number of values 2587 leaq (BO, BI, SIZE), BO 2588 salq $ 2, %rax // rax = rax * 4 ; number of values 2589 leaq (AO, %rax, SIZE), AO 2590#endif 2591 2592 2593#if defined(TRMMKERNEL) && defined(LEFT) 2594 addq $ 2, KK 2595#endif 2596 2597 addq $ 4 * SIZE, CO1 # coffset += 4 2598 decq I # i -- 2599 jg .L2_4_21 2600 ALIGN_4 2601 2602 2603 2604/**************************************************************************/ 2605.L2_4_40: 2606 testq $ 1, M 2607 jz .L2_4_60 // to next 2 lines of N 2608 2609 ALIGN_4 2610 2611.L2_4_41: 2612 2613#if !defined(TRMMKERNEL) || \ 2614 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2615 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2616 leaq BUFFER1, BO // first buffer to BO 2617 addq $ 8 * SIZE, BO 2618#else 2619 movq KK, %rax 2620 leaq BUFFER1, BO // first buffer to BO 2621 addq $ 8 * SIZE, BO 2622 movq %rax, BI // Index for BO 2623 leaq (,BI,4), BI // BI = BI * 4 ; number of values 2624 leaq (BO, BI, SIZE), BO 2625 salq $ 1, %rax // rax = rax * 2 ; number of values 2626 leaq (AO, %rax, SIZE), AO 2627#endif 2628 2629 vzeroall 2630 2631#ifndef TRMMKERNEL 2632 movq K, %rax 2633#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 2634 movq K, %rax 2635 subq KK, %rax 2636 movq %rax, KKK 2637#else 2638 movq KK, %rax 2639#ifdef LEFT 2640 addq $ 1, %rax // number of values in AO 2641#else 2642 addq $ 2, %rax // number of values in BO 2643#endif 2644 movq %rax, KKK 2645#endif 2646 2647 2648 andq $ -8, %rax // K = K - ( K % 8 ) 2649 je .L2_4_46 2650 movq %rax, BI // Index for BO 2651 leaq ( ,BI,4), BI // BI = BI * 4 ; number of values 2652 2653 salq $ 1, %rax // rax = rax * 2 ; number of values 2654 leaq (AO, %rax, SIZE), AO 2655 leaq (BO, BI, SIZE), BO 2656 negq BI 2657 negq %rax 2658 ALIGN_4 2659 2660.L2_4_42: 2661 2662 prefetcht0 A_PR1(AO,%rax,SIZE) 2663 prefetcht0 B_PR1(BO,BI,SIZE) 2664 KERNEL1x2_SUB 2665 KERNEL1x2_SUB 2666 KERNEL1x2_SUB 2667 KERNEL1x2_SUB 2668 2669 prefetcht0 B_PR1(BO,BI,SIZE) 2670 KERNEL1x2_SUB 2671 KERNEL1x2_SUB 2672 KERNEL1x2_SUB 2673 KERNEL1x2_SUB 2674 2675 je .L2_4_46 2676 2677 prefetcht0 A_PR1(AO,%rax,SIZE) 2678 prefetcht0 B_PR1(BO,BI,SIZE) 2679 KERNEL1x2_SUB 2680 KERNEL1x2_SUB 2681 KERNEL1x2_SUB 2682 KERNEL1x2_SUB 2683 2684 prefetcht0 B_PR1(BO,BI,SIZE) 2685 KERNEL1x2_SUB 2686 KERNEL1x2_SUB 2687 KERNEL1x2_SUB 2688 KERNEL1x2_SUB 2689 2690 je .L2_4_46 2691 2692 jmp .L2_4_42 2693 ALIGN_4 2694 2695.L2_4_46: 2696#ifndef TRMMKERNEL 2697 movq K, %rax 2698#else 2699 movq KKK, %rax 2700#endif 2701 2702 andq $ 7, %rax # if (k & 1) 2703 je .L2_4_49 2704 2705 movq %rax, BI // Index for BO 2706 leaq ( ,BI,4), BI // BI = BI * 4 ; number of values 2707 2708 salq $ 1, %rax // rax = rax * 2 ; number of values 2709 leaq (AO, %rax, SIZE), AO 2710 leaq (BO, BI, SIZE), BO 2711 negq BI 2712 negq %rax 2713 ALIGN_4 2714 2715.L2_4_47: 2716 2717 KERNEL1x2_SUB 2718 2719 jl .L2_4_47 2720 ALIGN_4 2721 2722 2723.L2_4_49: 2724 2725 SAVE1x2 2726 2727#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2728 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2729 movq K, %rax 2730 subq KKK, %rax 2731 movq %rax, BI // Index for BO 2732 leaq ( ,BI,4), BI // BI = BI * 4 ; number of values 2733 leaq (BO, BI, SIZE), BO 2734 salq $ 1, %rax // rax = rax * 2 ; number of values 2735 leaq (AO, %rax, SIZE), AO 2736#endif 2737 2738 2739#if defined(TRMMKERNEL) && defined(LEFT) 2740 addq $ 1, KK 2741#endif 2742 2743 addq $ 2 * SIZE, CO1 # coffset += 2 2744 decq I # i -- 2745 jg .L2_4_41 2746 ALIGN_4 2747 2748 2749 2750 2751.L2_4_60: 2752#if defined(TRMMKERNEL) && !defined(LEFT) 2753 addq $ 2, KK 2754#endif 2755 2756 decq J // j -- 2757 jg .L2_01 // next 2 lines of N 2758 2759 2760 2761.L1_0: 2762 2763/************************************************************************************************ 2764* Loop for Nmod6 % 2 > 0 2765*************************************************************************************************/ 2766 2767 movq Nmod6, J 2768 andq $ 1, J // j % 2 2769 je .L999 2770 ALIGN_4 2771 2772.L1_01: 2773 // copy to sub buffer 2774 movq B, BO1 2775 leaq BUFFER1, BO // first buffer to BO 2776 movq K, %rax 2777 ALIGN_4 2778 2779.L1_02b: 2780 2781 vmovsd (BO1), %xmm0 2782 vmovsd %xmm0, (BO) 2783 addq $ 2*SIZE,BO1 2784 addq $ 2*SIZE,BO 2785 decq %rax 2786 jnz .L1_02b 2787 2788.L1_02c: 2789 2790 movq BO1, B // next offset of B 2791 2792.L1_10: 2793 movq C, CO1 2794 leaq (C, LDC, 1), C // c += 1 * ldc 2795 2796#if defined(TRMMKERNEL) && defined(LEFT) 2797 movq OFFSET, %rax 2798 movq %rax, KK 2799#endif 2800 2801 movq A, AO // aoffset = a 2802 addq $ 16 * SIZE, AO 2803 2804 movq M, I 2805 sarq $ 3, I // i = (m >> 3) 2806 je .L1_4_10 2807 2808 ALIGN_4 2809 2810/**************************************************************************************************/ 2811 2812.L1_8_11: 2813 2814#if !defined(TRMMKERNEL) || \ 2815 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2816 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2817 leaq BUFFER1, BO // first buffer to BO 2818 addq $ 4 * SIZE, BO 2819#else 2820 movq KK, %rax 2821 leaq BUFFER1, BO // first buffer to BO 2822 addq $ 4 * SIZE, BO 2823 movq %rax, BI // Index for BO 2824 leaq (,BI,2), BI // BI = BI * 2 ; number of values 2825 leaq (BO, BI, SIZE), BO 2826 salq $ 4, %rax // rax = rax *16 ; number of values 2827 leaq (AO, %rax, SIZE), AO 2828#endif 2829 2830 vzeroall 2831 2832#ifndef TRMMKERNEL 2833 movq K, %rax 2834#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 2835 movq K, %rax 2836 subq KK, %rax 2837 movq %rax, KKK 2838#else 2839 movq KK, %rax 2840#ifdef LEFT 2841 addq $ 8, %rax // number of values in AO 2842#else 2843 addq $ 1, %rax // number of values in BO 2844#endif 2845 movq %rax, KKK 2846#endif 2847 2848 2849 andq $ -8, %rax // K = K - ( K % 8 ) 2850 je .L1_8_16 2851 movq %rax, BI // Index for BO 2852 leaq ( ,BI,2), BI // BI = BI * 2 ; number of values 2853 2854 salq $ 4, %rax // rax = rax *16 ; number of values 2855 leaq (AO, %rax, SIZE), AO 2856 leaq (BO, BI, SIZE), BO 2857 negq BI 2858 negq %rax 2859 ALIGN_4 2860 2861.L1_8_12: 2862 2863 prefetcht0 A_PR1(AO,%rax,SIZE) 2864 prefetcht0 B_PR1(BO,BI,SIZE) 2865 KERNEL8x1_SUB 2866 prefetcht0 A_PR1(AO,%rax,SIZE) 2867 KERNEL8x1_SUB 2868 prefetcht0 A_PR1(AO,%rax,SIZE) 2869 KERNEL8x1_SUB 2870 prefetcht0 A_PR1(AO,%rax,SIZE) 2871 KERNEL8x1_SUB 2872 2873 prefetcht0 A_PR1(AO,%rax,SIZE) 2874 KERNEL8x1_SUB 2875 prefetcht0 A_PR1(AO,%rax,SIZE) 2876 KERNEL8x1_SUB 2877 prefetcht0 A_PR1(AO,%rax,SIZE) 2878 KERNEL8x1_SUB 2879 prefetcht0 A_PR1(AO,%rax,SIZE) 2880 KERNEL8x1_SUB 2881 2882 je .L1_8_16 2883 2884 prefetcht0 A_PR1(AO,%rax,SIZE) 2885 prefetcht0 B_PR1(BO,BI,SIZE) 2886 KERNEL8x1_SUB 2887 prefetcht0 A_PR1(AO,%rax,SIZE) 2888 KERNEL8x1_SUB 2889 prefetcht0 A_PR1(AO,%rax,SIZE) 2890 KERNEL8x1_SUB 2891 prefetcht0 A_PR1(AO,%rax,SIZE) 2892 KERNEL8x1_SUB 2893 2894 prefetcht0 A_PR1(AO,%rax,SIZE) 2895 KERNEL8x1_SUB 2896 prefetcht0 A_PR1(AO,%rax,SIZE) 2897 KERNEL8x1_SUB 2898 prefetcht0 A_PR1(AO,%rax,SIZE) 2899 KERNEL8x1_SUB 2900 prefetcht0 A_PR1(AO,%rax,SIZE) 2901 KERNEL8x1_SUB 2902 2903 je .L1_8_16 2904 2905 jmp .L1_8_12 2906 ALIGN_4 2907 2908.L1_8_16: 2909#ifndef TRMMKERNEL 2910 movq K, %rax 2911#else 2912 movq KKK, %rax 2913#endif 2914 2915 andq $ 7, %rax # if (k & 1) 2916 je .L1_8_19 2917 2918 movq %rax, BI // Index for BO 2919 leaq ( ,BI,2), BI // BI = BI * 4 ; number of values 2920 2921 salq $ 4, %rax // rax = rax *16 ; number of values 2922 leaq (AO, %rax, SIZE), AO 2923 leaq (BO, BI, SIZE), BO 2924 negq BI 2925 negq %rax 2926 ALIGN_4 2927 2928.L1_8_17: 2929 2930 KERNEL8x1_SUB 2931 2932 jl .L1_8_17 2933 ALIGN_4 2934 2935 2936.L1_8_19: 2937 2938 SAVE8x1 2939 2940 2941#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2942 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2943 movq K, %rax 2944 subq KKK, %rax 2945 movq %rax, BI // Index for BO 2946 leaq ( ,BI,2), BI // BI = BI * 2 ; number of values 2947 leaq (BO, BI, SIZE), BO 2948 salq $ 4, %rax // rax = rax *16 ; number of values 2949 leaq (AO, %rax, SIZE), AO 2950#endif 2951 2952 2953#if defined(TRMMKERNEL) && defined(LEFT) 2954 addq $ 8, KK 2955#endif 2956 2957 addq $ 16 * SIZE, CO1 # coffset += 16 2958 decq I # i -- 2959 jg .L1_8_11 2960 ALIGN_4 2961 2962 2963 2964/**************************************************************************************************/ 2965.L1_4_10: 2966 2967 testq $ 7, M 2968 jz .L999 2969 2970 testq $ 4, M 2971 jz .L1_4_20 2972 2973 2974.L1_4_11: 2975 2976#if !defined(TRMMKERNEL) || \ 2977 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 2978 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2979 leaq BUFFER1, BO // first buffer to BO 2980 addq $ 4 * SIZE, BO 2981#else 2982 movq KK, %rax 2983 leaq BUFFER1, BO // first buffer to BO 2984 addq $ 4 * SIZE, BO 2985 movq %rax, BI // Index for BO 2986 leaq (,BI,2), BI // BI = BI * 2 ; number of values 2987 leaq (BO, BI, SIZE), BO 2988 salq $ 3, %rax // rax = rax * 8 ; number of values 2989 leaq (AO, %rax, SIZE), AO 2990#endif 2991 2992 vzeroall 2993 2994#ifndef TRMMKERNEL 2995 movq K, %rax 2996#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 2997 movq K, %rax 2998 subq KK, %rax 2999 movq %rax, KKK 3000#else 3001 movq KK, %rax 3002#ifdef LEFT 3003 addq $ 4, %rax // number of values in AO 3004#else 3005 addq $ 1, %rax // number of values in BO 3006#endif 3007 movq %rax, KKK 3008#endif 3009 3010 3011 andq $ -8, %rax // K = K - ( K % 8 ) 3012 je .L1_4_16 3013 movq %rax, BI // Index for BO 3014 leaq ( ,BI,2), BI // BI = BI * 2 ; number of values 3015 3016 salq $ 3, %rax // rax = rax * 8 ; number of values 3017 leaq (AO, %rax, SIZE), AO 3018 leaq (BO, BI, SIZE), BO 3019 negq BI 3020 negq %rax 3021 ALIGN_4 3022 3023.L1_4_12: 3024 3025 prefetcht0 A_PR1(AO,%rax,SIZE) 3026 prefetcht0 B_PR1(BO,BI,SIZE) 3027 KERNEL4x1_SUB 3028 KERNEL4x1_SUB 3029 prefetcht0 A_PR1(AO,%rax,SIZE) 3030 KERNEL4x1_SUB 3031 KERNEL4x1_SUB 3032 3033 prefetcht0 A_PR1(AO,%rax,SIZE) 3034 KERNEL4x1_SUB 3035 KERNEL4x1_SUB 3036 prefetcht0 A_PR1(AO,%rax,SIZE) 3037 KERNEL4x1_SUB 3038 KERNEL4x1_SUB 3039 3040 je .L1_4_16 3041 3042 prefetcht0 A_PR1(AO,%rax,SIZE) 3043 prefetcht0 B_PR1(BO,BI,SIZE) 3044 KERNEL4x1_SUB 3045 KERNEL4x1_SUB 3046 prefetcht0 A_PR1(AO,%rax,SIZE) 3047 KERNEL4x1_SUB 3048 KERNEL4x1_SUB 3049 3050 prefetcht0 A_PR1(AO,%rax,SIZE) 3051 KERNEL4x1_SUB 3052 KERNEL4x1_SUB 3053 prefetcht0 A_PR1(AO,%rax,SIZE) 3054 KERNEL4x1_SUB 3055 KERNEL4x1_SUB 3056 3057 je .L1_4_16 3058 3059 jmp .L1_4_12 3060 ALIGN_4 3061 3062.L1_4_16: 3063#ifndef TRMMKERNEL 3064 movq K, %rax 3065#else 3066 movq KKK, %rax 3067#endif 3068 3069 andq $ 7, %rax # if (k & 1) 3070 je .L1_4_19 3071 3072 movq %rax, BI // Index for BO 3073 leaq ( ,BI,2), BI // BI = BI * 4 ; number of values 3074 3075 salq $ 3, %rax // rax = rax * 8 ; number of values 3076 leaq (AO, %rax, SIZE), AO 3077 leaq (BO, BI, SIZE), BO 3078 negq BI 3079 negq %rax 3080 ALIGN_4 3081 3082.L1_4_17: 3083 3084 KERNEL4x1_SUB 3085 3086 jl .L1_4_17 3087 ALIGN_4 3088 3089 3090.L1_4_19: 3091 3092 SAVE4x1 3093 3094#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 3095 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 3096 movq K, %rax 3097 subq KKK, %rax 3098 movq %rax, BI // Index for BO 3099 leaq ( ,BI,2), BI // BI = BI * 2 ; number of values 3100 leaq (BO, BI, SIZE), BO 3101 salq $ 3, %rax // rax = rax * 8 ; number of values 3102 leaq (AO, %rax, SIZE), AO 3103#endif 3104 3105 3106#if defined(TRMMKERNEL) && defined(LEFT) 3107 addq $ 4, KK 3108#endif 3109 3110 addq $ 8 * SIZE, CO1 # coffset += 8 3111 ALIGN_4 3112 3113 3114 3115/************************************************************************** 3116* Rest of M 3117***************************************************************************/ 3118 3119.L1_4_20: 3120 3121 testq $ 2, M 3122 jz .L1_4_40 3123 ALIGN_4 3124 3125.L1_4_21: 3126 3127#if !defined(TRMMKERNEL) || \ 3128 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 3129 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 3130 leaq BUFFER1, BO // first buffer to BO 3131 addq $ 4 * SIZE, BO 3132#else 3133 movq KK, %rax 3134 leaq BUFFER1, BO // first buffer to BO 3135 addq $ 4 * SIZE, BO 3136 movq %rax, BI // Index for BO 3137 leaq (,BI,2), BI // BI = BI * 2 ; number of values 3138 leaq (BO, BI, SIZE), BO 3139 salq $ 2, %rax // rax = rax * 4 ; number of values 3140 leaq (AO, %rax, SIZE), AO 3141#endif 3142 3143 vzeroall 3144 3145#ifndef TRMMKERNEL 3146 movq K, %rax 3147#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 3148 movq K, %rax 3149 subq KK, %rax 3150 movq %rax, KKK 3151#else 3152 movq KK, %rax 3153#ifdef LEFT 3154 addq $ 2, %rax // number of values in AO 3155#else 3156 addq $ 1, %rax // number of values in BO 3157#endif 3158 movq %rax, KKK 3159#endif 3160 3161 3162 andq $ -8, %rax // K = K - ( K % 8 ) 3163 je .L1_4_26 3164 movq %rax, BI // Index for BO 3165 leaq ( ,BI,2), BI // BI = BI * 2 ; number of values 3166 3167 salq $ 2, %rax // rax = rax * 4 ; number of values 3168 leaq (AO, %rax, SIZE), AO 3169 leaq (BO, BI, SIZE), BO 3170 negq BI 3171 negq %rax 3172 ALIGN_4 3173 3174.L1_4_22: 3175 3176 prefetcht0 A_PR1(AO,%rax,SIZE) 3177 prefetcht0 B_PR1(BO,BI,SIZE) 3178 KERNEL2x1_SUB 3179 KERNEL2x1_SUB 3180 KERNEL2x1_SUB 3181 KERNEL2x1_SUB 3182 3183 prefetcht0 A_PR1(AO,%rax,SIZE) 3184 KERNEL2x1_SUB 3185 KERNEL2x1_SUB 3186 KERNEL2x1_SUB 3187 KERNEL2x1_SUB 3188 3189 je .L1_4_26 3190 3191 prefetcht0 A_PR1(AO,%rax,SIZE) 3192 prefetcht0 B_PR1(BO,BI,SIZE) 3193 KERNEL2x1_SUB 3194 KERNEL2x1_SUB 3195 KERNEL2x1_SUB 3196 KERNEL2x1_SUB 3197 3198 prefetcht0 A_PR1(AO,%rax,SIZE) 3199 KERNEL2x1_SUB 3200 KERNEL2x1_SUB 3201 KERNEL2x1_SUB 3202 KERNEL2x1_SUB 3203 3204 je .L1_4_26 3205 3206 jmp .L1_4_22 3207 ALIGN_4 3208 3209.L1_4_26: 3210#ifndef TRMMKERNEL 3211 movq K, %rax 3212#else 3213 movq KKK, %rax 3214#endif 3215 3216 andq $ 7, %rax # if (k & 1) 3217 je .L1_4_29 3218 3219 movq %rax, BI // Index for BO 3220 leaq ( ,BI,2), BI // BI = BI * 2; number of values 3221 3222 salq $ 2, %rax // rax = rax * 4 ; number of values 3223 leaq (AO, %rax, SIZE), AO 3224 leaq (BO, BI, SIZE), BO 3225 negq BI 3226 negq %rax 3227 ALIGN_4 3228 3229.L1_4_27: 3230 3231 KERNEL2x1_SUB 3232 3233 jl .L1_4_27 3234 ALIGN_4 3235 3236 3237.L1_4_29: 3238 3239 SAVE2x1 3240 3241#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 3242 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 3243 movq K, %rax 3244 subq KKK, %rax 3245 movq %rax, BI // Index for BO 3246 leaq ( ,BI,2), BI // BI = BI * 2 ; number of values 3247 leaq (BO, BI, SIZE), BO 3248 salq $ 2, %rax // rax = rax * 4 ; number of values 3249 leaq (AO, %rax, SIZE), AO 3250#endif 3251 3252 3253#if defined(TRMMKERNEL) && defined(LEFT) 3254 addq $ 2, KK 3255#endif 3256 3257 addq $ 4 * SIZE, CO1 # coffset += 4 3258 ALIGN_4 3259 3260 3261 3262/**************************************************************************/ 3263.L1_4_40: 3264 testq $ 1, M 3265 jz .L999 // to next 2 lines of N 3266 3267 ALIGN_4 3268 3269.L1_4_41: 3270 3271#if !defined(TRMMKERNEL) || \ 3272 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 3273 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 3274 leaq BUFFER1, BO // first buffer to BO 3275 addq $ 4 * SIZE, BO 3276#else 3277 movq KK, %rax 3278 leaq BUFFER1, BO // first buffer to BO 3279 addq $ 4 * SIZE, BO 3280 movq %rax, BI // Index for BO 3281 leaq (,BI,2), BI // BI = BI * 2 ; number of values 3282 leaq (BO, BI, SIZE), BO 3283 salq $ 1, %rax // rax = rax * 2 ; number of values 3284 leaq (AO, %rax, SIZE), AO 3285#endif 3286 3287 vzeroall 3288 3289#ifndef TRMMKERNEL 3290 movq K, %rax 3291#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 3292 movq K, %rax 3293 subq KK, %rax 3294 movq %rax, KKK 3295#else 3296 movq KK, %rax 3297#ifdef LEFT 3298 addq $ 1, %rax // number of values in AO 3299#else 3300 addq $ 1, %rax // number of values in BO 3301#endif 3302 movq %rax, KKK 3303#endif 3304 3305 3306 andq $ -8, %rax // K = K - ( K % 8 ) 3307 je .L1_4_46 3308 movq %rax, BI // Index for BO 3309 leaq ( ,BI,2), BI // BI = BI * 2 ; number of values 3310 3311 salq $ 1, %rax // rax = rax * 2 ; number of values 3312 leaq (AO, %rax, SIZE), AO 3313 leaq (BO, BI, SIZE), BO 3314 negq BI 3315 negq %rax 3316 ALIGN_4 3317 3318.L1_4_42: 3319 3320 prefetcht0 A_PR1(AO,%rax,SIZE) 3321 prefetcht0 B_PR1(BO,BI,SIZE) 3322 KERNEL1x1_SUB 3323 KERNEL1x1_SUB 3324 KERNEL1x1_SUB 3325 KERNEL1x1_SUB 3326 3327 KERNEL1x1_SUB 3328 KERNEL1x1_SUB 3329 KERNEL1x1_SUB 3330 KERNEL1x1_SUB 3331 3332 je .L1_4_46 3333 3334 prefetcht0 A_PR1(AO,%rax,SIZE) 3335 prefetcht0 B_PR1(BO,BI,SIZE) 3336 KERNEL1x1_SUB 3337 KERNEL1x1_SUB 3338 KERNEL1x1_SUB 3339 KERNEL1x1_SUB 3340 3341 KERNEL1x1_SUB 3342 KERNEL1x1_SUB 3343 KERNEL1x1_SUB 3344 KERNEL1x1_SUB 3345 3346 je .L1_4_46 3347 3348 jmp .L1_4_42 3349 ALIGN_4 3350 3351.L1_4_46: 3352#ifndef TRMMKERNEL 3353 movq K, %rax 3354#else 3355 movq KKK, %rax 3356#endif 3357 3358 andq $ 7, %rax # if (k & 1) 3359 je .L1_4_49 3360 3361 movq %rax, BI // Index for BO 3362 leaq ( ,BI,2), BI // BI = BI * 2 ; number of values 3363 3364 salq $ 1, %rax // rax = rax * 2 ; number of values 3365 leaq (AO, %rax, SIZE), AO 3366 leaq (BO, BI, SIZE), BO 3367 negq BI 3368 negq %rax 3369 ALIGN_4 3370 3371.L1_4_47: 3372 3373 KERNEL1x1_SUB 3374 3375 jl .L1_4_47 3376 ALIGN_4 3377 3378 3379.L1_4_49: 3380 3381 SAVE1x1 3382 3383 3384 3385#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 3386 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 3387 movq K, %rax 3388 subq KKK, %rax 3389 movq %rax, BI // Index for BO 3390 leaq ( ,BI,2), BI // BI = BI * 2 ; number of values 3391 leaq (BO, BI, SIZE), BO 3392 salq $ 1, %rax // rax = rax * 2 ; number of values 3393 leaq (AO, %rax, SIZE), AO 3394#endif 3395 3396 3397#if defined(TRMMKERNEL) && defined(LEFT) 3398 addq $ 1, KK 3399#endif 3400 3401 addq $ 2 * SIZE, CO1 # coffset += 2 3402 ALIGN_4 3403 3404 3405.L999: 3406 vzeroupper 3407 3408 movq SP, %rsp 3409 movq (%rsp), %rbx 3410 movq 8(%rsp), %rbp 3411 movq 16(%rsp), %r12 3412 movq 24(%rsp), %r13 3413 movq 32(%rsp), %r14 3414 movq 40(%rsp), %r15 3415 3416#ifdef WINDOWS_ABI 3417 movq 48(%rsp), %rdi 3418 movq 56(%rsp), %rsi 3419 vmovups 64(%rsp), %xmm6 3420 vmovups 80(%rsp), %xmm7 3421 vmovups 96(%rsp), %xmm8 3422 vmovups 112(%rsp), %xmm9 3423 vmovups 128(%rsp), %xmm10 3424 vmovups 144(%rsp), %xmm11 3425 vmovups 160(%rsp), %xmm12 3426 vmovups 176(%rsp), %xmm13 3427 vmovups 192(%rsp), %xmm14 3428 vmovups 208(%rsp), %xmm15 3429#endif 3430 3431 addq $ STACKSIZE, %rsp 3432 ret 3433 3434 EPILOGUE 3435 3436#else 3437 3438/************************************************************************************************/ 3439 3440 3441 PROLOGUE 3442 PROFCODE 3443 3444 subq $ STACKSIZE, %rsp 3445 movq %rbx, (%rsp) 3446 movq %rbp, 8(%rsp) 3447 movq %r12, 16(%rsp) 3448 movq %r13, 24(%rsp) 3449 movq %r14, 32(%rsp) 3450 movq %r15, 40(%rsp) 3451 3452 vzeroupper 3453 3454#ifdef WINDOWS_ABI 3455 movq %rdi, 48(%rsp) 3456 movq %rsi, 56(%rsp) 3457 vmovups %xmm6, 64(%rsp) 3458 vmovups %xmm7, 80(%rsp) 3459 vmovups %xmm8, 96(%rsp) 3460 vmovups %xmm9, 112(%rsp) 3461 vmovups %xmm10, 128(%rsp) 3462 vmovups %xmm11, 144(%rsp) 3463 vmovups %xmm12, 160(%rsp) 3464 vmovups %xmm13, 176(%rsp) 3465 vmovups %xmm14, 192(%rsp) 3466 vmovups %xmm15, 208(%rsp) 3467 3468 movq ARG1, OLD_M 3469 movq ARG2, OLD_N 3470 movq ARG3, OLD_K 3471 movq OLD_A, A 3472 movq OLD_B, B 3473 movq OLD_C, C 3474 movq OLD_LDC, LDC 3475#ifdef TRMMKERNEL 3476 movsd OLD_OFFSET, %xmm12 3477#endif 3478 vmovaps %xmm3, %xmm0 3479 vmovsd OLD_ALPHA_I, %xmm1 3480 3481#else 3482 movq STACKSIZE + 8(%rsp), LDC 3483#ifdef TRMMKERNEL 3484 movsd STACKSIZE + 16(%rsp), %xmm12 3485#endif 3486 3487#endif 3488 3489 movq %rsp, SP # save old stack 3490 subq $ 128 + L_BUFFER_SIZE, %rsp 3491 andq $ -4096, %rsp # align stack 3492 3493 STACK_TOUCH 3494 3495 cmpq $ 0, OLD_M 3496 je .L999 3497 3498 cmpq $ 0, OLD_N 3499 je .L999 3500 3501 cmpq $ 0, OLD_K 3502 je .L999 3503 3504 movq OLD_M, M 3505 movq OLD_N, N 3506 movq OLD_K, K 3507 3508 vmovss %xmm0, ALPHA_R 3509 vmovss %xmm1, ALPHA_I 3510 3511 salq $ ZBASE_SHIFT, LDC 3512 3513 movq N, %rax 3514 xorq %rdx, %rdx 3515 movq $ 2, %rdi 3516 divq %rdi // N / 2 3517 movq %rax, Ndiv6 // N / 2 3518 movq %rdx, Nmod6 // N % 2 3519 3520 3521 3522#ifdef TRMMKERNEL 3523 vmovsd %xmm12, OFFSET 3524 vmovsd %xmm12, KK 3525#ifndef LEFT 3526 negq KK 3527#endif 3528#endif 3529 3530.L2_0: 3531 3532 movq Ndiv6, J 3533 cmpq $ 0, J 3534 je .L1_0 3535 ALIGN_4 3536 3537 3538 3539.L2_01: 3540 // copy to sub buffer 3541 movq B, BO1 3542 leaq BUFFER1, BO // first buffer to BO 3543 movq K, %rax 3544 ALIGN_4 3545 3546.L2_02b: 3547 3548 vmovups (BO1), %xmm0 3549 vmovups %xmm0, (BO) 3550 addq $ 4*SIZE,BO1 3551 addq $ 4*SIZE,BO 3552 decq %rax 3553 jnz .L2_02b 3554 3555.L2_02c: 3556 3557 movq BO1, B // next offset of B 3558 3559.L2_10: 3560 movq C, CO1 3561 leaq (C, LDC, 2), C // c += 2 * ldc 3562 3563#if defined(TRMMKERNEL) && defined(LEFT) 3564 movq OFFSET, %rax 3565 movq %rax, KK 3566#endif 3567 3568 movq A, AO // aoffset = a 3569 addq $ 16 * SIZE, AO 3570 3571 movq M, I 3572 sarq $ 3, I // i = (m >> 3) 3573 je .L2_4_10 3574 3575 ALIGN_4 3576/**********************************************************************************************************/ 3577 3578.L2_8_11: 3579 3580#if !defined(TRMMKERNEL) || \ 3581 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 3582 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 3583 leaq BUFFER1, BO // first buffer to BO 3584 addq $ 8 * SIZE, BO 3585#else 3586 movq KK, %rax 3587 leaq BUFFER1, BO // first buffer to BO 3588 addq $ 8 * SIZE, BO 3589 movq %rax, BI // Index for BO 3590 leaq (,BI,4), BI // BI = BI * 4 ; number of values 3591 leaq (BO, BI, SIZE), BO 3592 salq $ 4, %rax // rax = rax *16 ; number of values 3593 leaq (AO, %rax, SIZE), AO 3594#endif 3595 3596 vzeroall 3597 3598#ifndef TRMMKERNEL 3599 movq K, %rax 3600#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 3601 movq K, %rax 3602 subq KK, %rax 3603 movq %rax, KKK 3604#else 3605 movq KK, %rax 3606#ifdef LEFT 3607 addq $ 8, %rax // number of values in AO 3608#else 3609 addq $ 2, %rax // number of values in BO 3610#endif 3611 movq %rax, KKK 3612#endif 3613 3614 3615 andq $ -8, %rax // K = K - ( K % 8 ) 3616 je .L2_8_16 3617 movq %rax, BI // Index for BO 3618 leaq ( ,BI,4), BI // BI = BI * 4 ; number of values 3619 3620 salq $ 4, %rax // rax = rax *16 ; number of values 3621 leaq (AO, %rax, SIZE), AO 3622 leaq (BO, BI, SIZE), BO 3623 negq BI 3624 negq %rax 3625 ALIGN_4 3626 3627.L2_8_12: 3628 3629 prefetcht0 A_PR1(AO,%rax,SIZE) 3630 prefetcht0 B_PR1(BO,BI,SIZE) 3631 KERNEL8x2_SUB 3632 prefetcht0 A_PR1(AO,%rax,SIZE) 3633 KERNEL8x2_SUB 3634 prefetcht0 A_PR1(AO,%rax,SIZE) 3635 KERNEL8x2_SUB 3636 prefetcht0 A_PR1(AO,%rax,SIZE) 3637 KERNEL8x2_SUB 3638 3639 prefetcht0 A_PR1(AO,%rax,SIZE) 3640 prefetcht0 B_PR1(BO,BI,SIZE) 3641 KERNEL8x2_SUB 3642 prefetcht0 A_PR1(AO,%rax,SIZE) 3643 KERNEL8x2_SUB 3644 prefetcht0 A_PR1(AO,%rax,SIZE) 3645 KERNEL8x2_SUB 3646 prefetcht0 A_PR1(AO,%rax,SIZE) 3647 KERNEL8x2_SUB 3648 3649 je .L2_8_16 3650 3651 prefetcht0 A_PR1(AO,%rax,SIZE) 3652 prefetcht0 B_PR1(BO,BI,SIZE) 3653 KERNEL8x2_SUB 3654 prefetcht0 A_PR1(AO,%rax,SIZE) 3655 KERNEL8x2_SUB 3656 prefetcht0 A_PR1(AO,%rax,SIZE) 3657 KERNEL8x2_SUB 3658 prefetcht0 A_PR1(AO,%rax,SIZE) 3659 KERNEL8x2_SUB 3660 3661 prefetcht0 A_PR1(AO,%rax,SIZE) 3662 prefetcht0 B_PR1(BO,BI,SIZE) 3663 KERNEL8x2_SUB 3664 prefetcht0 A_PR1(AO,%rax,SIZE) 3665 KERNEL8x2_SUB 3666 prefetcht0 A_PR1(AO,%rax,SIZE) 3667 KERNEL8x2_SUB 3668 prefetcht0 A_PR1(AO,%rax,SIZE) 3669 KERNEL8x2_SUB 3670 3671 je .L2_8_16 3672 3673 jmp .L2_8_12 3674 ALIGN_4 3675 3676.L2_8_16: 3677#ifndef TRMMKERNEL 3678 movq K, %rax 3679#else 3680 movq KKK, %rax 3681#endif 3682 3683 andq $ 7, %rax # if (k & 1) 3684 je .L2_8_19 3685 3686 movq %rax, BI // Index for BO 3687 leaq ( ,BI,4), BI // BI = BI * 4 ; number of values 3688 3689 salq $ 4, %rax // rax = rax *16 ; number of values 3690 leaq (AO, %rax, SIZE), AO 3691 leaq (BO, BI, SIZE), BO 3692 negq BI 3693 negq %rax 3694 ALIGN_4 3695 3696.L2_8_17: 3697 3698 KERNEL8x2_SUB 3699 3700 jl .L2_8_17 3701 ALIGN_4 3702 3703 3704.L2_8_19: 3705 3706 SAVE8x2 3707 3708 3709#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 3710 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 3711 movq K, %rax 3712 subq KKK, %rax 3713 movq %rax, BI // Index for BO 3714 leaq ( ,BI,4), BI // BI = BI * 4 ; number of values 3715 leaq (BO, BI, SIZE), BO 3716 salq $ 4, %rax // rax = rax *16 ; number of values 3717 leaq (AO, %rax, SIZE), AO 3718#endif 3719 3720 3721#if defined(TRMMKERNEL) && defined(LEFT) 3722 addq $ 8, KK 3723#endif 3724 3725 addq $ 16 * SIZE, CO1 # coffset += 16 3726 decq I # i -- 3727 jg .L2_8_11 3728 ALIGN_4 3729 3730 3731/**********************************************************************************************************/ 3732 3733 3734 3735 3736.L2_4_10: 3737 testq $ 7, M 3738 jz .L2_4_60 // to next 2 lines of N 3739 3740 testq $ 4, M 3741 jz .L2_4_20 3742 ALIGN_4 3743 3744 3745.L2_4_11: 3746 3747#if !defined(TRMMKERNEL) || \ 3748 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 3749 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 3750 leaq BUFFER1, BO // first buffer to BO 3751 addq $ 8 * SIZE, BO 3752#else 3753 movq KK, %rax 3754 leaq BUFFER1, BO // first buffer to BO 3755 addq $ 8 * SIZE, BO 3756 movq %rax, BI // Index for BO 3757 leaq (,BI,4), BI // BI = BI * 4 ; number of values 3758 leaq (BO, BI, SIZE), BO 3759 salq $ 3, %rax // rax = rax * 8 ; number of values 3760 leaq (AO, %rax, SIZE), AO 3761#endif 3762 3763 vzeroall 3764 3765#ifndef TRMMKERNEL 3766 movq K, %rax 3767#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 3768 movq K, %rax 3769 subq KK, %rax 3770 movq %rax, KKK 3771#else 3772 movq KK, %rax 3773#ifdef LEFT 3774 addq $ 4, %rax // number of values in AO 3775#else 3776 addq $ 2, %rax // number of values in BO 3777#endif 3778 movq %rax, KKK 3779#endif 3780 3781 3782 andq $ -8, %rax // K = K - ( K % 8 ) 3783 je .L2_4_16 3784 movq %rax, BI // Index for BO 3785 leaq ( ,BI,4), BI // BI = BI * 4 ; number of values 3786 3787 salq $ 3, %rax // rax = rax * 8 ; number of values 3788 leaq (AO, %rax, SIZE), AO 3789 leaq (BO, BI, SIZE), BO 3790 negq BI 3791 negq %rax 3792 ALIGN_4 3793 3794.L2_4_12: 3795 3796 prefetcht0 A_PR1(AO,%rax,SIZE) 3797 prefetcht0 B_PR1(BO,BI,SIZE) 3798 KERNEL4x2_SUB 3799 KERNEL4x2_SUB 3800 prefetcht0 A_PR1(AO,%rax,SIZE) 3801 KERNEL4x2_SUB 3802 KERNEL4x2_SUB 3803 3804 prefetcht0 A_PR1(AO,%rax,SIZE) 3805 prefetcht0 B_PR1(BO,BI,SIZE) 3806 KERNEL4x2_SUB 3807 KERNEL4x2_SUB 3808 prefetcht0 A_PR1(AO,%rax,SIZE) 3809 KERNEL4x2_SUB 3810 KERNEL4x2_SUB 3811 3812 je .L2_4_16 3813 3814 prefetcht0 A_PR1(AO,%rax,SIZE) 3815 prefetcht0 B_PR1(BO,BI,SIZE) 3816 KERNEL4x2_SUB 3817 KERNEL4x2_SUB 3818 prefetcht0 A_PR1(AO,%rax,SIZE) 3819 KERNEL4x2_SUB 3820 KERNEL4x2_SUB 3821 3822 prefetcht0 A_PR1(AO,%rax,SIZE) 3823 prefetcht0 B_PR1(BO,BI,SIZE) 3824 KERNEL4x2_SUB 3825 KERNEL4x2_SUB 3826 prefetcht0 A_PR1(AO,%rax,SIZE) 3827 KERNEL4x2_SUB 3828 KERNEL4x2_SUB 3829 3830 je .L2_4_16 3831 3832 jmp .L2_4_12 3833 ALIGN_4 3834 3835.L2_4_16: 3836#ifndef TRMMKERNEL 3837 movq K, %rax 3838#else 3839 movq KKK, %rax 3840#endif 3841 3842 andq $ 7, %rax # if (k & 1) 3843 je .L2_4_19 3844 3845 movq %rax, BI // Index for BO 3846 leaq ( ,BI,4), BI // BI = BI * 4 ; number of values 3847 3848 salq $ 3, %rax // rax = rax * 8 ; number of values 3849 leaq (AO, %rax, SIZE), AO 3850 leaq (BO, BI, SIZE), BO 3851 negq BI 3852 negq %rax 3853 ALIGN_4 3854 3855.L2_4_17: 3856 3857 KERNEL4x2_SUB 3858 3859 jl .L2_4_17 3860 ALIGN_4 3861 3862 3863.L2_4_19: 3864 3865 SAVE4x2 3866 3867#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 3868 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 3869 movq K, %rax 3870 subq KKK, %rax 3871 movq %rax, BI // Index for BO 3872 leaq ( ,BI,4), BI // BI = BI * 4 ; number of values 3873 leaq (BO, BI, SIZE), BO 3874 salq $ 3, %rax // rax = rax * 8 ; number of values 3875 leaq (AO, %rax, SIZE), AO 3876#endif 3877 3878 3879#if defined(TRMMKERNEL) && defined(LEFT) 3880 addq $ 4, KK 3881#endif 3882 3883 addq $ 8 * SIZE, CO1 # coffset += 8 3884 ALIGN_4 3885 3886 3887 3888/************************************************************************** 3889* Rest of M 3890***************************************************************************/ 3891 3892.L2_4_20: 3893 3894 testq $ 2, M 3895 jz .L2_4_40 3896 ALIGN_4 3897 3898.L2_4_21: 3899 3900#if !defined(TRMMKERNEL) || \ 3901 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 3902 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 3903 leaq BUFFER1, BO // first buffer to BO 3904 addq $ 8 * SIZE, BO 3905#else 3906 movq KK, %rax 3907 leaq BUFFER1, BO // first buffer to BO 3908 addq $ 8 * SIZE, BO 3909 movq %rax, BI // Index for BO 3910 leaq (,BI,4), BI // BI = BI * 4 ; number of values 3911 leaq (BO, BI, SIZE), BO 3912 salq $ 2, %rax // rax = rax * 4 ; number of values 3913 leaq (AO, %rax, SIZE), AO 3914#endif 3915 3916 vzeroall 3917 3918#ifndef TRMMKERNEL 3919 movq K, %rax 3920#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 3921 movq K, %rax 3922 subq KK, %rax 3923 movq %rax, KKK 3924#else 3925 movq KK, %rax 3926#ifdef LEFT 3927 addq $ 2, %rax // number of values in AO 3928#else 3929 addq $ 2, %rax // number of values in BO 3930#endif 3931 movq %rax, KKK 3932#endif 3933 3934 3935 andq $ -8, %rax // K = K - ( K % 8 ) 3936 je .L2_4_26 3937 movq %rax, BI // Index for BO 3938 leaq ( ,BI,4), BI // BI = BI * 4 ; number of values 3939 3940 salq $ 2, %rax // rax = rax * 4 ; number of values 3941 leaq (AO, %rax, SIZE), AO 3942 leaq (BO, BI, SIZE), BO 3943 negq BI 3944 negq %rax 3945 ALIGN_4 3946 3947.L2_4_22: 3948 3949 prefetcht0 A_PR1(AO,%rax,SIZE) 3950 prefetcht0 B_PR1(BO,BI,SIZE) 3951 KERNEL2x2_SUB 3952 KERNEL2x2_SUB 3953 KERNEL2x2_SUB 3954 KERNEL2x2_SUB 3955 3956 prefetcht0 A_PR1(AO,%rax,SIZE) 3957 prefetcht0 B_PR1(BO,BI,SIZE) 3958 KERNEL2x2_SUB 3959 KERNEL2x2_SUB 3960 KERNEL2x2_SUB 3961 KERNEL2x2_SUB 3962 3963 je .L2_4_26 3964 3965 prefetcht0 A_PR1(AO,%rax,SIZE) 3966 prefetcht0 B_PR1(BO,BI,SIZE) 3967 KERNEL2x2_SUB 3968 KERNEL2x2_SUB 3969 KERNEL2x2_SUB 3970 KERNEL2x2_SUB 3971 3972 prefetcht0 A_PR1(AO,%rax,SIZE) 3973 prefetcht0 B_PR1(BO,BI,SIZE) 3974 KERNEL2x2_SUB 3975 KERNEL2x2_SUB 3976 KERNEL2x2_SUB 3977 KERNEL2x2_SUB 3978 3979 je .L2_4_26 3980 3981 jmp .L2_4_22 3982 ALIGN_4 3983 3984.L2_4_26: 3985#ifndef TRMMKERNEL 3986 movq K, %rax 3987#else 3988 movq KKK, %rax 3989#endif 3990 3991 andq $ 7, %rax # if (k & 1) 3992 je .L2_4_29 3993 3994 movq %rax, BI // Index for BO 3995 leaq ( ,BI,4), BI // BI = BI * 4 ; number of values 3996 3997 salq $ 2, %rax // rax = rax * 4 ; number of values 3998 leaq (AO, %rax, SIZE), AO 3999 leaq (BO, BI, SIZE), BO 4000 negq BI 4001 negq %rax 4002 ALIGN_4 4003 4004.L2_4_27: 4005 4006 KERNEL2x2_SUB 4007 4008 jl .L2_4_27 4009 ALIGN_4 4010 4011 4012.L2_4_29: 4013 4014 vbroadcastss ALPHA_R, %xmm0 4015 vbroadcastss ALPHA_I, %xmm1 4016 4017 // swap high and low 64 bytes 4018 vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 4019 vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 4020 4021#if defined(NN) || defined(NT) || defined(TN) || defined(TT) || \ 4022 defined(NR) || defined(NC) || defined(TR) || defined(TC) 4023 4024 vaddsubps %xmm9, %xmm8 , %xmm8 4025 vaddsubps %xmm11,%xmm10, %xmm10 4026 4027 vshufps $ 0xb1, %xmm8 , %xmm8, %xmm9 4028 vshufps $ 0xb1, %xmm10, %xmm10, %xmm11 4029 4030#else 4031 vaddsubps %xmm8, %xmm9 ,%xmm9 4032 vaddsubps %xmm10, %xmm11,%xmm11 4033 4034 vmovaps %xmm9, %xmm8 4035 vmovaps %xmm11, %xmm10 4036 4037 // swap high and low 64 bytes 4038 vshufps $ 0xb1, %xmm9 , %xmm9, %xmm9 4039 vshufps $ 0xb1, %xmm11, %xmm11, %xmm11 4040 4041#endif 4042 4043 // multiply with ALPHA_R 4044 vmulps %xmm8 , %xmm0, %xmm8 4045 vmulps %xmm10, %xmm0, %xmm10 4046 4047 // multiply with ALPHA_I 4048 vmulps %xmm9 , %xmm1, %xmm9 4049 vmulps %xmm11, %xmm1, %xmm11 4050 4051 vaddsubps %xmm9, %xmm8 , %xmm8 4052 vaddsubps %xmm11,%xmm10, %xmm10 4053 4054 4055 4056#ifndef TRMMKERNEL 4057 4058 vaddps (CO1), %xmm8 , %xmm8 4059 4060 vaddps (CO1, LDC), %xmm10, %xmm10 4061 4062#endif 4063 4064 vmovups %xmm8 , (CO1) 4065 4066 vmovups %xmm10 , (CO1, LDC) 4067 4068 4069 4070#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 4071 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 4072 movq K, %rax 4073 subq KKK, %rax 4074 movq %rax, BI // Index for BO 4075 leaq ( ,BI,4), BI // BI = BI * 4 ; number of values 4076 leaq (BO, BI, SIZE), BO 4077 salq $ 2, %rax // rax = rax * 4 ; number of values 4078 leaq (AO, %rax, SIZE), AO 4079#endif 4080 4081 4082#if defined(TRMMKERNEL) && defined(LEFT) 4083 addq $ 2, KK 4084#endif 4085 4086 addq $ 4 * SIZE, CO1 # coffset += 4 4087 decq I # i -- 4088 jg .L2_4_21 4089 ALIGN_4 4090 4091 4092 4093/**************************************************************************/ 4094.L2_4_40: 4095 testq $ 1, M 4096 jz .L2_4_60 // to next 2 lines of N 4097 4098 ALIGN_4 4099 4100.L2_4_41: 4101 4102#if !defined(TRMMKERNEL) || \ 4103 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 4104 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 4105 leaq BUFFER1, BO // first buffer to BO 4106 addq $ 8 * SIZE, BO 4107#else 4108 movq KK, %rax 4109 leaq BUFFER1, BO // first buffer to BO 4110 addq $ 8 * SIZE, BO 4111 movq %rax, BI // Index for BO 4112 leaq (,BI,4), BI // BI = BI * 4 ; number of values 4113 leaq (BO, BI, SIZE), BO 4114 salq $ 1, %rax // rax = rax * 2 ; number of values 4115 leaq (AO, %rax, SIZE), AO 4116#endif 4117 4118 vzeroall 4119 4120#ifndef TRMMKERNEL 4121 movq K, %rax 4122#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 4123 movq K, %rax 4124 subq KK, %rax 4125 movq %rax, KKK 4126#else 4127 movq KK, %rax 4128#ifdef LEFT 4129 addq $ 1, %rax // number of values in AO 4130#else 4131 addq $ 2, %rax // number of values in BO 4132#endif 4133 movq %rax, KKK 4134#endif 4135 4136 4137 andq $ -8, %rax // K = K - ( K % 8 ) 4138 je .L2_4_46 4139 movq %rax, BI // Index for BO 4140 leaq ( ,BI,4), BI // BI = BI * 4 ; number of values 4141 4142 salq $ 1, %rax // rax = rax * 2 ; number of values 4143 leaq (AO, %rax, SIZE), AO 4144 leaq (BO, BI, SIZE), BO 4145 negq BI 4146 negq %rax 4147 ALIGN_4 4148 4149.L2_4_42: 4150 4151 prefetcht0 A_PR1(AO,%rax,SIZE) 4152 prefetcht0 B_PR1(BO,BI,SIZE) 4153 KERNEL1x2_SUB 4154 KERNEL1x2_SUB 4155 KERNEL1x2_SUB 4156 KERNEL1x2_SUB 4157 4158 prefetcht0 B_PR1(BO,BI,SIZE) 4159 KERNEL1x2_SUB 4160 KERNEL1x2_SUB 4161 KERNEL1x2_SUB 4162 KERNEL1x2_SUB 4163 4164 je .L2_4_46 4165 4166 prefetcht0 A_PR1(AO,%rax,SIZE) 4167 prefetcht0 B_PR1(BO,BI,SIZE) 4168 KERNEL1x2_SUB 4169 KERNEL1x2_SUB 4170 KERNEL1x2_SUB 4171 KERNEL1x2_SUB 4172 4173 prefetcht0 B_PR1(BO,BI,SIZE) 4174 KERNEL1x2_SUB 4175 KERNEL1x2_SUB 4176 KERNEL1x2_SUB 4177 KERNEL1x2_SUB 4178 4179 je .L2_4_46 4180 4181 jmp .L2_4_42 4182 ALIGN_4 4183 4184.L2_4_46: 4185#ifndef TRMMKERNEL 4186 movq K, %rax 4187#else 4188 movq KKK, %rax 4189#endif 4190 4191 andq $ 7, %rax # if (k & 1) 4192 je .L2_4_49 4193 4194 movq %rax, BI // Index for BO 4195 leaq ( ,BI,4), BI // BI = BI * 4 ; number of values 4196 4197 salq $ 1, %rax // rax = rax * 2 ; number of values 4198 leaq (AO, %rax, SIZE), AO 4199 leaq (BO, BI, SIZE), BO 4200 negq BI 4201 negq %rax 4202 ALIGN_4 4203 4204.L2_4_47: 4205 4206 KERNEL1x2_SUB 4207 4208 jl .L2_4_47 4209 ALIGN_4 4210 4211 4212.L2_4_49: 4213 4214 SAVE1x2 4215 4216#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 4217 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 4218 movq K, %rax 4219 subq KKK, %rax 4220 movq %rax, BI // Index for BO 4221 leaq ( ,BI,4), BI // BI = BI * 4 ; number of values 4222 leaq (BO, BI, SIZE), BO 4223 salq $ 1, %rax // rax = rax * 2 ; number of values 4224 leaq (AO, %rax, SIZE), AO 4225#endif 4226 4227 4228#if defined(TRMMKERNEL) && defined(LEFT) 4229 addq $ 1, KK 4230#endif 4231 4232 addq $ 2 * SIZE, CO1 # coffset += 2 4233 decq I # i -- 4234 jg .L2_4_41 4235 ALIGN_4 4236 4237 4238 4239 4240.L2_4_60: 4241#if defined(TRMMKERNEL) && !defined(LEFT) 4242 addq $ 2, KK 4243#endif 4244 4245 decq J // j -- 4246 jg .L2_01 // next 2 lines of N 4247 4248 4249 4250.L1_0: 4251 4252/************************************************************************************************ 4253* Loop for Nmod6 % 2 > 0 4254*************************************************************************************************/ 4255 4256 movq Nmod6, J 4257 andq $ 1, J // j % 2 4258 je .L999 4259 ALIGN_4 4260 4261.L1_01: 4262 // copy to sub buffer 4263 movq B, BO1 4264 leaq BUFFER1, BO // first buffer to BO 4265 movq K, %rax 4266 ALIGN_4 4267 4268.L1_02b: 4269 4270 vmovsd (BO1), %xmm0 4271 vmovsd %xmm0, (BO) 4272 addq $ 2*SIZE,BO1 4273 addq $ 2*SIZE,BO 4274 decq %rax 4275 jnz .L1_02b 4276 4277.L1_02c: 4278 4279 movq BO1, B // next offset of B 4280 4281.L1_10: 4282 movq C, CO1 4283 leaq (C, LDC, 1), C // c += 1 * ldc 4284 4285#if defined(TRMMKERNEL) && defined(LEFT) 4286 movq OFFSET, %rax 4287 movq %rax, KK 4288#endif 4289 4290 movq A, AO // aoffset = a 4291 addq $ 16 * SIZE, AO 4292 4293 movq M, I 4294 sarq $ 3, I // i = (m >> 3) 4295 je .L1_4_10 4296 4297 ALIGN_4 4298 4299/**************************************************************************************************/ 4300 4301.L1_8_11: 4302 4303#if !defined(TRMMKERNEL) || \ 4304 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 4305 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 4306 leaq BUFFER1, BO // first buffer to BO 4307 addq $ 4 * SIZE, BO 4308#else 4309 movq KK, %rax 4310 leaq BUFFER1, BO // first buffer to BO 4311 addq $ 4 * SIZE, BO 4312 movq %rax, BI // Index for BO 4313 leaq (,BI,2), BI // BI = BI * 2 ; number of values 4314 leaq (BO, BI, SIZE), BO 4315 salq $ 4, %rax // rax = rax *16 ; number of values 4316 leaq (AO, %rax, SIZE), AO 4317#endif 4318 4319 vzeroall 4320 4321#ifndef TRMMKERNEL 4322 movq K, %rax 4323#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 4324 movq K, %rax 4325 subq KK, %rax 4326 movq %rax, KKK 4327#else 4328 movq KK, %rax 4329#ifdef LEFT 4330 addq $ 8, %rax // number of values in AO 4331#else 4332 addq $ 1, %rax // number of values in BO 4333#endif 4334 movq %rax, KKK 4335#endif 4336 4337 4338 andq $ -8, %rax // K = K - ( K % 8 ) 4339 je .L1_8_16 4340 movq %rax, BI // Index for BO 4341 leaq ( ,BI,2), BI // BI = BI * 2 ; number of values 4342 4343 salq $ 4, %rax // rax = rax *16 ; number of values 4344 leaq (AO, %rax, SIZE), AO 4345 leaq (BO, BI, SIZE), BO 4346 negq BI 4347 negq %rax 4348 ALIGN_4 4349 4350.L1_8_12: 4351 4352 prefetcht0 A_PR1(AO,%rax,SIZE) 4353 prefetcht0 B_PR1(BO,BI,SIZE) 4354 KERNEL8x1_SUB 4355 prefetcht0 A_PR1(AO,%rax,SIZE) 4356 KERNEL8x1_SUB 4357 prefetcht0 A_PR1(AO,%rax,SIZE) 4358 KERNEL8x1_SUB 4359 prefetcht0 A_PR1(AO,%rax,SIZE) 4360 KERNEL8x1_SUB 4361 4362 prefetcht0 A_PR1(AO,%rax,SIZE) 4363 KERNEL8x1_SUB 4364 prefetcht0 A_PR1(AO,%rax,SIZE) 4365 KERNEL8x1_SUB 4366 prefetcht0 A_PR1(AO,%rax,SIZE) 4367 KERNEL8x1_SUB 4368 prefetcht0 A_PR1(AO,%rax,SIZE) 4369 KERNEL8x1_SUB 4370 4371 je .L1_8_16 4372 4373 prefetcht0 A_PR1(AO,%rax,SIZE) 4374 prefetcht0 B_PR1(BO,BI,SIZE) 4375 KERNEL8x1_SUB 4376 prefetcht0 A_PR1(AO,%rax,SIZE) 4377 KERNEL8x1_SUB 4378 prefetcht0 A_PR1(AO,%rax,SIZE) 4379 KERNEL8x1_SUB 4380 prefetcht0 A_PR1(AO,%rax,SIZE) 4381 KERNEL8x1_SUB 4382 4383 prefetcht0 A_PR1(AO,%rax,SIZE) 4384 KERNEL8x1_SUB 4385 prefetcht0 A_PR1(AO,%rax,SIZE) 4386 KERNEL8x1_SUB 4387 prefetcht0 A_PR1(AO,%rax,SIZE) 4388 KERNEL8x1_SUB 4389 prefetcht0 A_PR1(AO,%rax,SIZE) 4390 KERNEL8x1_SUB 4391 4392 je .L1_8_16 4393 4394 jmp .L1_8_12 4395 ALIGN_4 4396 4397.L1_8_16: 4398#ifndef TRMMKERNEL 4399 movq K, %rax 4400#else 4401 movq KKK, %rax 4402#endif 4403 4404 andq $ 7, %rax # if (k & 1) 4405 je .L1_8_19 4406 4407 movq %rax, BI // Index for BO 4408 leaq ( ,BI,2), BI // BI = BI * 4 ; number of values 4409 4410 salq $ 4, %rax // rax = rax *16 ; number of values 4411 leaq (AO, %rax, SIZE), AO 4412 leaq (BO, BI, SIZE), BO 4413 negq BI 4414 negq %rax 4415 ALIGN_4 4416 4417.L1_8_17: 4418 4419 KERNEL8x1_SUB 4420 4421 jl .L1_8_17 4422 ALIGN_4 4423 4424 4425.L1_8_19: 4426 4427 SAVE8x1 4428 4429 4430#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 4431 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 4432 movq K, %rax 4433 subq KKK, %rax 4434 movq %rax, BI // Index for BO 4435 leaq ( ,BI,2), BI // BI = BI * 2 ; number of values 4436 leaq (BO, BI, SIZE), BO 4437 salq $ 4, %rax // rax = rax *16 ; number of values 4438 leaq (AO, %rax, SIZE), AO 4439#endif 4440 4441 4442#if defined(TRMMKERNEL) && defined(LEFT) 4443 addq $ 8, KK 4444#endif 4445 4446 addq $ 16 * SIZE, CO1 # coffset += 16 4447 decq I # i -- 4448 jg .L1_8_11 4449 ALIGN_4 4450 4451 4452 4453/**************************************************************************************************/ 4454.L1_4_10: 4455 4456 testq $ 7, M 4457 jz .L999 4458 4459 testq $ 4, M 4460 jz .L1_4_20 4461 4462 4463.L1_4_11: 4464 4465#if !defined(TRMMKERNEL) || \ 4466 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 4467 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 4468 leaq BUFFER1, BO // first buffer to BO 4469 addq $ 4 * SIZE, BO 4470#else 4471 movq KK, %rax 4472 leaq BUFFER1, BO // first buffer to BO 4473 addq $ 4 * SIZE, BO 4474 movq %rax, BI // Index for BO 4475 leaq (,BI,2), BI // BI = BI * 2 ; number of values 4476 leaq (BO, BI, SIZE), BO 4477 salq $ 3, %rax // rax = rax * 8 ; number of values 4478 leaq (AO, %rax, SIZE), AO 4479#endif 4480 4481 vzeroall 4482 4483#ifndef TRMMKERNEL 4484 movq K, %rax 4485#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 4486 movq K, %rax 4487 subq KK, %rax 4488 movq %rax, KKK 4489#else 4490 movq KK, %rax 4491#ifdef LEFT 4492 addq $ 4, %rax // number of values in AO 4493#else 4494 addq $ 1, %rax // number of values in BO 4495#endif 4496 movq %rax, KKK 4497#endif 4498 4499 4500 andq $ -8, %rax // K = K - ( K % 8 ) 4501 je .L1_4_16 4502 movq %rax, BI // Index for BO 4503 leaq ( ,BI,2), BI // BI = BI * 2 ; number of values 4504 4505 salq $ 3, %rax // rax = rax * 8 ; number of values 4506 leaq (AO, %rax, SIZE), AO 4507 leaq (BO, BI, SIZE), BO 4508 negq BI 4509 negq %rax 4510 ALIGN_4 4511 4512.L1_4_12: 4513 4514 prefetcht0 A_PR1(AO,%rax,SIZE) 4515 prefetcht0 B_PR1(BO,BI,SIZE) 4516 KERNEL4x1_SUB 4517 KERNEL4x1_SUB 4518 prefetcht0 A_PR1(AO,%rax,SIZE) 4519 KERNEL4x1_SUB 4520 KERNEL4x1_SUB 4521 4522 prefetcht0 A_PR1(AO,%rax,SIZE) 4523 KERNEL4x1_SUB 4524 KERNEL4x1_SUB 4525 prefetcht0 A_PR1(AO,%rax,SIZE) 4526 KERNEL4x1_SUB 4527 KERNEL4x1_SUB 4528 4529 je .L1_4_16 4530 4531 prefetcht0 A_PR1(AO,%rax,SIZE) 4532 prefetcht0 B_PR1(BO,BI,SIZE) 4533 KERNEL4x1_SUB 4534 KERNEL4x1_SUB 4535 prefetcht0 A_PR1(AO,%rax,SIZE) 4536 KERNEL4x1_SUB 4537 KERNEL4x1_SUB 4538 4539 prefetcht0 A_PR1(AO,%rax,SIZE) 4540 KERNEL4x1_SUB 4541 KERNEL4x1_SUB 4542 prefetcht0 A_PR1(AO,%rax,SIZE) 4543 KERNEL4x1_SUB 4544 KERNEL4x1_SUB 4545 4546 je .L1_4_16 4547 4548 jmp .L1_4_12 4549 ALIGN_4 4550 4551.L1_4_16: 4552#ifndef TRMMKERNEL 4553 movq K, %rax 4554#else 4555 movq KKK, %rax 4556#endif 4557 4558 andq $ 7, %rax # if (k & 1) 4559 je .L1_4_19 4560 4561 movq %rax, BI // Index for BO 4562 leaq ( ,BI,2), BI // BI = BI * 4 ; number of values 4563 4564 salq $ 3, %rax // rax = rax * 8 ; number of values 4565 leaq (AO, %rax, SIZE), AO 4566 leaq (BO, BI, SIZE), BO 4567 negq BI 4568 negq %rax 4569 ALIGN_4 4570 4571.L1_4_17: 4572 4573 KERNEL4x1_SUB 4574 4575 jl .L1_4_17 4576 ALIGN_4 4577 4578 4579.L1_4_19: 4580 4581 SAVE4x1 4582 4583#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 4584 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 4585 movq K, %rax 4586 subq KKK, %rax 4587 movq %rax, BI // Index for BO 4588 leaq ( ,BI,2), BI // BI = BI * 2 ; number of values 4589 leaq (BO, BI, SIZE), BO 4590 salq $ 3, %rax // rax = rax * 8 ; number of values 4591 leaq (AO, %rax, SIZE), AO 4592#endif 4593 4594 4595#if defined(TRMMKERNEL) && defined(LEFT) 4596 addq $ 4, KK 4597#endif 4598 4599 addq $ 8 * SIZE, CO1 # coffset += 8 4600 ALIGN_4 4601 4602 4603 4604/************************************************************************** 4605* Rest of M 4606***************************************************************************/ 4607 4608.L1_4_20: 4609 4610 testq $ 2, M 4611 jz .L1_4_40 4612 ALIGN_4 4613 4614.L1_4_21: 4615 4616#if !defined(TRMMKERNEL) || \ 4617 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 4618 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 4619 leaq BUFFER1, BO // first buffer to BO 4620 addq $ 4 * SIZE, BO 4621#else 4622 movq KK, %rax 4623 leaq BUFFER1, BO // first buffer to BO 4624 addq $ 4 * SIZE, BO 4625 movq %rax, BI // Index for BO 4626 leaq (,BI,2), BI // BI = BI * 2 ; number of values 4627 leaq (BO, BI, SIZE), BO 4628 salq $ 2, %rax // rax = rax * 4 ; number of values 4629 leaq (AO, %rax, SIZE), AO 4630#endif 4631 4632 vzeroall 4633 4634#ifndef TRMMKERNEL 4635 movq K, %rax 4636#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 4637 movq K, %rax 4638 subq KK, %rax 4639 movq %rax, KKK 4640#else 4641 movq KK, %rax 4642#ifdef LEFT 4643 addq $ 2, %rax // number of values in AO 4644#else 4645 addq $ 1, %rax // number of values in BO 4646#endif 4647 movq %rax, KKK 4648#endif 4649 4650 4651 andq $ -8, %rax // K = K - ( K % 8 ) 4652 je .L1_4_26 4653 movq %rax, BI // Index for BO 4654 leaq ( ,BI,2), BI // BI = BI * 2 ; number of values 4655 4656 salq $ 2, %rax // rax = rax * 4 ; number of values 4657 leaq (AO, %rax, SIZE), AO 4658 leaq (BO, BI, SIZE), BO 4659 negq BI 4660 negq %rax 4661 ALIGN_4 4662 4663.L1_4_22: 4664 4665 prefetcht0 A_PR1(AO,%rax,SIZE) 4666 prefetcht0 B_PR1(BO,BI,SIZE) 4667 KERNEL2x1_SUB 4668 KERNEL2x1_SUB 4669 KERNEL2x1_SUB 4670 KERNEL2x1_SUB 4671 4672 prefetcht0 A_PR1(AO,%rax,SIZE) 4673 KERNEL2x1_SUB 4674 KERNEL2x1_SUB 4675 KERNEL2x1_SUB 4676 KERNEL2x1_SUB 4677 4678 je .L1_4_26 4679 4680 prefetcht0 A_PR1(AO,%rax,SIZE) 4681 prefetcht0 B_PR1(BO,BI,SIZE) 4682 KERNEL2x1_SUB 4683 KERNEL2x1_SUB 4684 KERNEL2x1_SUB 4685 KERNEL2x1_SUB 4686 4687 prefetcht0 A_PR1(AO,%rax,SIZE) 4688 KERNEL2x1_SUB 4689 KERNEL2x1_SUB 4690 KERNEL2x1_SUB 4691 KERNEL2x1_SUB 4692 4693 je .L1_4_26 4694 4695 jmp .L1_4_22 4696 ALIGN_4 4697 4698.L1_4_26: 4699#ifndef TRMMKERNEL 4700 movq K, %rax 4701#else 4702 movq KKK, %rax 4703#endif 4704 4705 andq $ 7, %rax # if (k & 1) 4706 je .L1_4_29 4707 4708 movq %rax, BI // Index for BO 4709 leaq ( ,BI,2), BI // BI = BI * 2; number of values 4710 4711 salq $ 2, %rax // rax = rax * 4 ; number of values 4712 leaq (AO, %rax, SIZE), AO 4713 leaq (BO, BI, SIZE), BO 4714 negq BI 4715 negq %rax 4716 ALIGN_4 4717 4718.L1_4_27: 4719 4720 KERNEL2x1_SUB 4721 4722 jl .L1_4_27 4723 ALIGN_4 4724 4725 4726.L1_4_29: 4727 4728 SAVE2x1 4729 4730#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 4731 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 4732 movq K, %rax 4733 subq KKK, %rax 4734 movq %rax, BI // Index for BO 4735 leaq ( ,BI,2), BI // BI = BI * 2 ; number of values 4736 leaq (BO, BI, SIZE), BO 4737 salq $ 2, %rax // rax = rax * 4 ; number of values 4738 leaq (AO, %rax, SIZE), AO 4739#endif 4740 4741 4742#if defined(TRMMKERNEL) && defined(LEFT) 4743 addq $ 2, KK 4744#endif 4745 4746 addq $ 4 * SIZE, CO1 # coffset += 4 4747 ALIGN_4 4748 4749 4750 4751/**************************************************************************/ 4752.L1_4_40: 4753 testq $ 1, M 4754 jz .L999 // to next 2 lines of N 4755 4756 ALIGN_4 4757 4758.L1_4_41: 4759 4760#if !defined(TRMMKERNEL) || \ 4761 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 4762 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 4763 leaq BUFFER1, BO // first buffer to BO 4764 addq $ 4 * SIZE, BO 4765#else 4766 movq KK, %rax 4767 leaq BUFFER1, BO // first buffer to BO 4768 addq $ 4 * SIZE, BO 4769 movq %rax, BI // Index for BO 4770 leaq (,BI,2), BI // BI = BI * 2 ; number of values 4771 leaq (BO, BI, SIZE), BO 4772 salq $ 1, %rax // rax = rax * 2 ; number of values 4773 leaq (AO, %rax, SIZE), AO 4774#endif 4775 4776 vzeroall 4777 4778#ifndef TRMMKERNEL 4779 movq K, %rax 4780#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 4781 movq K, %rax 4782 subq KK, %rax 4783 movq %rax, KKK 4784#else 4785 movq KK, %rax 4786#ifdef LEFT 4787 addq $ 1, %rax // number of values in AO 4788#else 4789 addq $ 1, %rax // number of values in BO 4790#endif 4791 movq %rax, KKK 4792#endif 4793 4794 4795 andq $ -8, %rax // K = K - ( K % 8 ) 4796 je .L1_4_46 4797 movq %rax, BI // Index for BO 4798 leaq ( ,BI,2), BI // BI = BI * 2 ; number of values 4799 4800 salq $ 1, %rax // rax = rax * 2 ; number of values 4801 leaq (AO, %rax, SIZE), AO 4802 leaq (BO, BI, SIZE), BO 4803 negq BI 4804 negq %rax 4805 ALIGN_4 4806 4807.L1_4_42: 4808 4809 prefetcht0 A_PR1(AO,%rax,SIZE) 4810 prefetcht0 B_PR1(BO,BI,SIZE) 4811 KERNEL1x1_SUB 4812 KERNEL1x1_SUB 4813 KERNEL1x1_SUB 4814 KERNEL1x1_SUB 4815 4816 KERNEL1x1_SUB 4817 KERNEL1x1_SUB 4818 KERNEL1x1_SUB 4819 KERNEL1x1_SUB 4820 4821 je .L1_4_46 4822 4823 prefetcht0 A_PR1(AO,%rax,SIZE) 4824 prefetcht0 B_PR1(BO,BI,SIZE) 4825 KERNEL1x1_SUB 4826 KERNEL1x1_SUB 4827 KERNEL1x1_SUB 4828 KERNEL1x1_SUB 4829 4830 KERNEL1x1_SUB 4831 KERNEL1x1_SUB 4832 KERNEL1x1_SUB 4833 KERNEL1x1_SUB 4834 4835 je .L1_4_46 4836 4837 jmp .L1_4_42 4838 ALIGN_4 4839 4840.L1_4_46: 4841#ifndef TRMMKERNEL 4842 movq K, %rax 4843#else 4844 movq KKK, %rax 4845#endif 4846 4847 andq $ 7, %rax # if (k & 1) 4848 je .L1_4_49 4849 4850 movq %rax, BI // Index for BO 4851 leaq ( ,BI,2), BI // BI = BI * 2 ; number of values 4852 4853 salq $ 1, %rax // rax = rax * 2 ; number of values 4854 leaq (AO, %rax, SIZE), AO 4855 leaq (BO, BI, SIZE), BO 4856 negq BI 4857 negq %rax 4858 ALIGN_4 4859 4860.L1_4_47: 4861 4862 KERNEL1x1_SUB 4863 4864 jl .L1_4_47 4865 ALIGN_4 4866 4867 4868.L1_4_49: 4869 4870 SAVE1x1 4871 4872 4873 4874#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 4875 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 4876 movq K, %rax 4877 subq KKK, %rax 4878 movq %rax, BI // Index for BO 4879 leaq ( ,BI,2), BI // BI = BI * 2 ; number of values 4880 leaq (BO, BI, SIZE), BO 4881 salq $ 1, %rax // rax = rax * 2 ; number of values 4882 leaq (AO, %rax, SIZE), AO 4883#endif 4884 4885 4886#if defined(TRMMKERNEL) && defined(LEFT) 4887 addq $ 1, KK 4888#endif 4889 4890 addq $ 2 * SIZE, CO1 # coffset += 2 4891 ALIGN_4 4892 4893 4894.L999: 4895 vzeroupper 4896 4897 movq SP, %rsp 4898 movq (%rsp), %rbx 4899 movq 8(%rsp), %rbp 4900 movq 16(%rsp), %r12 4901 movq 24(%rsp), %r13 4902 movq 32(%rsp), %r14 4903 movq 40(%rsp), %r15 4904 4905#ifdef WINDOWS_ABI 4906 movq 48(%rsp), %rdi 4907 movq 56(%rsp), %rsi 4908 vmovups 64(%rsp), %xmm6 4909 vmovups 80(%rsp), %xmm7 4910 vmovups 96(%rsp), %xmm8 4911 vmovups 112(%rsp), %xmm9 4912 vmovups 128(%rsp), %xmm10 4913 vmovups 144(%rsp), %xmm11 4914 vmovups 160(%rsp), %xmm12 4915 vmovups 176(%rsp), %xmm13 4916 vmovups 192(%rsp), %xmm14 4917 vmovups 208(%rsp), %xmm15 4918#endif 4919 4920 addq $ STACKSIZE, %rsp 4921 ret 4922 4923 EPILOGUE 4924 4925 4926#endif 4927 4928