1/***************************************************************************** 2Copyright (c) 2011-2014, The OpenBLAS Project 3All rights reserved. 4 5Redistribution and use in source and binary forms, with or without 6modification, are permitted provided that the following conditions are 7met: 8 9 1. Redistributions of source code must retain the above copyright 10 notice, this list of conditions and the following disclaimer. 11 12 2. Redistributions in binary form must reproduce the above copyright 13 notice, this list of conditions and the following disclaimer in 14 the documentation and/or other materials provided with the 15 distribution. 16 3. Neither the name of the OpenBLAS project nor the names of 17 its contributors may be used to endorse or promote products 18 derived from this software without specific prior written 19 permission. 20 21THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 22AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 23IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 24ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 25LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 26DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 27SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 28CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 29OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE 30USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 31 32 **********************************************************************************/ 33 34#define ASSEMBLER 35#include "common.h" 36 37#define old_bm %rdi 38#define old_bn %rsi 39#define old_bk %rdx 40 41#define bm %r13 42#define bn %r14 43#define bk %r15 44 45#define ALPHA %xmm0 46#define ba %rcx 47#define bb %r8 48#define C %r9 49#define ldc %r10 50 51#define i %r11 52#define k %rax 53 54#define ptrba %rdi 55#define ptrbb %rsi 56#define C0 %rbx 57#define C1 %rbp 58 59#define prebb %r12 60 61#ifndef WINDOWS_ABI 62 63#define STACKSIZE 128 64 65#define old_ldc 8+STACKSIZE(%rsp) 66#define old_offset 16+STACKSIZE(%rsp) 67#define MEMALPHA 48(%rsp) 68#define j 56(%rsp) 69#define OFFSET 64(%rsp) 70#define kk 72(%rsp) 71#define kkk 80(%rsp) 72 73#else 74 75#define STACKSIZE 512 76 77#define OLD_A 40 + STACKSIZE(%rsp) 78#define OLD_B 48 + STACKSIZE(%rsp) 79#define OLD_C 56 + STACKSIZE(%rsp) 80#define old_ldc 64 + STACKSIZE(%rsp) 81#define old_offset 72 + STACKSIZE(%rsp) 82 83#define MEMALPHA 224(%rsp) 84#define j 232(%rsp) 85#define OFFSET 240(%rsp) 86#define kk 248(%rsp) 87#define kkk 256(%rsp) 88 89#endif 90 91#define PREFETCH0 prefetcht0 92#define PREFETCH1 prefetcht0 93#define PREFETCH2 prefetcht2 94 95#define xvec0 %xmm0 96#define xvec1 %xmm1 97#define xvec2 %xmm2 98#define xvec3 %xmm3 99#define xvec4 %xmm4 100#define xvec5 %xmm5 101#define xvec6 %xmm6 102#define xvec7 %xmm7 103#define xvec8 %xmm8 104#define xvec9 %xmm9 105#define xvec10 %xmm10 106#define xvec11 %xmm11 107#define xvec12 %xmm12 108#define xvec13 %xmm13 109#define xvec14 %xmm14 110#define xvec15 %xmm15 111 112#define yvec0 %ymm0 113#define yvec1 %ymm1 114#define yvec2 %ymm2 115#define yvec3 %ymm3 116#define yvec4 %ymm4 117#define yvec5 %ymm5 118#define yvec6 %ymm6 119#define yvec7 %ymm7 120#define yvec8 %ymm8 121#define yvec9 %ymm9 122#define yvec10 %ymm10 123#define yvec11 %ymm11 124#define yvec12 %ymm12 125#define yvec13 %ymm13 126#define yvec14 %ymm14 127#define yvec15 %ymm15 128 129#define LEAQ leaq 130#define ADDQ addq 131#define MULQ imulq 132#define SARQ sarq 133#define SALQ salq 134#define ANDQ andq 135#define SUBQ subq 136#define DECQ decq 137#define JG jg 138#define JLE jle 139#define TEST testq 140#define OR orq 141#define JNE jne 142#define NOP 143#define XOR xorpd 144#undef MOVQ 145#define MOVQ movq 146 147#define XOR_DY vxorpd 148#define XOR_DX vxorpd 149 150#define LD_DY vmovapd 151#define LD_DX vmovapd 152#define LDL_DX vmovlpd 153#define LDL_DY vmovlpd 154#define LDH_DX vmovhpd 155#define LDH_DY vmovhpd 156 157#define ST_DY vmovapd 158#define ST_DX vmovapd 159#define STL_DX vmovlpd 160#define STL_DY vmovlpd 161#define STH_DX vmovhpd 162#define STH_DY vmovhpd 163 164#define EDUP_DY vmovddup 165 166#define ADD_DY vaddpd 167#define ADD_DX vaddpd 168 169#define ADD1_DY vaddpd 170#define ADD2_DY vaddpd 171#define ADDSUB_DY vaddsubpd 172 173#define MUL_DY vmulpd 174#define MUL_DX vmulpd 175 176#define SHUF_DY vperm2f128 177#define SHUF_DX vpshufd 178 179#define VPERMILP_DY vpermilpd 180 181#define BROAD_DY vbroadcastsd 182#define BROAD_DX vmovddup 183 184#define MOV_DY vmovapd 185#define MOV_DX vmovapd 186 187#define REVS_DY vshufpd 188#define REVS_DX vmovsd 189 190#define EXTRA_DY vextractf128 191 192PROLOGUE 193 194subq $STACKSIZE, %rsp; 195movq %rbx, 0(%rsp); 196movq %rbp, 8(%rsp); 197movq %r12, 16(%rsp); 198movq %r13, 24(%rsp); 199movq %r14, 32(%rsp); 200movq %r15, 40(%rsp); 201 202#ifdef WINDOWS_ABI 203 movq %rdi, 48(%rsp) 204 movq %rsi, 56(%rsp) 205 movups %xmm6, 64(%rsp) 206 movups %xmm7, 80(%rsp) 207 movups %xmm8, 96(%rsp) 208 movups %xmm9, 112(%rsp) 209 movups %xmm10, 128(%rsp) 210 movups %xmm11, 144(%rsp) 211 movups %xmm12, 160(%rsp) 212 movups %xmm13, 176(%rsp) 213 movups %xmm14, 192(%rsp) 214 movups %xmm15, 208(%rsp) 215 216 movq ARG1, old_bm 217 movq ARG2, old_bn 218 movq ARG3, old_bk 219 movq OLD_A, ba 220 movq OLD_B, bb 221 movq OLD_C, C 222 movq old_ldc, ldc 223#ifdef TRMMKERNEL 224 movq old_offset, %r11 225#endif 226 movaps %xmm3, %xmm0 227#else 228 229movq old_ldc, ldc 230#ifdef TRMMKERNEL 231movq old_offset, %r11 232#endif 233#endif 234 235vzeroupper 236 237vmovlps ALPHA, MEMALPHA 238movq old_bm, bm 239movq old_bn, bn 240movq old_bk, bk 241leaq (, ldc, SIZE), ldc 242#ifdef TRMMKERNEL 243movq %r11, OFFSET 244#ifndef LEFT 245negq %r11; 246#endif 247movq %r11, kk 248#endif 249 250MOVQ bn,j; 251SARQ $2,j; # Rn = 4 252JLE .L0_loopE; 253ALIGN_5; 254.L0_bodyB:; 255#if defined(TRMMKERNEL) && defined(LEFT) 256MOVQ OFFSET, %rax; 257MOVQ %rax, kk; 258#endif 259 260MOVQ C,C0; 261LEAQ (C,ldc,2),C1; 262MOVQ bk, k; 263SALQ $5, k; 264LEAQ (bb, k, 1), prebb; 265MOVQ ba,ptrba; 266MOVQ bm,i; 267SARQ $3,i; # Rm = 8 268JLE .L1_loopE; 269ALIGN_5; 270.L1_bodyB:; 271#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) 272MOVQ bb, ptrbb; 273#else 274MOVQ bb, ptrbb; 275MOVQ kk, %rax; 276LEAQ (, %rax, SIZE), %rax; 277LEAQ (ptrba, %rax, 8), ptrba; 278LEAQ (ptrbb, %rax, 4), ptrbb; 279#endif 280//#### Initial Results Register #### 281PREFETCH2 0*SIZE(prebb); 282XOR_DY yvec15, yvec15, yvec15; 283PREFETCH2 8*SIZE(prebb); 284XOR_DY yvec14, yvec14, yvec14; 285XOR_DY yvec13, yvec13, yvec13; 286ADDQ $16*SIZE, prebb 287XOR_DY yvec12, yvec12, yvec12; 288PREFETCH0 3*SIZE(C0) 289LD_DY 0*SIZE(ptrbb), yvec2; 290PREFETCH0 3*SIZE(C0, ldc, 1) 291XOR_DY yvec11, yvec11, yvec11; 292PREFETCH0 3*SIZE(C1) 293XOR_DY yvec10, yvec10, yvec10; 294PREFETCH0 3*SIZE(C1, ldc, 1) 295LD_DY 0*SIZE(ptrba), yvec0; 296XOR_DY yvec9, yvec9, yvec9; 297XOR_DY yvec8, yvec8, yvec8; 298VPERMILP_DY $0x05, yvec2, yvec3; 299#ifndef TRMMKERNEL 300MOVQ bk,k; 301#elif (defined(LEFT) && !defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) 302MOVQ bk, %rax; 303SUBQ kk, %rax; 304MOVQ %rax, kkk; 305#else 306MOVQ kk, %rax; 307#ifdef LEFT 308ADDQ $8, %rax; 309#else 310ADDQ $4, %rax; 311#endif 312MOVQ %rax, kkk; 313#endif 314SARQ $2,k; 315JLE .L2_loopE; 316ALIGN_5; 317.L2_bodyB:; 318# Computing kernel 319 320//#### Unroll times 1 #### 321LD_DY 4*SIZE(ptrba), yvec1; 322MUL_DY yvec0, yvec2, yvec6; 323SHUF_DY $0x03, yvec2, yvec2, yvec4; 324MUL_DY yvec0, yvec3, yvec7; 325SHUF_DY $0x03, yvec3, yvec3, yvec5; 326ADD_DY yvec15, yvec6, yvec15; 327ADD_DY yvec13, yvec7, yvec13; 328 329PREFETCH0 64*SIZE(ptrba) 330MUL_DY yvec1, yvec2, yvec6; 331LD_DY 4*SIZE(ptrbb), yvec2; 332MUL_DY yvec1, yvec3, yvec7; 333VPERMILP_DY $0x05, yvec2, yvec3; 334ADD_DY yvec14, yvec6, yvec14; 335ADD_DY yvec12, yvec7, yvec12; 336 337MUL_DY yvec0, yvec4, yvec6; 338MUL_DY yvec0, yvec5, yvec7; 339LD_DY 8*SIZE(ptrba), yvec0; 340ADD_DY yvec11, yvec6, yvec11; 341ADD_DY yvec9, yvec7, yvec9; 342 343MUL_DY yvec1, yvec4, yvec6; 344MUL_DY yvec1, yvec5, yvec7; 345ADD_DY yvec10, yvec6, yvec10; 346ADD_DY yvec8, yvec7, yvec8; 347 348//#### Unroll times 2 #### 349LD_DY 12*SIZE(ptrba), yvec1; 350MUL_DY yvec0, yvec2, yvec6; 351SHUF_DY $0x03, yvec2, yvec2, yvec4; 352MUL_DY yvec0, yvec3, yvec7; 353SHUF_DY $0x03, yvec3, yvec3, yvec5; 354ADD_DY yvec15, yvec6, yvec15; 355ADD_DY yvec13, yvec7, yvec13; 356 357PREFETCH0 72*SIZE(ptrba) 358MUL_DY yvec1, yvec2, yvec6; 359LD_DY 8*SIZE(ptrbb), yvec2; 360MUL_DY yvec1, yvec3, yvec7; 361VPERMILP_DY $0x05, yvec2, yvec3; 362ADD_DY yvec14, yvec6, yvec14; 363ADD_DY yvec12, yvec7, yvec12; 364 365MUL_DY yvec0, yvec4, yvec6; 366MUL_DY yvec0, yvec5, yvec7; 367LD_DY 16*SIZE(ptrba), yvec0; 368ADD_DY yvec11, yvec6, yvec11; 369ADD_DY yvec9, yvec7, yvec9; 370 371MUL_DY yvec1, yvec4, yvec6; 372MUL_DY yvec1, yvec5, yvec7; 373ADD_DY yvec10, yvec6, yvec10; 374ADD_DY yvec8, yvec7, yvec8; 375 376//#### Unroll times 3 #### 377LD_DY 20*SIZE(ptrba), yvec1; 378MUL_DY yvec0, yvec2, yvec6; 379SHUF_DY $0x03, yvec2, yvec2, yvec4; 380MUL_DY yvec0, yvec3, yvec7; 381SHUF_DY $0x03, yvec3, yvec3, yvec5; 382ADD_DY yvec15, yvec6, yvec15; 383ADD_DY yvec13, yvec7, yvec13; 384 385PREFETCH0 80*SIZE(ptrba) 386MUL_DY yvec1, yvec2, yvec6; 387LD_DY 12*SIZE(ptrbb), yvec2; 388ADDQ $16*SIZE, ptrbb; 389MUL_DY yvec1, yvec3, yvec7; 390VPERMILP_DY $0x05, yvec2, yvec3; 391ADD_DY yvec14, yvec6, yvec14; 392ADD_DY yvec12, yvec7, yvec12; 393 394MUL_DY yvec0, yvec4, yvec6; 395MUL_DY yvec0, yvec5, yvec7; 396LD_DY 24*SIZE(ptrba), yvec0; 397ADD_DY yvec11, yvec6, yvec11; 398ADD_DY yvec9, yvec7, yvec9; 399 400MUL_DY yvec1, yvec4, yvec6; 401MUL_DY yvec1, yvec5, yvec7; 402ADD_DY yvec10, yvec6, yvec10; 403ADD_DY yvec8, yvec7, yvec8; 404 405//#### Unroll times 4 #### 406LD_DY 28*SIZE(ptrba), yvec1; 407MUL_DY yvec0, yvec2, yvec6; 408SHUF_DY $0x03, yvec2, yvec2, yvec4; 409MUL_DY yvec0, yvec3, yvec7; 410SHUF_DY $0x03, yvec3, yvec3, yvec5; 411ADDQ $32*SIZE, ptrba; 412ADD_DY yvec15, yvec6, yvec15; 413ADD_DY yvec13, yvec7, yvec13; 414 415PREFETCH0 88*SIZE(ptrba) 416MUL_DY yvec1, yvec2, yvec6; 417LD_DY 0*SIZE(ptrbb), yvec2; 418MUL_DY yvec1, yvec3, yvec7; 419VPERMILP_DY $0x05, yvec2, yvec3; 420ADD_DY yvec14, yvec6, yvec14; 421ADD_DY yvec12, yvec7, yvec12; 422 423MUL_DY yvec0, yvec4, yvec6; 424MUL_DY yvec0, yvec5, yvec7; 425LD_DY 0*SIZE(ptrba), yvec0; 426ADD_DY yvec11, yvec6, yvec11; 427ADD_DY yvec9, yvec7, yvec9; 428 429MUL_DY yvec1, yvec4, yvec6; 430MUL_DY yvec1, yvec5, yvec7; 431ADD_DY yvec10, yvec6, yvec10; 432ADD_DY yvec8, yvec7, yvec8; 433.L2_bodyE:; 434DECQ k; 435JG .L2_bodyB; 436ALIGN_5 437.L2_loopE:; 438PREFETCH2 0*SIZE(prebb); 439ADDQ $8*SIZE, prebb; 440#ifndef TRMMKERNEL 441TEST $2, bk; 442#else 443MOVQ kkk, %rax; 444TEST $2, %rax; 445#endif 446JLE .L3_loopE; 447ALIGN_5 448.L3_bodyB: 449//#### Unroll times 1 #### 450PREFETCH0 64*SIZE(ptrba) 451LD_DY 4*SIZE(ptrba), yvec1; 452MUL_DY yvec0, yvec2, yvec6; 453SHUF_DY $0x03, yvec2, yvec2, yvec4; 454MUL_DY yvec0, yvec3, yvec7; 455SHUF_DY $0x03, yvec3, yvec3, yvec5; 456ADD_DY yvec15, yvec6, yvec15; 457ADD_DY yvec13, yvec7, yvec13; 458 459MUL_DY yvec1, yvec2, yvec6; 460LD_DY 4*SIZE(ptrbb), yvec2; 461ADDQ $8*SIZE, ptrbb; 462MUL_DY yvec1, yvec3, yvec7; 463VPERMILP_DY $0x05, yvec2, yvec3; 464ADD_DY yvec14, yvec6, yvec14; 465ADD_DY yvec12, yvec7, yvec12; 466 467MUL_DY yvec0, yvec4, yvec6; 468MUL_DY yvec0, yvec5, yvec7; 469LD_DY 8*SIZE(ptrba), yvec0; 470ADD_DY yvec11, yvec6, yvec11; 471ADD_DY yvec9, yvec7, yvec9; 472 473MUL_DY yvec1, yvec4, yvec6; 474MUL_DY yvec1, yvec5, yvec7; 475ADD_DY yvec10, yvec6, yvec10; 476ADD_DY yvec8, yvec7, yvec8; 477 478//#### Unroll times 2 #### 479PREFETCH0 72*SIZE(ptrba) 480LD_DY 12*SIZE(ptrba), yvec1; 481MUL_DY yvec0, yvec2, yvec6; 482SHUF_DY $0x03, yvec2, yvec2, yvec4; 483MUL_DY yvec0, yvec3, yvec7; 484SHUF_DY $0x03, yvec3, yvec3, yvec5; 485ADDQ $16*SIZE, ptrba; 486ADD_DY yvec15, yvec6, yvec15; 487ADD_DY yvec13, yvec7, yvec13; 488 489MUL_DY yvec1, yvec2, yvec6; 490LD_DY 0*SIZE(ptrbb), yvec2; 491MUL_DY yvec1, yvec3, yvec7; 492VPERMILP_DY $0x05, yvec2, yvec3; 493ADD_DY yvec14, yvec6, yvec14; 494ADD_DY yvec12, yvec7, yvec12; 495 496MUL_DY yvec0, yvec4, yvec6; 497MUL_DY yvec0, yvec5, yvec7; 498LD_DY 0*SIZE(ptrba), yvec0; 499ADD_DY yvec11, yvec6, yvec11; 500ADD_DY yvec9, yvec7, yvec9; 501 502MUL_DY yvec1, yvec4, yvec6; 503MUL_DY yvec1, yvec5, yvec7; 504ADD_DY yvec10, yvec6, yvec10; 505ADD_DY yvec8, yvec7, yvec8; 506 507.L3_loopE: 508PREFETCH2 0*SIZE(prebb); 509ADDQ $8*SIZE, prebb 510#ifndef TRMMKERNEL 511TEST $1, bk; 512#else 513MOVQ kkk, %rax; 514TEST $1, %rax; 515#endif 516JLE .L4_loopE; 517ALIGN_5 518.L4_bodyB:; 519//#### Unroll times 1 #### 520PREFETCH0 64*SIZE(ptrba) 521LD_DY 4*SIZE(ptrba), yvec1; 522MUL_DY yvec0, yvec2, yvec6; 523SHUF_DY $0x03, yvec2, yvec2, yvec4; 524MUL_DY yvec0, yvec3, yvec7; 525SHUF_DY $0x03, yvec3, yvec3, yvec5; 526ADDQ $8*SIZE, ptrba; 527ADD_DY yvec15, yvec6, yvec15; 528ADD_DY yvec13, yvec7, yvec13; 529 530MUL_DY yvec1, yvec2, yvec6; 531MUL_DY yvec1, yvec3, yvec7; 532ADDQ $4*SIZE, ptrbb; 533ADD_DY yvec14, yvec6, yvec14; 534ADD_DY yvec12, yvec7, yvec12; 535 536MUL_DY yvec0, yvec4, yvec6; 537MUL_DY yvec0, yvec5, yvec7; 538ADD_DY yvec11, yvec6, yvec11; 539ADD_DY yvec9, yvec7, yvec9; 540 541MUL_DY yvec1, yvec4, yvec6; 542MUL_DY yvec1, yvec5, yvec7; 543ADD_DY yvec10, yvec6, yvec10; 544ADD_DY yvec8, yvec7, yvec8; 545 546.L4_loopE:; 547//#### Load Alpha #### 548BROAD_DY MEMALPHA,yvec7; 549//#### Multiply Alpha #### 550MUL_DY yvec7,yvec15,yvec15; 551MUL_DY yvec7,yvec14,yvec14; 552MUL_DY yvec7,yvec13,yvec13; 553MUL_DY yvec7,yvec12,yvec12; 554MUL_DY yvec7,yvec11,yvec11; 555MUL_DY yvec7,yvec10,yvec10; 556MUL_DY yvec7,yvec9,yvec9; 557MUL_DY yvec7,yvec8,yvec8; 558//#### Reverse the Results #### 559MOV_DY yvec15,yvec7; 560REVS_DY $0x0a,yvec13,yvec15,yvec15; 561REVS_DY $0x0a,yvec7,yvec13,yvec13; 562MOV_DY yvec14,yvec7; 563REVS_DY $0x0a,yvec12,yvec14,yvec14; 564REVS_DY $0x0a,yvec7,yvec12,yvec12; 565MOV_DY yvec11,yvec7; 566REVS_DY $0x0a,yvec9,yvec11,yvec11; 567REVS_DY $0x0a,yvec7,yvec9,yvec9; 568MOV_DY yvec10,yvec7; 569REVS_DY $0x0a,yvec8,yvec10,yvec10; 570REVS_DY $0x0a,yvec7,yvec8,yvec8; 571//#### Testing alignment #### 572MOVQ C0, %rax; 573OR ldc, %rax; 574TEST $15, %rax; 575JNE .L4_loopEx; # Unalign part write back 576ALIGN_5 577//#### Writing Back #### 578EXTRA_DY $1,yvec15,xvec7; 579EXTRA_DY $1,yvec14,xvec6; 580EXTRA_DY $1,yvec13,xvec5; 581EXTRA_DY $1,yvec12,xvec4; 582EXTRA_DY $1,yvec11,xvec3; 583EXTRA_DY $1,yvec10,xvec2; 584EXTRA_DY $1,yvec9,xvec1; 585EXTRA_DY $1,yvec8,xvec0; 586#ifndef TRMMKERNEL 587ADD_DY 0*SIZE(C0),xvec15,xvec15; 588ADD_DY 2*SIZE(C1),xvec7,xvec7; 589ADD_DY 4*SIZE(C0),xvec14,xvec14; 590ADD_DY 6*SIZE(C1),xvec6,xvec6; 591ADD_DY 0*SIZE(C0,ldc,1),xvec13,xvec13; 592ADD_DY 2*SIZE(C1,ldc,1),xvec5,xvec5; 593ADD_DY 4*SIZE(C0,ldc,1),xvec12,xvec12; 594ADD_DY 6*SIZE(C1,ldc,1),xvec4,xvec4; 595ADD_DY 0*SIZE(C1),xvec11,xvec11; 596ADD_DY 2*SIZE(C0),xvec3,xvec3; 597ADD_DY 4*SIZE(C1),xvec10,xvec10; 598ADD_DY 6*SIZE(C0),xvec2,xvec2; 599ADD_DY 0*SIZE(C1,ldc,1),xvec9,xvec9; 600ADD_DY 2*SIZE(C0,ldc,1),xvec1,xvec1; 601ADD_DY 4*SIZE(C1,ldc,1),xvec8,xvec8; 602ADD_DY 6*SIZE(C0,ldc,1),xvec0,xvec0; 603#endif 604ST_DY xvec15, 0*SIZE(C0); 605ST_DY xvec7, 2*SIZE(C1); 606ST_DY xvec14, 4*SIZE(C0); 607ST_DY xvec6, 6*SIZE(C1); 608ST_DY xvec13, 0*SIZE(C0,ldc,1); 609ST_DY xvec5, 2*SIZE(C1,ldc,1); 610ST_DY xvec12, 4*SIZE(C0,ldc,1); 611ST_DY xvec4, 6*SIZE(C1,ldc,1); 612ST_DY xvec11, 0*SIZE(C1); 613ST_DY xvec3, 2*SIZE(C0); 614ST_DY xvec10, 4*SIZE(C1); 615ST_DY xvec2, 6*SIZE(C0); 616ST_DY xvec9, 0*SIZE(C1,ldc,1); 617ST_DY xvec1, 2*SIZE(C0,ldc,1); 618ST_DY xvec8, 4*SIZE(C1,ldc,1); 619ST_DY xvec0, 6*SIZE(C0,ldc,1); 620#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) ||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 621MOVQ bk, %rax; 622SUBQ kkk, %rax; 623LEAQ (, %rax, SIZE), %rax; 624LEAQ (ptrba, %rax, 8), ptrba; 625LEAQ (ptrbb, %rax, 4), ptrbb; 626#endif 627#if defined(TRMMKERNEL) && defined(LEFT) 628ADDQ $8, kk 629#endif 630ADDQ $8*SIZE,C0; 631ADDQ $8*SIZE,C1; 632.L1_bodyE:; 633DECQ i; 634JG .L1_bodyB; 635JMP .L1_loopE; 636ALIGN_5; 637.L4_loopEx:; 638EXTRA_DY $1, yvec15, xvec7; 639#ifndef TRMMKERNEL 640LDL_DY 0*SIZE(C0), xvec6, xvec6; 641LDH_DY 1*SIZE(C0), xvec6, xvec6; 642ADD_DY xvec6, xvec15, xvec15; 643LDL_DY 2*SIZE(C1), xvec5, xvec5; 644LDH_DY 3*SIZE(C1), xvec5, xvec5; 645ADD_DY xvec5, xvec7, xvec7; 646#endif 647STL_DY xvec15, 0*SIZE(C0); 648STH_DY xvec15, 1*SIZE(C0); 649STL_DY xvec7, 2*SIZE(C1); 650STH_DY xvec7, 3*SIZE(C1); 651 652EXTRA_DY $1, yvec14, xvec4; 653#ifndef TRMMKERNEL 654LDL_DY 4*SIZE(C0), xvec3, xvec3; 655LDH_DY 5*SIZE(C0), xvec3, xvec3; 656ADD_DY xvec3, xvec14, xvec14; 657LDL_DY 6*SIZE(C1), xvec2, xvec2; 658LDH_DY 7*SIZE(C1), xvec2, xvec2; 659ADD_DY xvec2, xvec4, xvec4; 660#endif 661STL_DY xvec14, 4*SIZE(C0); 662STH_DY xvec14, 5*SIZE(C0); 663STL_DY xvec4, 6*SIZE(C1); 664STH_DY xvec4, 7*SIZE(C1); 665 666EXTRA_DY $1, yvec13, xvec7; 667#ifndef TRMMKERNEL 668LDL_DY 0*SIZE(C0, ldc, 1), xvec6, xvec6; 669LDH_DY 1*SIZE(C0, ldc, 1), xvec6, xvec6; 670ADD_DY xvec6, xvec13, xvec13; 671LDL_DY 2*SIZE(C1, ldc, 1), xvec5, xvec5; 672LDH_DY 3*SIZE(C1, ldc, 1), xvec5, xvec5; 673ADD_DY xvec5, xvec7, xvec7; 674#endif 675STL_DY xvec13, 0*SIZE(C0, ldc, 1); 676STH_DY xvec13, 1*SIZE(C0, ldc, 1); 677STL_DY xvec7, 2*SIZE(C1, ldc, 1); 678STH_DY xvec7, 3*SIZE(C1, ldc, 1); 679 680EXTRA_DY $1, yvec12, xvec4; 681#ifndef TRMMKERNEL 682LDL_DY 4*SIZE(C0, ldc, 1), xvec3, xvec3; 683LDH_DY 5*SIZE(C0, ldc, 1), xvec3, xvec3; 684ADD_DY xvec3, xvec12, xvec12; 685LDL_DY 6*SIZE(C1, ldc, 1), xvec2, xvec2; 686LDH_DY 7*SIZE(C1, ldc, 1), xvec2, xvec2; 687ADD_DY xvec2, xvec4, xvec4; 688#endif 689STL_DY xvec12, 4*SIZE(C0, ldc, 1); 690STH_DY xvec12, 5*SIZE(C0, ldc ,1); 691STL_DY xvec4, 6*SIZE(C1, ldc, 1); 692STH_DY xvec4, 7*SIZE(C1, ldc, 1); 693 694EXTRA_DY $1, yvec11, xvec7; 695#ifndef TRMMKERNEL 696LDL_DY 0*SIZE(C1), xvec6, xvec6; 697LDH_DY 1*SIZE(C1), xvec6, xvec6; 698ADD_DY xvec6, xvec11, xvec11; 699LDL_DY 2*SIZE(C0), xvec5, xvec5; 700LDH_DY 3*SIZE(C0), xvec5, xvec5; 701ADD_DY xvec5, xvec7, xvec7; 702#endif 703STL_DY xvec11, 0*SIZE(C1); 704STH_DY xvec11, 1*SIZE(C1); 705STL_DY xvec7, 2*SIZE(C0); 706STH_DY xvec7, 3*SIZE(C0); 707 708EXTRA_DY $1, yvec10, xvec4; 709#ifndef TRMMKERNEL 710LDL_DY 4*SIZE(C1), xvec3, xvec3; 711LDH_DY 5*SIZE(C1), xvec3, xvec3; 712ADD_DY xvec3, xvec10, xvec10; 713LDL_DY 6*SIZE(C0), xvec2, xvec2; 714LDH_DY 7*SIZE(C0), xvec2, xvec2; 715ADD_DY xvec2, xvec4, xvec4; 716#endif 717STL_DY xvec10, 4*SIZE(C1); 718STH_DY xvec10, 5*SIZE(C1); 719STL_DY xvec4, 6*SIZE(C0); 720STH_DY xvec4, 7*SIZE(C0); 721 722EXTRA_DY $1, yvec9, xvec7; 723#ifndef TRMMKERNEL 724LDL_DY 0*SIZE(C1, ldc, 1), xvec6, xvec6; 725LDH_DY 1*SIZE(C1, ldc, 1), xvec6, xvec6; 726ADD_DY xvec6, xvec9, xvec9; 727LDL_DY 2*SIZE(C0, ldc, 1), xvec5, xvec5; 728LDH_DY 3*SIZE(C0, ldc ,1), xvec5, xvec5; 729ADD_DY xvec5, xvec7, xvec7; 730#endif 731STL_DY xvec9, 0*SIZE(C1, ldc, 1); 732STH_DY xvec9, 1*SIZE(C1, ldc, 1); 733STL_DY xvec7, 2*SIZE(C0, ldc, 1); 734STH_DY xvec7, 3*SIZE(C0, ldc, 1); 735 736EXTRA_DY $1, yvec8, xvec4; 737#ifndef TRMMKERNEL 738LDL_DY 4*SIZE(C1, ldc, 1), xvec3, xvec3; 739LDH_DY 5*SIZE(C1, ldc, 1), xvec3, xvec3; 740ADD_DY xvec3, xvec8, xvec8; 741LDL_DY 6*SIZE(C0, ldc, 1), xvec2, xvec2; 742LDH_DY 7*SIZE(C0, ldc, 1), xvec2, xvec2; 743ADD_DY xvec2, xvec4, xvec4; 744#endif 745STL_DY xvec8, 4*SIZE(C1, ldc, 1); 746STH_DY xvec8, 5*SIZE(C1, ldc, 1); 747STL_DY xvec4, 6*SIZE(C0, ldc, 1); 748STH_DY xvec4, 7*SIZE(C0, ldc, 1); 749#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) ||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 750MOVQ bk, %rax; 751SUBQ kkk, %rax; 752LEAQ (, %rax, SIZE), %rax; 753LEAQ (ptrba, %rax, 8), ptrba; 754LEAQ (ptrbb, %rax, 4), ptrbb; 755#endif 756#if defined(TRMMKERNEL) && defined(LEFT) 757ADDQ $8, kk 758#endif 759 760ADDQ $8*SIZE, C0; 761ADDQ $8*SIZE, C1; 762DECQ i; 763JG .L1_bodyB; 764ALIGN_5 765.L1_loopE:; 766TEST $4, bm; # Rm = 4 767JLE .L5_loopE; 768ALIGN_5 769.L5_bodyB:; 770#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) 771MOVQ bb, ptrbb; 772#else 773MOVQ bb, ptrbb; 774MOVQ kk, %rax; 775LEAQ (, %rax, SIZE), %rax; 776LEAQ (ptrba, %rax, 4), ptrba; 777LEAQ (ptrbb, %rax, 4), ptrbb; 778#endif 779//#### Initial Results Register #### 780XOR_DY yvec15, yvec15, yvec15; 781XOR_DY yvec13, yvec13, yvec13; 782LD_DY 0*SIZE(ptrbb), yvec2; 783XOR_DY yvec11, yvec11, yvec11; 784XOR_DY yvec9, yvec9, yvec9; 785LD_DY 0*SIZE(ptrba), yvec0; 786VPERMILP_DY $0x05, yvec2, yvec3; 787#ifndef TRMMKERNEL 788MOVQ bk, k; 789#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) 790MOVQ bk, %rax; 791SUBQ kk, %rax; 792MOVQ %rax, kkk; 793#else 794MOVQ kk, %rax; 795#ifdef LEFT 796ADDQ $4, %rax; 797#else 798ADDQ $4, %rax; 799#endif 800MOVQ %rax, kkk; 801#endif 802SARQ $2, k; 803JLE .L6_loopE; 804ALIGN_5; 805.L6_bodyB:; 806# Computing kernel 807 808//#### Untoll time 1 #### 809LD_DY 4*SIZE(ptrba), yvec1; 810MUL_DY yvec0, yvec2, yvec6; 811ADD_DY yvec15, yvec6, yvec15; 812SHUF_DY $0x03, yvec2, yvec2, yvec4; 813MUL_DY yvec0, yvec3, yvec7; 814ADD_DY yvec13, yvec7, yvec13; 815SHUF_DY $0x03, yvec3, yvec3, yvec5; 816 817LD_DY 4*SIZE(ptrbb), yvec2; 818MUL_DY yvec0, yvec4, yvec6; 819ADD_DY yvec11, yvec6, yvec11; 820VPERMILP_DY $0x05, yvec2, yvec3; 821MUL_DY yvec0, yvec5, yvec7; 822ADD_DY yvec9, yvec7, yvec9; 823 824//#### Untoll time 2 #### 825LD_DY 8*SIZE(ptrba), yvec0; 826MUL_DY yvec1, yvec2, yvec6; 827ADD_DY yvec15, yvec6, yvec15; 828SHUF_DY $0x03, yvec2, yvec2, yvec4; 829MUL_DY yvec1, yvec3, yvec7; 830ADD_DY yvec13, yvec7, yvec13; 831SHUF_DY $0x03, yvec3, yvec3, yvec5; 832 833LD_DY 8*SIZE(ptrbb), yvec2; 834MUL_DY yvec1, yvec4, yvec6; 835ADD_DY yvec11, yvec6, yvec11; 836VPERMILP_DY $0x05, yvec2, yvec3; 837MUL_DY yvec1, yvec5, yvec7; 838ADD_DY yvec9, yvec7, yvec9; 839 840//#### Untoll time 3 #### 841LD_DY 12*SIZE(ptrba), yvec1; 842MUL_DY yvec0, yvec2, yvec6; 843ADD_DY yvec15, yvec6, yvec15; 844SHUF_DY $0x03, yvec2, yvec2, yvec4; 845ADDQ $16*SIZE, ptrba; 846MUL_DY yvec0, yvec3, yvec7; 847ADD_DY yvec13, yvec7, yvec13; 848SHUF_DY $0x03, yvec3, yvec3, yvec5; 849 850LD_DY 12*SIZE(ptrbb), yvec2; 851MUL_DY yvec0, yvec4, yvec6; 852ADD_DY yvec11, yvec6, yvec11; 853VPERMILP_DY $0x05, yvec2, yvec3; 854ADDQ $16*SIZE, ptrbb; 855MUL_DY yvec0, yvec5, yvec7; 856ADD_DY yvec9, yvec7, yvec9; 857 858//#### Untoll time 4 #### 859LD_DY 0*SIZE(ptrba), yvec0; 860MUL_DY yvec1, yvec2, yvec6; 861ADD_DY yvec15, yvec6, yvec15; 862SHUF_DY $0x03, yvec2, yvec2, yvec4; 863MUL_DY yvec1, yvec3, yvec7; 864ADD_DY yvec13, yvec7, yvec13; 865SHUF_DY $0x03, yvec3, yvec3, yvec5; 866 867LD_DY 0*SIZE(ptrbb), yvec2; 868MUL_DY yvec1, yvec4, yvec6; 869ADD_DY yvec11, yvec6, yvec11; 870VPERMILP_DY $0x05, yvec2, yvec3; 871MUL_DY yvec1, yvec5, yvec7; 872ADD_DY yvec9, yvec7, yvec9; 873DECQ k; 874JG .L6_bodyB; 875ALIGN_5 876.L6_loopE:; 877#ifndef TRMMKERNEL 878TEST $2, bk; 879#else 880MOVQ kkk, %rax; 881TEST $2, %rax; 882#endif 883JLE .L7_loopE; 884ALIGN_5 885.L7_bodyB:; 886//#### Untoll time 1 #### 887LD_DY 4*SIZE(ptrba), yvec1; 888MUL_DY yvec0, yvec2, yvec6; 889ADD_DY yvec15, yvec6, yvec15; 890SHUF_DY $0x03, yvec2, yvec2, yvec4; 891ADDQ $8*SIZE, ptrba; 892MUL_DY yvec0, yvec3, yvec7; 893ADD_DY yvec13, yvec7, yvec13; 894SHUF_DY $0x03, yvec3, yvec3, yvec5; 895 896LD_DY 4*SIZE(ptrbb), yvec2; 897MUL_DY yvec0, yvec4, yvec6; 898ADD_DY yvec11, yvec6, yvec11; 899VPERMILP_DY $0x05, yvec2, yvec3; 900ADDQ $8*SIZE, ptrbb; 901MUL_DY yvec0, yvec5, yvec7; 902ADD_DY yvec9, yvec7, yvec9; 903 904//#### Untoll time 2 #### 905LD_DY 0*SIZE(ptrba), yvec0; 906MUL_DY yvec1, yvec2, yvec6; 907ADD_DY yvec15, yvec6, yvec15; 908SHUF_DY $0x03, yvec2, yvec2, yvec4; 909MUL_DY yvec1, yvec3, yvec7; 910ADD_DY yvec13, yvec7, yvec13; 911SHUF_DY $0x03, yvec3, yvec3, yvec5; 912 913LD_DY 0*SIZE(ptrbb), yvec2; 914MUL_DY yvec1, yvec4, yvec6; 915ADD_DY yvec11, yvec6, yvec11; 916VPERMILP_DY $0x05, yvec2, yvec3; 917MUL_DY yvec1, yvec5, yvec7; 918ADD_DY yvec9, yvec7, yvec9; 919 920.L7_loopE:; 921#ifndef TRMMKERNEL 922TEST $1, bk 923#else 924MOVQ kkk, %rax; 925TEST $1, %rax; 926#endif 927JLE .L8_loopE; 928ALIGN_5 929.L8_bodyB:; 930//#### Untoll time 1 #### 931MUL_DY yvec0, yvec2, yvec6; 932ADD_DY yvec15, yvec6, yvec15; 933SHUF_DY $0x03, yvec2, yvec2, yvec4; 934ADDQ $4*SIZE, ptrba; 935MUL_DY yvec0, yvec3, yvec7; 936ADD_DY yvec13, yvec7, yvec13; 937SHUF_DY $0x03, yvec3, yvec3, yvec5; 938 939MUL_DY yvec0, yvec4, yvec6; 940ADD_DY yvec11, yvec6, yvec11; 941ADDQ $4*SIZE, ptrbb; 942MUL_DY yvec0, yvec5, yvec7; 943ADD_DY yvec9, yvec7, yvec9; 944 945.L8_loopE:; 946//#### Load Alpha #### 947BROAD_DY MEMALPHA, yvec7; 948//#### Multiply Alpha #### 949MUL_DY yvec7,yvec15,yvec15; 950MUL_DY yvec7,yvec13,yvec13; 951MUL_DY yvec7,yvec11,yvec11; 952MUL_DY yvec7,yvec9,yvec9; 953//#### Reverse the Results #### 954MOV_DY yvec15, yvec7; 955REVS_DY $0x0a,yvec13,yvec15,yvec15; 956REVS_DY $0x0a,yvec7,yvec13,yvec13; 957MOV_DY yvec11,yvec7; 958REVS_DY $0x0a,yvec9,yvec11,yvec11; 959REVS_DY $0x0a,yvec7,yvec9,yvec9; 960//#### Testing alignment #### 961MOVQ C0, %rax; 962OR ldc, %rax; 963TEST $15, %rax; 964JNE .L8_loopEx; # Unalign part write back 965ALIGN_5 966//#### Writing Back #### 967EXTRA_DY $1,yvec15,xvec7; 968EXTRA_DY $1,yvec13,xvec5; 969EXTRA_DY $1,yvec11,xvec3; 970EXTRA_DY $1,yvec9,xvec1; 971#ifndef TRMMKERNEL 972ADD_DX 0*SIZE(C0), xvec15, xvec15; 973ADD_DX 2*SIZE(C1), xvec7, xvec7; 974ADD_DX 0*SIZE(C0, ldc, 1), xvec13, xvec13; 975ADD_DX 2*SIZE(C1, ldc, 1), xvec5, xvec5; 976ADD_DX 0*SIZE(C1), xvec11, xvec11; 977ADD_DX 2*SIZE(C0), xvec3, xvec3; 978ADD_DX 0*SIZE(C1, ldc, 1), xvec9, xvec9; 979ADD_DX 2*SIZE(C0, ldc, 1), xvec1, xvec1; 980#endif 981ST_DX xvec15, 0*SIZE(C0); 982ST_DX xvec7, 2*SIZE(C1); 983ST_DX xvec13, 0*SIZE(C0,ldc,1); 984ST_DX xvec5, 2*SIZE(C1,ldc,1); 985ST_DX xvec11, 0*SIZE(C1); 986ST_DX xvec3, 2*SIZE(C0); 987ST_DX xvec9, 0*SIZE(C1,ldc,1); 988ST_DX xvec1, 2*SIZE(C0,ldc,1); 989#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) 990MOVQ bk, %rax; 991SUBQ kkk, %rax; 992LEAQ (, %rax, SIZE), %rax; 993LEAQ (ptrba, %rax, 4), ptrba; 994LEAQ (ptrbb, %rax, 4), ptrbb; 995#endif 996#if defined(TRMMKERNEL)&&defined(LEFT) 997ADDQ $4, kk 998#endif 999ADDQ $4*SIZE, C0; 1000ADDQ $4*SIZE, C1; 1001JMP .L5_loopE; 1002ALIGN_5 1003.L8_loopEx:; 1004EXTRA_DY $1,yvec15,xvec7; 1005EXTRA_DY $1,yvec13,xvec5; 1006EXTRA_DY $1,yvec11,xvec3; 1007EXTRA_DY $1,yvec9,xvec1; 1008#ifndef TRMMKERNEL 1009LDL_DX 0*SIZE(C0), xvec14, xvec14; 1010LDH_DX 1*SIZE(C0), xvec14, xvec14; 1011LDL_DX 0*SIZE(C0, ldc, 1), xvec12, xvec12; 1012LDH_DX 1*SIZE(C0, ldc, 1), xvec12, xvec12; 1013LDL_DX 0*SIZE(C1), xvec10, xvec10; 1014LDH_DX 1*SIZE(C1), xvec10, xvec10; 1015LDL_DX 0*SIZE(C1, ldc, 1), xvec8, xvec8; 1016LDH_DX 1*SIZE(C1, ldc, 1), xvec8, xvec8; 1017ADD_DX xvec14, xvec15, xvec15; 1018ADD_DX xvec12, xvec13, xvec13; 1019ADD_DX xvec10, xvec11, xvec11; 1020ADD_DX xvec8, xvec9, xvec9; 1021#endif 1022STL_DX xvec15, 0*SIZE(C0); 1023STH_DX xvec15, 1*SIZE(C0); 1024STL_DX xvec13, 0*SIZE(C0, ldc, 1); 1025STH_DX xvec13, 1*SIZE(C0, ldc, 1); 1026STL_DX xvec11, 0*SIZE(C1); 1027STH_DX xvec11, 1*SIZE(C1); 1028STL_DX xvec9, 0*SIZE(C1, ldc, 1); 1029STH_DX xvec9, 1*SIZE(C1, ldc, 1); 1030#ifndef TRMMKERNEL 1031LDL_DX 2*SIZE(C0), xvec0, xvec0; 1032LDH_DX 3*SIZE(C0), xvec0, xvec0; 1033LDL_DX 2*SIZE(C0, ldc, 1), xvec2, xvec2; 1034LDH_DX 3*SIZE(C0, ldc, 1), xvec2, xvec2; 1035LDL_DX 2*SIZE(C1), xvec4, xvec4; 1036LDH_DX 3*SIZE(C1), xvec4, xvec4; 1037LDL_DX 2*SIZE(C1, ldc, 1), xvec6, xvec6; 1038LDH_DX 3*SIZE(C1, ldc, 1), xvec6, xvec6; 1039ADD_DX xvec0, xvec3, xvec3; 1040ADD_DX xvec2, xvec1, xvec1; 1041ADD_DX xvec4, xvec7, xvec7; 1042ADD_DX xvec6, xvec5, xvec5; 1043#endif 1044STL_DX xvec3, 2*SIZE(C0); 1045STH_DX xvec3, 3*SIZE(C0); 1046STL_DX xvec1, 2*SIZE(C0, ldc, 1); 1047STH_DX xvec1, 3*SIZE(C0, ldc, 1); 1048STL_DX xvec7, 2*SIZE(C1); 1049STH_DX xvec7, 3*SIZE(C1); 1050STL_DX xvec5, 2*SIZE(C1, ldc, 1); 1051STH_DX xvec5, 3*SIZE(C1, ldc, 1); 1052#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) 1053MOVQ bk, %rax; 1054SUBQ kkk, %rax; 1055LEAQ (, %rax, SIZE), %rax; 1056LEAQ (ptrba, %rax, 4), ptrba; 1057LEAQ (ptrbb, %rax, 4), ptrbb; 1058#endif 1059#if defined(TRMMKERNEL)&&defined(LEFT) 1060ADDQ $4, kk 1061#endif 1062 1063ADDQ $4*SIZE, C0; 1064ADDQ $4*SIZE, C1; 1065.L5_loopE:; 1066TEST $2, bm; 1067JLE .L9_loopE; 1068ALIGN_5 1069.L9_bodyB:; 1070#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) 1071MOVQ bb, ptrbb; 1072#else 1073MOVQ bb, ptrbb; 1074MOVQ kk, %rax; 1075LEAQ (, %rax, SIZE), %rax; 1076LEAQ (ptrba, %rax, 2), ptrba; 1077LEAQ (ptrbb, %rax, 4), ptrbb 1078#endif 1079//#### Initial Results Register #### 1080LD_DX 0*SIZE(ptrbb), xvec2; 1081XOR_DY yvec15, yvec15, yvec15; 1082LD_DX 2*SIZE(ptrbb), xvec3; 1083XOR_DY yvec13, yvec13, yvec13; 1084LD_DX 0*SIZE(ptrba), xvec0; 1085XOR_DY yvec11, yvec11, yvec11; 1086SHUF_DX $0x4e, xvec2, xvec4; 1087XOR_DY yvec9, yvec9, yvec9; 1088#ifndef TRMMKERNEL 1089MOVQ bk, k; 1090#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) 1091MOVQ bk, %rax; 1092SUBQ kk, %rax; 1093MOVQ %rax, kkk; 1094#else 1095MOVQ kk, %rax; 1096#ifdef LEFT 1097ADDQ $2, %rax; 1098#else 1099ADDQ $4, %rax; 1100#endif 1101MOVQ %rax, kkk; 1102#endif 1103SARQ $2, k; 1104JLE .L10_loopE; 1105ALIGN_5; 1106.L10_bodyB:; 1107# Computing kernel 1108 1109//#### Unroll time 1 #### 1110LD_DX 4*SIZE(ptrbb), xvec6; 1111SHUF_DX $0x4e, xvec3, xvec5; 1112MUL_DX xvec0, xvec2, xvec2; 1113ADD_DX xvec2, xvec15, xvec15; 1114 1115LD_DX 6*SIZE(ptrbb), xvec7; 1116MUL_DX xvec0, xvec3, xvec3; 1117ADD_DX xvec3, xvec11, xvec11; 1118 1119LD_DX 2*SIZE(ptrba), xvec1; 1120MUL_DX xvec0, xvec4, xvec4; 1121ADD_DX xvec4, xvec13, xvec13; 1122SHUF_DX $0x4e, xvec6, xvec4; 1123MUL_DX xvec0, xvec5, xvec5; 1124ADD_DX xvec5, xvec9, xvec9; 1125 1126//#### Unroll time 2 #### 1127LD_DX 8*SIZE(ptrbb), xvec2; 1128SHUF_DX $0x4e, xvec7, xvec5; 1129MUL_DX xvec1, xvec6, xvec6; 1130ADD_DX xvec6, xvec15, xvec15; 1131 1132LD_DX 10*SIZE(ptrbb), xvec3; 1133MUL_DX xvec1, xvec7, xvec7; 1134ADD_DX xvec7, xvec11, xvec11; 1135 1136LD_DX 4*SIZE(ptrba), xvec0; 1137MUL_DX xvec1, xvec4, xvec4; 1138ADD_DX xvec4, xvec13, xvec13; 1139SHUF_DX $0x4e, xvec2, xvec4; 1140MUL_DX xvec1, xvec5, xvec5; 1141ADD_DX xvec5, xvec9, xvec9; 1142 1143//#### Unroll time 3 #### 1144LD_DX 12*SIZE(ptrbb), xvec6; 1145SHUF_DX $0x4e, xvec3, xvec5; 1146MUL_DX xvec0, xvec2, xvec2; 1147ADD_DX xvec2, xvec15, xvec15; 1148 1149LD_DX 14*SIZE(ptrbb), xvec7; 1150MUL_DX xvec0, xvec3, xvec3; 1151ADD_DX xvec3, xvec11, xvec11; 1152ADDQ $16*SIZE, ptrbb; 1153 1154LD_DX 6*SIZE(ptrba), xvec1; 1155MUL_DX xvec0, xvec4, xvec4; 1156ADD_DX xvec4, xvec13, xvec13; 1157SHUF_DX $0x4e, xvec6, xvec4; 1158ADDQ $8*SIZE, ptrba; 1159MUL_DX xvec0, xvec5, xvec5; 1160ADD_DX xvec5, xvec9, xvec9; 1161 1162//#### Unroll time 4 #### 1163LD_DX 0*SIZE(ptrbb), xvec2; 1164SHUF_DX $0x4e, xvec7, xvec5; 1165MUL_DX xvec1, xvec6, xvec6; 1166ADD_DX xvec6, xvec15, xvec15; 1167 1168LD_DX 2*SIZE(ptrbb), xvec3; 1169MUL_DX xvec1, xvec7, xvec7; 1170ADD_DX xvec7, xvec11, xvec11; 1171 1172LD_DX 0*SIZE(ptrba), xvec0; 1173MUL_DX xvec1, xvec4, xvec4; 1174ADD_DX xvec4, xvec13, xvec13; 1175SHUF_DX $0x4e, xvec2, xvec4; 1176MUL_DX xvec1, xvec5, xvec5; 1177ADD_DX xvec5, xvec9, xvec9; 1178DECQ k; 1179JG .L10_bodyB; 1180ALIGN_5 1181.L10_loopE:; 1182#ifndef TRMMKERNEL 1183TEST $2, bk 1184#else 1185MOVQ kkk, %rax; 1186TEST $2, %rax; 1187#endif 1188JLE .L11_loopE; 1189ALIGN_5 1190.L11_bodyB:; 1191//#### Unroll time 1 #### 1192LD_DX 4*SIZE(ptrbb), xvec6; 1193SHUF_DX $0x4e, xvec3, xvec5; 1194MUL_DX xvec0, xvec2, xvec2; 1195ADD_DX xvec2, xvec15, xvec15; 1196 1197LD_DX 6*SIZE(ptrbb), xvec7; 1198MUL_DX xvec0, xvec3, xvec3; 1199ADD_DX xvec3, xvec11, xvec11; 1200ADDQ $8*SIZE, ptrbb; 1201 1202LD_DX 2*SIZE(ptrba), xvec1; 1203MUL_DX xvec0, xvec4, xvec4; 1204ADD_DX xvec4, xvec13, xvec13; 1205SHUF_DX $0x4e, xvec6, xvec4; 1206ADDQ $4*SIZE, ptrba; 1207 1208MUL_DX xvec0, xvec5, xvec5; 1209ADD_DX xvec5, xvec9, xvec9; 1210 1211//#### Unroll time 2 #### 1212LD_DX 0*SIZE(ptrbb), xvec2; 1213SHUF_DX $0x4e, xvec7, xvec5; 1214MUL_DX xvec1, xvec6, xvec6; 1215ADD_DX xvec6, xvec15, xvec15; 1216 1217LD_DX 2*SIZE(ptrbb), xvec3; 1218MUL_DX xvec1, xvec7, xvec7; 1219ADD_DX xvec7, xvec11, xvec11; 1220 1221LD_DX 0*SIZE(ptrba), xvec0; 1222MUL_DX xvec1, xvec4, xvec4; 1223ADD_DX xvec4, xvec13, xvec13; 1224SHUF_DX $0x4e, xvec2, xvec4; 1225MUL_DX xvec1, xvec5, xvec5; 1226ADD_DX xvec5, xvec9, xvec9; 1227 1228.L11_loopE:; 1229#ifndef TRMMKERNEL 1230TEST $1, bk 1231#else 1232MOVQ kkk, %rax; 1233TEST $1, %rax; 1234#endif 1235JLE .L12_loopE; 1236ALIGN_5 1237.L12_bodyB:; 1238SHUF_DX $0x4e, xvec3, xvec5; 1239MUL_DX xvec0, xvec2, xvec2; 1240ADD_DX xvec2, xvec15, xvec15; 1241ADDQ $4*SIZE, ptrbb; 1242 1243MUL_DX xvec0, xvec3, xvec3; 1244ADD_DX xvec3, xvec11, xvec11; 1245ADDQ $2*SIZE, ptrba; 1246 1247MUL_DX xvec0, xvec4, xvec4; 1248ADD_DX xvec4, xvec13, xvec13; 1249 1250MUL_DX xvec0, xvec5, xvec5; 1251ADD_DX xvec5, xvec9, xvec9; 1252 1253.L12_loopE:; 1254//#### Load Alpha #### 1255BROAD_DX MEMALPHA, xvec7; 1256//#### Multiply Alpha #### 1257MUL_DX xvec7, xvec15, xvec15; 1258MUL_DX xvec7, xvec13, xvec13; 1259MUL_DX xvec7, xvec11, xvec11; 1260MUL_DX xvec7, xvec9, xvec9; 1261//#### Reverse the Results #### 1262MOV_DX xvec15, xvec6; 1263REVS_DX xvec13, xvec15, xvec15; 1264REVS_DX xvec6, xvec13, xvec13; 1265MOV_DX xvec11, xvec6; 1266REVS_DX xvec9, xvec11, xvec11; 1267REVS_DX xvec6, xvec9, xvec9; 1268//#### Testing Alignment #### 1269MOVQ C0, %rax; 1270OR ldc, %rax; 1271TEST $15, %rax; 1272JNE .L12_loopEx; 1273ALIGN_5 1274//#### Writing Back #### 1275#ifndef TRMMKERNEL 1276ADD_DX 0*SIZE(C0), xvec13, xvec13; 1277ADD_DX 0*SIZE(C0, ldc, 1), xvec15, xvec15; 1278ADD_DX 0*SIZE(C1), xvec9, xvec9; 1279ADD_DX 0*SIZE(C1, ldc, 1), xvec11, xvec11; 1280#endif 1281ST_DX xvec13, 0*SIZE(C0); 1282ST_DX xvec15, 0*SIZE(C0, ldc, 1); 1283ST_DX xvec9, 0*SIZE(C1); 1284ST_DX xvec11, 0*SIZE(C1, ldc, 1); 1285#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) 1286MOVQ bk, %rax; 1287SUBQ kkk, %rax; 1288LEAQ (,%rax, SIZE), %rax; 1289LEAQ (ptrba, %rax, 2), ptrba; 1290LEAQ (ptrbb, %rax, 4), ptrbb; 1291#endif 1292#if defined(TRMMKERNEL) && defined(LEFT) 1293ADDQ $2, kk 1294#endif 1295ADDQ $2*SIZE, C0 1296ADDQ $2*SIZE, C1 1297JMP .L9_loopE; 1298ALIGN_5 1299.L12_loopEx: 1300#ifndef TRMMKERNEL 1301LDL_DX 0*SIZE(C0), xvec14, xvec14; 1302LDH_DX 1*SIZE(C0), xvec14, xvec14; 1303LDL_DX 0*SIZE(C0, ldc, 1), xvec12, xvec12; 1304LDH_DX 1*SIZE(C0, ldc, 1), xvec12, xvec12; 1305LDL_DX 0*SIZE(C1), xvec10, xvec10; 1306LDH_DX 1*SIZE(C1), xvec10, xvec10; 1307LDL_DX 0*SIZE(C1, ldc, 1), xvec8, xvec8; 1308LDH_DX 1*SIZE(C1, ldc, 1), xvec8, xvec8; 1309ADD_DX xvec14, xvec13, xvec13; 1310ADD_DX xvec12, xvec15, xvec15; 1311ADD_DX xvec10, xvec9, xvec9; 1312ADD_DX xvec8, xvec11, xvec11; 1313#endif 1314STL_DX xvec13, 0*SIZE(C0); 1315STH_DX xvec13, 1*SIZE(C0); 1316STL_DX xvec15, 0*SIZE(C0, ldc, 1); 1317STH_DX xvec15, 1*SIZE(C0, ldc, 1); 1318STL_DX xvec9, 0*SIZE(C1); 1319STH_DX xvec9, 1*SIZE(C1); 1320STL_DX xvec11, 0*SIZE(C1, ldc, 1); 1321STH_DX xvec11, 1*SIZE(C1, ldc, 1); 1322#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) 1323MOVQ bk, %rax; 1324SUBQ kkk, %rax; 1325LEAQ (,%rax, SIZE), %rax; 1326LEAQ (ptrba, %rax, 2), ptrba; 1327LEAQ (ptrbb, %rax, 4), ptrbb; 1328#endif 1329#if defined(TRMMKERNEL) && defined(LEFT) 1330ADDQ $2, kk 1331#endif 1332ADDQ $2*SIZE, C0; 1333ADDQ $2*SIZE, C1; 1334.L9_loopE:; 1335TEST $1, bm 1336JLE .L13_loopE; 1337ALIGN_5 1338.L13_bodyB:; 1339#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) 1340MOVQ bb, ptrbb; 1341#else 1342MOVQ bb, ptrbb; 1343MOVQ kk, %rax; 1344LEAQ (,%rax, SIZE), %rax; 1345ADDQ %rax, ptrba; 1346LEAQ (ptrbb, %rax, 4), ptrbb; 1347#endif 1348//#### Initial Results Register #### 1349XOR_DY yvec15, yvec15, yvec15; 1350#ifndef TRMMKERNEL 1351MOVQ bk, k; 1352#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) 1353MOVQ bk, %rax; 1354SUBQ kk, %rax; 1355MOVQ %rax, kkk; 1356#else 1357MOVQ kk, %rax; 1358#ifdef LEFT 1359ADDQ $1, %rax; 1360#else 1361ADDQ $4, %rax; 1362#endif 1363MOVQ %rax, kkk; 1364#endif 1365SARQ $2, k; 1366JLE .L14_loopE; 1367ALIGN_5 1368.L14_bodyB:; 1369BROAD_DY 0*SIZE(ptrba), yvec0; 1370LD_DY 0*SIZE(ptrbb), yvec2; 1371MUL_DY yvec0, yvec2, yvec6; 1372ADD_DY yvec15, yvec6, yvec15; 1373 1374BROAD_DY 1*SIZE(ptrba), yvec1; 1375LD_DY 4*SIZE(ptrbb), yvec3; 1376MUL_DY yvec1, yvec3, yvec7; 1377ADD_DY yvec15, yvec7, yvec15; 1378 1379BROAD_DY 2*SIZE(ptrba), yvec0; 1380LD_DY 8*SIZE(ptrbb), yvec2; 1381MUL_DY yvec0, yvec2, yvec6; 1382ADD_DY yvec15, yvec6, yvec15; 1383 1384BROAD_DY 3*SIZE(ptrba), yvec1; 1385LD_DY 12*SIZE(ptrbb), yvec3; 1386MUL_DY yvec1, yvec3, yvec7; 1387ADD_DY yvec15, yvec7, yvec15; 1388ADDQ $4*SIZE, ptrba; 1389ADDQ $16*SIZE, ptrbb; 1390DECQ k; 1391JG .L14_bodyB; 1392ALIGN_5 1393.L14_loopE: 1394#ifndef TRMMKERNEL 1395TEST $2, bk; 1396#else 1397MOVQ kkk, %rax; 1398TEST $2, %rax; 1399#endif 1400JLE .L15_loopE; 1401ALIGN_5 1402.L15_bodyB: 1403BROAD_DY 0*SIZE(ptrba), yvec0; 1404LD_DY 0*SIZE(ptrbb), yvec2; 1405MUL_DY yvec0, yvec2, yvec6; 1406ADD_DY yvec15, yvec6, yvec15; 1407 1408BROAD_DY 1*SIZE(ptrba), yvec1; 1409LD_DY 4*SIZE(ptrbb), yvec3; 1410MUL_DY yvec1, yvec3, yvec7; 1411ADD_DY yvec15, yvec7, yvec15; 1412ADDQ $2*SIZE, ptrba; 1413ADDQ $8*SIZE, ptrbb; 1414.L15_loopE:; 1415#ifndef TRMMKERNEL 1416TEST $1, bk; 1417#else 1418MOVQ kkk, %rax; 1419TEST $1, %rax; 1420#endif 1421JLE .L16_loopE; 1422ALIGN_5 1423.L16_bodyB:; 1424BROAD_DY 0*SIZE(ptrba), yvec0; 1425LD_DY 0*SIZE(ptrbb), yvec2; 1426MUL_DY yvec0, yvec2, yvec6; 1427ADD_DY yvec15, yvec6, yvec15; 1428ADDQ $1*SIZE, ptrba; 1429ADDQ $4*SIZE, ptrbb; 1430 1431.L16_loopE: 1432//#### Load Alpha #### 1433BROAD_DY MEMALPHA, yvec7; 1434//#### Multiply Alpha #### 1435MUL_DY yvec15, yvec7, yvec15; 1436//#### Writing Back #### 1437EXTRA_DY $1, yvec15, xvec7; 1438#ifndef TRMMKERNEL 1439LDL_DX 0*SIZE(C0), xvec0, xvec0; 1440LDH_DX 0*SIZE(C0, ldc, 1), xvec0, xvec0; 1441LDL_DX 0*SIZE(C1), xvec1, xvec1; 1442LDH_DX 0*SIZE(C1, ldc, 1), xvec1, xvec1; 1443ADD_DX xvec0, xvec15, xvec15; 1444ADD_DX xvec1, xvec7, xvec7; 1445#endif 1446STL_DX xvec15, 0*SIZE(C0); 1447STH_DX xvec15, 0*SIZE(C0, ldc, 1); 1448STL_DX xvec7, 0*SIZE(C1); 1449STH_DX xvec7, 0*SIZE(C1, ldc, 1); 1450#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) 1451MOVQ bk, %rax; 1452SUBQ kkk, %rax; 1453LEAQ (,%rax, SIZE), %rax; 1454ADDQ %rax, ptrba; 1455LEAQ (ptrbb, %rax, 4), ptrbb; 1456#endif 1457#if defined(TRMMKERNEL)&&defined(LEFT) 1458ADDQ $1, kk 1459#endif 1460ADDQ $1*SIZE, C0 1461ADDQ $1*SIZE, C1 1462.L13_loopE:; 1463#if defined(TRMMKERNEL)&&!defined(LEFT) 1464ADDQ $4, kk 1465#endif 1466MOVQ bk,k; 1467SALQ $5,k; 1468ADDQ k,bb; 1469LEAQ (C,ldc,4),C; 1470.L0_bodyE:; 1471DECQ j; 1472JG .L0_bodyB; 1473ALIGN_5; 1474.L0_loopE:; 1475TEST $2, bn; 1476JLE .L20_loopE; 1477ALIGN_5; 1478.L20_loopB:; 1479#if defined(TRMMKERNEL) && defined(LEFT) 1480MOVQ OFFSET, %rax; 1481MOVQ %rax, kk 1482#endif 1483MOVQ C, C0; 1484LEAQ (C, ldc, 1), C1; 1485MOVQ ba, ptrba; 1486MOVQ bm, i; 1487SARQ $3, i; # Rm = 8 1488JLE .L21_loopE; 1489ALIGN_5; 1490.L21_bodyB:; 1491#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) 1492MOVQ bb, ptrbb; 1493#else 1494MOVQ bb, ptrbb; 1495MOVQ kk, %rax; 1496LEAQ (, %rax, SIZE), %rax; 1497LEAQ (ptrba, %rax, 8), ptrba; 1498LEAQ (ptrbb, %rax, 2), ptrbb; 1499#endif 1500//#### Initial Results Register #### 1501XOR_DY yvec15, yvec15, yvec15; 1502XOR_DY yvec14, yvec14, yvec14; 1503XOR_DY yvec13, yvec13, yvec13; 1504XOR_DY yvec12, yvec12, yvec12; 1505XOR_DY yvec11, yvec11, yvec11; 1506XOR_DY yvec10, yvec10, yvec10; 1507XOR_DY yvec9, yvec9, yvec9; 1508XOR_DY yvec8, yvec8, yvec8; 1509#ifndef TRMMKERNEL 1510MOVQ bk, k; 1511#elif (defined(LEFT) && !defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) 1512MOVQ bk, %rax; 1513SUBQ kk, %rax; 1514MOVQ %rax, kkk; 1515#else 1516MOVQ kk, %rax; 1517#ifdef LEFT 1518ADDQ $8, %rax; 1519#else 1520ADDQ $2, %rax; 1521#endif 1522MOVQ %rax, kkk; 1523#endif 1524SARQ $2, k; 1525JLE .L211_loopE; 1526ALIGN_5; 1527.L211_bodyB: 1528# Computing kernel 1529//#### Unroll time 1 #### 1530LD_DX 0*SIZE(ptrba), xvec0; 1531LD_DX 0*SIZE(ptrbb), xvec4; 1532MOV_DX xvec4, xvec5; 1533MUL_DX xvec0, xvec4, xvec4; 1534ADD_DX xvec4, xvec15, xvec15; 1535 1536LD_DX 2*SIZE(ptrba), xvec1; 1537MOV_DX xvec5, xvec6; 1538MUL_DX xvec1, xvec5, xvec5; 1539ADD_DX xvec5, xvec14, xvec14; 1540 1541LD_DX 4*SIZE(ptrba), xvec2; 1542MOV_DX xvec6, xvec7; 1543MUL_DX xvec2, xvec6, xvec6; 1544ADD_DX xvec6, xvec13, xvec13; 1545 1546LD_DX 6*SIZE(ptrba), xvec3; 1547SHUF_DX $0x4e, xvec7, xvec4; 1548MUL_DX xvec3, xvec7, xvec7; 1549ADD_DX xvec7, xvec12, xvec12; 1550 1551MOV_DX xvec4, xvec5; 1552MUL_DX xvec0, xvec4, xvec4; 1553ADD_DX xvec4, xvec11, xvec11; 1554 1555MOV_DX xvec5, xvec6; 1556MUL_DX xvec1, xvec5, xvec5; 1557ADD_DX xvec5, xvec10, xvec10; 1558 1559MOV_DX xvec6, xvec7; 1560MUL_DX xvec2, xvec6, xvec6; 1561ADD_DX xvec6, xvec9, xvec9; 1562 1563MUL_DX xvec3, xvec7, xvec7; 1564ADD_DX xvec7, xvec8, xvec8; 1565 1566//#### Unroll time 2 #### 1567LD_DX 8*SIZE(ptrba), xvec0; 1568LD_DX 2*SIZE(ptrbb), xvec4; 1569MOV_DX xvec4, xvec5; 1570MUL_DX xvec0, xvec4, xvec4; 1571ADD_DX xvec4, xvec15, xvec15; 1572 1573LD_DX 10*SIZE(ptrba), xvec1; 1574MOV_DX xvec5, xvec6; 1575MUL_DX xvec1, xvec5, xvec5; 1576ADD_DX xvec5, xvec14, xvec14; 1577 1578LD_DX 12*SIZE(ptrba), xvec2; 1579MOV_DX xvec6, xvec7; 1580MUL_DX xvec2, xvec6, xvec6; 1581ADD_DX xvec6, xvec13, xvec13; 1582 1583LD_DX 14*SIZE(ptrba), xvec3; 1584SHUF_DX $0x4e, xvec7, xvec4; 1585MUL_DX xvec3, xvec7, xvec7; 1586ADD_DX xvec7, xvec12, xvec12; 1587 1588MOV_DX xvec4, xvec5; 1589MUL_DX xvec0, xvec4, xvec4; 1590ADD_DX xvec4, xvec11, xvec11; 1591 1592MOV_DX xvec5, xvec6; 1593MUL_DX xvec1, xvec5, xvec5; 1594ADD_DX xvec5, xvec10, xvec10; 1595 1596MOV_DX xvec6, xvec7; 1597MUL_DX xvec2, xvec6, xvec6; 1598ADD_DX xvec6, xvec9, xvec9; 1599 1600MUL_DX xvec3, xvec7, xvec7; 1601ADD_DX xvec7, xvec8, xvec8; 1602 1603//#### Unroll time 3 #### 1604LD_DX 16*SIZE(ptrba), xvec0; 1605LD_DX 4*SIZE(ptrbb), xvec4; 1606MOV_DX xvec4, xvec5; 1607MUL_DX xvec0, xvec4, xvec4; 1608ADD_DX xvec4, xvec15, xvec15; 1609 1610LD_DX 18*SIZE(ptrba), xvec1; 1611MOV_DX xvec5, xvec6; 1612MUL_DX xvec1, xvec5, xvec5; 1613ADD_DX xvec5, xvec14, xvec14; 1614 1615LD_DX 20*SIZE(ptrba), xvec2; 1616MOV_DX xvec6, xvec7; 1617MUL_DX xvec2, xvec6, xvec6; 1618ADD_DX xvec6, xvec13, xvec13; 1619 1620LD_DX 22*SIZE(ptrba), xvec3; 1621SHUF_DX $0x4e, xvec7, xvec4; 1622MUL_DX xvec3, xvec7, xvec7; 1623ADD_DX xvec7, xvec12, xvec12; 1624 1625MOV_DX xvec4, xvec5; 1626MUL_DX xvec0, xvec4, xvec4; 1627ADD_DX xvec4, xvec11, xvec11; 1628 1629MOV_DX xvec5, xvec6; 1630MUL_DX xvec1, xvec5, xvec5; 1631ADD_DX xvec5, xvec10, xvec10; 1632 1633MOV_DX xvec6, xvec7; 1634MUL_DX xvec2, xvec6, xvec6; 1635ADD_DX xvec6, xvec9, xvec9; 1636 1637MUL_DX xvec3, xvec7, xvec7; 1638ADD_DX xvec7, xvec8, xvec8; 1639 1640//#### Unroll time 4 #### 1641LD_DX 24*SIZE(ptrba), xvec0; 1642LD_DX 6*SIZE(ptrbb), xvec4; 1643MOV_DX xvec4, xvec5; 1644MUL_DX xvec0, xvec4, xvec4; 1645ADD_DX xvec4, xvec15, xvec15; 1646ADDQ $8*SIZE, ptrbb; 1647 1648LD_DX 26*SIZE(ptrba), xvec1; 1649MOV_DX xvec5, xvec6; 1650MUL_DX xvec1, xvec5, xvec5; 1651ADD_DX xvec5, xvec14, xvec14; 1652 1653LD_DX 28*SIZE(ptrba), xvec2; 1654MOV_DX xvec6, xvec7; 1655MUL_DX xvec2, xvec6, xvec6; 1656ADD_DX xvec6, xvec13, xvec13; 1657 1658LD_DX 30*SIZE(ptrba), xvec3; 1659SHUF_DX $0x4e, xvec7, xvec4; 1660MUL_DX xvec3, xvec7, xvec7; 1661ADD_DX xvec7, xvec12, xvec12; 1662ADDQ $32*SIZE, ptrba; 1663 1664MOV_DX xvec4, xvec5; 1665MUL_DX xvec0, xvec4, xvec4; 1666ADD_DX xvec4, xvec11, xvec11; 1667 1668MOV_DX xvec5, xvec6; 1669MUL_DX xvec1, xvec5, xvec5; 1670ADD_DX xvec5, xvec10, xvec10; 1671 1672MOV_DX xvec6, xvec7; 1673MUL_DX xvec2, xvec6, xvec6; 1674ADD_DX xvec6, xvec9, xvec9; 1675 1676MUL_DX xvec3, xvec7, xvec7; 1677ADD_DX xvec7, xvec8, xvec8; 1678DECQ k; 1679JG .L211_bodyB; 1680ALIGN_5 1681.L211_loopE: 1682#ifndef TRMMKERNEL 1683TEST $2, bk; 1684#else 1685MOVQ kkk, %rax; 1686TEST $2, %rax; 1687#endif 1688JLE .L212_loopE; 1689ALIGN_5; 1690.L212_bodyB: 1691# Computing kernel 1692//#### Unroll time 1 #### 1693LD_DX 0*SIZE(ptrba), xvec0; 1694LD_DX 0*SIZE(ptrbb), xvec4; 1695MOV_DX xvec4, xvec5; 1696MUL_DX xvec0, xvec4, xvec4; 1697ADD_DX xvec4, xvec15, xvec15; 1698 1699LD_DX 2*SIZE(ptrba), xvec1; 1700MOV_DX xvec5, xvec6; 1701MUL_DX xvec1, xvec5, xvec5; 1702ADD_DX xvec5, xvec14, xvec14; 1703 1704LD_DX 4*SIZE(ptrba), xvec2; 1705MOV_DX xvec6, xvec7; 1706MUL_DX xvec2, xvec6, xvec6; 1707ADD_DX xvec6, xvec13, xvec13; 1708 1709LD_DX 6*SIZE(ptrba), xvec3; 1710SHUF_DX $0x4e, xvec7, xvec4; 1711MUL_DX xvec3, xvec7, xvec7; 1712ADD_DX xvec7, xvec12, xvec12; 1713 1714MOV_DX xvec4, xvec5; 1715MUL_DX xvec0, xvec4, xvec4; 1716ADD_DX xvec4, xvec11, xvec11; 1717 1718MOV_DX xvec5, xvec6; 1719MUL_DX xvec1, xvec5, xvec5; 1720ADD_DX xvec5, xvec10, xvec10; 1721 1722MOV_DX xvec6, xvec7; 1723MUL_DX xvec2, xvec6, xvec6; 1724ADD_DX xvec6, xvec9, xvec9; 1725 1726MUL_DX xvec3, xvec7, xvec7; 1727ADD_DX xvec7, xvec8, xvec8; 1728 1729//#### Unroll time 2 #### 1730LD_DX 8*SIZE(ptrba), xvec0; 1731LD_DX 2*SIZE(ptrbb), xvec4; 1732MOV_DX xvec4, xvec5; 1733MUL_DX xvec0, xvec4, xvec4; 1734ADD_DX xvec4, xvec15, xvec15; 1735ADDQ $4*SIZE, ptrbb; 1736 1737LD_DX 10*SIZE(ptrba), xvec1; 1738MOV_DX xvec5, xvec6; 1739MUL_DX xvec1, xvec5, xvec5; 1740ADD_DX xvec5, xvec14, xvec14; 1741 1742LD_DX 12*SIZE(ptrba), xvec2; 1743MOV_DX xvec6, xvec7; 1744MUL_DX xvec2, xvec6, xvec6; 1745ADD_DX xvec6, xvec13, xvec13; 1746 1747LD_DX 14*SIZE(ptrba), xvec3; 1748SHUF_DX $0x4e, xvec7, xvec4; 1749MUL_DX xvec3, xvec7, xvec7; 1750ADD_DX xvec7, xvec12, xvec12; 1751ADDQ $16*SIZE, ptrba; 1752 1753MOV_DX xvec4, xvec5; 1754MUL_DX xvec0, xvec4, xvec4; 1755ADD_DX xvec4, xvec11, xvec11; 1756 1757MOV_DX xvec5, xvec6; 1758MUL_DX xvec1, xvec5, xvec5; 1759ADD_DX xvec5, xvec10, xvec10; 1760 1761MOV_DX xvec6, xvec7; 1762MUL_DX xvec2, xvec6, xvec6; 1763ADD_DX xvec6, xvec9, xvec9; 1764 1765MUL_DX xvec3, xvec7, xvec7; 1766ADD_DX xvec7, xvec8, xvec8; 1767 1768.L212_loopE: 1769#ifndef TRMMKERNEL 1770TEST $1, bk; 1771#else 1772MOVQ kkk, %rax; 1773TEST $1, %rax; 1774#endif 1775JLE .L213_loopE; 1776ALIGN_5 1777.L213_bodyB: 1778//#### Unroll time 1 #### 1779LD_DX 0*SIZE(ptrba), xvec0; 1780LD_DX 0*SIZE(ptrbb), xvec4; 1781MOV_DX xvec4, xvec5; 1782MUL_DX xvec0, xvec4, xvec4; 1783ADD_DX xvec4, xvec15, xvec15; 1784ADDQ $2*SIZE, ptrbb; 1785 1786LD_DX 2*SIZE(ptrba), xvec1; 1787MOV_DX xvec5, xvec6; 1788MUL_DX xvec1, xvec5, xvec5; 1789ADD_DX xvec5, xvec14, xvec14; 1790 1791LD_DX 4*SIZE(ptrba), xvec2; 1792MOV_DX xvec6, xvec7; 1793MUL_DX xvec2, xvec6, xvec6; 1794ADD_DX xvec6, xvec13, xvec13; 1795 1796LD_DX 6*SIZE(ptrba), xvec3; 1797SHUF_DX $0x4e, xvec7, xvec4; 1798MUL_DX xvec3, xvec7, xvec7; 1799ADD_DX xvec7, xvec12, xvec12; 1800ADDQ $8*SIZE, ptrba; 1801 1802MOV_DX xvec4, xvec5; 1803MUL_DX xvec0, xvec4, xvec4; 1804ADD_DX xvec4, xvec11, xvec11; 1805 1806MOV_DX xvec5, xvec6; 1807MUL_DX xvec1, xvec5, xvec5; 1808ADD_DX xvec5, xvec10, xvec10; 1809 1810MOV_DX xvec6, xvec7; 1811MUL_DX xvec2, xvec6, xvec6; 1812ADD_DX xvec6, xvec9, xvec9; 1813 1814MUL_DX xvec3, xvec7, xvec7; 1815ADD_DX xvec7, xvec8, xvec8; 1816 1817.L213_loopE: 1818//#### Multiply Alpha #### 1819BROAD_DX MEMALPHA, xvec7; 1820MUL_DX xvec7, xvec15, xvec15; 1821MUL_DX xvec7, xvec14, xvec14; 1822MUL_DX xvec7, xvec13, xvec13; 1823MUL_DX xvec7, xvec12, xvec12; 1824MUL_DX xvec7, xvec11, xvec11; 1825MUL_DX xvec7, xvec10, xvec10; 1826MUL_DX xvec7, xvec9, xvec9; 1827MUL_DX xvec7, xvec8, xvec8; 1828//#### Reverse #### 1829MOV_DX xvec15, xvec6; 1830REVS_DX xvec11, xvec15, xvec15; 1831REVS_DX xvec6, xvec11, xvec11; 1832MOV_DX xvec14, xvec6; 1833REVS_DX xvec10, xvec14, xvec14; 1834REVS_DX xvec6, xvec10, xvec10; 1835MOV_DX xvec13, xvec6; 1836REVS_DX xvec9, xvec13, xvec13; 1837REVS_DX xvec6, xvec9, xvec9; 1838MOV_DX xvec12, xvec6; 1839REVS_DX xvec8, xvec12, xvec12; 1840REVS_DX xvec6, xvec8, xvec8; 1841//#### Testing Alignment #### 1842MOVQ C0, %rax; 1843OR ldc, %rax; 1844TEST $15, %rax; 1845JNE .L213_loopEx; 1846ALIGN_5 1847//#### Writing Back #### 1848#ifndef TRMMKERNEL 1849ADD_DX 0*SIZE(C0), xvec11, xvec11; 1850ADD_DX 2*SIZE(C0), xvec10, xvec10; 1851ADD_DX 4*SIZE(C0), xvec9, xvec9; 1852ADD_DX 6*SIZE(C0), xvec8, xvec8; 1853ADD_DX 0*SIZE(C1), xvec15, xvec15; 1854ADD_DX 2*SIZE(C1), xvec14, xvec14; 1855ADD_DX 4*SIZE(C1), xvec13, xvec13; 1856ADD_DX 6*SIZE(C1), xvec12, xvec12; 1857#endif 1858ST_DX xvec11, 0*SIZE(C0); 1859ST_DX xvec10, 2*SIZE(C0); 1860ST_DX xvec9, 4*SIZE(C0); 1861ST_DX xvec8, 6*SIZE(C0); 1862ST_DX xvec15, 0*SIZE(C1); 1863ST_DX xvec14, 2*SIZE(C1); 1864ST_DX xvec13, 4*SIZE(C1); 1865ST_DX xvec12, 6*SIZE(C1); 1866#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) 1867MOVQ bk, %rax; 1868SUBQ kkk, %rax; 1869LEAQ (,%rax, SIZE), %rax; 1870LEAQ (ptrba, %rax, 8), ptrba; 1871LEAQ (ptrbb, %rax, 2), ptrbb; 1872#endif 1873#if defined(TRMMKERNEL) && defined(LEFT) 1874ADDQ $8, kk 1875#endif 1876ADDQ $8*SIZE, C0; 1877ADDQ $8*SIZE, C1; 1878DECQ i; 1879JG .L21_bodyB; 1880JMP .L21_loopE; 1881ALIGN_5 1882.L213_loopEx:; 1883#ifndef TRMMKERNEL 1884LDL_DX 0*SIZE(C0), xvec0, xvec0; 1885LDH_DX 1*SIZE(C0), xvec0, xvec0; 1886LDL_DX 2*SIZE(C0), xvec1, xvec1; 1887LDH_DX 3*SIZE(C0), xvec1, xvec1; 1888LDL_DX 4*SIZE(C0), xvec2, xvec2; 1889LDH_DX 5*SIZE(C0), xvec2, xvec2; 1890LDL_DX 6*SIZE(C0), xvec3, xvec3; 1891LDH_DX 7*SIZE(C0), xvec3, xvec3; 1892ADD_DX xvec0, xvec11, xvec11; 1893ADD_DX xvec1, xvec10, xvec10; 1894ADD_DX xvec2, xvec9, xvec9; 1895ADD_DX xvec3, xvec8, xvec8; 1896#endif 1897STL_DX xvec11, 0*SIZE(C0); 1898STH_DX xvec11, 1*SIZE(C0); 1899STL_DX xvec10, 2*SIZE(C0); 1900STH_DX xvec10, 3*SIZE(C0); 1901STL_DX xvec9, 4*SIZE(C0); 1902STH_DX xvec9, 5*SIZE(C0); 1903STL_DX xvec8, 6*SIZE(C0); 1904STH_DX xvec8, 7*SIZE(C0); 1905#ifndef TRMMKERNEL 1906LDL_DX 0*SIZE(C1), xvec4, xvec4; 1907LDH_DX 1*SIZE(C1), xvec4, xvec4; 1908LDL_DX 2*SIZE(C1), xvec5, xvec5; 1909LDH_DX 3*SIZE(C1), xvec5, xvec5; 1910LDL_DX 4*SIZE(C1), xvec6, xvec6; 1911LDH_DX 5*SIZE(C1), xvec6, xvec6; 1912LDL_DX 6*SIZE(C1), xvec7, xvec7; 1913LDH_DX 7*SIZE(C1), xvec7, xvec7; 1914ADD_DX xvec4, xvec15, xvec15; 1915ADD_DX xvec5, xvec14, xvec14; 1916ADD_DX xvec6, xvec13, xvec13; 1917ADD_DX xvec7, xvec12, xvec12; 1918#endif 1919STL_DX xvec15, 0*SIZE(C1); 1920STH_DX xvec15, 1*SIZE(C1); 1921STL_DX xvec14, 2*SIZE(C1); 1922STH_DX xvec14, 3*SIZE(C1); 1923STL_DX xvec13, 4*SIZE(C1); 1924STH_DX xvec13, 5*SIZE(C1); 1925STL_DX xvec12, 6*SIZE(C1); 1926STH_DX xvec12, 7*SIZE(C1); 1927#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) 1928MOVQ bk, %rax; 1929SUBQ kkk, %rax; 1930LEAQ (,%rax, SIZE), %rax; 1931LEAQ (ptrba, %rax, 8), ptrba; 1932LEAQ (ptrbb, %rax, 2), ptrbb; 1933#endif 1934#if defined(TRMMKERNEL) && defined(LEFT) 1935ADDQ $8, kk 1936#endif 1937ADDQ $8*SIZE, C0; 1938ADDQ $8*SIZE, C1; 1939DECQ i; 1940JG .L21_bodyB; 1941.L21_loopE:; 1942TEST $4, bm; # Rm = 4 1943JLE .L22_loopE; 1944ALIGN_5; 1945.L22_bodyB:; 1946#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) 1947MOVQ bb, ptrbb; 1948#else 1949MOVQ bb, ptrbb; 1950MOVQ kk, %rax; 1951LEAQ (,%rax, SIZE), %rax; 1952LEAQ (ptrba, %rax, 4), ptrba; 1953LEAQ (ptrbb, %rax, 2), ptrbb; 1954#endif 1955//#### Initial Results Register #### 1956XOR_DY yvec15, yvec15, yvec15; 1957XOR_DY yvec14, yvec14, yvec14; 1958XOR_DY yvec11, yvec11, yvec11; 1959XOR_DY yvec10, yvec10, yvec10; 1960#ifndef TRMMKERNEL 1961MOVQ bk, k; 1962#elif (defined(LEFT) && !defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) 1963MOVQ bk, %rax; 1964SUBQ kk, %rax; 1965MOVQ %rax, kkk; 1966#else 1967MOVQ kk, %rax; 1968#ifdef LEFT 1969ADDQ $4, %rax; 1970#else 1971ADDQ $2, %rax; 1972#endif 1973MOVQ %rax, kkk; 1974#endif 1975SARQ $2, k; 1976JLE .L221_loopE; 1977ALIGN_5 1978.L221_bodyB:; 1979# Computing kernel 1980//#### Unroll time 1 #### 1981LD_DX 0*SIZE(ptrba), xvec0; 1982LD_DX 0*SIZE(ptrbb), xvec4; 1983MOV_DX xvec4, xvec5; 1984MUL_DX xvec0, xvec4, xvec4; 1985ADD_DX xvec4, xvec15, xvec15; 1986 1987LD_DX 2*SIZE(ptrba), xvec1; 1988SHUF_DX $0x4e, xvec5, xvec4; 1989MUL_DX xvec1, xvec5, xvec5; 1990ADD_DX xvec5, xvec14, xvec14; 1991 1992MOV_DX xvec4, xvec5; 1993MUL_DX xvec0, xvec4, xvec4; 1994ADD_DX xvec4, xvec11, xvec11; 1995 1996MUL_DX xvec1, xvec5, xvec5; 1997ADD_DX xvec5, xvec10, xvec10; 1998 1999//#### Unroll time 2 #### 2000LD_DX 4*SIZE(ptrba), xvec0; 2001LD_DX 2*SIZE(ptrbb), xvec4; 2002MOV_DX xvec4, xvec5; 2003MUL_DX xvec0, xvec4, xvec4; 2004ADD_DX xvec4, xvec15, xvec15; 2005 2006LD_DX 6*SIZE(ptrba), xvec1; 2007SHUF_DX $0x4e, xvec5, xvec4; 2008MUL_DX xvec1, xvec5, xvec5; 2009ADD_DX xvec5, xvec14, xvec14; 2010 2011MOV_DX xvec4, xvec5; 2012MUL_DX xvec0, xvec4, xvec4; 2013ADD_DX xvec4, xvec11, xvec11; 2014 2015MUL_DX xvec1, xvec5, xvec5; 2016ADD_DX xvec5, xvec10, xvec10; 2017 2018//#### Unroll time 3 #### 2019LD_DX 8*SIZE(ptrba), xvec0; 2020LD_DX 4*SIZE(ptrbb), xvec4; 2021MOV_DX xvec4, xvec5; 2022MUL_DX xvec0, xvec4, xvec4; 2023ADD_DX xvec4, xvec15, xvec15; 2024 2025LD_DX 10*SIZE(ptrba), xvec1; 2026SHUF_DX $0x4e, xvec5, xvec4; 2027MUL_DX xvec1, xvec5, xvec5; 2028ADD_DX xvec5, xvec14, xvec14; 2029 2030MOV_DX xvec4, xvec5; 2031MUL_DX xvec0, xvec4, xvec4; 2032ADD_DX xvec4, xvec11, xvec11; 2033 2034MUL_DX xvec1, xvec5, xvec5; 2035ADD_DX xvec5, xvec10, xvec10; 2036 2037//#### Unroll time 4 #### 2038LD_DX 12*SIZE(ptrba), xvec0; 2039LD_DX 6*SIZE(ptrbb), xvec4; 2040MOV_DX xvec4, xvec5; 2041MUL_DX xvec0, xvec4, xvec4; 2042ADD_DX xvec4, xvec15, xvec15; 2043ADDQ $8*SIZE, ptrbb; 2044 2045LD_DX 14*SIZE(ptrba), xvec1; 2046SHUF_DX $0x4e, xvec5, xvec4; 2047MUL_DX xvec1, xvec5, xvec5; 2048ADD_DX xvec5, xvec14, xvec14; 2049ADDQ $16*SIZE, ptrba; 2050 2051MOV_DX xvec4, xvec5; 2052MUL_DX xvec0, xvec4, xvec4; 2053ADD_DX xvec4, xvec11, xvec11; 2054 2055MUL_DX xvec1, xvec5, xvec5; 2056ADD_DX xvec5, xvec10, xvec10; 2057DECQ k; 2058JG .L221_bodyB; 2059ALIGN_5 2060.L221_loopE:; 2061#ifndef TRMMKERNEL 2062TEST $2, bk; 2063#else 2064MOVQ kkk, %rax; 2065TEST $2, %rax; 2066#endif 2067JLE .L222_loopE; 2068ALIGN_5 2069.L222_bodyB: 2070//#### Unroll time 1 #### 2071LD_DX 0*SIZE(ptrba), xvec0; 2072LD_DX 0*SIZE(ptrbb), xvec4; 2073MOV_DX xvec4, xvec5; 2074MUL_DX xvec0, xvec4, xvec4; 2075ADD_DX xvec4, xvec15, xvec15; 2076 2077LD_DX 2*SIZE(ptrba), xvec1; 2078SHUF_DX $0x4e, xvec5, xvec4; 2079MUL_DX xvec1, xvec5, xvec5; 2080ADD_DX xvec5, xvec14, xvec14; 2081 2082MOV_DX xvec4, xvec5; 2083MUL_DX xvec0, xvec4, xvec4; 2084ADD_DX xvec4, xvec11, xvec11; 2085 2086MUL_DX xvec1, xvec5, xvec5; 2087ADD_DX xvec5, xvec10, xvec10; 2088 2089//#### Unroll time 2 #### 2090LD_DX 4*SIZE(ptrba), xvec0; 2091LD_DX 2*SIZE(ptrbb), xvec4; 2092MOV_DX xvec4, xvec5; 2093MUL_DX xvec0, xvec4, xvec4; 2094ADD_DX xvec4, xvec15, xvec15; 2095ADDQ $4*SIZE, ptrbb; 2096 2097LD_DX 6*SIZE(ptrba), xvec1; 2098SHUF_DX $0x4e, xvec5, xvec4; 2099MUL_DX xvec1, xvec5, xvec5; 2100ADD_DX xvec5, xvec14, xvec14; 2101ADDQ $8*SIZE, ptrba; 2102MOV_DX xvec4, xvec5; 2103MUL_DX xvec0, xvec4, xvec4; 2104ADD_DX xvec4, xvec11, xvec11; 2105 2106MUL_DX xvec1, xvec5, xvec5; 2107ADD_DX xvec5, xvec10, xvec10; 2108 2109.L222_loopE: 2110#ifndef TRMMKERNEL 2111TEST $1, bk 2112#else 2113MOVQ kkk, %rax; 2114TEST $1, %rax; 2115#endif 2116JLE .L223_loopE; 2117ALIGN_5 2118.L223_bodyB: 2119//#### Unroll time 1 #### 2120LD_DX 0*SIZE(ptrba), xvec0; 2121LD_DX 0*SIZE(ptrbb), xvec4; 2122MOV_DX xvec4, xvec5; 2123MUL_DX xvec0, xvec4, xvec4; 2124ADD_DX xvec4, xvec15, xvec15; 2125ADDQ $2*SIZE, ptrbb; 2126 2127LD_DX 2*SIZE(ptrba), xvec1; 2128SHUF_DX $0x4e, xvec5, xvec4; 2129MUL_DX xvec1, xvec5, xvec5; 2130ADD_DX xvec5, xvec14, xvec14; 2131ADDQ $4*SIZE, ptrba; 2132 2133MOV_DX xvec4, xvec5; 2134MUL_DX xvec0, xvec4, xvec4; 2135ADD_DX xvec4, xvec11, xvec11; 2136 2137MUL_DX xvec1, xvec5, xvec5; 2138ADD_DX xvec5, xvec10, xvec10; 2139 2140.L223_loopE: 2141//#### Multiply Alpha #### 2142BROAD_DX MEMALPHA, xvec7; 2143MUL_DX xvec7, xvec15, xvec15; 2144MUL_DX xvec7, xvec14, xvec14; 2145MUL_DX xvec7, xvec11, xvec11; 2146MUL_DX xvec7, xvec10, xvec10; 2147//#### Reverse #### 2148MOV_DX xvec15, xvec6; 2149REVS_DX xvec11, xvec15, xvec15; 2150REVS_DX xvec6, xvec11, xvec11; 2151MOV_DX xvec14, xvec6; 2152REVS_DX xvec10, xvec14, xvec14; 2153REVS_DX xvec6, xvec10, xvec10; 2154//#### Testing Alignment #### 2155MOVQ C0, %rax; 2156OR ldc, %rax; 2157TEST $15, %rax; 2158JNE .L223_loopEx; 2159ALIGN_5 2160//#### Writing Back #### 2161#ifndef TRMMKERNEL 2162ADD_DX 0*SIZE(C0), xvec11, xvec11; 2163ADD_DX 2*SIZE(C0), xvec10, xvec10; 2164ADD_DX 0*SIZE(C1), xvec15, xvec15; 2165ADD_DX 2*SIZE(C1), xvec14, xvec14; 2166#endif 2167ST_DX xvec11, 0*SIZE(C0); 2168ST_DX xvec10, 2*SIZE(C0); 2169ST_DX xvec15, 0*SIZE(C1); 2170ST_DX xvec14, 2*SIZE(C1); 2171#if (defined(TRMMKERNEL)&& defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&& !defined(TRANSA)) 2172MOVQ bk, %rax; 2173SUBQ kkk, %rax; 2174LEAQ (,%rax, SIZE), %rax; 2175LEAQ (ptrba, %rax, 4), ptrba; 2176LEAQ (ptrbb, %rax, 2), ptrbb; 2177#endif 2178#if defined(TRMMKERNEL) && defined(LEFT) 2179ADDQ $4, kk 2180#endif 2181ADDQ $4*SIZE, C0; 2182ADDQ $4*SIZE, C1; 2183JMP .L22_loopE; 2184ALIGN_5 2185.L223_loopEx:; 2186#ifndef TRMMKERNEL 2187LDL_DX 0*SIZE(C0), xvec0, xvec0; 2188LDH_DX 1*SIZE(C0), xvec0, xvec0; 2189LDL_DX 2*SIZE(C0), xvec1, xvec1; 2190LDH_DX 3*SIZE(C0), xvec1, xvec1; 2191ADD_DX xvec0, xvec11, xvec11; 2192ADD_DX xvec1, xvec10, xvec10; 2193#endif 2194STL_DX xvec11, 0*SIZE(C0); 2195STH_DX xvec11, 1*SIZE(C0); 2196STL_DX xvec10, 2*SIZE(C0); 2197STH_DX xvec10, 3*SIZE(C0); 2198#ifndef TRMMKERNEL 2199LDL_DX 0*SIZE(C1), xvec4, xvec4; 2200LDH_DX 1*SIZE(C1), xvec4, xvec4; 2201LDL_DX 2*SIZE(C1), xvec5, xvec5; 2202LDH_DX 3*SIZE(C1), xvec5, xvec5; 2203ADD_DX xvec4, xvec15, xvec15; 2204ADD_DX xvec5, xvec14, xvec14; 2205#endif 2206STL_DX xvec15, 0*SIZE(C1); 2207STH_DX xvec15, 1*SIZE(C1); 2208STL_DX xvec14, 2*SIZE(C1); 2209STH_DX xvec14, 3*SIZE(C1); 2210#if (defined(TRMMKERNEL)&& defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&& !defined(TRANSA)) 2211MOVQ bk, %rax; 2212SUBQ kkk, %rax; 2213LEAQ (,%rax, SIZE), %rax; 2214LEAQ (ptrba, %rax, 4), ptrba; 2215LEAQ (ptrbb, %rax, 2), ptrbb; 2216#endif 2217#if defined(TRMMKERNEL) && defined(LEFT) 2218ADDQ $4, kk 2219#endif 2220ADDQ $4*SIZE, C0; 2221ADDQ $4*SIZE, C1; 2222.L22_loopE:; 2223TEST $2, bm; // Rm = 2 2224JLE .L23_loopE; 2225ALIGN_5; 2226.L23_bodyB: 2227#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) 2228MOVQ bb, ptrbb; 2229#else 2230MOVQ bb, ptrbb; 2231MOVQ kk, %rax; 2232LEAQ (,%rax, SIZE), %rax; 2233LEAQ (ptrba, %rax, 2), ptrba; 2234LEAQ (ptrbb, %rax, 2), ptrbb; 2235#endif 2236XOR_DY yvec15, yvec15, yvec15; 2237XOR_DY yvec11, yvec11, yvec11; 2238#ifndef TRMMKERNEL 2239MOVQ bk, k; 2240#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) 2241MOVQ bk, %rax; 2242SUBQ kk, %rax; 2243MOVQ %rax, kkk; 2244#else 2245MOVQ kk, %rax; 2246#ifdef LEFT 2247ADDQ $2, %rax; 2248#else 2249ADDQ $2, %rax; 2250#endif 2251MOVQ %rax, kkk; 2252#endif 2253SARQ $2, k; 2254JLE .L231_loopE; 2255ALIGN_5 2256.L231_bodyB: 2257# Computing kernel 2258//#### Unroll time 1 #### 2259LD_DX 0*SIZE(ptrba), xvec0; 2260LD_DX 0*SIZE(ptrbb), xvec4; 2261SHUF_DX $0x4e, xvec4, xvec5; 2262MUL_DX xvec0, xvec4, xvec4; 2263ADD_DX xvec4, xvec15, xvec15; 2264 2265MUL_DX xvec0, xvec5, xvec5; 2266ADD_DX xvec5, xvec11, xvec11; 2267//#### Unroll time 2 #### 2268LD_DX 2*SIZE(ptrba), xvec0; 2269LD_DX 2*SIZE(ptrbb), xvec4; 2270SHUF_DX $0x4e, xvec4, xvec5; 2271MUL_DX xvec0, xvec4, xvec4; 2272ADD_DX xvec4, xvec15, xvec15; 2273 2274MUL_DX xvec0, xvec5, xvec5; 2275ADD_DX xvec5, xvec11, xvec11; 2276//#### Unroll time 3 #### 2277LD_DX 4*SIZE(ptrba), xvec0; 2278LD_DX 4*SIZE(ptrbb), xvec4; 2279SHUF_DX $0x4e, xvec4, xvec5; 2280MUL_DX xvec0, xvec4, xvec4; 2281ADD_DX xvec4, xvec15, xvec15; 2282 2283MUL_DX xvec0, xvec5, xvec5; 2284ADD_DX xvec5, xvec11, xvec11; 2285//#### Unroll time 4 #### 2286LD_DX 6*SIZE(ptrba), xvec0; 2287LD_DX 6*SIZE(ptrbb), xvec4; 2288SHUF_DX $0x4e, xvec4, xvec5; 2289MUL_DX xvec0, xvec4, xvec4; 2290ADD_DX xvec4, xvec15, xvec15; 2291ADDQ $8*SIZE, ptrba; 2292MUL_DX xvec0, xvec5, xvec5; 2293ADD_DX xvec5, xvec11, xvec11; 2294ADDQ $8*SIZE, ptrbb; 2295DECQ k; 2296JG .L231_bodyB; 2297ALIGN_5 2298.L231_loopE: 2299#ifndef TRMMKERNEL 2300TEST $2, bk; 2301#else 2302MOVQ kkk, %rax; 2303TEST $2, %rax; 2304#endif 2305JLE .L232_loopE; 2306ALIGN_5 2307.L232_bodyB: 2308//#### Unroll time 1 #### 2309LD_DX 0*SIZE(ptrba), xvec0; 2310LD_DX 0*SIZE(ptrbb), xvec4; 2311SHUF_DX $0x4e, xvec4, xvec5; 2312MUL_DX xvec0, xvec4, xvec4; 2313ADD_DX xvec4, xvec15, xvec15; 2314 2315MUL_DX xvec0, xvec5, xvec5; 2316ADD_DX xvec5, xvec11, xvec11; 2317//#### Unroll time 2 #### 2318LD_DX 2*SIZE(ptrba), xvec0; 2319LD_DX 2*SIZE(ptrbb), xvec4; 2320SHUF_DX $0x4e, xvec4, xvec5; 2321MUL_DX xvec0, xvec4, xvec4; 2322ADD_DX xvec4, xvec15, xvec15; 2323ADDQ $4*SIZE, ptrba; 2324MUL_DX xvec0, xvec5, xvec5; 2325ADD_DX xvec5, xvec11, xvec11; 2326ADDQ $4*SIZE, ptrbb; 2327.L232_loopE: 2328#ifndef TRMMKERNEL 2329TEST $1, bk; 2330#else 2331MOVQ kkk, %rax; 2332TEST $1, %rax; 2333#endif 2334JLE .L233_loopE; 2335ALIGN_5 2336.L233_bodyB: 2337//#### Unroll time 1 #### 2338LD_DX 0*SIZE(ptrba), xvec0; 2339LD_DX 0*SIZE(ptrbb), xvec4; 2340SHUF_DX $0x4e, xvec4, xvec5; 2341MUL_DX xvec0, xvec4, xvec4; 2342ADD_DX xvec4, xvec15, xvec15; 2343ADDQ $2*SIZE, ptrba; 2344MUL_DX xvec0, xvec5, xvec5; 2345ADD_DX xvec5, xvec11, xvec11; 2346ADDQ $2*SIZE, ptrbb; 2347.L233_loopE: 2348//#### Multiply Alpha #### 2349BROAD_DX MEMALPHA, xvec7; 2350MUL_DX xvec7, xvec15, xvec15; 2351MUL_DX xvec7, xvec11, xvec11; 2352//#### Reverse #### 2353MOV_DX xvec15, xvec6; 2354REVS_DX xvec11, xvec15, xvec15; 2355REVS_DX xvec6, xvec11, xvec11; 2356//#### Testing Alignment #### 2357MOVQ C0, %rax; 2358OR ldc, %rax; 2359TEST $15, %rax; 2360JNE .L233_loopEx; 2361ALIGN_5 2362//#### Writing Back #### 2363#ifndef TRMMKERNEL 2364ADD_DX 0*SIZE(C0), xvec11, xvec11; 2365ADD_DX 0*SIZE(C1), xvec15, xvec15; 2366#endif 2367ST_DX xvec11, 0*SIZE(C0); 2368ST_DX xvec15, 0*SIZE(C1); 2369#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) 2370MOVQ bk, %rax; 2371SUBQ kkk, %rax; 2372LEAQ (,%rax, SIZE), %rax; 2373LEAQ (ptrba, %rax, 2), ptrba; 2374LEAQ (ptrbb, %rax, 2), ptrbb; 2375#endif 2376#if defined(TRMMKERNEL) && defined(LEFT) 2377ADDQ $2, kk; 2378#endif 2379ADDQ $2*SIZE, C0; 2380ADDQ $2*SIZE, C1; 2381JMP .L23_loopE; 2382ALIGN_5 2383.L233_loopEx:; 2384#ifndef TRMMKERNEL 2385LDL_DX 0*SIZE(C0), xvec0, xvec0; 2386LDH_DX 1*SIZE(C0), xvec0, xvec0; 2387ADD_DX xvec0, xvec11, xvec11; 2388#endif 2389STL_DX xvec11, 0*SIZE(C0); 2390STH_DX xvec11, 1*SIZE(C0); 2391#ifndef TRMMKERNEL 2392LDL_DX 0*SIZE(C1), xvec4, xvec4; 2393LDH_DX 1*SIZE(C1), xvec4, xvec4; 2394ADD_DX xvec4, xvec15, xvec15; 2395#endif 2396STL_DX xvec15, 0*SIZE(C1); 2397STH_DX xvec15, 1*SIZE(C1); 2398#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) 2399MOVQ bk, %rax; 2400SUBQ kkk, %rax; 2401LEAQ (,%rax, SIZE), %rax; 2402LEAQ (ptrba, %rax, 2), ptrba; 2403LEAQ (ptrbb, %rax, 2), ptrbb; 2404#endif 2405#if defined(TRMMKERNEL) && defined(LEFT) 2406ADDQ $2, kk; 2407#endif 2408ADDQ $2*SIZE, C0; 2409ADDQ $2*SIZE, C1; 2410.L23_loopE: 2411TEST $1, bm; // Rm = 1 2412JLE .L24_loopE; 2413ALIGN_5; 2414.L24_bodyB: 2415#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) 2416MOVQ bb, ptrbb; 2417#else 2418MOVQ bb, ptrbb; 2419MOVQ kk, %rax; 2420LEAQ (, %rax, SIZE), %rax; 2421ADDQ %rax, ptrba; 2422LEAQ (ptrbb, %rax, 2), ptrbb; 2423#endif 2424XOR_DY yvec15, yvec15, yvec15; 2425#ifndef TRMMKERNEL 2426MOVQ bk, k; 2427#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) 2428MOVQ bk, %rax; 2429SUBQ kk, %rax; 2430MOVQ %rax, kkk; 2431#else 2432MOVQ kk, %rax; 2433#ifdef LEFT 2434ADDQ $1, %rax; 2435#else 2436ADDQ $2, %rax; 2437#endif 2438MOVQ %rax, kkk; 2439#endif 2440SARQ $2, k; 2441JLE .L241_loopE; 2442ALIGN_5 2443.L241_bodyB: 2444BROAD_DX 0*SIZE(ptrba), xvec0; 2445LD_DX 0*SIZE(ptrbb), xvec2; 2446MUL_DX xvec0, xvec2, xvec2; 2447ADD_DX xvec2, xvec15, xvec15; 2448 2449BROAD_DX 1*SIZE(ptrba), xvec1; 2450LD_DX 2*SIZE(ptrbb), xvec3; 2451MUL_DX xvec1, xvec3, xvec3; 2452ADD_DX xvec3, xvec15, xvec15; 2453 2454BROAD_DX 2*SIZE(ptrba), xvec0; 2455LD_DX 4*SIZE(ptrbb), xvec2; 2456MUL_DX xvec0, xvec2, xvec2; 2457ADD_DX xvec2, xvec15, xvec15; 2458 2459BROAD_DX 3*SIZE(ptrba), xvec1; 2460LD_DX 6*SIZE(ptrbb), xvec3; 2461MUL_DX xvec1, xvec3, xvec3; 2462ADD_DX xvec3, xvec15, xvec15; 2463ADDQ $4*SIZE, ptrba; 2464ADDQ $8*SIZE, ptrbb; 2465DECQ k; 2466JG .L241_bodyB; 2467ALIGN_5 2468.L241_loopE: 2469#ifndef TRMMKERNEL 2470TEST $2, bk; 2471#else 2472MOVQ kkk, %rax; 2473TEST $2, %rax; 2474#endif 2475JLE .L242_loopE; 2476ALIGN_5 2477.L242_bodyB: 2478BROAD_DX 0*SIZE(ptrba), xvec0; 2479LD_DX 0*SIZE(ptrbb), xvec2; 2480MUL_DX xvec0, xvec2, xvec2; 2481ADD_DX xvec2, xvec15, xvec15; 2482 2483BROAD_DX 1*SIZE(ptrba), xvec1; 2484LD_DX 2*SIZE(ptrbb), xvec3; 2485MUL_DX xvec1, xvec3, xvec3; 2486ADD_DX xvec3, xvec15, xvec15; 2487ADDQ $2*SIZE, ptrba; 2488ADDQ $4*SIZE, ptrbb; 2489.L242_loopE: 2490#ifndef TRMMKERNEL 2491TEST $1, bk; 2492#else 2493MOVQ kkk, %rax; 2494TEST $1, %rax; 2495#endif 2496JLE .L243_loopE; 2497ALIGN_5 2498.L243_bodyB: 2499BROAD_DX 0*SIZE(ptrba), xvec0; 2500LD_DX 0*SIZE(ptrbb), xvec2; 2501MUL_DX xvec0, xvec2, xvec2; 2502ADD_DX xvec2, xvec15, xvec15; 2503ADDQ $1*SIZE, ptrba; 2504ADDQ $2*SIZE, ptrbb; 2505 2506.L243_loopE: 2507BROAD_DX MEMALPHA, xvec7; 2508MUL_DX xvec7, xvec15, xvec15; 2509#ifndef TRMMKERNEL 2510LDL_DX 0*SIZE(C0), xvec0, xvec0; 2511LDH_DX 0*SIZE(C1), xvec0, xvec0; 2512ADD_DX xvec0, xvec15, xvec15; 2513#endif 2514STL_DX xvec15, 0*SIZE(C0); 2515STH_DX xvec15, 0*SIZE(C1); 2516#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) 2517MOVQ bk, %rax; 2518SUBQ kkk, %rax; 2519LEAQ (,%rax, SIZE), %rax; 2520ADDQ %rax, ptrba; 2521LEAQ (ptrbb, %rax, 2), ptrbb; 2522#endif 2523#if defined(TRMMKERNEL) && defined(LEFT) 2524ADDQ $1, kk; 2525#endif 2526ADDQ $1*SIZE, C0; 2527ADDQ $1*SIZE, C1; 2528.L24_loopE: 2529#if defined(TRMMKERNEL) && !defined(LEFT) 2530ADDQ $2, kk; 2531#endif 2532MOVQ bk, k; 2533SALQ $4, k; 2534ADDQ k, bb; 2535LEAQ (C, ldc, 2), C; 2536.L20_loopE:; 2537TEST $1, bn; // Rn = 1 2538JLE .L30_loopE; 2539ALIGN_5 2540.L30_bodyB: 2541#if defined(TRMMKERNEL)&&defined(LEFT) 2542MOVQ OFFSET, %rax; 2543MOVQ %rax, kk; 2544#endif 2545MOVQ C, C0; 2546MOVQ ba, ptrba; 2547MOVQ bm, i; 2548SARQ $3, i; 2549JLE .L31_loopE; 2550ALIGN_5 2551.L31_bodyB: 2552#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) 2553MOVQ bb, ptrbb; 2554#else 2555MOVQ bb, ptrbb; 2556MOVQ kk, %rax 2557LEAQ (, %rax, SIZE), %rax; 2558LEAQ (ptrba, %rax, 8), ptrba; 2559ADDQ %rax, ptrbb; 2560#endif 2561//#### Initial Results Register #### 2562XOR_DY yvec15, yvec15, yvec15; 2563XOR_DY yvec14, yvec14, yvec14; 2564#ifndef TRMMKERNEL 2565MOVQ bk, k; 2566#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) 2567MOVQ bk, %rax; 2568SUBQ kk, %rax; 2569MOVQ %rax, kkk; 2570#else 2571MOVQ kk, %rax; 2572#ifdef LEFT 2573ADDQ $8, %rax; 2574#else 2575ADDQ $1, %rax; 2576#endif 2577MOVQ %rax, kkk; 2578#endif 2579SARQ $2, k; 2580JLE .L311_loopE; 2581ALIGN_5 2582.L311_bodyB: 2583//#### Unroll time 1 #### 2584LD_DY 0*SIZE(ptrba), yvec0; 2585LD_DY 4*SIZE(ptrba), yvec1; 2586BROAD_DY 0*SIZE(ptrbb), yvec2; 2587MUL_DY yvec2, yvec0, yvec0; 2588ADD_DY yvec0, yvec15, yvec15; 2589MUL_DY yvec2, yvec1, yvec1; 2590ADD_DY yvec1, yvec14, yvec14; 2591 2592//#### Unroll time 2 #### 2593LD_DY 8*SIZE(ptrba), yvec3; 2594LD_DY 12*SIZE(ptrba), yvec4; 2595BROAD_DY 1*SIZE(ptrbb), yvec5; 2596MUL_DY yvec5, yvec3, yvec3; 2597ADD_DY yvec3, yvec15, yvec15; 2598MUL_DY yvec5, yvec4, yvec4 2599ADD_DY yvec4, yvec14, yvec14; 2600 2601//#### Unroll time 3 #### 2602LD_DY 16*SIZE(ptrba), yvec0; 2603LD_DY 20*SIZE(ptrba), yvec1; 2604BROAD_DY 2*SIZE(ptrbb), yvec2; 2605MUL_DY yvec2, yvec0, yvec0; 2606ADD_DY yvec0, yvec15, yvec15; 2607MUL_DY yvec2, yvec1, yvec1; 2608ADD_DY yvec1, yvec14, yvec14; 2609 2610//#### Unroll time 2 #### 2611LD_DY 24*SIZE(ptrba), yvec3; 2612LD_DY 28*SIZE(ptrba), yvec4; 2613BROAD_DY 3*SIZE(ptrbb), yvec5; 2614MUL_DY yvec5, yvec3, yvec3; 2615ADD_DY yvec3, yvec15, yvec15; 2616ADDQ $32*SIZE, ptrba; 2617MUL_DY yvec5, yvec4, yvec4; 2618ADD_DY yvec4, yvec14, yvec14; 2619ADDQ $4*SIZE, ptrbb; 2620DECQ k; 2621JG .L311_bodyB; 2622ALIGN_5 2623.L311_loopE: 2624#ifndef TRMMKERNEL 2625TEST $2, bk; 2626#else 2627MOVQ kkk, %rax; 2628TEST $2, %rax; 2629#endif 2630JLE .L312_loopE; 2631ALIGN_5 2632.L312_bodyB: 2633//#### Unroll time 1 #### 2634LD_DY 0*SIZE(ptrba), yvec0; 2635LD_DY 4*SIZE(ptrba), yvec1; 2636BROAD_DY 0*SIZE(ptrbb), yvec2; 2637MUL_DY yvec2, yvec0, yvec0; 2638ADD_DY yvec0, yvec15, yvec15; 2639MUL_DY yvec2, yvec1, yvec1; 2640ADD_DY yvec1, yvec14, yvec14; 2641 2642//#### Unroll time 2 #### 2643LD_DY 8*SIZE(ptrba), yvec3; 2644LD_DY 12*SIZE(ptrba), yvec4; 2645BROAD_DY 1*SIZE(ptrbb), yvec5; 2646MUL_DY yvec5, yvec3, yvec3; 2647ADD_DY yvec3, yvec15, yvec15; 2648ADDQ $16*SIZE, ptrba; 2649MUL_DY yvec5, yvec4, yvec4 2650ADD_DY yvec4, yvec14, yvec14; 2651ADDQ $2*SIZE, ptrbb; 2652 2653.L312_loopE: 2654#ifndef TRMMKERNEL 2655TEST $1, bk; 2656#else 2657MOVQ kkk, %rax; 2658TEST $1, %rax; 2659#endif 2660JLE .L313_loopE; 2661ALIGN_5 2662.L313_bodyB: 2663//#### Unroll time 1 #### 2664LD_DY 0*SIZE(ptrba), yvec0; 2665LD_DY 4*SIZE(ptrba), yvec1; 2666BROAD_DY 0*SIZE(ptrbb), yvec2; 2667MUL_DY yvec2, yvec0, yvec0; 2668ADD_DY yvec0, yvec15, yvec15; 2669ADDQ $8*SIZE, ptrba; 2670MUL_DY yvec2, yvec1, yvec1; 2671ADD_DY yvec1, yvec14, yvec14; 2672ADDQ $1*SIZE, ptrbb; 2673 2674.L313_loopE: 2675//#### Multiply Alpha #### 2676BROAD_DY MEMALPHA, yvec7; 2677MUL_DY yvec7, yvec15, yvec15; 2678MUL_DY yvec7, yvec14, yvec14; 2679//#### Testing Alignment #### 2680MOVQ C0, %rax; 2681OR ldc, %rax; 2682TEST $15, %rax; 2683JNE .L313_loopEx; 2684ALIGN_5 2685//#### Writing Back #### 2686EXTRA_DY $1, yvec15, xvec13; 2687EXTRA_DY $1, yvec14, xvec12; 2688#ifndef TRMMKERNEL 2689ADD_DX 0*SIZE(C0), xvec15, xvec15; 2690ADD_DX 2*SIZE(C0), xvec13, xvec13; 2691ADD_DX 4*SIZE(C0), xvec14, xvec14; 2692ADD_DX 6*SIZE(C0), xvec12, xvec12; 2693#endif 2694ST_DX xvec15, 0*SIZE(C0); 2695ST_DX xvec13, 2*SIZE(C0); 2696ST_DX xvec14, 4*SIZE(C0); 2697ST_DX xvec12, 6*SIZE(C0); 2698#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) 2699MOVQ bk, %rax; 2700SUBQ kkk, %rax; 2701LEAQ (,%rax, SIZE), %rax; 2702LEAQ (ptrba, %rax, 8), ptrba; 2703ADDQ %rax, ptrbb; 2704#endif 2705#if defined(TRMMKERNEL)&&defined(LEFT) 2706ADDQ $8, kk; 2707#endif 2708ADDQ $8*SIZE, C0; 2709DECQ i; 2710JG .L31_bodyB; 2711JMP .L31_loopE; 2712ALIGN_5 2713.L313_loopEx: 2714EXTRA_DY $1, yvec15, xvec13; 2715EXTRA_DY $1, yvec14, xvec12; 2716#ifndef TRMMKERNEL 2717LDL_DX 0*SIZE(C0), xvec11, xvec11; 2718LDH_DX 1*SIZE(C0), xvec11, xvec11; 2719LDL_DX 2*SIZE(C0), xvec10, xvec10; 2720LDH_DX 3*SIZE(C0), xvec10, xvec10; 2721LDL_DX 4*SIZE(C0), xvec9, xvec9; 2722LDH_DX 5*SIZE(C0), xvec9, xvec9; 2723LDL_DX 6*SIZE(C0), xvec8, xvec8; 2724LDH_DX 7*SIZE(C0), xvec8, xvec8; 2725ADD_DX xvec11, xvec15, xvec15; 2726ADD_DX xvec10, xvec13, xvec13; 2727ADD_DX xvec9, xvec14, xvec14; 2728ADD_DX xvec8, xvec12, xvec12; 2729#endif 2730STL_DX xvec15, 0*SIZE(C0); 2731STH_DX xvec15, 1*SIZE(C0); 2732STL_DX xvec13, 2*SIZE(C0); 2733STH_DX xvec13, 3*SIZE(C0); 2734STL_DX xvec14, 4*SIZE(C0); 2735STH_DX xvec14, 5*SIZE(C0); 2736STL_DX xvec12, 6*SIZE(C0); 2737STH_DX xvec12, 7*SIZE(C0); 2738#if (defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) 2739MOVQ bk, %rax; 2740SUBQ kkk, %rax; 2741LEAQ (,%rax, SIZE), %rax; 2742LEAQ (ptrba, %rax, 8), ptrba; 2743ADDQ %rax, ptrbb; 2744#endif 2745#if defined(TRMMKERNEL)&&defined(LEFT) 2746ADDQ $8, kk; 2747#endif 2748ADDQ $8*SIZE, C0; 2749DECQ i; 2750JG .L31_bodyB; 2751.L31_loopE: 2752TEST $4, bm 2753JLE .L32_loopE; 2754ALIGN_5 2755.L32_bodyB: 2756#if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) 2757MOVQ bb, ptrbb; 2758#else 2759MOVQ bb, ptrbb; 2760MOVQ kk, %rax; 2761LEAQ (,%rax, SIZE), %rax; 2762LEAQ (ptrba, %rax, 4), ptrba; 2763ADDQ %rax, ptrbb; 2764#endif 2765//#### Initial Results Register #### 2766XOR_DY yvec15, yvec15, yvec15; 2767#ifndef TRMMKERNEL 2768MOVQ bk, k; 2769#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 2770MOVQ bk, %rax; 2771SUBQ kk, %rax; 2772MOVQ %rax, kkk; 2773#else 2774MOVQ kk, %rax; 2775#ifdef LEFT 2776ADDQ $4, %rax; 2777#else 2778ADDQ $1, %rax; 2779#endif 2780MOVQ %rax, kkk 2781#endif 2782SARQ $2, k; 2783JLE .L321_loopE; 2784ALIGN_5 2785.L321_bodyB: 2786LD_DY 0*SIZE(ptrba), yvec0; 2787BROAD_DY 0*SIZE(ptrbb), yvec1; 2788MUL_DY yvec0, yvec1, yvec1; 2789ADD_DY yvec1, yvec15, yvec15; 2790 2791LD_DY 4*SIZE(ptrba), yvec2; 2792BROAD_DY 1*SIZE(ptrbb), yvec3; 2793MUL_DY yvec2, yvec3, yvec3; 2794ADD_DY yvec3, yvec15, yvec15; 2795 2796LD_DY 8*SIZE(ptrba), yvec4; 2797BROAD_DY 2*SIZE(ptrbb), yvec5; 2798MUL_DY yvec4, yvec5, yvec5; 2799ADD_DY yvec5, yvec15, yvec15; 2800 2801LD_DY 12*SIZE(ptrba), yvec6; 2802BROAD_DY 3*SIZE(ptrbb), yvec7; 2803MUL_DY yvec6, yvec7, yvec7; 2804ADD_DY yvec7, yvec15, yvec15; 2805ADDQ $16*SIZE, ptrba; 2806ADDQ $4*SIZE, ptrbb; 2807DECQ k; 2808JG .L321_bodyB; 2809ALIGN_5 2810.L321_loopE: 2811#ifndef TRMMKERNEL 2812TEST $2, bk; 2813#else 2814MOVQ kkk, %rax; 2815TEST $2, %rax; 2816#endif 2817JLE .L322_loopE; 2818ALIGN_5 2819.L322_bodyB: 2820LD_DY 0*SIZE(ptrba), yvec0; 2821BROAD_DY 0*SIZE(ptrbb), yvec1; 2822MUL_DY yvec0, yvec1, yvec1; 2823ADD_DY yvec1, yvec15, yvec15; 2824 2825LD_DY 4*SIZE(ptrba), yvec2; 2826BROAD_DY 1*SIZE(ptrbb), yvec3; 2827MUL_DY yvec2, yvec3, yvec3; 2828ADD_DY yvec3, yvec15, yvec15; 2829ADDQ $8*SIZE, ptrba; 2830ADDQ $2*SIZE, ptrbb; 2831 2832.L322_loopE: 2833#ifndef TRMMKERNEL 2834TEST $1, bk; 2835#else 2836MOVQ kkk, %rax; 2837TEST $1, %rax; 2838#endif 2839JLE .L323_loopE; 2840ALIGN_5 2841.L323_bodyB: 2842LD_DY 0*SIZE(ptrba), yvec0; 2843BROAD_DY 0*SIZE(ptrbb), yvec1; 2844MUL_DY yvec0, yvec1, yvec1; 2845ADD_DY yvec1, yvec15, yvec15; 2846ADDQ $4*SIZE, ptrba; 2847ADDQ $1*SIZE, ptrbb; 2848 2849.L323_loopE: 2850//#### Multiply Alpha #### 2851BROAD_DY MEMALPHA, yvec7; 2852MUL_DY yvec7, yvec15, yvec15; 2853//#### Testing Alignment #### 2854MOVQ C0, %rax; 2855OR ldc, %rax; 2856TEST $15, %rax; 2857JNE .L323_loopEx; 2858ALIGN_5 2859//#### Writing Back #### 2860EXTRA_DY $1, yvec15, xvec14; 2861#ifndef TRMMKERNEL 2862ADD_DX 0*SIZE(C0), xvec15, xvec15; 2863ADD_DX 2*SIZE(C0), xvec14, xvec14; 2864#endif 2865ST_DX xvec15, 0*SIZE(C0); 2866ST_DX xvec14, 2*SIZE(C0); 2867#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) 2868MOVQ bk, %rax; 2869SUBQ kkk, %rax; 2870LEAQ (, %rax, SIZE), %rax; 2871LEAQ (ptrba, %rax, 4), ptrba; 2872ADDQ %rax, ptrbb; 2873#endif 2874#if defined(TRMMKERNEL) && defined(LEFT) 2875ADDQ $4, kk 2876#endif 2877ADDQ $4*SIZE, C0; 2878JMP .L32_loopE; 2879ALIGN_5 2880.L323_loopEx: 2881//#### Writing Back #### 2882EXTRA_DY $1, yvec15, xvec14; 2883#ifndef TRMMKERNEL 2884LDL_DX 0*SIZE(C0), xvec13, xvec13; 2885LDH_DX 1*SIZE(C0), xvec13, xvec13; 2886LDL_DX 2*SIZE(C0), xvec12, xvec12; 2887LDH_DX 3*SIZE(C0), xvec12, xvec12; 2888ADD_DX xvec13, xvec15, xvec15; 2889ADD_DX xvec12, xvec14, xvec14; 2890#endif 2891STL_DX xvec15, 0*SIZE(C0); 2892STH_DX xvec15, 1*SIZE(C0); 2893STL_DX xvec14, 2*SIZE(C0); 2894STH_DX xvec14, 3*SIZE(C0); 2895#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) 2896MOVQ bk, %rax; 2897SUBQ kkk, %rax; 2898LEAQ (, %rax, SIZE), %rax; 2899LEAQ (ptrba, %rax, 4), ptrba; 2900ADDQ %rax, ptrbb; 2901#endif 2902#if defined(TRMMKERNEL) && defined(LEFT) 2903ADDQ $4, kk 2904#endif 2905ADDQ $4*SIZE, C0; 2906.L32_loopE: 2907TEST $2, bm 2908JLE .L33_loopE; 2909ALIGN_5 2910.L33_bodyB: 2911#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 2912MOVQ bb, ptrbb; 2913#else 2914MOVQ bb, ptrbb; 2915MOVQ kk, %rax 2916LEAQ (, %rax, SIZE), %rax 2917LEAQ (ptrba, %rax, 2), ptrba 2918ADDQ %rax, ptrbb; 2919#endif 2920//#### Initial Result #### 2921XOR_DY yvec15, yvec15, yvec15; 2922#ifndef TRMMKERNEL 2923MOVQ bk, k; 2924#elif (defined(LEFT)&&!defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) 2925MOVQ bk, %rax; 2926SUBQ kk, %rax; 2927MOVQ %rax, kkk; 2928#else 2929MOVQ kk, %rax; 2930#ifdef LEFT 2931ADDQ $2, %rax; 2932#else 2933ADDQ $1, %rax; 2934#endif 2935MOVQ %rax, kkk; 2936#endif 2937SARQ $2, k; 2938JLE .L331_loopE; 2939ALIGN_5 2940.L331_bodyB: 2941LD_DX 0*SIZE(ptrba), xvec0; 2942BROAD_DX 0*SIZE(ptrbb), xvec2; 2943MUL_DX xvec0, xvec2, xvec2; 2944ADD_DX xvec2, xvec15, xvec15; 2945 2946LD_DX 2*SIZE(ptrba), xvec1; 2947BROAD_DX 1*SIZE(ptrbb), xvec3; 2948MUL_DX xvec1, xvec3, xvec3; 2949ADD_DX xvec3, xvec15, xvec15; 2950 2951LD_DX 4*SIZE(ptrba), xvec4; 2952BROAD_DX 2*SIZE(ptrbb), xvec5; 2953MUL_DX xvec4, xvec5, xvec5; 2954ADD_DX xvec5, xvec15, xvec15; 2955 2956LD_DX 6*SIZE(ptrba), xvec6; 2957BROAD_DX 3*SIZE(ptrbb), xvec7; 2958MUL_DX xvec6, xvec7, xvec7; 2959ADD_DX xvec7, xvec15, xvec15; 2960ADDQ $8*SIZE, ptrba; 2961ADDQ $4*SIZE, ptrbb; 2962DECQ k; 2963JG .L331_bodyB; 2964ALIGN_5 2965.L331_loopE: 2966#ifndef TRMMKERNEL 2967TEST $2,bk; 2968#else 2969MOVQ kkk, %rax; 2970TEST $2, %rax 2971#endif 2972JLE .L332_loopE; 2973ALIGN_5 2974.L332_bodyB: 2975LD_DX 0*SIZE(ptrba), xvec0; 2976BROAD_DX 0*SIZE(ptrbb), xvec2; 2977MUL_DX xvec0, xvec2, xvec2; 2978ADD_DX xvec2, xvec15, xvec15; 2979 2980LD_DX 2*SIZE(ptrba), xvec1; 2981BROAD_DX 1*SIZE(ptrbb), xvec3; 2982MUL_DX xvec1, xvec3, xvec3; 2983ADD_DX xvec3, xvec15, xvec15; 2984ADDQ $4*SIZE, ptrba; 2985ADDQ $2*SIZE, ptrbb; 2986.L332_loopE: 2987#ifndef TRMMKERNEL 2988TEST $1, bk; 2989#else 2990MOVQ kkk, %rax; 2991TEST $1, %rax; 2992#endif 2993JLE .L333_loopE; 2994ALIGN_5 2995.L333_bodyB: 2996LD_DX 0*SIZE(ptrba), xvec0; 2997BROAD_DX 0*SIZE(ptrbb), xvec2; 2998MUL_DX xvec0, xvec2, xvec2; 2999ADD_DX xvec2, xvec15, xvec15; 3000ADDQ $2*SIZE, ptrba; 3001ADDQ $1*SIZE, ptrbb; 3002.L333_loopE: 3003//#### Multiply Alpha #### 3004BROAD_DX MEMALPHA, xvec7; 3005MUL_DX xvec7, xvec15, xvec15; 3006#ifndef TRMMKERNEL 3007LDL_DX 0*SIZE(C0), xvec14, xvec14; 3008LDH_DX 1*SIZE(C0), xvec14, xvec14; 3009ADD_DX xvec14, xvec15, xvec15; 3010#endif 3011STL_DX xvec15, 0*SIZE(C0); 3012STH_DX xvec15, 1*SIZE(C0); 3013#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) ||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 3014MOVQ bk, %rax; 3015SUBQ kkk, %rax; 3016LEAQ (,%rax, SIZE), %rax; 3017LEAQ (ptrba, %rax, 2), ptrba; 3018ADDQ %rax, ptrbb; 3019#endif 3020#if defined(TRMMKERNEL) && defined(LEFT) 3021addq $2, kk 3022#endif 3023ADDQ $2*SIZE, C0; 3024.L33_loopE: 3025TEST $1, bm 3026JLE .L34_loopE; 3027ALIGN_5 3028.L34_bodyB: 3029#if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 3030MOVQ bb, ptrbb; 3031#else 3032MOVQ bb, ptrbb; 3033MOVQ kk, %rax; 3034LEAQ (, %rax, SIZE), %rax; 3035ADDQ %rax, ptrba; 3036ADDQ %rax, ptrbb; 3037#endif 3038XOR_DY yvec15, yvec15, yvec15; 3039#ifndef TRMMKERNEL 3040MOVQ bk, k; 3041#elif (defined(LEFT)&& !defined(TRANSA))||(!defined(LEFT)&&defined(TRANSA)) 3042MOVQ bk, %rax; 3043SUBQ kk, %rax; 3044MOVQ %rax, kkk; 3045#else 3046MOVQ kk, %rax; 3047#ifdef LEFT 3048ADDQ $1, %rax; 3049#else 3050ADDQ $1, %rax; 3051#endif 3052MOVQ %rax, kkk; 3053#endif 3054SARQ $2, k; 3055JLE .L341_loopE; 3056ALIGN_5 3057.L341_bodyB: 3058vmovsd 0*SIZE(ptrba), xvec0; 3059vmovsd 0*SIZE(ptrbb), xvec1; 3060vmulsd xvec0, xvec1, xvec1; 3061vaddsd xvec1, xvec15, xvec15; 3062 3063vmovsd 1*SIZE(ptrba), xvec0; 3064vmovsd 1*SIZE(ptrbb), xvec1; 3065vmulsd xvec0, xvec1, xvec1; 3066vaddsd xvec1, xvec15, xvec15; 3067 3068vmovsd 2*SIZE(ptrba), xvec0; 3069vmovsd 2*SIZE(ptrbb), xvec1; 3070vmulsd xvec0, xvec1, xvec1; 3071vaddsd xvec1, xvec15, xvec15; 3072 3073vmovsd 3*SIZE(ptrba), xvec0; 3074vmovsd 3*SIZE(ptrbb), xvec1; 3075vmulsd xvec0, xvec1, xvec1; 3076vaddsd xvec1, xvec15, xvec15; 3077addq $4*SIZE, ptrba; 3078addq $4*SIZE, ptrbb; 3079decq k; 3080JG .L341_bodyB; 3081ALIGN_5 3082.L341_loopE: 3083#ifndef TRMMKERNEL 3084TEST $2, bk; 3085#else 3086MOVQ kkk, %rax; 3087TEST $2, %rax; 3088#endif 3089JLE .L342_loopE; 3090ALIGN_5 3091.L342_bodyB: 3092vmovsd 0*SIZE(ptrba), xvec0; 3093vmovsd 0*SIZE(ptrbb), xvec1; 3094vmulsd xvec0, xvec1, xvec1; 3095vaddsd xvec1, xvec15, xvec15; 3096 3097vmovsd 1*SIZE(ptrba), xvec0; 3098vmovsd 1*SIZE(ptrbb), xvec1; 3099vmulsd xvec0, xvec1, xvec1; 3100vaddsd xvec1, xvec15, xvec15; 3101addq $2*SIZE, ptrba; 3102addq $2*SIZE, ptrbb; 3103 3104.L342_loopE: 3105#ifndef TRMMKERNEL 3106TEST $1, bk 3107#else 3108MOVQ kkk, %rax; 3109TEST $1, %rax; 3110#endif 3111JLE .L343_loopE; 3112ALIGN_5 3113.L343_bodyB: 3114vmovsd 0*SIZE(ptrba), xvec0; 3115vmovsd 0*SIZE(ptrbb), xvec1; 3116vmulsd xvec0, xvec1, xvec1; 3117vaddsd xvec1, xvec15, xvec15; 3118addq $1*SIZE, ptrba; 3119addq $1*SIZE, ptrbb; 3120 3121.L343_loopE: 3122//#### Writing Back #### 3123vmovsd MEMALPHA, xvec7; 3124vmulsd xvec7, xvec15, xvec15; 3125#ifndef TRMMKERNEL 3126vmovsd 0*SIZE(C0), xvec0; 3127vaddsd xvec0, xvec15, xvec15; 3128#endif 3129movsd xvec15, 0*SIZE(C0); 3130#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) ||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 3131MOVQ bk, %rax; 3132SUBQ kkk, %rax; 3133LEAQ (,%rax, SIZE), %rax; 3134ADDQ %rax, ptrba; 3135ADDQ %rax, ptrbb; 3136#endif 3137#if defined(TRMMKERNEL) && defined(LEFT) 3138addq $1, kk 3139#endif 3140addq $1*SIZE, C0; 3141.L34_loopE: 3142MOVQ bk, k 3143SALQ $3, k; 3144ADDQ k, bb; 3145LEAQ (C, ldc, 1), C; 3146 3147.L30_loopE: 3148movq 0(%rsp), %rbx; 3149movq 8(%rsp), %rbp; 3150movq 16(%rsp), %r12; 3151movq 24(%rsp), %r13; 3152movq 32(%rsp), %r14; 3153movq 40(%rsp), %r15; 3154 3155vzeroupper 3156 3157#ifdef WINDOWS_ABI 3158 movq 48(%rsp), %rdi 3159 movq 56(%rsp), %rsi 3160 movups 64(%rsp), %xmm6 3161 movups 80(%rsp), %xmm7 3162 movups 96(%rsp), %xmm8 3163 movups 112(%rsp), %xmm9 3164 movups 128(%rsp), %xmm10 3165 movups 144(%rsp), %xmm11 3166 movups 160(%rsp), %xmm12 3167 movups 176(%rsp), %xmm13 3168 movups 192(%rsp), %xmm14 3169 movups 208(%rsp), %xmm15 3170#endif 3171addq $STACKSIZE, %rsp; 3172ret 3173 3174EPILOGUE 3175