1/*********************************************************************/ 2/* Copyright 2009, 2010 The University of Texas at Austin. */ 3/* All rights reserved. */ 4/* */ 5/* Redistribution and use in source and binary forms, with or */ 6/* without modification, are permitted provided that the following */ 7/* conditions are met: */ 8/* */ 9/* 1. Redistributions of source code must retain the above */ 10/* copyright notice, this list of conditions and the following */ 11/* disclaimer. */ 12/* */ 13/* 2. Redistributions in binary form must reproduce the above */ 14/* copyright notice, this list of conditions and the following */ 15/* disclaimer in the documentation and/or other materials */ 16/* provided with the distribution. */ 17/* */ 18/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ 19/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ 20/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ 21/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ 22/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ 23/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ 24/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ 25/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ 26/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ 27/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ 28/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ 29/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ 30/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ 31/* POSSIBILITY OF SUCH DAMAGE. */ 32/* */ 33/* The views and conclusions contained in the software and */ 34/* documentation are those of the authors and should not be */ 35/* interpreted as representing official policies, either expressed */ 36/* or implied, of The University of Texas at Austin. */ 37/*********************************************************************/ 38 39/********************************************************************* 40* 2013/10/20 Saar 41* BLASTEST : OK 42* CTEST : OK 43* TEST : OK 44 45* 46* 47* 2013/10/20 Saar 48* Parameter: 49* DGEMM_DEFAULT_UNROLL_N 2 50* DGEMM_DEFAULT_UNROLL_M 16 51* DGEMM_DEFAULT_P 192 52* DGEMM_DEFAULT_Q 128 53* A_PR1 512 54* 55* 56* Performance without prefetch of B: 57* 1 thread: 45.8 GFLOPS (MKL: 45) 58* 2 threads: 80.0 GFLOPS (MKL: 91) 59* 4 threads: 135.0 GFLOPS (MKL: 135) 60*********************************************************************/ 61 62 63#define ASSEMBLER 64#include "common.h" 65 66#define OLD_M %rdi 67#define OLD_N %rsi 68#define M %r13 69#define J %r14 70#define OLD_K %rdx 71 72#define A %rcx 73#define B %r8 74#define C %r9 75#define LDC %r10 76 77#define I %r11 78#define AO %rdi 79#define BO %rsi 80#define CO1 %r15 81#define K %r12 82#define BI %rbp 83#define SP %rbx 84 85#define BO1 %rdi 86#define BO2 %r15 87 88#ifndef WINDOWS_ABI 89 90#define STACKSIZE 96 91 92#else 93 94#define STACKSIZE 256 95 96#define OLD_A 40 + STACKSIZE(%rsp) 97#define OLD_B 48 + STACKSIZE(%rsp) 98#define OLD_C 56 + STACKSIZE(%rsp) 99#define OLD_LDC 64 + STACKSIZE(%rsp) 100#define OLD_OFFSET 72 + STACKSIZE(%rsp) 101 102#endif 103 104#define L_BUFFER_SIZE 512*8*4 105#define LB2_OFFSET 512*8*2 106 107#define Ndiv6 24(%rsp) 108#define Nmod6 32(%rsp) 109#define N 40(%rsp) 110#define ALPHA 48(%rsp) 111#define OFFSET 56(%rsp) 112#define KK 64(%rsp) 113#define KKK 72(%rsp) 114#define BUFFER1 128(%rsp) 115#define BUFFER2 LB2_OFFSET+128(%rsp) 116 117#if defined(OS_WINDOWS) 118#if L_BUFFER_SIZE > 16384 119#define STACK_TOUCH \ 120 movl $0, 4096 * 4(%rsp);\ 121 movl $0, 4096 * 3(%rsp);\ 122 movl $0, 4096 * 2(%rsp);\ 123 movl $0, 4096 * 1(%rsp); 124#elif L_BUFFER_SIZE > 12288 125#define STACK_TOUCH \ 126 movl $0, 4096 * 3(%rsp);\ 127 movl $0, 4096 * 2(%rsp);\ 128 movl $0, 4096 * 1(%rsp); 129#elif L_BUFFER_SIZE > 8192 130#define STACK_TOUCH \ 131 movl $0, 4096 * 2(%rsp);\ 132 movl $0, 4096 * 1(%rsp); 133#elif L_BUFFER_SIZE > 4096 134#define STACK_TOUCH \ 135 movl $0, 4096 * 1(%rsp); 136#else 137#define STACK_TOUCH 138#endif 139#else 140#define STACK_TOUCH 141#endif 142 143#if defined(BULLDOZER) 144 145.macro VFMADD231PD_ y0,y1,y2 146 vfmaddpd \y0,\y1,\y2,\y0 147.endm 148 149.macro VFMADD231SD_ x0,x1,x2 150 vfmaddsd \x0,\x1,\x2,\x0 151.endm 152 153#else 154 155.macro VFMADD231PD_ y0,y1,y2 156 vfmadd231pd \y2,\y1,\y0 157.endm 158 159.macro VFMADD231SD_ x0,x1,x2 160 vfmadd231sd \x2,\x1,\x0 161.endm 162 163#endif 164 165 166#define A_PR1 1024 167#define B_PR1 256 168 169/******************************************************************************************* 170* 3 lines of N 171*******************************************************************************************/ 172 173.macro KERNEL16x3_SUBN 174 vbroadcastsd -12 * SIZE(BO), %zmm1 175 vbroadcastsd -11 * SIZE(BO), %zmm2 176 vbroadcastsd -10 * SIZE(BO), %zmm3 177 178 vmovaps -16 * SIZE(AO), %zmm0 179 VFMADD231PD_ %zmm4,%zmm1,%zmm0 180 VFMADD231PD_ %zmm5,%zmm2,%zmm0 181 VFMADD231PD_ %zmm6,%zmm3,%zmm0 182 183 vmovaps -8 * SIZE(AO), %zmm9 184 VFMADD231PD_ %zmm10,%zmm1,%zmm9 185 VFMADD231PD_ %zmm11,%zmm2,%zmm9 186 VFMADD231PD_ %zmm12,%zmm3,%zmm9 187 addq $ 3*SIZE , BO 188 addq $ 16*SIZE, AO 189.endm 190 191 192.macro KERNEL8x3_SUBN 193 vbroadcastsd -12 * SIZE(BO), %ymm1 194 vmovaps -16 * SIZE(AO), %ymm0 195 VFMADD231PD_ %ymm4,%ymm1,%ymm0 196 vbroadcastsd -11 * SIZE(BO), %ymm2 197 VFMADD231PD_ %ymm5,%ymm2,%ymm0 198 vbroadcastsd -10 * SIZE(BO), %ymm3 199 VFMADD231PD_ %ymm6,%ymm3,%ymm0 200 vmovaps -12 * SIZE(AO), %ymm0 201 VFMADD231PD_ %ymm7,%ymm1,%ymm0 202 VFMADD231PD_ %ymm8,%ymm2,%ymm0 203 VFMADD231PD_ %ymm9,%ymm3,%ymm0 204 prefetcht0 B_PR1(BO) 205 addq $ 3*SIZE , BO 206 addq $ 8*SIZE, AO 207.endm 208 209.macro KERNEL4x3_SUBN 210 vbroadcastsd -12 * SIZE(BO), %ymm1 211 vmovaps -16 * SIZE(AO), %ymm0 212 VFMADD231PD_ %ymm4,%ymm1,%ymm0 213 vbroadcastsd -11 * SIZE(BO), %ymm2 214 VFMADD231PD_ %ymm5,%ymm2,%ymm0 215 vbroadcastsd -10 * SIZE(BO), %ymm3 216 VFMADD231PD_ %ymm6,%ymm3,%ymm0 217 addq $ 3*SIZE , BO 218 addq $ 4*SIZE, AO 219.endm 220 221.macro KERNEL2x3_SUBN 222 vmovsd -12 * SIZE(BO), %xmm1 223 vmovsd -16 * SIZE(AO), %xmm0 224 VFMADD231SD_ %xmm4,%xmm1,%xmm0 225 vmovsd -11 * SIZE(BO), %xmm2 226 VFMADD231SD_ %xmm5,%xmm2,%xmm0 227 vmovsd -10 * SIZE(BO), %xmm3 228 VFMADD231SD_ %xmm6,%xmm3,%xmm0 229 vmovsd -15 * SIZE(AO), %xmm0 230 VFMADD231SD_ %xmm8,%xmm1,%xmm0 231 VFMADD231SD_ %xmm10,%xmm2,%xmm0 232 VFMADD231SD_ %xmm12,%xmm3,%xmm0 233 addq $ 3*SIZE , BO 234 addq $ 2*SIZE, AO 235.endm 236 237.macro KERNEL1x3_SUBN 238 vmovsd -12 * SIZE(BO), %xmm1 239 vmovsd -16 * SIZE(AO), %xmm0 240 VFMADD231SD_ %xmm4,%xmm1,%xmm0 241 vmovsd -11 * SIZE(BO), %xmm2 242 VFMADD231SD_ %xmm5,%xmm2,%xmm0 243 vmovsd -10 * SIZE(BO), %xmm3 244 VFMADD231SD_ %xmm6,%xmm3,%xmm0 245 addq $ 3*SIZE , BO 246 addq $ 1*SIZE, AO 247.endm 248 249 250 251 252 253 254/******************************************************************************************/ 255 256.macro KERNEL16x3_1 257 vbroadcastsd -6 * SIZE(BO, BI, SIZE), %zmm1 258 vmovups -32 * SIZE(AO, %rax, SIZE), %zmm0 259 VFMADD231PD_ %zmm4,%zmm1,%zmm0 260 vbroadcastsd -5 * SIZE(BO, BI, SIZE), %zmm2 261 VFMADD231PD_ %zmm5,%zmm2,%zmm0 262 vbroadcastsd -4 * SIZE(BO, BI, SIZE), %zmm3 263 VFMADD231PD_ %zmm6,%zmm3,%zmm0 264 vmovups -24 * SIZE(AO, %rax, SIZE), %zmm0 265 VFMADD231PD_ %zmm10,%zmm1,%zmm0 266 VFMADD231PD_ %zmm11,%zmm2,%zmm0 267 VFMADD231PD_ %zmm12,%zmm3,%zmm0 268 vbroadcastsd -3 * SIZE(BO, BI, SIZE), %zmm1 269 vbroadcastsd -2 * SIZE(BO, BI, SIZE), %zmm2 270.endm 271 272 273 274 275.macro KERNEL16x3_2 276 vmovups -16 * SIZE(AO, %rax, SIZE), %zmm0 277 VFMADD231PD_ %zmm4,%zmm1,%zmm0 278 VFMADD231PD_ %zmm5,%zmm2,%zmm0 279 vbroadcastsd -1 * SIZE(BO, BI, SIZE), %zmm3 280 VFMADD231PD_ %zmm6,%zmm3,%zmm0 281 vmovups -8 * SIZE(AO, %rax, SIZE), %zmm0 282 VFMADD231PD_ %zmm10,%zmm1,%zmm0 283 VFMADD231PD_ %zmm11,%zmm2,%zmm0 284 VFMADD231PD_ %zmm12,%zmm3,%zmm0 285 vbroadcastsd 0 * SIZE(BO, BI, SIZE), %zmm1 286 vbroadcastsd 1 * SIZE(BO, BI, SIZE), %zmm2 287.endm 288 289.macro KERNEL16x3_3 290 vmovups 0 * SIZE(AO, %rax, SIZE), %zmm0 291 VFMADD231PD_ %zmm4,%zmm1,%zmm0 292 VFMADD231PD_ %zmm5,%zmm2,%zmm0 293 vbroadcastsd 2 * SIZE(BO, BI, SIZE), %zmm3 294 VFMADD231PD_ %zmm6,%zmm3,%zmm0 295 vmovups 8 * SIZE(AO, %rax, SIZE), %zmm0 296 VFMADD231PD_ %zmm10,%zmm1,%zmm0 297 VFMADD231PD_ %zmm11,%zmm2,%zmm0 298 VFMADD231PD_ %zmm12,%zmm3,%zmm0 299 vbroadcastsd 3 * SIZE(BO, BI, SIZE), %zmm1 300 vbroadcastsd 4 * SIZE(BO, BI, SIZE), %zmm2 301.endm 302 303.macro KERNEL16x3_4 304 vmovups 16 * SIZE(AO, %rax, SIZE), %zmm0 305 VFMADD231PD_ %zmm4,%zmm1,%zmm0 306 VFMADD231PD_ %zmm5,%zmm2,%zmm0 307 vbroadcastsd 5 * SIZE(BO, BI, SIZE), %zmm3 308 VFMADD231PD_ %zmm6,%zmm3,%zmm0 309 vmovups 24 * SIZE(AO, %rax, SIZE), %zmm0 310 VFMADD231PD_ %zmm10,%zmm1,%zmm0 311 VFMADD231PD_ %zmm11,%zmm2,%zmm0 312 addq $12, BI 313 VFMADD231PD_ %zmm12,%zmm3,%zmm0 314 addq $64, %rax 315.endm 316 317.macro KERNEL16x3_SUB 318 vbroadcastsd -6 * SIZE(BO, BI, SIZE), %zmm1 319 vmovups -32 * SIZE(AO, %rax, SIZE), %zmm0 320 VFMADD231PD_ %zmm4,%zmm1,%zmm0 321 vbroadcastsd -5 * SIZE(BO, BI, SIZE), %zmm2 322 VFMADD231PD_ %zmm5,%zmm2,%zmm0 323 vbroadcastsd -4 * SIZE(BO, BI, SIZE), %zmm3 324 VFMADD231PD_ %zmm6,%zmm3,%zmm0 325 vmovups -24 * SIZE(AO, %rax, SIZE), %zmm0 326 VFMADD231PD_ %zmm10,%zmm1,%zmm0 327 VFMADD231PD_ %zmm11,%zmm2,%zmm0 328 VFMADD231PD_ %zmm12,%zmm3,%zmm0 329 addq $3 , BI 330 addq $16, %rax 331.endm 332 333.macro SAVE16x3 334 335 vbroadcastsd ALPHA, %zmm0 336 337 vmulpd %zmm0 , %zmm4 , %zmm4 338 vmulpd %zmm0 , %zmm10, %zmm10 339 340 vmulpd %zmm0 , %zmm5 , %zmm5 341 vmulpd %zmm0 , %zmm11, %zmm11 342 343 vmulpd %zmm0 , %zmm6 , %zmm6 344 vmulpd %zmm0 , %zmm12, %zmm12 345 346#if !defined(TRMMKERNEL) 347 348 vaddpd (CO1), %zmm4,%zmm4 349 vaddpd 8 * SIZE(CO1), %zmm10,%zmm10 350 351 vaddpd (CO1, LDC), %zmm5,%zmm5 352 vaddpd 8 * SIZE(CO1, LDC), %zmm11,%zmm11 353 354 vaddpd (CO1, LDC, 2), %zmm6,%zmm6 355 vaddpd 8 * SIZE(CO1, LDC, 2), %zmm12,%zmm12 356 357#endif 358 359 vmovups %zmm4 , (CO1) 360 vmovups %zmm10, 8 * SIZE(CO1) 361 362 vmovups %zmm5 , (CO1, LDC) 363 vmovups %zmm11, 8 * SIZE(CO1, LDC) 364 365 vmovups %zmm6 , (CO1, LDC, 2) 366 vmovups %zmm12, 8 * SIZE(CO1, LDC, 2) 367 368.endm 369 370 371 372/*******************************************************************************************/ 373 374.macro KERNEL8x3_1 375 prefetcht0 A_PR1(AO, %rax, SIZE) 376 vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 377 vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 378 VFMADD231PD_ %ymm4,%ymm1,%ymm0 379 vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 380 VFMADD231PD_ %ymm5,%ymm2,%ymm0 381 vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 382 VFMADD231PD_ %ymm6,%ymm3,%ymm0 383 vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 384 VFMADD231PD_ %ymm7,%ymm1,%ymm0 385 VFMADD231PD_ %ymm8,%ymm2,%ymm0 386 VFMADD231PD_ %ymm9,%ymm3,%ymm0 387.endm 388 389.macro KERNEL8x3_2 390 prefetcht0 64+A_PR1(AO, %rax, SIZE) 391 vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm1 392 vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 393 VFMADD231PD_ %ymm4,%ymm1,%ymm0 394 vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm2 395 VFMADD231PD_ %ymm5,%ymm2,%ymm0 396 vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm3 397 VFMADD231PD_ %ymm6,%ymm3,%ymm0 398 vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 399 VFMADD231PD_ %ymm7,%ymm1,%ymm0 400 VFMADD231PD_ %ymm8,%ymm2,%ymm0 401 VFMADD231PD_ %ymm9,%ymm3,%ymm0 402.endm 403 404.macro KERNEL8x3_3 405 prefetcht0 128+A_PR1(AO, %rax, SIZE) 406 vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 407 vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 408 VFMADD231PD_ %ymm4,%ymm1,%ymm0 409 vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 410 VFMADD231PD_ %ymm5,%ymm2,%ymm0 411 vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm3 412 VFMADD231PD_ %ymm6,%ymm3,%ymm0 413 vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 414 VFMADD231PD_ %ymm7,%ymm1,%ymm0 415 VFMADD231PD_ %ymm8,%ymm2,%ymm0 416 VFMADD231PD_ %ymm9,%ymm3,%ymm0 417.endm 418 419.macro KERNEL8x3_4 420 prefetcht0 192+A_PR1(AO, %rax, SIZE) 421 vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm1 422 vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 423 VFMADD231PD_ %ymm4,%ymm1,%ymm0 424 vbroadcastsd 4 * SIZE(BO, BI, SIZE), %ymm2 425 VFMADD231PD_ %ymm5,%ymm2,%ymm0 426 vbroadcastsd 5 * SIZE(BO, BI, SIZE), %ymm3 427 VFMADD231PD_ %ymm6,%ymm3,%ymm0 428 vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 429 VFMADD231PD_ %ymm7,%ymm1,%ymm0 430 VFMADD231PD_ %ymm8,%ymm2,%ymm0 431 VFMADD231PD_ %ymm9,%ymm3,%ymm0 432 addq $12, BI 433 addq $32, %rax 434.endm 435 436.macro KERNEL8x3_SUB 437 vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 438 vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 439 VFMADD231PD_ %ymm4,%ymm1,%ymm0 440 vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 441 VFMADD231PD_ %ymm5,%ymm2,%ymm0 442 vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 443 VFMADD231PD_ %ymm6,%ymm3,%ymm0 444 vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 445 VFMADD231PD_ %ymm7,%ymm1,%ymm0 446 VFMADD231PD_ %ymm8,%ymm2,%ymm0 447 VFMADD231PD_ %ymm9,%ymm3,%ymm0 448 addq $3 , BI 449 addq $8 , %rax 450.endm 451 452.macro SAVE8x3 453 454 vbroadcastsd ALPHA, %ymm0 455 456 vmulpd %ymm0 , %ymm4 , %ymm4 457 vmulpd %ymm0 , %ymm7 , %ymm7 458 459 vmulpd %ymm0 , %ymm5 , %ymm5 460 vmulpd %ymm0 , %ymm8 , %ymm8 461 462 vmulpd %ymm0 , %ymm6 , %ymm6 463 vmulpd %ymm0 , %ymm9 , %ymm9 464 465#if !defined(TRMMKERNEL) 466 467 vaddpd (CO1), %ymm4,%ymm4 468 vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 469 470 vaddpd (CO1, LDC), %ymm5,%ymm5 471 vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8 472 473 vaddpd (CO1, LDC, 2), %ymm6,%ymm6 474 vaddpd 4 * SIZE(CO1, LDC, 2), %ymm9,%ymm9 475 476#endif 477 478 vmovups %ymm4 , (CO1) 479 vmovups %ymm7 , 4 * SIZE(CO1) 480 481 vmovups %ymm5 , (CO1, LDC) 482 vmovups %ymm8 , 4 * SIZE(CO1, LDC) 483 484 vmovups %ymm6 , (CO1, LDC, 2) 485 vmovups %ymm9 , 4 * SIZE(CO1, LDC, 2) 486 487.endm 488 489 490 491/*******************************************************************************************/ 492 493.macro KERNEL4x3_1 494 prefetcht0 A_PR1(AO, %rax, SIZE) 495 vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 496 vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 497 VFMADD231PD_ %ymm4,%ymm1,%ymm0 498 vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 499 VFMADD231PD_ %ymm5,%ymm2,%ymm0 500 vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 501 VFMADD231PD_ %ymm6,%ymm3,%ymm0 502.endm 503 504.macro KERNEL4x3_2 505 vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm1 506 vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 507 VFMADD231PD_ %ymm4,%ymm1,%ymm0 508 vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm2 509 VFMADD231PD_ %ymm5,%ymm2,%ymm0 510 vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm3 511 VFMADD231PD_ %ymm6,%ymm3,%ymm0 512.endm 513 514.macro KERNEL4x3_3 515 prefetcht0 A_PR1(AO, %rax, SIZE) 516 vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 517 vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 518 VFMADD231PD_ %ymm4,%ymm1,%ymm0 519 vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 520 VFMADD231PD_ %ymm5,%ymm2,%ymm0 521 vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm3 522 VFMADD231PD_ %ymm6,%ymm3,%ymm0 523.endm 524 525.macro KERNEL4x3_4 526 vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm1 527 vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 528 VFMADD231PD_ %ymm4,%ymm1,%ymm0 529 vbroadcastsd 4 * SIZE(BO, BI, SIZE), %ymm2 530 VFMADD231PD_ %ymm5,%ymm2,%ymm0 531 vbroadcastsd 5 * SIZE(BO, BI, SIZE), %ymm3 532 VFMADD231PD_ %ymm6,%ymm3,%ymm0 533 addq $12, BI 534 addq $16, %rax 535.endm 536 537.macro KERNEL4x3_SUB 538 vbroadcastsd -6 * SIZE(BO, BI, SIZE), %ymm1 539 vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 540 VFMADD231PD_ %ymm4,%ymm1,%ymm0 541 vbroadcastsd -5 * SIZE(BO, BI, SIZE), %ymm2 542 VFMADD231PD_ %ymm5,%ymm2,%ymm0 543 vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm3 544 VFMADD231PD_ %ymm6,%ymm3,%ymm0 545 addq $3 , BI 546 addq $4 , %rax 547.endm 548 549.macro SAVE4x3 550 551 vbroadcastsd ALPHA, %ymm0 552 553 vmulpd %ymm0 , %ymm4 , %ymm4 554 vmulpd %ymm0 , %ymm5 , %ymm5 555 vmulpd %ymm0 , %ymm6 , %ymm6 556 557#if !defined(TRMMKERNEL) 558 559 vaddpd (CO1), %ymm4,%ymm4 560 vaddpd (CO1, LDC), %ymm5,%ymm5 561 vaddpd (CO1, LDC, 2), %ymm6,%ymm6 562 563#endif 564 565 vmovups %ymm4 , (CO1) 566 vmovups %ymm5 , (CO1, LDC) 567 vmovups %ymm6 , (CO1, LDC, 2) 568 569.endm 570 571 572/*******************************************************************************************/ 573 574.macro KERNEL2x3_1 575 prefetcht0 A_PR1(AO, %rax, SIZE) 576 vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 577 vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 578 VFMADD231SD_ %xmm4,%xmm1,%xmm0 579 vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 580 VFMADD231SD_ %xmm5,%xmm2,%xmm0 581 vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 582 VFMADD231SD_ %xmm6,%xmm3,%xmm0 583 vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 584 VFMADD231SD_ %xmm8,%xmm1,%xmm0 585 VFMADD231SD_ %xmm10,%xmm2,%xmm0 586 VFMADD231SD_ %xmm12,%xmm3,%xmm0 587.endm 588 589.macro KERNEL2x3_2 590 vmovsd -3 * SIZE(BO, BI, SIZE), %xmm1 591 vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 592 VFMADD231SD_ %xmm4,%xmm1,%xmm0 593 vmovsd -2 * SIZE(BO, BI, SIZE), %xmm2 594 VFMADD231SD_ %xmm5,%xmm2,%xmm0 595 vmovsd -1 * SIZE(BO, BI, SIZE), %xmm3 596 VFMADD231SD_ %xmm6,%xmm3,%xmm0 597 vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 598 VFMADD231SD_ %xmm8,%xmm1,%xmm0 599 VFMADD231SD_ %xmm10,%xmm2,%xmm0 600 VFMADD231SD_ %xmm12,%xmm3,%xmm0 601.endm 602 603.macro KERNEL2x3_3 604 vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 605 vmovsd -28 * SIZE(AO, %rax, SIZE), %xmm0 606 VFMADD231SD_ %xmm4,%xmm1,%xmm0 607 vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 608 VFMADD231SD_ %xmm5,%xmm2,%xmm0 609 vmovsd 2 * SIZE(BO, BI, SIZE), %xmm3 610 VFMADD231SD_ %xmm6,%xmm3,%xmm0 611 vmovsd -27 * SIZE(AO, %rax, SIZE), %xmm0 612 VFMADD231SD_ %xmm8,%xmm1,%xmm0 613 VFMADD231SD_ %xmm10,%xmm2,%xmm0 614 VFMADD231SD_ %xmm12,%xmm3,%xmm0 615.endm 616 617.macro KERNEL2x3_4 618 vmovsd 3 * SIZE(BO, BI, SIZE), %xmm1 619 vmovsd -26 * SIZE(AO, %rax, SIZE), %xmm0 620 VFMADD231SD_ %xmm4,%xmm1,%xmm0 621 vmovsd 4 * SIZE(BO, BI, SIZE), %xmm2 622 VFMADD231SD_ %xmm5,%xmm2,%xmm0 623 vmovsd 5 * SIZE(BO, BI, SIZE), %xmm3 624 VFMADD231SD_ %xmm6,%xmm3,%xmm0 625 vmovsd -25 * SIZE(AO, %rax, SIZE), %xmm0 626 VFMADD231SD_ %xmm8,%xmm1,%xmm0 627 VFMADD231SD_ %xmm10,%xmm2,%xmm0 628 VFMADD231SD_ %xmm12,%xmm3,%xmm0 629 addq $12, BI 630 addq $8, %rax 631.endm 632 633.macro KERNEL2x3_SUB 634 vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 635 vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 636 VFMADD231SD_ %xmm4,%xmm1,%xmm0 637 vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 638 VFMADD231SD_ %xmm5,%xmm2,%xmm0 639 vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 640 VFMADD231SD_ %xmm6,%xmm3,%xmm0 641 vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 642 VFMADD231SD_ %xmm8,%xmm1,%xmm0 643 VFMADD231SD_ %xmm10,%xmm2,%xmm0 644 VFMADD231SD_ %xmm12,%xmm3,%xmm0 645 addq $3 , BI 646 addq $2 , %rax 647.endm 648 649.macro SAVE2x3 650 651 vmovsd ALPHA, %xmm0 652 653 vmulsd %xmm0 , %xmm4 , %xmm4 654 vmulsd %xmm0 , %xmm8 , %xmm8 655 vmulsd %xmm0 , %xmm5 , %xmm5 656 vmulsd %xmm0 , %xmm10, %xmm10 657 vmulsd %xmm0 , %xmm6 , %xmm6 658 vmulsd %xmm0 , %xmm12, %xmm12 659 660#if !defined(TRMMKERNEL) 661 662 vaddsd (CO1), %xmm4,%xmm4 663 vaddsd 1 * SIZE(CO1), %xmm8,%xmm8 664 vaddsd (CO1, LDC), %xmm5,%xmm5 665 vaddsd 1 * SIZE(CO1, LDC), %xmm10,%xmm10 666 vaddsd (CO1, LDC, 2), %xmm6,%xmm6 667 vaddsd 1 * SIZE(CO1, LDC, 2), %xmm12,%xmm12 668 669#endif 670 671 vmovsd %xmm4 , (CO1) 672 vmovsd %xmm8 , 1 * SIZE(CO1) 673 vmovsd %xmm5 , (CO1, LDC) 674 vmovsd %xmm10, 1 * SIZE(CO1, LDC) 675 vmovsd %xmm6 , (CO1, LDC, 2) 676 vmovsd %xmm12, 1 * SIZE(CO1, LDC, 2) 677 678.endm 679 680/*******************************************************************************************/ 681 682.macro KERNEL1x3_1 683 vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 684 vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 685 VFMADD231SD_ %xmm4,%xmm1,%xmm0 686 vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 687 VFMADD231SD_ %xmm5,%xmm2,%xmm0 688 vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 689 VFMADD231SD_ %xmm6,%xmm3,%xmm0 690.endm 691 692.macro KERNEL1x3_2 693 vmovsd -3 * SIZE(BO, BI, SIZE), %xmm1 694 vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 695 VFMADD231SD_ %xmm4,%xmm1,%xmm0 696 vmovsd -2 * SIZE(BO, BI, SIZE), %xmm2 697 VFMADD231SD_ %xmm5,%xmm2,%xmm0 698 vmovsd -1 * SIZE(BO, BI, SIZE), %xmm3 699 VFMADD231SD_ %xmm6,%xmm3,%xmm0 700.endm 701 702.macro KERNEL1x3_3 703 vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 704 vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 705 VFMADD231SD_ %xmm4,%xmm1,%xmm0 706 vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 707 VFMADD231SD_ %xmm5,%xmm2,%xmm0 708 vmovsd 2 * SIZE(BO, BI, SIZE), %xmm3 709 VFMADD231SD_ %xmm6,%xmm3,%xmm0 710.endm 711 712.macro KERNEL1x3_4 713 vmovsd 3 * SIZE(BO, BI, SIZE), %xmm1 714 vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 715 VFMADD231SD_ %xmm4,%xmm1,%xmm0 716 vmovsd 4 * SIZE(BO, BI, SIZE), %xmm2 717 VFMADD231SD_ %xmm5,%xmm2,%xmm0 718 vmovsd 5 * SIZE(BO, BI, SIZE), %xmm3 719 VFMADD231SD_ %xmm6,%xmm3,%xmm0 720 addq $12, BI 721 addq $4, %rax 722.endm 723 724.macro KERNEL1x3_SUB 725 vmovsd -6 * SIZE(BO, BI, SIZE), %xmm1 726 vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 727 VFMADD231SD_ %xmm4,%xmm1,%xmm0 728 vmovsd -5 * SIZE(BO, BI, SIZE), %xmm2 729 VFMADD231SD_ %xmm5,%xmm2,%xmm0 730 vmovsd -4 * SIZE(BO, BI, SIZE), %xmm3 731 VFMADD231SD_ %xmm6,%xmm3,%xmm0 732 addq $3 , BI 733 addq $1 , %rax 734.endm 735 736.macro SAVE1x3 737 738 vmovsd ALPHA, %xmm0 739 740 vmulsd %xmm0 , %xmm4 , %xmm4 741 vmulsd %xmm0 , %xmm5 , %xmm5 742 vmulsd %xmm0 , %xmm6 , %xmm6 743 744#if !defined(TRMMKERNEL) 745 746 vaddsd (CO1), %xmm4,%xmm4 747 vaddsd (CO1, LDC), %xmm5,%xmm5 748 vaddsd (CO1, LDC, 2), %xmm6,%xmm6 749 750#endif 751 752 vmovsd %xmm4 , (CO1) 753 vmovsd %xmm5 , (CO1, LDC) 754 vmovsd %xmm6 , (CO1, LDC, 2) 755 756.endm 757 758 759/*******************************************************************************************/ 760 761/******************************************************************************************* 762* 2 lines of N 763*******************************************************************************************/ 764 765.macro KERNEL16x2_1 766 prefetcht0 A_PR1(AO, %rax, SIZE) 767 vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 768 vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 769 VFMADD231PD_ %ymm4,%ymm1,%ymm0 770 vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 771 VFMADD231PD_ %ymm5,%ymm2,%ymm0 772 vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 773 VFMADD231PD_ %ymm7,%ymm1,%ymm0 774 prefetcht0 64+A_PR1(AO, %rax, SIZE) 775 VFMADD231PD_ %ymm8,%ymm2,%ymm0 776 vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 777 VFMADD231PD_ %ymm10,%ymm1,%ymm0 778 VFMADD231PD_ %ymm11,%ymm2,%ymm0 779 vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 780 VFMADD231PD_ %ymm13,%ymm1,%ymm0 781 VFMADD231PD_ %ymm14,%ymm2,%ymm0 782.endm 783 784.macro KERNEL16x2_2 785 prefetcht0 128+A_PR1(AO, %rax, SIZE) 786 vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 787 vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 788 VFMADD231PD_ %ymm4,%ymm1,%ymm0 789 vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm2 790 VFMADD231PD_ %ymm5,%ymm2,%ymm0 791 vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 792 VFMADD231PD_ %ymm7,%ymm1,%ymm0 793 prefetcht0 192+A_PR1(AO, %rax, SIZE) 794 VFMADD231PD_ %ymm8,%ymm2,%ymm0 795 vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 796 VFMADD231PD_ %ymm10,%ymm1,%ymm0 797 VFMADD231PD_ %ymm11,%ymm2,%ymm0 798 vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 799 VFMADD231PD_ %ymm13,%ymm1,%ymm0 800 VFMADD231PD_ %ymm14,%ymm2,%ymm0 801.endm 802 803.macro KERNEL16x2_3 804 prefetcht0 256+A_PR1(AO, %rax, SIZE) 805 vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 806 vmovups 0 * SIZE(AO, %rax, SIZE), %ymm0 807 VFMADD231PD_ %ymm4,%ymm1,%ymm0 808 vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 809 VFMADD231PD_ %ymm5,%ymm2,%ymm0 810 vmovups 4 * SIZE(AO, %rax, SIZE), %ymm0 811 VFMADD231PD_ %ymm7,%ymm1,%ymm0 812 prefetcht0 320+A_PR1(AO, %rax, SIZE) 813 VFMADD231PD_ %ymm8,%ymm2,%ymm0 814 vmovups 8 * SIZE(AO, %rax, SIZE), %ymm0 815 VFMADD231PD_ %ymm10,%ymm1,%ymm0 816 VFMADD231PD_ %ymm11,%ymm2,%ymm0 817 vmovups 12 * SIZE(AO, %rax, SIZE), %ymm0 818 VFMADD231PD_ %ymm13,%ymm1,%ymm0 819 VFMADD231PD_ %ymm14,%ymm2,%ymm0 820.endm 821 822.macro KERNEL16x2_4 823 prefetcht0 384+A_PR1(AO, %rax, SIZE) 824 vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm1 825 vmovups 16 * SIZE(AO, %rax, SIZE), %ymm0 826 VFMADD231PD_ %ymm4,%ymm1,%ymm0 827 vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm2 828 VFMADD231PD_ %ymm5,%ymm2,%ymm0 829 vmovups 20 * SIZE(AO, %rax, SIZE), %ymm0 830 VFMADD231PD_ %ymm7,%ymm1,%ymm0 831 prefetcht0 448+A_PR1(AO, %rax, SIZE) 832 VFMADD231PD_ %ymm8,%ymm2,%ymm0 833 vmovups 24 * SIZE(AO, %rax, SIZE), %ymm0 834 VFMADD231PD_ %ymm10,%ymm1,%ymm0 835 VFMADD231PD_ %ymm11,%ymm2,%ymm0 836 vmovups 28 * SIZE(AO, %rax, SIZE), %ymm0 837 VFMADD231PD_ %ymm13,%ymm1,%ymm0 838 VFMADD231PD_ %ymm14,%ymm2,%ymm0 839 addq $8, BI 840 addq $64, %rax 841.endm 842 843.macro KERNEL16x2_SUB 844 vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 845 vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 846 VFMADD231PD_ %ymm4,%ymm1,%ymm0 847 vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 848 VFMADD231PD_ %ymm5,%ymm2,%ymm0 849 vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 850 VFMADD231PD_ %ymm7,%ymm1,%ymm0 851 VFMADD231PD_ %ymm8,%ymm2,%ymm0 852 vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 853 VFMADD231PD_ %ymm10,%ymm1,%ymm0 854 VFMADD231PD_ %ymm11,%ymm2,%ymm0 855 vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 856 VFMADD231PD_ %ymm13,%ymm1,%ymm0 857 VFMADD231PD_ %ymm14,%ymm2,%ymm0 858 addq $2, BI 859 addq $16, %rax 860.endm 861 862.macro SAVE16x2 863 864 vbroadcastsd ALPHA, %ymm0 865 866 vmulpd %ymm0 , %ymm4 , %ymm4 867 vmulpd %ymm0 , %ymm7 , %ymm7 868 vmulpd %ymm0 , %ymm10, %ymm10 869 vmulpd %ymm0 , %ymm13, %ymm13 870 871 vmulpd %ymm0 , %ymm5 , %ymm5 872 vmulpd %ymm0 , %ymm8 , %ymm8 873 vmulpd %ymm0 , %ymm11, %ymm11 874 vmulpd %ymm0 , %ymm14, %ymm14 875 876#if !defined(TRMMKERNEL) 877 878 vaddpd (CO1), %ymm4,%ymm4 879 vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 880 vaddpd 8 * SIZE(CO1), %ymm10,%ymm10 881 vaddpd 12 * SIZE(CO1), %ymm13,%ymm13 882 883 vaddpd (CO1, LDC), %ymm5,%ymm5 884 vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8 885 vaddpd 8 * SIZE(CO1, LDC), %ymm11,%ymm11 886 vaddpd 12 * SIZE(CO1, LDC), %ymm14,%ymm14 887 888#endif 889 890 vmovups %ymm4 , (CO1) 891 vmovups %ymm7 , 4 * SIZE(CO1) 892 vmovups %ymm10, 8 * SIZE(CO1) 893 vmovups %ymm13,12 * SIZE(CO1) 894 895 vmovups %ymm5 , (CO1, LDC) 896 vmovups %ymm8 , 4 * SIZE(CO1, LDC) 897 vmovups %ymm11, 8 * SIZE(CO1, LDC) 898 vmovups %ymm14,12 * SIZE(CO1, LDC) 899 900.endm 901 902 903 904/*******************************************************************************************/ 905 906.macro KERNEL8x2_1 907 prefetcht0 A_PR1(AO, %rax, SIZE) 908 vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 909 vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 910 VFMADD231PD_ %ymm4,%ymm1,%ymm0 911 vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 912 VFMADD231PD_ %ymm5,%ymm2,%ymm0 913 vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 914 VFMADD231PD_ %ymm7,%ymm1,%ymm0 915 VFMADD231PD_ %ymm8,%ymm2,%ymm0 916.endm 917 918.macro KERNEL8x2_2 919 prefetcht0 64+A_PR1(AO, %rax, SIZE) 920 vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 921 vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 922 VFMADD231PD_ %ymm4,%ymm1,%ymm0 923 vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm2 924 VFMADD231PD_ %ymm5,%ymm2,%ymm0 925 vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 926 VFMADD231PD_ %ymm7,%ymm1,%ymm0 927 VFMADD231PD_ %ymm8,%ymm2,%ymm0 928.endm 929 930.macro KERNEL8x2_3 931 prefetcht0 128+A_PR1(AO, %rax, SIZE) 932 vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 933 vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 934 VFMADD231PD_ %ymm4,%ymm1,%ymm0 935 vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 936 VFMADD231PD_ %ymm5,%ymm2,%ymm0 937 vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 938 VFMADD231PD_ %ymm7,%ymm1,%ymm0 939 VFMADD231PD_ %ymm8,%ymm2,%ymm0 940.endm 941 942.macro KERNEL8x2_4 943 prefetcht0 192+A_PR1(AO, %rax, SIZE) 944 vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm1 945 vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 946 VFMADD231PD_ %ymm4,%ymm1,%ymm0 947 vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm2 948 VFMADD231PD_ %ymm5,%ymm2,%ymm0 949 vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 950 VFMADD231PD_ %ymm7,%ymm1,%ymm0 951 VFMADD231PD_ %ymm8,%ymm2,%ymm0 952 addq $8, BI 953 addq $32, %rax 954.endm 955 956.macro KERNEL8x2_SUB 957 vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 958 vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 959 VFMADD231PD_ %ymm4,%ymm1,%ymm0 960 vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 961 VFMADD231PD_ %ymm5,%ymm2,%ymm0 962 vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 963 VFMADD231PD_ %ymm7,%ymm1,%ymm0 964 VFMADD231PD_ %ymm8,%ymm2,%ymm0 965 addq $2, BI 966 addq $8 , %rax 967.endm 968 969.macro SAVE8x2 970 971 vbroadcastsd ALPHA, %ymm0 972 973 vmulpd %ymm0 , %ymm4 , %ymm4 974 vmulpd %ymm0 , %ymm7 , %ymm7 975 976 vmulpd %ymm0 , %ymm5 , %ymm5 977 vmulpd %ymm0 , %ymm8 , %ymm8 978 979#if !defined(TRMMKERNEL) 980 981 vaddpd (CO1), %ymm4,%ymm4 982 vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 983 984 vaddpd (CO1, LDC), %ymm5,%ymm5 985 vaddpd 4 * SIZE(CO1, LDC), %ymm8,%ymm8 986 987#endif 988 989 vmovups %ymm4 , (CO1) 990 vmovups %ymm7 , 4 * SIZE(CO1) 991 992 vmovups %ymm5 , (CO1, LDC) 993 vmovups %ymm8 , 4 * SIZE(CO1, LDC) 994 995.endm 996 997 998 999/*******************************************************************************************/ 1000 1001.macro KERNEL4x2_1 1002 prefetcht0 A_PR1(AO, %rax, SIZE) 1003 vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 1004 vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 1005 VFMADD231PD_ %ymm4,%ymm1,%ymm0 1006 vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 1007 VFMADD231PD_ %ymm5,%ymm2,%ymm0 1008.endm 1009 1010.macro KERNEL4x2_2 1011 vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 1012 vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 1013 VFMADD231PD_ %ymm4,%ymm1,%ymm0 1014 vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm2 1015 VFMADD231PD_ %ymm5,%ymm2,%ymm0 1016.endm 1017 1018.macro KERNEL4x2_3 1019 prefetcht0 64+A_PR1(AO, %rax, SIZE) 1020 vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 1021 vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 1022 VFMADD231PD_ %ymm4,%ymm1,%ymm0 1023 vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm2 1024 VFMADD231PD_ %ymm5,%ymm2,%ymm0 1025.endm 1026 1027.macro KERNEL4x2_4 1028 vbroadcastsd 2 * SIZE(BO, BI, SIZE), %ymm1 1029 vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 1030 VFMADD231PD_ %ymm4,%ymm1,%ymm0 1031 vbroadcastsd 3 * SIZE(BO, BI, SIZE), %ymm2 1032 VFMADD231PD_ %ymm5,%ymm2,%ymm0 1033 addq $8, BI 1034 addq $16, %rax 1035.endm 1036 1037.macro KERNEL4x2_SUB 1038 vbroadcastsd -4 * SIZE(BO, BI, SIZE), %ymm1 1039 vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 1040 VFMADD231PD_ %ymm4,%ymm1,%ymm0 1041 vbroadcastsd -3 * SIZE(BO, BI, SIZE), %ymm2 1042 VFMADD231PD_ %ymm5,%ymm2,%ymm0 1043 addq $2, BI 1044 addq $4 , %rax 1045.endm 1046 1047.macro SAVE4x2 1048 1049 vbroadcastsd ALPHA, %ymm0 1050 1051 vmulpd %ymm0 , %ymm4 , %ymm4 1052 vmulpd %ymm0 , %ymm5 , %ymm5 1053 1054#if !defined(TRMMKERNEL) 1055 1056 vaddpd (CO1), %ymm4,%ymm4 1057 vaddpd (CO1, LDC), %ymm5,%ymm5 1058 1059#endif 1060 1061 vmovups %ymm4 , (CO1) 1062 vmovups %ymm5 , (CO1, LDC) 1063 1064.endm 1065 1066 1067/*******************************************************************************************/ 1068 1069.macro KERNEL2x2_1 1070 prefetcht0 A_PR1(AO, %rax, SIZE) 1071 vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 1072 vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 1073 VFMADD231SD_ %xmm4,%xmm1,%xmm0 1074 vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 1075 VFMADD231SD_ %xmm5,%xmm2,%xmm0 1076 vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 1077 VFMADD231SD_ %xmm8,%xmm1,%xmm0 1078 VFMADD231SD_ %xmm10,%xmm2,%xmm0 1079.endm 1080 1081.macro KERNEL2x2_2 1082 vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 1083 vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 1084 VFMADD231SD_ %xmm4,%xmm1,%xmm0 1085 vmovsd -1 * SIZE(BO, BI, SIZE), %xmm2 1086 VFMADD231SD_ %xmm5,%xmm2,%xmm0 1087 vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 1088 VFMADD231SD_ %xmm8,%xmm1,%xmm0 1089 VFMADD231SD_ %xmm10,%xmm2,%xmm0 1090.endm 1091 1092.macro KERNEL2x2_3 1093 vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 1094 vmovsd -28 * SIZE(AO, %rax, SIZE), %xmm0 1095 VFMADD231SD_ %xmm4,%xmm1,%xmm0 1096 vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 1097 VFMADD231SD_ %xmm5,%xmm2,%xmm0 1098 vmovsd -27 * SIZE(AO, %rax, SIZE), %xmm0 1099 VFMADD231SD_ %xmm8,%xmm1,%xmm0 1100 VFMADD231SD_ %xmm10,%xmm2,%xmm0 1101.endm 1102 1103.macro KERNEL2x2_4 1104 vmovsd 2 * SIZE(BO, BI, SIZE), %xmm1 1105 vmovsd -26 * SIZE(AO, %rax, SIZE), %xmm0 1106 VFMADD231SD_ %xmm4,%xmm1,%xmm0 1107 vmovsd 3 * SIZE(BO, BI, SIZE), %xmm2 1108 VFMADD231SD_ %xmm5,%xmm2,%xmm0 1109 vmovsd -25 * SIZE(AO, %rax, SIZE), %xmm0 1110 VFMADD231SD_ %xmm8,%xmm1,%xmm0 1111 VFMADD231SD_ %xmm10,%xmm2,%xmm0 1112 addq $8, BI 1113 addq $8, %rax 1114.endm 1115 1116.macro KERNEL2x2_SUB 1117 vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 1118 vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 1119 VFMADD231SD_ %xmm4,%xmm1,%xmm0 1120 vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 1121 VFMADD231SD_ %xmm5,%xmm2,%xmm0 1122 vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 1123 VFMADD231SD_ %xmm8,%xmm1,%xmm0 1124 VFMADD231SD_ %xmm10,%xmm2,%xmm0 1125 addq $2, BI 1126 addq $2, %rax 1127.endm 1128 1129.macro SAVE2x2 1130 1131 vmovsd ALPHA, %xmm0 1132 1133 vmulsd %xmm0 , %xmm4 , %xmm4 1134 vmulsd %xmm0 , %xmm8 , %xmm8 1135 vmulsd %xmm0 , %xmm5 , %xmm5 1136 vmulsd %xmm0 , %xmm10, %xmm10 1137 1138#if !defined(TRMMKERNEL) 1139 1140 vaddsd (CO1), %xmm4,%xmm4 1141 vaddsd 1 * SIZE(CO1), %xmm8,%xmm8 1142 vaddsd (CO1, LDC), %xmm5,%xmm5 1143 vaddsd 1 * SIZE(CO1, LDC), %xmm10,%xmm10 1144 1145#endif 1146 1147 vmovsd %xmm4 , (CO1) 1148 vmovsd %xmm8 , 1 * SIZE(CO1) 1149 vmovsd %xmm5 , (CO1, LDC) 1150 vmovsd %xmm10, 1 * SIZE(CO1, LDC) 1151 1152.endm 1153 1154 1155/*******************************************************************************************/ 1156 1157.macro KERNEL1x2_1 1158 vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 1159 vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 1160 VFMADD231SD_ %xmm4,%xmm1,%xmm0 1161 vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 1162 VFMADD231SD_ %xmm5,%xmm2,%xmm0 1163.endm 1164 1165.macro KERNEL1x2_2 1166 vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 1167 vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 1168 VFMADD231SD_ %xmm4,%xmm1,%xmm0 1169 vmovsd -1 * SIZE(BO, BI, SIZE), %xmm2 1170 VFMADD231SD_ %xmm5,%xmm2,%xmm0 1171.endm 1172 1173.macro KERNEL1x2_3 1174 vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 1175 vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 1176 VFMADD231SD_ %xmm4,%xmm1,%xmm0 1177 vmovsd 1 * SIZE(BO, BI, SIZE), %xmm2 1178 VFMADD231SD_ %xmm5,%xmm2,%xmm0 1179.endm 1180 1181.macro KERNEL1x2_4 1182 vmovsd 2 * SIZE(BO, BI, SIZE), %xmm1 1183 vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 1184 VFMADD231SD_ %xmm4,%xmm1,%xmm0 1185 vmovsd 3 * SIZE(BO, BI, SIZE), %xmm2 1186 VFMADD231SD_ %xmm5,%xmm2,%xmm0 1187 addq $8, BI 1188 addq $4, %rax 1189.endm 1190 1191.macro KERNEL1x2_SUB 1192 vmovsd -4 * SIZE(BO, BI, SIZE), %xmm1 1193 vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 1194 VFMADD231SD_ %xmm4,%xmm1,%xmm0 1195 vmovsd -3 * SIZE(BO, BI, SIZE), %xmm2 1196 VFMADD231SD_ %xmm5,%xmm2,%xmm0 1197 addq $2, BI 1198 addq $1, %rax 1199.endm 1200 1201.macro SAVE1x2 1202 1203 vmovsd ALPHA, %xmm0 1204 1205 vmulsd %xmm0 , %xmm4 , %xmm4 1206 vmulsd %xmm0 , %xmm5 , %xmm5 1207 1208#if !defined(TRMMKERNEL) 1209 1210 vaddsd (CO1), %xmm4,%xmm4 1211 vaddsd (CO1, LDC), %xmm5,%xmm5 1212 1213#endif 1214 1215 vmovsd %xmm4 , (CO1) 1216 vmovsd %xmm5 , (CO1, LDC) 1217 1218.endm 1219 1220 1221/*******************************************************************************************/ 1222 1223/******************************************************************************************* 1224* 1 line of N 1225*******************************************************************************************/ 1226 1227.macro KERNEL16x1_1 1228 vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 1229 vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 1230 VFMADD231PD_ %ymm4,%ymm1,%ymm0 1231 vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 1232 VFMADD231PD_ %ymm7,%ymm1,%ymm0 1233 vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 1234 VFMADD231PD_ %ymm10,%ymm1,%ymm0 1235 vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 1236 VFMADD231PD_ %ymm13,%ymm1,%ymm0 1237.endm 1238 1239.macro KERNEL16x1_2 1240 vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm1 1241 vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 1242 VFMADD231PD_ %ymm4,%ymm1,%ymm0 1243 vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 1244 VFMADD231PD_ %ymm7,%ymm1,%ymm0 1245 vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 1246 VFMADD231PD_ %ymm10,%ymm1,%ymm0 1247 vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 1248 VFMADD231PD_ %ymm13,%ymm1,%ymm0 1249.endm 1250 1251.macro KERNEL16x1_3 1252 vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 1253 vmovups 0 * SIZE(AO, %rax, SIZE), %ymm0 1254 VFMADD231PD_ %ymm4,%ymm1,%ymm0 1255 vmovups 4 * SIZE(AO, %rax, SIZE), %ymm0 1256 VFMADD231PD_ %ymm7,%ymm1,%ymm0 1257 vmovups 8 * SIZE(AO, %rax, SIZE), %ymm0 1258 VFMADD231PD_ %ymm10,%ymm1,%ymm0 1259 vmovups 12 * SIZE(AO, %rax, SIZE), %ymm0 1260 VFMADD231PD_ %ymm13,%ymm1,%ymm0 1261.endm 1262 1263.macro KERNEL16x1_4 1264 vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm1 1265 vmovups 16 * SIZE(AO, %rax, SIZE), %ymm0 1266 VFMADD231PD_ %ymm4,%ymm1,%ymm0 1267 vmovups 20 * SIZE(AO, %rax, SIZE), %ymm0 1268 VFMADD231PD_ %ymm7,%ymm1,%ymm0 1269 vmovups 24 * SIZE(AO, %rax, SIZE), %ymm0 1270 VFMADD231PD_ %ymm10,%ymm1,%ymm0 1271 vmovups 28 * SIZE(AO, %rax, SIZE), %ymm0 1272 VFMADD231PD_ %ymm13,%ymm1,%ymm0 1273 addq $4, BI 1274 addq $64, %rax 1275.endm 1276 1277.macro KERNEL16x1_SUB 1278 vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 1279 vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 1280 VFMADD231PD_ %ymm4,%ymm1,%ymm0 1281 vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 1282 VFMADD231PD_ %ymm7,%ymm1,%ymm0 1283 vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 1284 VFMADD231PD_ %ymm10,%ymm1,%ymm0 1285 vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 1286 VFMADD231PD_ %ymm13,%ymm1,%ymm0 1287 addq $1, BI 1288 addq $16, %rax 1289.endm 1290 1291.macro SAVE16x1 1292 1293 vbroadcastsd ALPHA, %ymm0 1294 1295 vmulpd %ymm0 , %ymm4 , %ymm4 1296 vmulpd %ymm0 , %ymm7 , %ymm7 1297 vmulpd %ymm0 , %ymm10, %ymm10 1298 vmulpd %ymm0 , %ymm13, %ymm13 1299 1300#if !defined(TRMMKERNEL) 1301 1302 vaddpd (CO1), %ymm4,%ymm4 1303 vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 1304 vaddpd 8 * SIZE(CO1), %ymm10,%ymm10 1305 vaddpd 12 * SIZE(CO1), %ymm13,%ymm13 1306 1307#endif 1308 1309 vmovups %ymm4 , (CO1) 1310 vmovups %ymm7 , 4 * SIZE(CO1) 1311 vmovups %ymm10, 8 * SIZE(CO1) 1312 vmovups %ymm13,12 * SIZE(CO1) 1313 1314.endm 1315 1316 1317 1318/*******************************************************************************************/ 1319 1320.macro KERNEL8x1_1 1321 vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 1322 vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 1323 VFMADD231PD_ %ymm4,%ymm1,%ymm0 1324 vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 1325 VFMADD231PD_ %ymm7,%ymm1,%ymm0 1326.endm 1327 1328.macro KERNEL8x1_2 1329 vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm1 1330 vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 1331 VFMADD231PD_ %ymm4,%ymm1,%ymm0 1332 vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 1333 VFMADD231PD_ %ymm7,%ymm1,%ymm0 1334.endm 1335 1336.macro KERNEL8x1_3 1337 vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 1338 vmovups -16 * SIZE(AO, %rax, SIZE), %ymm0 1339 VFMADD231PD_ %ymm4,%ymm1,%ymm0 1340 vmovups -12 * SIZE(AO, %rax, SIZE), %ymm0 1341 VFMADD231PD_ %ymm7,%ymm1,%ymm0 1342.endm 1343 1344.macro KERNEL8x1_4 1345 vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm1 1346 vmovups -8 * SIZE(AO, %rax, SIZE), %ymm0 1347 VFMADD231PD_ %ymm4,%ymm1,%ymm0 1348 vmovups -4 * SIZE(AO, %rax, SIZE), %ymm0 1349 VFMADD231PD_ %ymm7,%ymm1,%ymm0 1350 addq $4, BI 1351 addq $32, %rax 1352.endm 1353 1354.macro KERNEL8x1_SUB 1355 vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 1356 vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 1357 VFMADD231PD_ %ymm4,%ymm1,%ymm0 1358 vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 1359 VFMADD231PD_ %ymm7,%ymm1,%ymm0 1360 addq $1, BI 1361 addq $8 , %rax 1362.endm 1363 1364.macro SAVE8x1 1365 1366 vbroadcastsd ALPHA, %ymm0 1367 1368 vmulpd %ymm0 , %ymm4 , %ymm4 1369 vmulpd %ymm0 , %ymm7 , %ymm7 1370 1371#if !defined(TRMMKERNEL) 1372 1373 vaddpd (CO1), %ymm4,%ymm4 1374 vaddpd 4 * SIZE(CO1), %ymm7,%ymm7 1375 1376#endif 1377 1378 vmovups %ymm4 , (CO1) 1379 vmovups %ymm7 , 4 * SIZE(CO1) 1380 1381.endm 1382 1383 1384 1385/*******************************************************************************************/ 1386 1387.macro KERNEL4x1_1 1388 vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 1389 vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 1390 VFMADD231PD_ %ymm4,%ymm1,%ymm0 1391.endm 1392 1393.macro KERNEL4x1_2 1394 vbroadcastsd -1 * SIZE(BO, BI, SIZE), %ymm1 1395 vmovups -28 * SIZE(AO, %rax, SIZE), %ymm0 1396 VFMADD231PD_ %ymm4,%ymm1,%ymm0 1397.endm 1398 1399.macro KERNEL4x1_3 1400 vbroadcastsd 0 * SIZE(BO, BI, SIZE), %ymm1 1401 vmovups -24 * SIZE(AO, %rax, SIZE), %ymm0 1402 VFMADD231PD_ %ymm4,%ymm1,%ymm0 1403.endm 1404 1405.macro KERNEL4x1_4 1406 vbroadcastsd 1 * SIZE(BO, BI, SIZE), %ymm1 1407 vmovups -20 * SIZE(AO, %rax, SIZE), %ymm0 1408 VFMADD231PD_ %ymm4,%ymm1,%ymm0 1409 addq $4, BI 1410 addq $16, %rax 1411.endm 1412 1413.macro KERNEL4x1_SUB 1414 vbroadcastsd -2 * SIZE(BO, BI, SIZE), %ymm1 1415 vmovups -32 * SIZE(AO, %rax, SIZE), %ymm0 1416 VFMADD231PD_ %ymm4,%ymm1,%ymm0 1417 addq $1, BI 1418 addq $4 , %rax 1419.endm 1420 1421.macro SAVE4x1 1422 1423 vbroadcastsd ALPHA, %ymm0 1424 1425 vmulpd %ymm0 , %ymm4 , %ymm4 1426 1427#if !defined(TRMMKERNEL) 1428 1429 vaddpd (CO1), %ymm4,%ymm4 1430 1431#endif 1432 1433 vmovups %ymm4 , (CO1) 1434 1435.endm 1436 1437 1438/*******************************************************************************************/ 1439 1440.macro KERNEL2x1_1 1441 vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 1442 vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 1443 VFMADD231SD_ %xmm4,%xmm1,%xmm0 1444 vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 1445 VFMADD231SD_ %xmm8,%xmm1,%xmm0 1446.endm 1447 1448.macro KERNEL2x1_2 1449 vmovsd -1 * SIZE(BO, BI, SIZE), %xmm1 1450 vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 1451 VFMADD231SD_ %xmm4,%xmm1,%xmm0 1452 vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 1453 VFMADD231SD_ %xmm8,%xmm1,%xmm0 1454.endm 1455 1456.macro KERNEL2x1_3 1457 vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 1458 vmovsd -28 * SIZE(AO, %rax, SIZE), %xmm0 1459 VFMADD231SD_ %xmm4,%xmm1,%xmm0 1460 vmovsd -27 * SIZE(AO, %rax, SIZE), %xmm0 1461 VFMADD231SD_ %xmm8,%xmm1,%xmm0 1462.endm 1463 1464.macro KERNEL2x1_4 1465 vmovsd 1 * SIZE(BO, BI, SIZE), %xmm1 1466 vmovsd -26 * SIZE(AO, %rax, SIZE), %xmm0 1467 VFMADD231SD_ %xmm4,%xmm1,%xmm0 1468 vmovsd -25 * SIZE(AO, %rax, SIZE), %xmm0 1469 VFMADD231SD_ %xmm8,%xmm1,%xmm0 1470 addq $4, BI 1471 addq $8, %rax 1472.endm 1473 1474.macro KERNEL2x1_SUB 1475 vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 1476 vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 1477 VFMADD231SD_ %xmm4,%xmm1,%xmm0 1478 vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 1479 VFMADD231SD_ %xmm8,%xmm1,%xmm0 1480 addq $1, BI 1481 addq $2 , %rax 1482.endm 1483 1484.macro SAVE2x1 1485 1486 vmovsd ALPHA, %xmm0 1487 1488 vmulsd %xmm0 , %xmm4 , %xmm4 1489 vmulsd %xmm0 , %xmm8 , %xmm8 1490 1491#if !defined(TRMMKERNEL) 1492 1493 vaddsd (CO1), %xmm4,%xmm4 1494 vaddsd 1 * SIZE(CO1), %xmm8,%xmm8 1495 1496#endif 1497 1498 vmovsd %xmm4 , (CO1) 1499 vmovsd %xmm8 , 1 * SIZE(CO1) 1500 1501.endm 1502 1503 1504/*******************************************************************************************/ 1505 1506.macro KERNEL1x1_1 1507 vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 1508 vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 1509 VFMADD231SD_ %xmm4,%xmm1,%xmm0 1510.endm 1511 1512.macro KERNEL1x1_2 1513 vmovsd -1 * SIZE(BO, BI, SIZE), %xmm1 1514 vmovsd -31 * SIZE(AO, %rax, SIZE), %xmm0 1515 VFMADD231SD_ %xmm4,%xmm1,%xmm0 1516.endm 1517 1518.macro KERNEL1x1_3 1519 vmovsd 0 * SIZE(BO, BI, SIZE), %xmm1 1520 vmovsd -30 * SIZE(AO, %rax, SIZE), %xmm0 1521 VFMADD231SD_ %xmm4,%xmm1,%xmm0 1522.endm 1523 1524.macro KERNEL1x1_4 1525 vmovsd 1 * SIZE(BO, BI, SIZE), %xmm1 1526 vmovsd -29 * SIZE(AO, %rax, SIZE), %xmm0 1527 VFMADD231SD_ %xmm4,%xmm1,%xmm0 1528 addq $ 4, BI 1529 addq $ 4, %rax 1530.endm 1531 1532.macro KERNEL1x1_SUB 1533 vmovsd -2 * SIZE(BO, BI, SIZE), %xmm1 1534 vmovsd -32 * SIZE(AO, %rax, SIZE), %xmm0 1535 VFMADD231SD_ %xmm4,%xmm1,%xmm0 1536 addq $ 1, BI 1537 addq $ 1 , %rax 1538.endm 1539 1540.macro SAVE1x1 1541 1542 vmovsd ALPHA, %xmm0 1543 1544 vmulsd %xmm0 , %xmm4 , %xmm4 1545 1546#if !defined(TRMMKERNEL) 1547 1548 vaddsd (CO1), %xmm4,%xmm4 1549 1550#endif 1551 1552 vmovsd %xmm4 , (CO1) 1553 1554.endm 1555 1556 1557/*******************************************************************************************/ 1558 1559#if !defined(TRMMKERNEL) 1560 1561 1562 PROLOGUE 1563 PROFCODE 1564 1565 subq $STACKSIZE, %rsp 1566 movq %rbx, (%rsp) 1567 movq %rbp, 8(%rsp) 1568 movq %r12, 16(%rsp) 1569 movq %r13, 24(%rsp) 1570 movq %r14, 32(%rsp) 1571 movq %r15, 40(%rsp) 1572 1573 vzeroupper 1574 1575#ifdef WINDOWS_ABI 1576 movq %rdi, 48(%rsp) 1577 movq %rsi, 56(%rsp) 1578 movups %xmm6, 64(%rsp) 1579 movups %xmm7, 80(%rsp) 1580 movups %xmm8, 96(%rsp) 1581 movups %xmm9, 112(%rsp) 1582 movups %xmm10, 128(%rsp) 1583 movups %xmm11, 144(%rsp) 1584 movups %xmm12, 160(%rsp) 1585 movups %xmm13, 176(%rsp) 1586 movups %xmm14, 192(%rsp) 1587 movups %xmm15, 208(%rsp) 1588 1589 movq ARG1, OLD_M 1590 movq ARG2, OLD_N 1591 movq ARG3, OLD_K 1592 movq OLD_A, A 1593 movq OLD_B, B 1594 movq OLD_C, C 1595 movq OLD_LDC, LDC 1596 1597 vmovaps %xmm3, %xmm0 1598 1599#else 1600 movq STACKSIZE + 8(%rsp), LDC 1601 1602#endif 1603 1604 movq %rsp, SP # save old stack 1605 subq $128 + L_BUFFER_SIZE, %rsp 1606 andq $-4096, %rsp # align stack 1607 1608 STACK_TOUCH 1609 1610 cmpq $0, OLD_M 1611 je .L999 1612 1613 cmpq $0, OLD_N 1614 je .L999 1615 1616 cmpq $0, OLD_K 1617 je .L999 1618 1619 movq OLD_M, M 1620 movq OLD_N, N 1621 movq OLD_K, K 1622 1623 vmovsd %xmm0, ALPHA 1624 1625 salq $BASE_SHIFT, LDC 1626 1627 movq N, %rax 1628 xorq %rdx, %rdx 1629 movq $6, %rdi 1630 divq %rdi // N / 6 1631 movq %rax, Ndiv6 // N / 6 1632 movq %rdx, Nmod6 // N % 6 1633 1634 1635 movq Ndiv6, J 1636 cmpq $0, J 1637 je .L2_0 1638 ALIGN_4 1639 1640.L6_01: 1641 // copy to sub buffer 1642 movq K, %rax 1643 salq $1,%rax // K * 2 ; read 2 values 1644 movq B, BO1 1645 leaq (B,%rax, SIZE), BO2 // next offset to BO2 1646 leaq BUFFER1, BO // first buffer to BO 1647 movq K, %rax 1648 sarq $3 , %rax // K / 8 1649 jz .L6_01a_2 1650 ALIGN_4 1651 1652.L6_01a_1: 1653 1654 prefetcht0 512(BO1) 1655 prefetcht0 512(BO2) 1656 prefetchw 512(BO) 1657 1658 1659 vmovups 0 * SIZE(BO1), %xmm0 1660 vmovups 2 * SIZE(BO1), %xmm2 1661 vmovups 4 * SIZE(BO1), %xmm4 1662 vmovups 6 * SIZE(BO1), %xmm6 1663 vmovsd 0 * SIZE(BO2), %xmm1 1664 vmovsd 2 * SIZE(BO2), %xmm3 1665 vmovsd 4 * SIZE(BO2), %xmm5 1666 vmovsd 6 * SIZE(BO2), %xmm7 1667 vmovups %xmm0, 0*SIZE(BO) 1668 vmovsd %xmm1, 2*SIZE(BO) 1669 vmovups %xmm2, 3*SIZE(BO) 1670 vmovsd %xmm3, 5*SIZE(BO) 1671 vmovups %xmm4, 6*SIZE(BO) 1672 vmovsd %xmm5, 8*SIZE(BO) 1673 vmovups %xmm6, 9*SIZE(BO) 1674 vmovsd %xmm7,11*SIZE(BO) 1675 addq $ 8*SIZE,BO1 1676 addq $ 8*SIZE,BO2 1677 addq $ 12*SIZE,BO 1678 1679 vmovups 0 * SIZE(BO1), %xmm0 1680 vmovups 2 * SIZE(BO1), %xmm2 1681 vmovups 4 * SIZE(BO1), %xmm4 1682 vmovups 6 * SIZE(BO1), %xmm6 1683 vmovsd 0 * SIZE(BO2), %xmm1 1684 vmovsd 2 * SIZE(BO2), %xmm3 1685 vmovsd 4 * SIZE(BO2), %xmm5 1686 vmovsd 6 * SIZE(BO2), %xmm7 1687 vmovups %xmm0, 0*SIZE(BO) 1688 vmovsd %xmm1, 2*SIZE(BO) 1689 vmovups %xmm2, 3*SIZE(BO) 1690 vmovsd %xmm3, 5*SIZE(BO) 1691 vmovups %xmm4, 6*SIZE(BO) 1692 vmovsd %xmm5, 8*SIZE(BO) 1693 vmovups %xmm6, 9*SIZE(BO) 1694 vmovsd %xmm7,11*SIZE(BO) 1695 addq $ 8*SIZE,BO1 1696 addq $ 8*SIZE,BO2 1697 addq $ 12*SIZE,BO 1698 1699 decq %rax 1700 jnz .L6_01a_1 1701 1702 1703 1704.L6_01a_2: 1705 1706 movq K, %rax 1707 andq $7, %rax // K % 8 1708 jz .L6_02c 1709 ALIGN_4 1710 1711 1712.L6_02b: 1713 1714 vmovups 0 * SIZE(BO1), %xmm0 1715 vmovsd 0 * SIZE(BO2), %xmm2 1716 vmovups %xmm0, 0*SIZE(BO) 1717 vmovsd %xmm2, 2*SIZE(BO) 1718 addq $ 2*SIZE,BO1 1719 addq $ 2*SIZE,BO2 1720 addq $ 3*SIZE,BO 1721 decq %rax 1722 jnz .L6_02b 1723 1724.L6_02c: 1725 1726 movq K, %rax 1727 salq $1,%rax // K * 2 1728 leaq (B,%rax, SIZE), BO1 // next offset to BO1 1729 leaq (BO1,%rax, SIZE), BO2 // next offset to BO2 1730 leaq BUFFER2, BO // second buffer to BO 1731 movq K, %rax 1732 sarq $3 , %rax // K / 8 1733 jz .L6_02c_2 1734 ALIGN_4 1735 1736.L6_02c_1: 1737 1738 prefetcht0 512(BO2) 1739 prefetchw 512(BO) 1740 1741 vmovups 0 * SIZE(BO2), %xmm0 1742 vmovups 2 * SIZE(BO2), %xmm2 1743 vmovups 4 * SIZE(BO2), %xmm4 1744 vmovups 6 * SIZE(BO2), %xmm6 1745 vmovsd 1 * SIZE(BO1), %xmm1 1746 vmovsd 3 * SIZE(BO1), %xmm3 1747 vmovsd 5 * SIZE(BO1), %xmm5 1748 vmovsd 7 * SIZE(BO1), %xmm7 1749 vmovsd %xmm1, 0*SIZE(BO) 1750 vmovups %xmm0, 1*SIZE(BO) 1751 vmovsd %xmm3, 3*SIZE(BO) 1752 vmovups %xmm2, 4*SIZE(BO) 1753 vmovsd %xmm5, 6*SIZE(BO) 1754 vmovups %xmm4, 7*SIZE(BO) 1755 vmovsd %xmm7, 9*SIZE(BO) 1756 vmovups %xmm6,10*SIZE(BO) 1757 addq $8*SIZE,BO1 1758 addq $8*SIZE,BO2 1759 addq $12*SIZE,BO 1760 1761 1762 vmovups 0 * SIZE(BO2), %xmm0 1763 vmovups 2 * SIZE(BO2), %xmm2 1764 vmovups 4 * SIZE(BO2), %xmm4 1765 vmovups 6 * SIZE(BO2), %xmm6 1766 vmovsd 1 * SIZE(BO1), %xmm1 1767 vmovsd 3 * SIZE(BO1), %xmm3 1768 vmovsd 5 * SIZE(BO1), %xmm5 1769 vmovsd 7 * SIZE(BO1), %xmm7 1770 vmovsd %xmm1, 0*SIZE(BO) 1771 vmovups %xmm0, 1*SIZE(BO) 1772 vmovsd %xmm3, 3*SIZE(BO) 1773 vmovups %xmm2, 4*SIZE(BO) 1774 vmovsd %xmm5, 6*SIZE(BO) 1775 vmovups %xmm4, 7*SIZE(BO) 1776 vmovsd %xmm7, 9*SIZE(BO) 1777 vmovups %xmm6,10*SIZE(BO) 1778 addq $8*SIZE,BO1 1779 addq $8*SIZE,BO2 1780 addq $12*SIZE,BO 1781 1782 decq %rax 1783 jnz .L6_02c_1 1784 1785 1786.L6_02c_2: 1787 1788 movq K, %rax 1789 andq $7, %rax // K % 8 1790 jz .L6_03c 1791 ALIGN_4 1792 1793.L6_03b: 1794 1795 vmovsd 1*SIZE(BO1), %xmm0 1796 vmovups 0*SIZE(BO2), %xmm1 1797 vmovsd %xmm0, 0*SIZE(BO) 1798 vmovups %xmm1, 1*SIZE(BO) 1799 addq $2*SIZE,BO1 1800 addq $2*SIZE,BO2 1801 addq $3*SIZE,BO 1802 decq %rax 1803 jnz .L6_03b 1804 1805 1806.L6_03c: 1807 1808 movq BO2, B // next offset of B 1809 1810.L6_10: 1811 movq C, CO1 1812 leaq (C, LDC, 2), C 1813 leaq (C, LDC, 1), C // c += 3 * ldc 1814 1815 1816 movq A, AO // aoffset = a 1817 addq $16 * SIZE, AO 1818 1819 movq M, I 1820 sarq $4, I // i = (m >> 4) 1821 je .L6_20 1822 1823 ALIGN_4 1824 1825.L6_11: 1826 leaq BUFFER1, BO // first buffer to BO 1827 addq $12 * SIZE, BO 1828 1829 prefetcht0 (CO1) 1830 prefetcht0 (CO1,LDC,1) 1831 prefetcht0 (CO1,LDC,2) 1832 prefetcht0 64(CO1) 1833 prefetcht0 64(CO1,LDC,1) 1834 prefetcht0 64(CO1,LDC,2) 1835 1836 vzeroall 1837 1838 movq K, %rax 1839 1840 sarq $1, %rax // K / 8 1841 je .L6_16 1842 1843 ALIGN_5 1844 1845.L6_12: 1846/* 1847 prefetcht0 B_PR1(BO) 1848 prefetcht0 B_PR1+64(BO) 1849 prefetcht0 B_PR1+128(BO) 1850*/ 1851 KERNEL16x3_SUBN 1852 KERNEL16x3_SUBN 1853/* 1854 KERNEL16x3_SUBN 1855 KERNEL16x3_SUBN 1856 1857 KERNEL16x3_SUBN 1858 KERNEL16x3_SUBN 1859 KERNEL16x3_SUBN 1860 KERNEL16x3_SUBN 1861*/ 1862 dec %rax 1863 jne .L6_12 1864 1865.L6_16: 1866 movq K, %rax 1867 1868 andq $1, %rax # if (k & 1) 1869 je .L6_19 1870 1871 ALIGN_4 1872 1873.L6_17: 1874 1875 KERNEL16x3_SUBN 1876 1877 dec %rax 1878 jne .L6_17 1879 ALIGN_4 1880 1881 1882.L6_19: 1883 1884 SAVE16x3 1885 1886 addq $16 * SIZE, CO1 # coffset += 16 1887 decq I # i -- 1888 jg .L6_11 1889 ALIGN_4 1890 1891/************************************************************************** 1892* Rest of M 1893***************************************************************************/ 1894.L6_20: 1895 // Test rest of M 1896 1897 testq $15, M 1898 jz .L7_10 // to next 3 lines of N 1899 1900 testq $8, M 1901 jz .L6_21pre 1902 ALIGN_4 1903 1904/**************************************************************************/ 1905 1906.L6_20_1: 1907 leaq BUFFER1, BO // first buffer to BO 1908 addq $12 * SIZE, BO 1909 1910 vzeroall 1911 1912 movq K, %rax 1913 1914 sarq $3, %rax 1915 je .L6_20_6 1916 1917 ALIGN_4 1918 1919.L6_20_2: 1920 1921 KERNEL8x3_SUBN 1922 KERNEL8x3_SUBN 1923 KERNEL8x3_SUBN 1924 KERNEL8x3_SUBN 1925 1926 KERNEL8x3_SUBN 1927 KERNEL8x3_SUBN 1928 KERNEL8x3_SUBN 1929 KERNEL8x3_SUBN 1930 dec %rax 1931 jne .L6_20_2 1932 ALIGN_4 1933 1934.L6_20_6: 1935 movq K, %rax 1936 1937 andq $7, %rax # if (k & 1) 1938 je .L6_20_9 1939 1940 1941 ALIGN_4 1942 1943.L6_20_7: 1944 1945 KERNEL8x3_SUBN 1946 1947 dec %rax 1948 jne .L6_20_7 1949 ALIGN_4 1950 1951 1952.L6_20_9: 1953 1954 SAVE8x3 1955 1956 addq $8 * SIZE, CO1 # coffset += 8 1957 ALIGN_4 1958 1959 1960 1961/**************************************************************************/ 1962 1963.L6_21pre: 1964 1965 testq $4, M 1966 jz .L6_30 1967 ALIGN_4 1968 1969.L6_21: 1970 leaq BUFFER1, BO // first buffer to BO 1971 addq $12 * SIZE, BO 1972 1973 vzeroall 1974 1975 movq K, %rax 1976 1977 sarq $3, %rax 1978 je .L6_26 1979 1980 ALIGN_4 1981 1982.L6_22: 1983 1984 KERNEL4x3_SUBN 1985 KERNEL4x3_SUBN 1986 KERNEL4x3_SUBN 1987 KERNEL4x3_SUBN 1988 1989 KERNEL4x3_SUBN 1990 KERNEL4x3_SUBN 1991 KERNEL4x3_SUBN 1992 KERNEL4x3_SUBN 1993 dec %rax 1994 jne .L6_22 1995 ALIGN_4 1996 1997.L6_26: 1998 movq K, %rax 1999 2000 andq $7, %rax # if (k & 1) 2001 je .L6_29 2002 2003 ALIGN_4 2004 2005.L6_27: 2006 2007 KERNEL4x3_SUBN 2008 2009 dec %rax 2010 jne .L6_27 2011 ALIGN_4 2012 2013 2014.L6_29: 2015 2016 SAVE4x3 2017 2018 addq $4 * SIZE, CO1 # coffset += 4 2019 ALIGN_4 2020 2021 2022.L6_30: 2023 testq $2, M 2024 jz .L6_40 2025 2026 ALIGN_4 2027 2028.L6_31: 2029 leaq BUFFER1, BO // first buffer to BO 2030 addq $12 * SIZE, BO 2031 2032 vzeroall 2033 2034 movq K, %rax 2035 2036 sarq $3, %rax 2037 je .L6_36 2038 ALIGN_4 2039 2040.L6_32: 2041 2042 KERNEL2x3_SUBN 2043 KERNEL2x3_SUBN 2044 KERNEL2x3_SUBN 2045 KERNEL2x3_SUBN 2046 2047 KERNEL2x3_SUBN 2048 KERNEL2x3_SUBN 2049 KERNEL2x3_SUBN 2050 KERNEL2x3_SUBN 2051 dec %rax 2052 jne .L6_32 2053 ALIGN_4 2054 2055.L6_36: 2056 movq K, %rax 2057 2058 andq $7, %rax # if (k & 1) 2059 je .L6_39 2060 2061 ALIGN_4 2062 2063.L6_37: 2064 2065 KERNEL2x3_SUBN 2066 2067 dec %rax 2068 jne .L6_37 2069 ALIGN_4 2070 2071 2072.L6_39: 2073 2074 SAVE2x3 2075 2076 addq $2 * SIZE, CO1 # coffset += 2 2077 ALIGN_4 2078 2079.L6_40: 2080 testq $1, M 2081 jz .L7_10 // to next 3 lines of N 2082 2083 ALIGN_4 2084 2085.L6_41: 2086 leaq BUFFER1, BO // first buffer to BO 2087 addq $12 * SIZE, BO 2088 2089 vzeroall 2090 2091 movq K, %rax 2092 2093 sarq $3,%rax 2094 je .L6_46 2095 2096 ALIGN_4 2097 2098.L6_42: 2099 2100 KERNEL1x3_SUBN 2101 KERNEL1x3_SUBN 2102 KERNEL1x3_SUBN 2103 KERNEL1x3_SUBN 2104 2105 KERNEL1x3_SUBN 2106 KERNEL1x3_SUBN 2107 KERNEL1x3_SUBN 2108 KERNEL1x3_SUBN 2109 2110 dec %rax 2111 jne .L6_42 2112 ALIGN_4 2113 2114.L6_46: 2115 movq K, %rax 2116 2117 andq $7, %rax # if (k & 1) 2118 je .L6_49 2119 2120 ALIGN_4 2121 2122.L6_47: 2123 2124 KERNEL1x3_SUBN 2125 2126 dec %rax 2127 jne .L6_47 2128 ALIGN_4 2129 2130 2131.L6_49: 2132 2133 SAVE1x3 2134 2135 addq $1 * SIZE, CO1 # coffset += 1 2136 ALIGN_4 2137 2138 2139 2140 2141/***************************************************************************************************************/ 2142 2143.L7_10: 2144 movq C, CO1 2145 leaq (C, LDC, 2), C 2146 leaq (C, LDC, 1), C // c += 3 * ldc 2147 2148 2149 movq A, AO // aoffset = a 2150 addq $16 * SIZE, AO 2151 2152 movq M, I 2153 sarq $4, I // i = (m >> 4) 2154 je .L7_20 2155 2156 ALIGN_4 2157 2158.L7_11: 2159 leaq BUFFER2, BO // second buffer to BO 2160 addq $12 * SIZE, BO 2161 2162 prefetcht0 (CO1) 2163 prefetcht0 (CO1,LDC,1) 2164 prefetcht0 (CO1,LDC,2) 2165 prefetcht0 64(CO1) 2166 prefetcht0 64(CO1,LDC,1) 2167 prefetcht0 64(CO1,LDC,2) 2168 2169 vzeroall 2170 2171 movq K, %rax 2172 2173 sarq $3, %rax // K / 8 2174 je .L7_16 2175 ALIGN_5 2176 2177.L7_12: 2178/* 2179 prefetcht0 B_PR1(BO) 2180 prefetcht0 B_PR1+64(BO) 2181 prefetcht0 B_PR1+128(BO) 2182*/ 2183 KERNEL16x3_SUBN 2184 KERNEL16x3_SUBN 2185 KERNEL16x3_SUBN 2186 KERNEL16x3_SUBN 2187 2188 KERNEL16x3_SUBN 2189 KERNEL16x3_SUBN 2190 KERNEL16x3_SUBN 2191 KERNEL16x3_SUBN 2192 dec %rax 2193 jne .L7_12 2194 ALIGN_4 2195 2196.L7_16: 2197 movq K, %rax 2198 2199 andq $7, %rax # if (k & 1) 2200 je .L7_19 2201 2202 ALIGN_5 2203 2204.L7_17: 2205 2206 KERNEL16x3_SUBN 2207 2208 dec %rax 2209 jne .L7_17 2210 2211 2212.L7_19: 2213 2214 SAVE16x3 2215 2216 addq $16 * SIZE, CO1 # coffset += 16 2217 decq I # i -- 2218 jg .L7_11 2219 ALIGN_4 2220 2221/************************************************************************** 2222* Rest of M 2223***************************************************************************/ 2224.L7_20: 2225 // Test rest of M 2226 2227 testq $15, M 2228 jz .L7_60 // to next 3 lines of N 2229 2230 testq $8, M 2231 jz .L7_21pre 2232 ALIGN_4 2233 2234/**************************************************************************/ 2235 2236.L7_20_1: 2237 leaq BUFFER2, BO // first buffer to BO 2238 addq $12 * SIZE, BO 2239 2240 vzeroall 2241 2242 movq K, %rax 2243 2244 sarq $3, %rax 2245 je .L7_20_6 2246 2247 ALIGN_4 2248 2249.L7_20_2: 2250 2251 KERNEL8x3_SUBN 2252 KERNEL8x3_SUBN 2253 KERNEL8x3_SUBN 2254 KERNEL8x3_SUBN 2255 2256 KERNEL8x3_SUBN 2257 KERNEL8x3_SUBN 2258 KERNEL8x3_SUBN 2259 KERNEL8x3_SUBN 2260 2261 dec %rax 2262 jne .L7_20_2 2263 ALIGN_4 2264 2265.L7_20_6: 2266 movq K, %rax 2267 2268 andq $7, %rax # if (k & 1) 2269 je .L7_20_9 2270 2271 ALIGN_4 2272 2273.L7_20_7: 2274 2275 KERNEL8x3_SUBN 2276 2277 dec %rax 2278 jne .L7_20_7 2279 ALIGN_4 2280 2281.L7_20_9: 2282 2283 SAVE8x3 2284 2285 addq $8 * SIZE, CO1 # coffset += 8 2286 ALIGN_4 2287 2288 2289 2290/**************************************************************************/ 2291 2292.L7_21pre: 2293 2294 testq $4, M 2295 jz .L7_30 2296 ALIGN_4 2297 2298.L7_21: 2299 leaq BUFFER2, BO // second buffer to BO 2300 addq $12 * SIZE, BO 2301 2302 vzeroall 2303 2304 movq K, %rax 2305 2306 sarq $3, %rax 2307 je .L7_26 2308 2309 ALIGN_4 2310 2311.L7_22: 2312 2313 KERNEL4x3_SUBN 2314 KERNEL4x3_SUBN 2315 KERNEL4x3_SUBN 2316 KERNEL4x3_SUBN 2317 2318 KERNEL4x3_SUBN 2319 KERNEL4x3_SUBN 2320 KERNEL4x3_SUBN 2321 KERNEL4x3_SUBN 2322 2323 dec %rax 2324 jne .L7_22 2325 ALIGN_4 2326 2327.L7_26: 2328 movq K, %rax 2329 2330 andq $7, %rax # if (k & 1) 2331 je .L7_29 2332 2333 ALIGN_4 2334 2335.L7_27: 2336 2337 KERNEL4x3_SUBN 2338 2339 dec %rax 2340 jne .L7_27 2341 ALIGN_4 2342 2343 2344.L7_29: 2345 2346 SAVE4x3 2347 2348 addq $4 * SIZE, CO1 # coffset += 4 2349 ALIGN_4 2350 2351 2352.L7_30: 2353 testq $2, M 2354 jz .L7_40 2355 2356 ALIGN_4 2357 2358.L7_31: 2359 leaq BUFFER2, BO // second buffer to BO 2360 addq $12 * SIZE, BO 2361 2362 vzeroall 2363 2364 movq K, %rax 2365 2366 sarq $3, %rax 2367 je .L7_36 2368 2369 ALIGN_4 2370 2371.L7_32: 2372 2373 KERNEL2x3_SUBN 2374 KERNEL2x3_SUBN 2375 KERNEL2x3_SUBN 2376 KERNEL2x3_SUBN 2377 2378 KERNEL2x3_SUBN 2379 KERNEL2x3_SUBN 2380 KERNEL2x3_SUBN 2381 KERNEL2x3_SUBN 2382 2383 dec %rax 2384 jne .L7_32 2385 ALIGN_4 2386 2387.L7_36: 2388 movq K, %rax 2389 2390 andq $7, %rax # if (k & 1) 2391 je .L7_39 2392 2393 ALIGN_4 2394 2395.L7_37: 2396 2397 KERNEL2x3_SUBN 2398 2399 dec %rax 2400 jne .L7_37 2401 ALIGN_4 2402 2403 2404.L7_39: 2405 2406 SAVE2x3 2407 2408 addq $2 * SIZE, CO1 # coffset += 2 2409 ALIGN_4 2410 2411.L7_40: 2412 testq $1, M 2413 jz .L7_60 // to next 3 lines of N 2414 2415 ALIGN_4 2416 2417.L7_41: 2418 leaq BUFFER2, BO // second buffer to BO 2419 addq $12 * SIZE, BO 2420 2421 vzeroall 2422 2423 movq K, %rax 2424 2425 sarq $3, %rax 2426 je .L7_46 2427 2428 ALIGN_4 2429 2430.L7_42: 2431 KERNEL1x3_SUBN 2432 KERNEL1x3_SUBN 2433 KERNEL1x3_SUBN 2434 KERNEL1x3_SUBN 2435 2436 KERNEL1x3_SUBN 2437 KERNEL1x3_SUBN 2438 KERNEL1x3_SUBN 2439 KERNEL1x3_SUBN 2440 2441 dec %rax 2442 jne .L7_42 2443 ALIGN_4 2444 2445.L7_46: 2446 movq K, %rax 2447 2448 andq $7, %rax # if (k & 1) 2449 je .L7_49 2450 2451 ALIGN_4 2452 2453.L7_47: 2454 2455 KERNEL1x3_SUBN 2456 2457 dec %rax 2458 jne .L7_47 2459 ALIGN_4 2460 2461 2462.L7_49: 2463 2464 SAVE1x3 2465 2466 addq $1 * SIZE, CO1 # coffset += 1 2467 ALIGN_4 2468 2469 2470 2471.L7_60: 2472 2473 decq J // j -- 2474 jg .L6_01 2475 2476 2477.L2_0: 2478 cmpq $0, Nmod6 // N % 6 == 0 2479 je .L999 2480 2481/************************************************************************************************ 2482* Loop for Nmod6 / 2 > 0 2483*************************************************************************************************/ 2484 2485 movq Nmod6, J 2486 sarq $1, J // j = j / 2 2487 je .L1_0 2488 ALIGN_4 2489 2490.L2_01: 2491 // copy to sub buffer 2492 movq B, BO1 2493 leaq BUFFER1, BO // first buffer to BO 2494 movq K, %rax 2495 sarq $2, %rax // K / 4 2496 jz .L2_01b 2497 ALIGN_4 2498 2499.L2_01a: 2500 prefetcht0 512(BO1) 2501 prefetchw 512(BO) 2502 2503 vmovups (BO1), %xmm0 2504 vmovups 2*SIZE(BO1), %xmm1 2505 vmovups 4*SIZE(BO1), %xmm2 2506 vmovups 6*SIZE(BO1), %xmm3 2507 2508 vmovups %xmm0, (BO) 2509 vmovups %xmm1, 2*SIZE(BO) 2510 vmovups %xmm2, 4*SIZE(BO) 2511 vmovups %xmm3, 6*SIZE(BO) 2512 2513 addq $8*SIZE,BO1 2514 addq $8*SIZE,BO 2515 decq %rax 2516 jnz .L2_01a 2517 2518 2519.L2_01b: 2520 2521 movq K, %rax 2522 andq $3, %rax // K % 4 2523 jz .L2_02d 2524 ALIGN_4 2525 2526.L2_02c: 2527 2528 vmovups (BO1), %xmm0 2529 vmovups %xmm0, (BO) 2530 addq $2*SIZE,BO1 2531 addq $2*SIZE,BO 2532 decq %rax 2533 jnz .L2_02c 2534 2535.L2_02d: 2536 2537 movq BO1, B // next offset of B 2538 2539.L2_10: 2540 movq C, CO1 2541 leaq (C, LDC, 2), C // c += 2 * ldc 2542 2543 2544 movq A, AO // aoffset = a 2545 addq $32 * SIZE, AO 2546 2547 movq M, I 2548 sarq $4, I // i = (m >> 4) 2549 je .L2_20 2550 2551 ALIGN_4 2552 2553.L2_11: 2554 leaq BUFFER1, BO // first buffer to BO 2555 addq $4 * SIZE, BO 2556 2557 vzeroall 2558 2559 movq K, %rax 2560 2561 andq $-8, %rax // K = K - ( K % 8 ) 2562 je .L2_16 2563 movq %rax, BI // Index for BO 2564 leaq (BI,BI,1), BI // BI = BI * 2 ; number of values 2565 2566 salq $4, %rax // rax = rax * 16 ; number of values 2567 leaq (AO, %rax, SIZE), AO 2568 leaq (BO, BI, SIZE), BO 2569 negq BI 2570 negq %rax 2571 ALIGN_4 2572 2573.L2_12: 2574 2575 prefetcht0 B_PR1(BO,BI,8) 2576 KERNEL16x2_1 2577 KERNEL16x2_2 2578 KERNEL16x2_3 2579 KERNEL16x2_4 2580 2581 prefetcht0 B_PR1(BO,BI,8) 2582 KERNEL16x2_1 2583 KERNEL16x2_2 2584 KERNEL16x2_3 2585 KERNEL16x2_4 2586 2587 je .L2_16 2588 2589 prefetcht0 B_PR1(BO,BI,8) 2590 KERNEL16x2_1 2591 KERNEL16x2_2 2592 KERNEL16x2_3 2593 KERNEL16x2_4 2594 2595 prefetcht0 B_PR1(BO,BI,8) 2596 KERNEL16x2_1 2597 KERNEL16x2_2 2598 KERNEL16x2_3 2599 KERNEL16x2_4 2600 2601 je .L2_16 2602 2603 jmp .L2_12 2604 ALIGN_4 2605 2606.L2_16: 2607 movq K, %rax 2608 2609 andq $7, %rax # if (k & 1) 2610 je .L2_19 2611 2612 movq %rax, BI // Index for BO 2613 leaq (BI,BI,1), BI // BI = BI * 2 ; number of values 2614 2615 salq $4, %rax // rax = rax * 16 ; number of values 2616 leaq (AO, %rax, SIZE), AO 2617 leaq (BO, BI, SIZE), BO 2618 negq BI 2619 negq %rax 2620 ALIGN_4 2621 2622.L2_17: 2623 2624 KERNEL16x2_SUB 2625 2626 jl .L2_17 2627 ALIGN_4 2628 2629 2630.L2_19: 2631 2632 SAVE16x2 2633 2634 addq $16 * SIZE, CO1 # coffset += 16 2635 decq I # i -- 2636 jg .L2_11 2637 ALIGN_4 2638 2639/************************************************************************** 2640* Rest of M 2641***************************************************************************/ 2642.L2_20: 2643 // Test rest of M 2644 2645 testq $15, M 2646 jz .L2_60 // to next 3 lines of N 2647 2648 testq $8, M 2649 jz .L2_21pre 2650 ALIGN_4 2651 2652/**************************************************************************/ 2653 2654.L2_20_1: 2655 leaq BUFFER1, BO // first buffer to BO 2656 addq $4 * SIZE, BO 2657 2658 vzeroall 2659 2660 movq K, %rax 2661 2662 andq $-8, %rax 2663 je .L2_20_6 2664 movq %rax, BI // Index for BO 2665 leaq (BI,BI,1), BI // BI = BI * 2 ; number of values 2666 2667 salq $3, %rax // rax = rax * 8 ; number of values 2668 leaq (AO, %rax, SIZE), AO 2669 leaq (BO, BI, SIZE), BO 2670 negq BI 2671 negq %rax 2672 ALIGN_4 2673 2674.L2_20_2: 2675 2676 prefetcht0 B_PR1(BO,BI,8) 2677 KERNEL8x2_1 2678 KERNEL8x2_2 2679 KERNEL8x2_3 2680 KERNEL8x2_4 2681 2682 prefetcht0 B_PR1(BO,BI,8) 2683 KERNEL8x2_1 2684 KERNEL8x2_2 2685 KERNEL8x2_3 2686 KERNEL8x2_4 2687 2688 je .L2_20_6 2689 2690 prefetcht0 B_PR1(BO,BI,8) 2691 KERNEL8x2_1 2692 KERNEL8x2_2 2693 KERNEL8x2_3 2694 KERNEL8x2_4 2695 2696 prefetcht0 B_PR1(BO,BI,8) 2697 KERNEL8x2_1 2698 KERNEL8x2_2 2699 KERNEL8x2_3 2700 KERNEL8x2_4 2701 2702 je .L2_20_6 2703 2704 jmp .L2_20_2 2705 ALIGN_4 2706 2707.L2_20_6: 2708 movq K, %rax 2709 2710 andq $7, %rax # if (k & 1) 2711 je .L2_20_9 2712 2713 movq %rax, BI // Index for BO 2714 leaq (BI,BI,1), BI // BI = BI * 2 ; number of values 2715 2716 salq $3, %rax // rax = rax * 8 ; number of values 2717 leaq (AO, %rax, SIZE), AO 2718 leaq (BO, BI, SIZE), BO 2719 negq BI 2720 negq %rax 2721 ALIGN_4 2722 2723.L2_20_7: 2724 2725 KERNEL8x2_SUB 2726 2727 jl .L2_20_7 2728 ALIGN_4 2729 2730 2731.L2_20_9: 2732 2733 SAVE8x2 2734 2735 addq $8 * SIZE, CO1 # coffset += 8 2736 ALIGN_4 2737 2738 2739 2740/**************************************************************************/ 2741 2742.L2_21pre: 2743 2744 testq $4, M 2745 jz .L2_30 2746 ALIGN_4 2747 2748.L2_21: 2749 leaq BUFFER1, BO // first buffer to BO 2750 addq $4 * SIZE, BO 2751 2752 vzeroall 2753 2754 movq K, %rax 2755 2756 andq $-8, %rax 2757 je .L2_26 2758 movq %rax, BI // Index for BO 2759 leaq (BI,BI,1), BI // BI = BI * 1 ; number of values 2760 2761 salq $2, %rax // rax = rax * 4 ; number of values 2762 leaq (AO, %rax, SIZE), AO 2763 leaq (BO, BI, SIZE), BO 2764 negq BI 2765 negq %rax 2766 ALIGN_4 2767 2768.L2_22: 2769 2770 prefetcht0 B_PR1(BO,BI,8) 2771 KERNEL4x2_1 2772 KERNEL4x2_2 2773 KERNEL4x2_3 2774 KERNEL4x2_4 2775 2776 prefetcht0 B_PR1(BO,BI,8) 2777 KERNEL4x2_1 2778 KERNEL4x2_2 2779 KERNEL4x2_3 2780 KERNEL4x2_4 2781 2782 je .L2_26 2783 2784 prefetcht0 B_PR1(BO,BI,8) 2785 KERNEL4x2_1 2786 KERNEL4x2_2 2787 KERNEL4x2_3 2788 KERNEL4x2_4 2789 2790 prefetcht0 B_PR1(BO,BI,8) 2791 KERNEL4x2_1 2792 KERNEL4x2_2 2793 KERNEL4x2_3 2794 KERNEL4x2_4 2795 2796 je .L2_26 2797 2798 jmp .L2_22 2799 ALIGN_4 2800 2801.L2_26: 2802 movq K, %rax 2803 2804 andq $7, %rax # if (k & 1) 2805 je .L2_29 2806 2807 movq %rax, BI // Index for BO 2808 leaq (BI,BI,1), BI // BI = BI * 2 ; number of values 2809 2810 salq $2, %rax // rax = rax * 4 ; number of values 2811 leaq (AO, %rax, SIZE), AO 2812 leaq (BO, BI, SIZE), BO 2813 negq BI 2814 negq %rax 2815 ALIGN_4 2816 2817.L2_27: 2818 2819 KERNEL4x2_SUB 2820 2821 jl .L2_27 2822 ALIGN_4 2823 2824 2825.L2_29: 2826 2827 SAVE4x2 2828 2829 addq $4 * SIZE, CO1 # coffset += 4 2830 ALIGN_4 2831 2832 2833.L2_30: 2834 testq $2, M 2835 jz .L2_40 2836 2837 ALIGN_4 2838 2839.L2_31: 2840 leaq BUFFER1, BO // first buffer to BO 2841 addq $4 * SIZE, BO 2842 2843 vzeroall 2844 2845 movq K, %rax 2846 2847 andq $-8, %rax 2848 je .L2_36 2849 movq %rax, BI // Index for BO 2850 leaq (BI,BI,1), BI // BI = BI * 2 ; number of values 2851 2852 salq $1, %rax // rax = rax *2 ; number of values 2853 leaq (AO, %rax, SIZE), AO 2854 leaq (BO, BI, SIZE), BO 2855 negq BI 2856 negq %rax 2857 ALIGN_4 2858 2859.L2_32: 2860 2861 KERNEL2x2_1 2862 KERNEL2x2_2 2863 KERNEL2x2_3 2864 KERNEL2x2_4 2865 2866 KERNEL2x2_1 2867 KERNEL2x2_2 2868 KERNEL2x2_3 2869 KERNEL2x2_4 2870 2871 je .L2_36 2872 2873 KERNEL2x2_1 2874 KERNEL2x2_2 2875 KERNEL2x2_3 2876 KERNEL2x2_4 2877 2878 KERNEL2x2_1 2879 KERNEL2x2_2 2880 KERNEL2x2_3 2881 KERNEL2x2_4 2882 2883 je .L2_36 2884 2885 jmp .L2_32 2886 ALIGN_4 2887 2888.L2_36: 2889 movq K, %rax 2890 2891 andq $7, %rax # if (k & 1) 2892 je .L2_39 2893 2894 movq %rax, BI // Index for BO 2895 leaq (BI,BI,1), BI // BI = BI * 2 ; number of values 2896 2897 salq $1, %rax // rax = rax *2 ; number of values 2898 leaq (AO, %rax, SIZE), AO 2899 leaq (BO, BI, SIZE), BO 2900 negq BI 2901 negq %rax 2902 ALIGN_4 2903 2904.L2_37: 2905 2906 KERNEL2x2_SUB 2907 2908 jl .L2_37 2909 ALIGN_4 2910 2911 2912.L2_39: 2913 2914 SAVE2x2 2915 2916 addq $2 * SIZE, CO1 # coffset += 2 2917 ALIGN_4 2918 2919.L2_40: 2920 testq $1, M 2921 jz .L2_60 // to next 2 lines of N 2922 2923 ALIGN_4 2924 2925.L2_41: 2926 leaq BUFFER1, BO // first buffer to BO 2927 addq $4 * SIZE, BO 2928 2929 vzeroall 2930 2931 movq K, %rax 2932 2933 andq $-8, %rax 2934 je .L2_46 2935 movq %rax, BI // Index for BO 2936 leaq (BI,BI,1), BI // BI = BI * 2 ; number of values 2937 2938 leaq (AO, %rax, SIZE), AO 2939 leaq (BO, BI, SIZE), BO 2940 negq BI 2941 negq %rax 2942 ALIGN_4 2943 2944.L2_42: 2945 2946 KERNEL1x2_1 2947 KERNEL1x2_2 2948 KERNEL1x2_3 2949 KERNEL1x2_4 2950 2951 KERNEL1x2_1 2952 KERNEL1x2_2 2953 KERNEL1x2_3 2954 KERNEL1x2_4 2955 2956 je .L2_46 2957 2958 KERNEL1x2_1 2959 KERNEL1x2_2 2960 KERNEL1x2_3 2961 KERNEL1x2_4 2962 2963 KERNEL1x2_1 2964 KERNEL1x2_2 2965 KERNEL1x2_3 2966 KERNEL1x2_4 2967 2968 je .L2_46 2969 2970 jmp .L2_42 2971 ALIGN_4 2972 2973.L2_46: 2974 movq K, %rax 2975 2976 andq $7, %rax # if (k & 1) 2977 je .L2_49 2978 2979 movq %rax, BI // Index for BO 2980 leaq (BI,BI,1), BI // BI = BI * 2 ; number of values 2981 2982 leaq (AO, %rax, SIZE), AO 2983 leaq (BO, BI, SIZE), BO 2984 negq BI 2985 negq %rax 2986 ALIGN_4 2987 2988.L2_47: 2989 2990 KERNEL1x2_SUB 2991 2992 jl .L2_47 2993 ALIGN_4 2994 2995 2996.L2_49: 2997 2998 SAVE1x2 2999 3000 addq $1 * SIZE, CO1 # coffset += 1 3001 ALIGN_4 3002 3003.L2_60: 3004 3005 decq J // j -- 3006 jg .L2_01 // next 2 lines of N 3007 3008 3009 3010.L1_0: 3011 3012/************************************************************************************************ 3013* Loop for Nmod6 % 2 > 0 3014*************************************************************************************************/ 3015 3016 movq Nmod6, J 3017 andq $1, J // j % 2 3018 je .L999 3019 ALIGN_4 3020 3021.L1_01: 3022 // copy to sub buffer 3023 movq B, BO1 3024 leaq BUFFER1, BO // first buffer to BO 3025 movq K, %rax 3026 ALIGN_4 3027 3028.L1_02b: 3029 3030 vmovsd (BO1), %xmm0 3031 vmovsd %xmm0, (BO) 3032 addq $1*SIZE,BO1 3033 addq $1*SIZE,BO 3034 decq %rax 3035 jnz .L1_02b 3036 3037.L1_02c: 3038 3039 movq BO1, B // next offset of B 3040 3041.L1_10: 3042 movq C, CO1 3043 leaq (C, LDC, 1), C // c += 1 * ldc 3044 3045 3046 movq A, AO // aoffset = a 3047 addq $32 * SIZE, AO 3048 3049 movq M, I 3050 sarq $4, I // i = (m >> 4) 3051 je .L1_20 3052 3053 ALIGN_4 3054 3055.L1_11: 3056 leaq BUFFER1, BO // first buffer to BO 3057 addq $2 * SIZE, BO 3058 3059 vzeroall 3060 3061 movq K, %rax 3062 3063 andq $-8, %rax // K = K - ( K % 8 ) 3064 je .L1_16 3065 movq %rax, BI // Index for BO 3066 3067 salq $4, %rax // rax = rax * 16 ; number of values 3068 leaq (AO, %rax, SIZE), AO 3069 leaq (BO, BI, SIZE), BO 3070 negq BI 3071 negq %rax 3072 ALIGN_4 3073 3074.L1_12: 3075 3076 prefetcht0 B_PR1(BO,BI,8) 3077 KERNEL16x1_1 3078 KERNEL16x1_2 3079 KERNEL16x1_3 3080 KERNEL16x1_4 3081 3082 KERNEL16x1_1 3083 KERNEL16x1_2 3084 KERNEL16x1_3 3085 KERNEL16x1_4 3086 3087 je .L1_16 3088 3089 prefetcht0 B_PR1(BO,BI,8) 3090 KERNEL16x1_1 3091 KERNEL16x1_2 3092 KERNEL16x1_3 3093 KERNEL16x1_4 3094 3095 KERNEL16x1_1 3096 KERNEL16x1_2 3097 KERNEL16x1_3 3098 KERNEL16x1_4 3099 3100 je .L1_16 3101 3102 jmp .L1_12 3103 ALIGN_4 3104 3105.L1_16: 3106 movq K, %rax 3107 3108 andq $7, %rax # if (k & 1) 3109 je .L1_19 3110 3111 movq %rax, BI // Index for BO 3112 3113 salq $4, %rax // rax = rax * 16 ; number of values 3114 leaq (AO, %rax, SIZE), AO 3115 leaq (BO, BI, SIZE), BO 3116 negq BI 3117 negq %rax 3118 ALIGN_4 3119 3120.L1_17: 3121 3122 KERNEL16x1_SUB 3123 3124 jl .L1_17 3125 ALIGN_4 3126 3127 3128.L1_19: 3129 3130 SAVE16x1 3131 3132 addq $16 * SIZE, CO1 # coffset += 16 3133 decq I # i -- 3134 jg .L1_11 3135 ALIGN_4 3136 3137/************************************************************************** 3138* Rest of M 3139***************************************************************************/ 3140.L1_20: 3141 // Test rest of M 3142 3143 testq $15, M 3144 jz .L999 3145 3146 testq $8, M 3147 jz .L1_21pre 3148 ALIGN_4 3149 3150/**************************************************************************/ 3151 3152.L1_20_1: 3153 leaq BUFFER1, BO // first buffer to BO 3154 addq $2 * SIZE, BO 3155 3156 vzeroall 3157 3158 movq K, %rax 3159 3160 andq $-8, %rax 3161 je .L1_20_6 3162 movq %rax, BI // Index for BO 3163 3164 salq $3, %rax // rax = rax * 8 ; number of values 3165 leaq (AO, %rax, SIZE), AO 3166 leaq (BO, BI, SIZE), BO 3167 negq BI 3168 negq %rax 3169 ALIGN_4 3170 3171.L1_20_2: 3172 3173 prefetcht0 B_PR1(BO,BI,8) 3174 KERNEL8x1_1 3175 KERNEL8x1_2 3176 KERNEL8x1_3 3177 KERNEL8x1_4 3178 3179 KERNEL8x1_1 3180 KERNEL8x1_2 3181 KERNEL8x1_3 3182 KERNEL8x1_4 3183 3184 je .L1_20_6 3185 3186 prefetcht0 B_PR1(BO,BI,8) 3187 KERNEL8x1_1 3188 KERNEL8x1_2 3189 KERNEL8x1_3 3190 KERNEL8x1_4 3191 3192 KERNEL8x1_1 3193 KERNEL8x1_2 3194 KERNEL8x1_3 3195 KERNEL8x1_4 3196 3197 je .L1_20_6 3198 3199 jmp .L1_20_2 3200 ALIGN_4 3201 3202.L1_20_6: 3203 movq K, %rax 3204 3205 andq $7, %rax # if (k & 1) 3206 je .L1_20_9 3207 3208 movq %rax, BI // Index for BO 3209 3210 salq $3, %rax // rax = rax * 8 ; number of values 3211 leaq (AO, %rax, SIZE), AO 3212 leaq (BO, BI, SIZE), BO 3213 negq BI 3214 negq %rax 3215 ALIGN_4 3216 3217.L1_20_7: 3218 3219 KERNEL8x1_SUB 3220 3221 jl .L1_20_7 3222 ALIGN_4 3223 3224 3225.L1_20_9: 3226 3227 SAVE8x1 3228 3229 addq $8 * SIZE, CO1 # coffset += 8 3230 ALIGN_4 3231 3232 3233 3234/**************************************************************************/ 3235 3236.L1_21pre: 3237 3238 testq $4, M 3239 jz .L1_30 3240 ALIGN_4 3241 3242.L1_21: 3243 leaq BUFFER1, BO // first buffer to BO 3244 addq $2 * SIZE, BO 3245 3246 vzeroall 3247 3248 movq K, %rax 3249 3250 andq $-8, %rax 3251 je .L1_26 3252 movq %rax, BI // Index for BO 3253 3254 salq $2, %rax // rax = rax * 4 ; number of values 3255 leaq (AO, %rax, SIZE), AO 3256 leaq (BO, BI, SIZE), BO 3257 negq BI 3258 negq %rax 3259 ALIGN_4 3260 3261.L1_22: 3262 3263 prefetcht0 B_PR1(BO,BI,8) 3264 KERNEL4x1_1 3265 KERNEL4x1_2 3266 KERNEL4x1_3 3267 KERNEL4x1_4 3268 3269 KERNEL4x1_1 3270 KERNEL4x1_2 3271 KERNEL4x1_3 3272 KERNEL4x1_4 3273 3274 je .L1_26 3275 3276 prefetcht0 B_PR1(BO,BI,8) 3277 KERNEL4x1_1 3278 KERNEL4x1_2 3279 KERNEL4x1_3 3280 KERNEL4x1_4 3281 3282 KERNEL4x1_1 3283 KERNEL4x1_2 3284 KERNEL4x1_3 3285 KERNEL4x1_4 3286 3287 je .L1_26 3288 3289 jmp .L1_22 3290 ALIGN_4 3291 3292.L1_26: 3293 movq K, %rax 3294 3295 andq $7, %rax # if (k & 1) 3296 je .L1_29 3297 3298 movq %rax, BI // Index for BO 3299 3300 salq $2, %rax // rax = rax * 4 ; number of values 3301 leaq (AO, %rax, SIZE), AO 3302 leaq (BO, BI, SIZE), BO 3303 negq BI 3304 negq %rax 3305 ALIGN_4 3306 3307.L1_27: 3308 3309 KERNEL4x1_SUB 3310 3311 jl .L1_27 3312 ALIGN_4 3313 3314 3315.L1_29: 3316 3317 SAVE4x1 3318 3319 addq $4 * SIZE, CO1 # coffset += 4 3320 ALIGN_4 3321 3322 3323.L1_30: 3324 testq $2, M 3325 jz .L1_40 3326 3327 ALIGN_4 3328 3329.L1_31: 3330 leaq BUFFER1, BO // first buffer to BO 3331 addq $2 * SIZE, BO 3332 3333 vzeroall 3334 3335 movq K, %rax 3336 3337 andq $-8, %rax 3338 je .L1_36 3339 movq %rax, BI // Index for BO 3340 3341 salq $1, %rax // rax = rax *2 ; number of values 3342 leaq (AO, %rax, SIZE), AO 3343 leaq (BO, BI, SIZE), BO 3344 negq BI 3345 negq %rax 3346 ALIGN_4 3347 3348.L1_32: 3349 3350 KERNEL2x1_1 3351 KERNEL2x1_2 3352 KERNEL2x1_3 3353 KERNEL2x1_4 3354 3355 KERNEL2x1_1 3356 KERNEL2x1_2 3357 KERNEL2x1_3 3358 KERNEL2x1_4 3359 3360 je .L1_36 3361 3362 KERNEL2x1_1 3363 KERNEL2x1_2 3364 KERNEL2x1_3 3365 KERNEL2x1_4 3366 3367 KERNEL2x1_1 3368 KERNEL2x1_2 3369 KERNEL2x1_3 3370 KERNEL2x1_4 3371 3372 je .L1_36 3373 3374 jmp .L1_32 3375 ALIGN_4 3376 3377.L1_36: 3378 movq K, %rax 3379 3380 andq $7, %rax # if (k & 1) 3381 je .L1_39 3382 3383 movq %rax, BI // Index for BO 3384 3385 salq $1, %rax // rax = rax *2 ; number of values 3386 leaq (AO, %rax, SIZE), AO 3387 leaq (BO, BI, SIZE), BO 3388 negq BI 3389 negq %rax 3390 ALIGN_4 3391 3392.L1_37: 3393 3394 KERNEL2x1_SUB 3395 3396 jl .L1_37 3397 ALIGN_4 3398 3399 3400.L1_39: 3401 3402 SAVE2x1 3403 3404 addq $2 * SIZE, CO1 # coffset += 2 3405 ALIGN_4 3406 3407.L1_40: 3408 testq $1, M 3409 jz .L999 3410 3411 ALIGN_4 3412 3413.L1_41: 3414 leaq BUFFER1, BO // first buffer to BO 3415 addq $2 * SIZE, BO 3416 3417 vzeroall 3418 3419 movq K, %rax 3420 3421 andq $-8, %rax 3422 je .L1_46 3423 movq %rax, BI // Index for BO 3424 3425 leaq (AO, %rax, SIZE), AO 3426 leaq (BO, BI, SIZE), BO 3427 negq BI 3428 negq %rax 3429 ALIGN_4 3430 3431.L1_42: 3432 3433 KERNEL1x1_1 3434 KERNEL1x1_2 3435 KERNEL1x1_3 3436 KERNEL1x1_4 3437 3438 KERNEL1x1_1 3439 KERNEL1x1_2 3440 KERNEL1x1_3 3441 KERNEL1x1_4 3442 3443 je .L1_46 3444 3445 KERNEL1x1_1 3446 KERNEL1x1_2 3447 KERNEL1x1_3 3448 KERNEL1x1_4 3449 3450 KERNEL1x1_1 3451 KERNEL1x1_2 3452 KERNEL1x1_3 3453 KERNEL1x1_4 3454 3455 je .L1_46 3456 3457 jmp .L1_42 3458 ALIGN_4 3459 3460.L1_46: 3461 movq K, %rax 3462 3463 andq $7, %rax # if (k & 1) 3464 je .L1_49 3465 3466 movq %rax, BI // Index for BO 3467 3468 leaq (AO, %rax, SIZE), AO 3469 leaq (BO, BI, SIZE), BO 3470 negq BI 3471 negq %rax 3472 ALIGN_4 3473 3474.L1_47: 3475 3476 KERNEL1x1_SUB 3477 3478 jl .L1_47 3479 ALIGN_4 3480 3481 3482.L1_49: 3483 3484 SAVE1x1 3485 3486 addq $1 * SIZE, CO1 # coffset += 1 3487 ALIGN_4 3488 3489 3490.L999: 3491 movq SP, %rsp 3492 movq (%rsp), %rbx 3493 movq 8(%rsp), %rbp 3494 movq 16(%rsp), %r12 3495 movq 24(%rsp), %r13 3496 movq 32(%rsp), %r14 3497 movq 40(%rsp), %r15 3498 3499#ifdef WINDOWS_ABI 3500 movq 48(%rsp), %rdi 3501 movq 56(%rsp), %rsi 3502 movups 64(%rsp), %xmm6 3503 movups 80(%rsp), %xmm7 3504 movups 96(%rsp), %xmm8 3505 movups 112(%rsp), %xmm9 3506 movups 128(%rsp), %xmm10 3507 movups 144(%rsp), %xmm11 3508 movups 160(%rsp), %xmm12 3509 movups 176(%rsp), %xmm13 3510 movups 192(%rsp), %xmm14 3511 movups 208(%rsp), %xmm15 3512#endif 3513 3514 addq $STACKSIZE, %rsp 3515 ret 3516 3517 EPILOGUE 3518 3519 3520#else 3521/************************************************************************************* 3522* TRMM Kernel 3523*************************************************************************************/ 3524 3525 3526 PROLOGUE 3527 PROFCODE 3528 3529 subq $STACKSIZE, %rsp 3530 movq %rbx, (%rsp) 3531 movq %rbp, 8(%rsp) 3532 movq %r12, 16(%rsp) 3533 movq %r13, 24(%rsp) 3534 movq %r14, 32(%rsp) 3535 movq %r15, 40(%rsp) 3536 3537 vzeroupper 3538 3539#ifdef WINDOWS_ABI 3540 movq %rdi, 48(%rsp) 3541 movq %rsi, 56(%rsp) 3542 movups %xmm6, 64(%rsp) 3543 movups %xmm7, 80(%rsp) 3544 movups %xmm8, 96(%rsp) 3545 movups %xmm9, 112(%rsp) 3546 movups %xmm10, 128(%rsp) 3547 movups %xmm11, 144(%rsp) 3548 movups %xmm12, 160(%rsp) 3549 movups %xmm13, 176(%rsp) 3550 movups %xmm14, 192(%rsp) 3551 movups %xmm15, 208(%rsp) 3552 3553 movq ARG1, OLD_M 3554 movq ARG2, OLD_N 3555 movq ARG3, OLD_K 3556 movq OLD_A, A 3557 movq OLD_B, B 3558 movq OLD_C, C 3559 movq OLD_LDC, LDC 3560#ifdef TRMMKERNEL 3561 movsd OLD_OFFSET, %xmm12 3562#endif 3563 vmovaps %xmm3, %xmm0 3564 3565#else 3566 movq STACKSIZE + 8(%rsp), LDC 3567#ifdef TRMMKERNEL 3568 movsd STACKSIZE + 16(%rsp), %xmm12 3569#endif 3570 3571#endif 3572 3573 movq %rsp, SP # save old stack 3574 subq $128 + L_BUFFER_SIZE, %rsp 3575 andq $-4096, %rsp # align stack 3576 3577 STACK_TOUCH 3578 3579 cmpq $0, OLD_M 3580 je .L999 3581 3582 cmpq $0, OLD_N 3583 je .L999 3584 3585 cmpq $0, OLD_K 3586 je .L999 3587 3588 movq OLD_M, M 3589 movq OLD_N, N 3590 movq OLD_K, K 3591 3592 vmovsd %xmm0, ALPHA 3593 3594 salq $BASE_SHIFT, LDC 3595 3596 movq N, %rax 3597 xorq %rdx, %rdx 3598 movq $2, %rdi 3599 divq %rdi // N / 6 3600 movq %rax, Ndiv6 // N / 6 3601 movq %rdx, Nmod6 // N % 6 3602 3603 3604 3605#ifdef TRMMKERNEL 3606 vmovsd %xmm12, OFFSET 3607 vmovsd %xmm12, KK 3608#ifndef LEFT 3609 negq KK 3610#endif 3611#endif 3612 3613 movq Ndiv6, J 3614 cmpq $0, J 3615 je .L1_0 3616 ALIGN_4 3617 3618.L2_01: 3619 // copy to sub buffer 3620 movq B, BO1 3621 leaq BUFFER1, BO // first buffer to BO 3622 movq K, %rax 3623 sarq $2, %rax // K / 4 3624 jz .L2_01b 3625 ALIGN_4 3626 3627.L2_01a: 3628 prefetcht0 512(BO1) 3629 prefetchw 512(BO) 3630 3631 vmovups (BO1), %xmm0 3632 vmovups 2*SIZE(BO1), %xmm1 3633 vmovups 4*SIZE(BO1), %xmm2 3634 vmovups 6*SIZE(BO1), %xmm3 3635 3636 vmovups %xmm0, (BO) 3637 vmovups %xmm1, 2*SIZE(BO) 3638 vmovups %xmm2, 4*SIZE(BO) 3639 vmovups %xmm3, 6*SIZE(BO) 3640 3641 addq $8*SIZE,BO1 3642 addq $8*SIZE,BO 3643 decq %rax 3644 jnz .L2_01a 3645 3646 3647.L2_01b: 3648 3649 movq K, %rax 3650 andq $3, %rax // K % 4 3651 jz .L2_02d 3652 ALIGN_4 3653 3654.L2_02c: 3655 3656 vmovups (BO1), %xmm0 3657 vmovups %xmm0, (BO) 3658 addq $2*SIZE,BO1 3659 addq $2*SIZE,BO 3660 decq %rax 3661 jnz .L2_02c 3662 3663.L2_02d: 3664 3665 movq BO1, B // next offset of B 3666 3667.L2_10: 3668 movq C, CO1 3669 leaq (C, LDC, 2), C // c += 2 * ldc 3670 3671#if defined(TRMMKERNEL) && defined(LEFT) 3672 movq OFFSET, %rax 3673 movq %rax, KK 3674#endif 3675 3676 movq A, AO // aoffset = a 3677 addq $32 * SIZE, AO 3678 3679 movq M, I 3680 sarq $4, I // i = (m >> 4) 3681 je .L2_20 3682 3683 ALIGN_4 3684 3685.L2_11: 3686#if !defined(TRMMKERNEL) || \ 3687 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 3688 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 3689 leaq BUFFER1, BO // first buffer to BO 3690 addq $4 * SIZE, BO 3691#else 3692 movq KK, %rax 3693 leaq BUFFER1, BO // first buffer to BO 3694 addq $4 * SIZE, BO 3695 movq %rax, BI // Index for BO 3696 leaq (BI,BI,1), BI // BI = BI * 2 ; number of values 3697 leaq (BO, BI, SIZE), BO 3698 salq $4, %rax // rax = rax * 16 ; number of values 3699 leaq (AO, %rax, SIZE), AO 3700#endif 3701 3702 3703 vzeroall 3704 3705#ifndef TRMMKERNEL 3706 movq K, %rax 3707#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 3708 movq K, %rax 3709 subq KK, %rax 3710 movq %rax, KKK 3711#else 3712 movq KK, %rax 3713#ifdef LEFT 3714 addq $16, %rax // number of values in AO 3715#else 3716 addq $2, %rax // number of values in BO 3717#endif 3718 movq %rax, KKK 3719#endif 3720 3721 andq $-8, %rax // K = K - ( K % 8 ) 3722 je .L2_16 3723 movq %rax, BI // Index for BO 3724 leaq (BI,BI,1), BI // BI = BI * 2 ; number of values 3725 3726 salq $4, %rax // rax = rax * 16 ; number of values 3727 leaq (AO, %rax, SIZE), AO 3728 leaq (BO, BI, SIZE), BO 3729 negq BI 3730 negq %rax 3731 ALIGN_4 3732 3733.L2_12: 3734 3735 prefetcht0 B_PR1(BO,BI,8) 3736 KERNEL16x2_1 3737 KERNEL16x2_2 3738 KERNEL16x2_3 3739 KERNEL16x2_4 3740 3741 prefetcht0 B_PR1(BO,BI,8) 3742 KERNEL16x2_1 3743 KERNEL16x2_2 3744 KERNEL16x2_3 3745 KERNEL16x2_4 3746 3747 je .L2_16 3748 3749 prefetcht0 B_PR1(BO,BI,8) 3750 KERNEL16x2_1 3751 KERNEL16x2_2 3752 KERNEL16x2_3 3753 KERNEL16x2_4 3754 3755 prefetcht0 B_PR1(BO,BI,8) 3756 KERNEL16x2_1 3757 KERNEL16x2_2 3758 KERNEL16x2_3 3759 KERNEL16x2_4 3760 3761 je .L2_16 3762 3763 jmp .L2_12 3764 ALIGN_4 3765 3766.L2_16: 3767#ifndef TRMMKERNEL 3768 movq K, %rax 3769#else 3770 movq KKK, %rax 3771#endif 3772 3773 andq $7, %rax # if (k & 1) 3774 je .L2_19 3775 3776 movq %rax, BI // Index for BO 3777 leaq (BI,BI,1), BI // BI = BI * 2 ; number of values 3778 3779 salq $4, %rax // rax = rax * 16 ; number of values 3780 leaq (AO, %rax, SIZE), AO 3781 leaq (BO, BI, SIZE), BO 3782 negq BI 3783 negq %rax 3784 ALIGN_4 3785 3786.L2_17: 3787 3788 KERNEL16x2_SUB 3789 3790 jl .L2_17 3791 ALIGN_4 3792 3793 3794.L2_19: 3795 3796 SAVE16x2 3797 3798#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 3799 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 3800 movq K, %rax 3801 subq KKK, %rax 3802 movq %rax, BI // Index for BO 3803 leaq (BI,BI,1), BI // BI = BI * 2 ; number of values 3804 leaq (BO, BI, SIZE), BO 3805 salq $4, %rax // rax = rax * 16 ; number of values 3806 leaq (AO, %rax, SIZE), AO 3807#endif 3808 3809 3810#if defined(TRMMKERNEL) && defined(LEFT) 3811 addq $16, KK 3812#endif 3813 3814 addq $16 * SIZE, CO1 # coffset += 16 3815 decq I # i -- 3816 jg .L2_11 3817 ALIGN_4 3818 3819/************************************************************************** 3820* Rest of M 3821***************************************************************************/ 3822.L2_20: 3823 // Test rest of M 3824 3825 testq $15, M 3826 jz .L2_60 // to next 3 lines of N 3827 3828 testq $8, M 3829 jz .L2_21pre 3830 ALIGN_4 3831 3832/**************************************************************************/ 3833 3834.L2_20_1: 3835#if !defined(TRMMKERNEL) || \ 3836 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 3837 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 3838 leaq BUFFER1, BO // first buffer to BO 3839 addq $4 * SIZE, BO 3840#else 3841 movq KK, %rax 3842 leaq BUFFER1, BO // first buffer to BO 3843 addq $4 * SIZE, BO 3844 movq %rax, BI // Index for BO 3845 leaq (BI,BI,1), BI // BI = BI * 2 ; number of values 3846 leaq (BO, BI, SIZE), BO 3847 salq $3, %rax // rax = rax * 8 ; number of values 3848 leaq (AO, %rax, SIZE), AO 3849#endif 3850 3851 3852 vzeroall 3853 3854#ifndef TRMMKERNEL 3855 movq K, %rax 3856#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 3857 movq K, %rax 3858 subq KK, %rax 3859 movq %rax, KKK 3860#else 3861 movq KK, %rax 3862#ifdef LEFT 3863 addq $8, %rax // number of values in A 3864#else 3865 addq $2, %rax // number of values in BO 3866#endif 3867 movq %rax, KKK 3868#endif 3869 3870 3871 andq $-8, %rax 3872 je .L2_20_6 3873 movq %rax, BI // Index for BO 3874 leaq (BI,BI,1), BI // BI = BI * 2 ; number of values 3875 3876 salq $3, %rax // rax = rax * 8 ; number of values 3877 leaq (AO, %rax, SIZE), AO 3878 leaq (BO, BI, SIZE), BO 3879 negq BI 3880 negq %rax 3881 ALIGN_4 3882 3883.L2_20_2: 3884 3885 prefetcht0 B_PR1(BO,BI,8) 3886 KERNEL8x2_1 3887 KERNEL8x2_2 3888 KERNEL8x2_3 3889 KERNEL8x2_4 3890 3891 prefetcht0 B_PR1(BO,BI,8) 3892 KERNEL8x2_1 3893 KERNEL8x2_2 3894 KERNEL8x2_3 3895 KERNEL8x2_4 3896 3897 je .L2_20_6 3898 3899 prefetcht0 B_PR1(BO,BI,8) 3900 KERNEL8x2_1 3901 KERNEL8x2_2 3902 KERNEL8x2_3 3903 KERNEL8x2_4 3904 3905 prefetcht0 B_PR1(BO,BI,8) 3906 KERNEL8x2_1 3907 KERNEL8x2_2 3908 KERNEL8x2_3 3909 KERNEL8x2_4 3910 3911 je .L2_20_6 3912 3913 jmp .L2_20_2 3914 ALIGN_4 3915 3916.L2_20_6: 3917#ifndef TRMMKERNEL 3918 movq K, %rax 3919#else 3920 movq KKK, %rax 3921#endif 3922 3923 andq $7, %rax # if (k & 1) 3924 je .L2_20_9 3925 3926 movq %rax, BI // Index for BO 3927 leaq (BI,BI,1), BI // BI = BI * 2 ; number of values 3928 3929 salq $3, %rax // rax = rax * 8 ; number of values 3930 leaq (AO, %rax, SIZE), AO 3931 leaq (BO, BI, SIZE), BO 3932 negq BI 3933 negq %rax 3934 ALIGN_4 3935 3936.L2_20_7: 3937 3938 KERNEL8x2_SUB 3939 3940 jl .L2_20_7 3941 ALIGN_4 3942 3943 3944.L2_20_9: 3945 3946 SAVE8x2 3947 3948#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 3949 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 3950 movq K, %rax 3951 subq KKK, %rax 3952 movq %rax, BI // Index for BO 3953 leaq (BI,BI,1), BI // BI = BI * 2 ; number of values 3954 leaq (BO, BI, SIZE), BO 3955 salq $3, %rax // rax = rax * 8 ; number of values 3956 leaq (AO, %rax, SIZE), AO 3957#endif 3958 3959 3960#if defined(TRMMKERNEL) && defined(LEFT) 3961 addq $8, KK 3962#endif 3963 3964 addq $8 * SIZE, CO1 # coffset += 8 3965 ALIGN_4 3966 3967 3968 3969/**************************************************************************/ 3970 3971.L2_21pre: 3972 3973 testq $4, M 3974 jz .L2_30 3975 ALIGN_4 3976 3977.L2_21: 3978#if !defined(TRMMKERNEL) || \ 3979 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 3980 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 3981 leaq BUFFER1, BO // first buffer to BO 3982 addq $4 * SIZE, BO 3983#else 3984 movq KK, %rax 3985 leaq BUFFER1, BO // first buffer to BO 3986 addq $4 * SIZE, BO 3987 movq %rax, BI // Index for BO 3988 leaq (BI,BI,1), BI // BI = BI * 2 ; number of values 3989 leaq (BO, BI, SIZE), BO 3990 salq $2, %rax // rax = rax * 4 ; number of values 3991 leaq (AO, %rax, SIZE), AO 3992#endif 3993 3994 3995 vzeroall 3996 3997#ifndef TRMMKERNEL 3998 movq K, %rax 3999#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 4000 movq K, %rax 4001 subq KK, %rax 4002 movq %rax, KKK 4003#else 4004 movq KK, %rax 4005#ifdef LEFT 4006 addq $4, %rax // number of values in A 4007#else 4008 addq $2, %rax // number of values in BO 4009#endif 4010 movq %rax, KKK 4011#endif 4012 4013 4014 andq $-8, %rax 4015 je .L2_26 4016 movq %rax, BI // Index for BO 4017 leaq (BI,BI,1), BI // BI = BI * 1 ; number of values 4018 4019 salq $2, %rax // rax = rax * 4 ; number of values 4020 leaq (AO, %rax, SIZE), AO 4021 leaq (BO, BI, SIZE), BO 4022 negq BI 4023 negq %rax 4024 ALIGN_4 4025 4026.L2_22: 4027 4028 prefetcht0 B_PR1(BO,BI,8) 4029 KERNEL4x2_1 4030 KERNEL4x2_2 4031 KERNEL4x2_3 4032 KERNEL4x2_4 4033 4034 prefetcht0 B_PR1(BO,BI,8) 4035 KERNEL4x2_1 4036 KERNEL4x2_2 4037 KERNEL4x2_3 4038 KERNEL4x2_4 4039 4040 je .L2_26 4041 4042 prefetcht0 B_PR1(BO,BI,8) 4043 KERNEL4x2_1 4044 KERNEL4x2_2 4045 KERNEL4x2_3 4046 KERNEL4x2_4 4047 4048 prefetcht0 B_PR1(BO,BI,8) 4049 KERNEL4x2_1 4050 KERNEL4x2_2 4051 KERNEL4x2_3 4052 KERNEL4x2_4 4053 4054 je .L2_26 4055 4056 jmp .L2_22 4057 ALIGN_4 4058 4059.L2_26: 4060#ifndef TRMMKERNEL 4061 movq K, %rax 4062#else 4063 movq KKK, %rax 4064#endif 4065 4066 andq $7, %rax # if (k & 1) 4067 je .L2_29 4068 4069 movq %rax, BI // Index for BO 4070 leaq (BI,BI,1), BI // BI = BI * 2 ; number of values 4071 4072 salq $2, %rax // rax = rax * 4 ; number of values 4073 leaq (AO, %rax, SIZE), AO 4074 leaq (BO, BI, SIZE), BO 4075 negq BI 4076 negq %rax 4077 ALIGN_4 4078 4079.L2_27: 4080 4081 KERNEL4x2_SUB 4082 4083 jl .L2_27 4084 ALIGN_4 4085 4086 4087.L2_29: 4088 4089 SAVE4x2 4090 4091#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 4092 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 4093 movq K, %rax 4094 subq KKK, %rax 4095 movq %rax, BI // Index for BO 4096 leaq (BI,BI,1), BI // BI = BI * 2 ; number of values 4097 leaq (BO, BI, SIZE), BO 4098 salq $2, %rax // rax = rax * 4 ; number of values 4099 leaq (AO, %rax, SIZE), AO 4100#endif 4101 4102 4103#if defined(TRMMKERNEL) && defined(LEFT) 4104 addq $4, KK 4105#endif 4106 4107 addq $4 * SIZE, CO1 # coffset += 4 4108 ALIGN_4 4109 4110 4111.L2_30: 4112 testq $2, M 4113 jz .L2_40 4114 4115 ALIGN_4 4116 4117.L2_31: 4118#if !defined(TRMMKERNEL) || \ 4119 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 4120 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 4121 leaq BUFFER1, BO // first buffer to BO 4122 addq $4 * SIZE, BO 4123#else 4124 movq KK, %rax 4125 leaq BUFFER1, BO // first buffer to BO 4126 addq $4 * SIZE, BO 4127 movq %rax, BI // Index for BO 4128 leaq (BI,BI,1), BI // BI = BI * 2 ; number of values 4129 leaq (BO, BI, SIZE), BO 4130 salq $1, %rax // rax = rax * 2 ; number of values 4131 leaq (AO, %rax, SIZE), AO 4132#endif 4133 4134 4135 vzeroall 4136 4137#ifndef TRMMKERNEL 4138 movq K, %rax 4139#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 4140 movq K, %rax 4141 subq KK, %rax 4142 movq %rax, KKK 4143#else 4144 movq KK, %rax 4145#ifdef LEFT 4146 addq $2, %rax // number of values in AO 4147#else 4148 addq $2, %rax // number of values in BO 4149#endif 4150 movq %rax, KKK 4151#endif 4152 4153 4154 andq $-8, %rax 4155 je .L2_36 4156 movq %rax, BI // Index for BO 4157 leaq (BI,BI,1), BI // BI = BI * 2 ; number of values 4158 4159 salq $1, %rax // rax = rax *2 ; number of values 4160 leaq (AO, %rax, SIZE), AO 4161 leaq (BO, BI, SIZE), BO 4162 negq BI 4163 negq %rax 4164 ALIGN_4 4165 4166.L2_32: 4167 4168 KERNEL2x2_1 4169 KERNEL2x2_2 4170 KERNEL2x2_3 4171 KERNEL2x2_4 4172 4173 KERNEL2x2_1 4174 KERNEL2x2_2 4175 KERNEL2x2_3 4176 KERNEL2x2_4 4177 4178 je .L2_36 4179 4180 KERNEL2x2_1 4181 KERNEL2x2_2 4182 KERNEL2x2_3 4183 KERNEL2x2_4 4184 4185 KERNEL2x2_1 4186 KERNEL2x2_2 4187 KERNEL2x2_3 4188 KERNEL2x2_4 4189 4190 je .L2_36 4191 4192 jmp .L2_32 4193 ALIGN_4 4194 4195.L2_36: 4196#ifndef TRMMKERNEL 4197 movq K, %rax 4198#else 4199 movq KKK, %rax 4200#endif 4201 4202 andq $7, %rax # if (k & 1) 4203 je .L2_39 4204 4205 movq %rax, BI // Index for BO 4206 leaq (BI,BI,1), BI // BI = BI * 2 ; number of values 4207 4208 salq $1, %rax // rax = rax *2 ; number of values 4209 leaq (AO, %rax, SIZE), AO 4210 leaq (BO, BI, SIZE), BO 4211 negq BI 4212 negq %rax 4213 ALIGN_4 4214 4215.L2_37: 4216 4217 KERNEL2x2_SUB 4218 4219 jl .L2_37 4220 ALIGN_4 4221 4222 4223.L2_39: 4224 4225 SAVE2x2 4226 4227#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 4228 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 4229 movq K, %rax 4230 subq KKK, %rax 4231 movq %rax, BI // Index for BO 4232 leaq (BI,BI,1), BI // BI = BI * 2 ; number of values 4233 leaq (BO, BI, SIZE), BO 4234 salq $1, %rax // rax = rax * 2 ; number of values 4235 leaq (AO, %rax, SIZE), AO 4236#endif 4237 4238 4239#if defined(TRMMKERNEL) && defined(LEFT) 4240 addq $2, KK 4241#endif 4242 4243 addq $2 * SIZE, CO1 # coffset += 2 4244 ALIGN_4 4245 4246.L2_40: 4247 testq $1, M 4248 jz .L2_60 // to next 2 lines of N 4249 4250 ALIGN_4 4251 4252.L2_41: 4253#if !defined(TRMMKERNEL) || \ 4254 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 4255 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 4256 leaq BUFFER1, BO // first buffer to BO 4257 addq $4 * SIZE, BO 4258#else 4259 movq KK, %rax 4260 leaq BUFFER1, BO // first buffer to BO 4261 addq $4 * SIZE, BO 4262 movq %rax, BI // Index for BO 4263 leaq (BI,BI,1), BI // BI = BI * 2 ; number of values 4264 leaq (BO, BI, SIZE), BO 4265 leaq (AO, %rax, SIZE), AO 4266#endif 4267 4268 4269 vzeroall 4270 4271#ifndef TRMMKERNEL 4272 movq K, %rax 4273#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 4274 movq K, %rax 4275 subq KK, %rax 4276 movq %rax, KKK 4277#else 4278 movq KK, %rax 4279#ifdef LEFT 4280 addq $1, %rax // number of values in AO 4281#else 4282 addq $2, %rax // number of values in BO 4283#endif 4284 movq %rax, KKK 4285#endif 4286 4287 andq $-8, %rax 4288 je .L2_46 4289 movq %rax, BI // Index for BO 4290 leaq (BI,BI,1), BI // BI = BI * 2 ; number of values 4291 4292 leaq (AO, %rax, SIZE), AO 4293 leaq (BO, BI, SIZE), BO 4294 negq BI 4295 negq %rax 4296 ALIGN_4 4297 4298.L2_42: 4299 4300 KERNEL1x2_1 4301 KERNEL1x2_2 4302 KERNEL1x2_3 4303 KERNEL1x2_4 4304 4305 KERNEL1x2_1 4306 KERNEL1x2_2 4307 KERNEL1x2_3 4308 KERNEL1x2_4 4309 4310 je .L2_46 4311 4312 KERNEL1x2_1 4313 KERNEL1x2_2 4314 KERNEL1x2_3 4315 KERNEL1x2_4 4316 4317 KERNEL1x2_1 4318 KERNEL1x2_2 4319 KERNEL1x2_3 4320 KERNEL1x2_4 4321 4322 je .L2_46 4323 4324 jmp .L2_42 4325 ALIGN_4 4326 4327.L2_46: 4328#ifndef TRMMKERNEL 4329 movq K, %rax 4330#else 4331 movq KKK, %rax 4332#endif 4333 4334 andq $7, %rax # if (k & 1) 4335 je .L2_49 4336 4337 movq %rax, BI // Index for BO 4338 leaq (BI,BI,1), BI // BI = BI * 2 ; number of values 4339 4340 leaq (AO, %rax, SIZE), AO 4341 leaq (BO, BI, SIZE), BO 4342 negq BI 4343 negq %rax 4344 ALIGN_4 4345 4346.L2_47: 4347 4348 KERNEL1x2_SUB 4349 4350 jl .L2_47 4351 ALIGN_4 4352 4353 4354.L2_49: 4355 4356 SAVE1x2 4357 4358#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 4359 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 4360 movq K, %rax 4361 subq KKK, %rax 4362 movq %rax, BI // Index for BO 4363 leaq (BI,BI,1), BI // BI = BI * 2 ; number of values 4364 leaq (BO, BI, SIZE), BO 4365 leaq (AO, %rax, SIZE), AO 4366#endif 4367 4368 4369#if defined(TRMMKERNEL) && defined(LEFT) 4370 addq $1, KK 4371#endif 4372 4373 addq $1 * SIZE, CO1 # coffset += 1 4374 ALIGN_4 4375 4376 4377 4378 4379 4380.L2_60: 4381#if defined(TRMMKERNEL) && !defined(LEFT) 4382 addq $2, KK 4383#endif 4384 4385 decq J // j -- 4386 jg .L2_01 // next 2 lines of N 4387 4388 4389 4390.L1_0: 4391 4392/************************************************************************************************ 4393* Loop for Nmod6 % 2 > 0 4394*************************************************************************************************/ 4395 4396 movq Nmod6, J 4397 andq $1, J // j % 2 4398 je .L999 4399 ALIGN_4 4400 4401.L1_01: 4402 // copy to sub buffer 4403 movq B, BO1 4404 leaq BUFFER1, BO // first buffer to BO 4405 movq K, %rax 4406 ALIGN_4 4407 4408.L1_02b: 4409 4410 vmovsd (BO1), %xmm0 4411 vmovsd %xmm0, (BO) 4412 addq $1*SIZE,BO1 4413 addq $1*SIZE,BO 4414 decq %rax 4415 jnz .L1_02b 4416 4417.L1_02c: 4418 4419 movq BO1, B // next offset of B 4420 4421.L1_10: 4422 movq C, CO1 4423 leaq (C, LDC, 1), C // c += 1 * ldc 4424 4425#if defined(TRMMKERNEL) && defined(LEFT) 4426 movq OFFSET, %rax 4427 movq %rax, KK 4428#endif 4429 4430 movq A, AO // aoffset = a 4431 addq $32 * SIZE, AO 4432 4433 movq M, I 4434 sarq $4, I // i = (m >> 4) 4435 je .L1_20 4436 4437 ALIGN_4 4438 4439.L1_11: 4440#if !defined(TRMMKERNEL) || \ 4441 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 4442 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 4443 leaq BUFFER1, BO // first buffer to BO 4444 addq $2 * SIZE, BO 4445#else 4446 movq KK, %rax 4447 leaq BUFFER1, BO // first buffer to BO 4448 addq $2 * SIZE, BO 4449 movq %rax, BI // Index for BO 4450 leaq (BO, BI, SIZE), BO 4451 salq $4, %rax // rax = rax * 16 ; number of values 4452 leaq (AO, %rax, SIZE), AO 4453#endif 4454 4455 4456 vzeroall 4457 4458#ifndef TRMMKERNEL 4459 movq K, %rax 4460#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 4461 movq K, %rax 4462 subq KK, %rax 4463 movq %rax, KKK 4464#else 4465 movq KK, %rax 4466#ifdef LEFT 4467 addq $16, %rax // number of values in AO 4468#else 4469 addq $1, %rax // number of values in BO 4470#endif 4471 movq %rax, KKK 4472#endif 4473 4474 andq $-8, %rax // K = K - ( K % 8 ) 4475 je .L1_16 4476 movq %rax, BI // Index for BO 4477 4478 salq $4, %rax // rax = rax * 16 ; number of values 4479 leaq (AO, %rax, SIZE), AO 4480 leaq (BO, BI, SIZE), BO 4481 negq BI 4482 negq %rax 4483 ALIGN_4 4484 4485.L1_12: 4486 4487 prefetcht0 B_PR1(BO,BI,8) 4488 KERNEL16x1_1 4489 KERNEL16x1_2 4490 KERNEL16x1_3 4491 KERNEL16x1_4 4492 4493 KERNEL16x1_1 4494 KERNEL16x1_2 4495 KERNEL16x1_3 4496 KERNEL16x1_4 4497 4498 je .L1_16 4499 4500 prefetcht0 B_PR1(BO,BI,8) 4501 KERNEL16x1_1 4502 KERNEL16x1_2 4503 KERNEL16x1_3 4504 KERNEL16x1_4 4505 4506 KERNEL16x1_1 4507 KERNEL16x1_2 4508 KERNEL16x1_3 4509 KERNEL16x1_4 4510 4511 je .L1_16 4512 4513 jmp .L1_12 4514 ALIGN_4 4515 4516.L1_16: 4517#ifndef TRMMKERNEL 4518 movq K, %rax 4519#else 4520 movq KKK, %rax 4521#endif 4522 4523 andq $7, %rax # if (k & 1) 4524 je .L1_19 4525 4526 movq %rax, BI // Index for BO 4527 4528 salq $4, %rax // rax = rax * 16 ; number of values 4529 leaq (AO, %rax, SIZE), AO 4530 leaq (BO, BI, SIZE), BO 4531 negq BI 4532 negq %rax 4533 ALIGN_4 4534 4535.L1_17: 4536 4537 KERNEL16x1_SUB 4538 4539 jl .L1_17 4540 ALIGN_4 4541 4542 4543.L1_19: 4544 4545 SAVE16x1 4546 4547#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 4548 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 4549 movq K, %rax 4550 subq KKK, %rax 4551 movq %rax, BI // Index for BO 4552 leaq (BO, BI, SIZE), BO 4553 salq $4, %rax // rax = rax * 16 ; number of values 4554 leaq (AO, %rax, SIZE), AO 4555#endif 4556 4557 4558#if defined(TRMMKERNEL) && defined(LEFT) 4559 addq $16, KK 4560#endif 4561 4562 addq $16 * SIZE, CO1 # coffset += 16 4563 decq I # i -- 4564 jg .L1_11 4565 ALIGN_4 4566 4567/************************************************************************** 4568* Rest of M 4569***************************************************************************/ 4570.L1_20: 4571 // Test rest of M 4572 4573 testq $15, M 4574 jz .L999 4575 4576 testq $8, M 4577 jz .L1_21pre 4578 ALIGN_4 4579 4580/**************************************************************************/ 4581 4582.L1_20_1: 4583#if !defined(TRMMKERNEL) || \ 4584 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 4585 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 4586 leaq BUFFER1, BO // first buffer to BO 4587 addq $2 * SIZE, BO 4588#else 4589 movq KK, %rax 4590 leaq BUFFER1, BO // first buffer to BO 4591 addq $2 * SIZE, BO 4592 movq %rax, BI // Index for BO 4593 leaq (BO, BI, SIZE), BO 4594 salq $3, %rax // rax = rax * 8 ; number of values 4595 leaq (AO, %rax, SIZE), AO 4596#endif 4597 4598 4599 vzeroall 4600 4601#ifndef TRMMKERNEL 4602 movq K, %rax 4603#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 4604 movq K, %rax 4605 subq KK, %rax 4606 movq %rax, KKK 4607#else 4608 movq KK, %rax 4609#ifdef LEFT 4610 addq $8, %rax // number of values in A 4611#else 4612 addq $1, %rax // number of values in BO 4613#endif 4614 movq %rax, KKK 4615#endif 4616 4617 4618 andq $-8, %rax 4619 je .L1_20_6 4620 movq %rax, BI // Index for BO 4621 4622 salq $3, %rax // rax = rax * 8 ; number of values 4623 leaq (AO, %rax, SIZE), AO 4624 leaq (BO, BI, SIZE), BO 4625 negq BI 4626 negq %rax 4627 ALIGN_4 4628 4629.L1_20_2: 4630 4631 prefetcht0 B_PR1(BO,BI,8) 4632 KERNEL8x1_1 4633 KERNEL8x1_2 4634 KERNEL8x1_3 4635 KERNEL8x1_4 4636 4637 KERNEL8x1_1 4638 KERNEL8x1_2 4639 KERNEL8x1_3 4640 KERNEL8x1_4 4641 4642 je .L1_20_6 4643 4644 prefetcht0 B_PR1(BO,BI,8) 4645 KERNEL8x1_1 4646 KERNEL8x1_2 4647 KERNEL8x1_3 4648 KERNEL8x1_4 4649 4650 KERNEL8x1_1 4651 KERNEL8x1_2 4652 KERNEL8x1_3 4653 KERNEL8x1_4 4654 4655 je .L1_20_6 4656 4657 jmp .L1_20_2 4658 ALIGN_4 4659 4660.L1_20_6: 4661#ifndef TRMMKERNEL 4662 movq K, %rax 4663#else 4664 movq KKK, %rax 4665#endif 4666 4667 andq $7, %rax # if (k & 1) 4668 je .L1_20_9 4669 4670 movq %rax, BI // Index for BO 4671 4672 salq $3, %rax // rax = rax * 8 ; number of values 4673 leaq (AO, %rax, SIZE), AO 4674 leaq (BO, BI, SIZE), BO 4675 negq BI 4676 negq %rax 4677 ALIGN_4 4678 4679.L1_20_7: 4680 4681 KERNEL8x1_SUB 4682 4683 jl .L1_20_7 4684 ALIGN_4 4685 4686 4687.L1_20_9: 4688 4689 SAVE8x1 4690 4691#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 4692 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 4693 movq K, %rax 4694 subq KKK, %rax 4695 movq %rax, BI // Index for BO 4696 leaq (BO, BI, SIZE), BO 4697 salq $3, %rax // rax = rax * 8 ; number of values 4698 leaq (AO, %rax, SIZE), AO 4699#endif 4700 4701 4702#if defined(TRMMKERNEL) && defined(LEFT) 4703 addq $8, KK 4704#endif 4705 4706 addq $8 * SIZE, CO1 # coffset += 8 4707 ALIGN_4 4708 4709 4710 4711/**************************************************************************/ 4712 4713.L1_21pre: 4714 4715 testq $4, M 4716 jz .L1_30 4717 ALIGN_4 4718 4719.L1_21: 4720#if !defined(TRMMKERNEL) || \ 4721 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 4722 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 4723 leaq BUFFER1, BO // first buffer to BO 4724 addq $2 * SIZE, BO 4725#else 4726 movq KK, %rax 4727 leaq BUFFER1, BO // first buffer to BO 4728 addq $2 * SIZE, BO 4729 movq %rax, BI // Index for BO 4730 leaq (BO, BI, SIZE), BO 4731 salq $2, %rax // rax = rax * 4 ; number of values 4732 leaq (AO, %rax, SIZE), AO 4733#endif 4734 4735 4736 vzeroall 4737 4738#ifndef TRMMKERNEL 4739 movq K, %rax 4740#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 4741 movq K, %rax 4742 subq KK, %rax 4743 movq %rax, KKK 4744#else 4745 movq KK, %rax 4746#ifdef LEFT 4747 addq $4, %rax // number of values in A 4748#else 4749 addq $1, %rax // number of values in BO 4750#endif 4751 movq %rax, KKK 4752#endif 4753 4754 4755 andq $-8, %rax 4756 je .L1_26 4757 movq %rax, BI // Index for BO 4758 4759 salq $2, %rax // rax = rax * 4 ; number of values 4760 leaq (AO, %rax, SIZE), AO 4761 leaq (BO, BI, SIZE), BO 4762 negq BI 4763 negq %rax 4764 ALIGN_4 4765 4766.L1_22: 4767 4768 prefetcht0 B_PR1(BO,BI,8) 4769 KERNEL4x1_1 4770 KERNEL4x1_2 4771 KERNEL4x1_3 4772 KERNEL4x1_4 4773 4774 KERNEL4x1_1 4775 KERNEL4x1_2 4776 KERNEL4x1_3 4777 KERNEL4x1_4 4778 4779 je .L1_26 4780 4781 prefetcht0 B_PR1(BO,BI,8) 4782 KERNEL4x1_1 4783 KERNEL4x1_2 4784 KERNEL4x1_3 4785 KERNEL4x1_4 4786 4787 KERNEL4x1_1 4788 KERNEL4x1_2 4789 KERNEL4x1_3 4790 KERNEL4x1_4 4791 4792 je .L1_26 4793 4794 jmp .L1_22 4795 ALIGN_4 4796 4797.L1_26: 4798#ifndef TRMMKERNEL 4799 movq K, %rax 4800#else 4801 movq KKK, %rax 4802#endif 4803 4804 andq $7, %rax # if (k & 1) 4805 je .L1_29 4806 4807 movq %rax, BI // Index for BO 4808 4809 salq $2, %rax // rax = rax * 4 ; number of values 4810 leaq (AO, %rax, SIZE), AO 4811 leaq (BO, BI, SIZE), BO 4812 negq BI 4813 negq %rax 4814 ALIGN_4 4815 4816.L1_27: 4817 4818 KERNEL4x1_SUB 4819 4820 jl .L1_27 4821 ALIGN_4 4822 4823 4824.L1_29: 4825 4826 SAVE4x1 4827 4828#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 4829 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 4830 movq K, %rax 4831 subq KKK, %rax 4832 movq %rax, BI // Index for BO 4833 leaq (BO, BI, SIZE), BO 4834 salq $2, %rax // rax = rax * 4 ; number of values 4835 leaq (AO, %rax, SIZE), AO 4836#endif 4837 4838 4839#if defined(TRMMKERNEL) && defined(LEFT) 4840 addq $4, KK 4841#endif 4842 4843 addq $4 * SIZE, CO1 # coffset += 4 4844 ALIGN_4 4845 4846 4847.L1_30: 4848 testq $2, M 4849 jz .L1_40 4850 4851 ALIGN_4 4852 4853.L1_31: 4854#if !defined(TRMMKERNEL) || \ 4855 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 4856 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 4857 leaq BUFFER1, BO // first buffer to BO 4858 addq $2 * SIZE, BO 4859#else 4860 movq KK, %rax 4861 leaq BUFFER1, BO // first buffer to BO 4862 addq $2 * SIZE, BO 4863 movq %rax, BI // Index for BO 4864 leaq (BO, BI, SIZE), BO 4865 salq $1, %rax // rax = rax * 2 ; number of values 4866 leaq (AO, %rax, SIZE), AO 4867#endif 4868 4869 4870 vzeroall 4871 4872#ifndef TRMMKERNEL 4873 movq K, %rax 4874#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 4875 movq K, %rax 4876 subq KK, %rax 4877 movq %rax, KKK 4878#else 4879 movq KK, %rax 4880#ifdef LEFT 4881 addq $2, %rax // number of values in AO 4882#else 4883 addq $1, %rax // number of values in BO 4884#endif 4885 movq %rax, KKK 4886#endif 4887 4888 4889 andq $-8, %rax 4890 je .L1_36 4891 movq %rax, BI // Index for BO 4892 4893 salq $1, %rax // rax = rax *2 ; number of values 4894 leaq (AO, %rax, SIZE), AO 4895 leaq (BO, BI, SIZE), BO 4896 negq BI 4897 negq %rax 4898 ALIGN_4 4899 4900.L1_32: 4901 4902 KERNEL2x1_1 4903 KERNEL2x1_2 4904 KERNEL2x1_3 4905 KERNEL2x1_4 4906 4907 KERNEL2x1_1 4908 KERNEL2x1_2 4909 KERNEL2x1_3 4910 KERNEL2x1_4 4911 4912 je .L1_36 4913 4914 KERNEL2x1_1 4915 KERNEL2x1_2 4916 KERNEL2x1_3 4917 KERNEL2x1_4 4918 4919 KERNEL2x1_1 4920 KERNEL2x1_2 4921 KERNEL2x1_3 4922 KERNEL2x1_4 4923 4924 je .L1_36 4925 4926 jmp .L1_32 4927 ALIGN_4 4928 4929.L1_36: 4930#ifndef TRMMKERNEL 4931 movq K, %rax 4932#else 4933 movq KKK, %rax 4934#endif 4935 4936 andq $7, %rax # if (k & 1) 4937 je .L1_39 4938 4939 movq %rax, BI // Index for BO 4940 4941 salq $1, %rax // rax = rax *2 ; number of values 4942 leaq (AO, %rax, SIZE), AO 4943 leaq (BO, BI, SIZE), BO 4944 negq BI 4945 negq %rax 4946 ALIGN_4 4947 4948.L1_37: 4949 4950 KERNEL2x1_SUB 4951 4952 jl .L1_37 4953 ALIGN_4 4954 4955 4956.L1_39: 4957 4958 SAVE2x1 4959 4960#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 4961 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 4962 movq K, %rax 4963 subq KKK, %rax 4964 movq %rax, BI // Index for BO 4965 leaq (BO, BI, SIZE), BO 4966 salq $1, %rax // rax = rax * 2 ; number of values 4967 leaq (AO, %rax, SIZE), AO 4968#endif 4969 4970 4971#if defined(TRMMKERNEL) && defined(LEFT) 4972 addq $2, KK 4973#endif 4974 4975 addq $2 * SIZE, CO1 # coffset += 2 4976 ALIGN_4 4977 4978.L1_40: 4979 testq $1, M 4980 jz .L999 4981 4982 ALIGN_4 4983 4984.L1_41: 4985#if !defined(TRMMKERNEL) || \ 4986 (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 4987 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 4988 leaq BUFFER1, BO // first buffer to BO 4989 addq $2 * SIZE, BO 4990#else 4991 movq KK, %rax 4992 leaq BUFFER1, BO // first buffer to BO 4993 addq $2 * SIZE, BO 4994 movq %rax, BI // Index for BO 4995 leaq (BO, BI, SIZE), BO 4996 leaq (AO, %rax, SIZE), AO 4997#endif 4998 4999 5000 vzeroall 5001 5002#ifndef TRMMKERNEL 5003 movq K, %rax 5004#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 5005 movq K, %rax 5006 subq KK, %rax 5007 movq %rax, KKK 5008#else 5009 movq KK, %rax 5010#ifdef LEFT 5011 addq $1, %rax // number of values in AO 5012#else 5013 addq $1, %rax // number of values in BO 5014#endif 5015 movq %rax, KKK 5016#endif 5017 5018 andq $-8, %rax 5019 je .L1_46 5020 movq %rax, BI // Index for BO 5021 5022 leaq (AO, %rax, SIZE), AO 5023 leaq (BO, BI, SIZE), BO 5024 negq BI 5025 negq %rax 5026 ALIGN_4 5027 5028.L1_42: 5029 5030 KERNEL1x1_1 5031 KERNEL1x1_2 5032 KERNEL1x1_3 5033 KERNEL1x1_4 5034 5035 KERNEL1x1_1 5036 KERNEL1x1_2 5037 KERNEL1x1_3 5038 KERNEL1x1_4 5039 5040 je .L1_46 5041 5042 KERNEL1x1_1 5043 KERNEL1x1_2 5044 KERNEL1x1_3 5045 KERNEL1x1_4 5046 5047 KERNEL1x1_1 5048 KERNEL1x1_2 5049 KERNEL1x1_3 5050 KERNEL1x1_4 5051 5052 je .L1_46 5053 5054 jmp .L1_42 5055 ALIGN_4 5056 5057.L1_46: 5058#ifndef TRMMKERNEL 5059 movq K, %rax 5060#else 5061 movq KKK, %rax 5062#endif 5063 5064 andq $7, %rax # if (k & 1) 5065 je .L1_49 5066 5067 movq %rax, BI // Index for BO 5068 5069 leaq (AO, %rax, SIZE), AO 5070 leaq (BO, BI, SIZE), BO 5071 negq BI 5072 negq %rax 5073 ALIGN_4 5074 5075.L1_47: 5076 5077 KERNEL1x1_SUB 5078 5079 jl .L1_47 5080 ALIGN_4 5081 5082 5083.L1_49: 5084 5085 SAVE1x1 5086 5087#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ 5088 (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) 5089 movq K, %rax 5090 subq KKK, %rax 5091 movq %rax, BI // Index for BO 5092 leaq (BO, BI, SIZE), BO 5093 leaq (AO, %rax, SIZE), AO 5094#endif 5095 5096 5097#if defined(TRMMKERNEL) && defined(LEFT) 5098 addq $1, KK 5099#endif 5100 5101 addq $1 * SIZE, CO1 # coffset += 1 5102 ALIGN_4 5103 5104 5105.L999: 5106 movq SP, %rsp 5107 movq (%rsp), %rbx 5108 movq 8(%rsp), %rbp 5109 movq 16(%rsp), %r12 5110 movq 24(%rsp), %r13 5111 movq 32(%rsp), %r14 5112 movq 40(%rsp), %r15 5113 5114#ifdef WINDOWS_ABI 5115 movq 48(%rsp), %rdi 5116 movq 56(%rsp), %rsi 5117 movups 64(%rsp), %xmm6 5118 movups 80(%rsp), %xmm7 5119 movups 96(%rsp), %xmm8 5120 movups 112(%rsp), %xmm9 5121 movups 128(%rsp), %xmm10 5122 movups 144(%rsp), %xmm11 5123 movups 160(%rsp), %xmm12 5124 movups 176(%rsp), %xmm13 5125 movups 192(%rsp), %xmm14 5126 movups 208(%rsp), %xmm15 5127#endif 5128 5129 addq $STACKSIZE, %rsp 5130 ret 5131 5132 EPILOGUE 5133 5134 5135 5136 5137 5138#endif 5139