1/************************************************************************************************** 2* * 3* This file is part of BLASFEO. * 4* * 5* BLASFEO -- BLAS For Embedded Optimization. * 6* Copyright (C) 2019 by Gianluca Frison. * 7* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. * 8* All rights reserved. * 9* * 10* The 2-Clause BSD License * 11* * 12* Redistribution and use in source and binary forms, with or without * 13* modification, are permitted provided that the following conditions are met: * 14* * 15* 1. Redistributions of source code must retain the above copyright notice, this * 16* list of conditions and the following disclaimer. * 17* 2. Redistributions in binary form must reproduce the above copyright notice, * 18* this list of conditions and the following disclaimer in the documentation * 19* and/or other materials provided with the distribution. * 20* * 21* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * 22* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * 23* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * 24* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * 25* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * 26* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * 27* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * 28* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * 29* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * 30* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * 31* * 32* Author: Gianluca Frison, gianluca.frison (at) imtek.uni-freiburg.de * 33* * 34**************************************************************************************************/ 35 36#if defined(OS_LINUX) | defined(OS_MAC) 37 38//#define STACKSIZE 96 39#define STACKSIZE 64 40#define ARG1 %rdi 41#define ARG2 %rsi 42#define ARG3 %rdx 43#define ARG4 %rcx 44#define ARG5 %r8 45#define ARG6 %r9 46#define ARG7 STACKSIZE + 8(%rsp) 47#define ARG8 STACKSIZE + 16(%rsp) 48#define ARG9 STACKSIZE + 24(%rsp) 49#define ARG10 STACKSIZE + 32(%rsp) 50#define ARG11 STACKSIZE + 40(%rsp) 51#define ARG12 STACKSIZE + 48(%rsp) 52#define ARG13 STACKSIZE + 56(%rsp) 53#define ARG14 STACKSIZE + 64(%rsp) 54#define ARG15 STACKSIZE + 72(%rsp) 55#define ARG16 STACKSIZE + 80(%rsp) 56#define ARG17 STACKSIZE + 88(%rsp) 57#define ARG18 STACKSIZE + 96(%rsp) 58#define PROLOGUE \ 59 subq $STACKSIZE, %rsp; \ 60 movq %rbx, (%rsp); \ 61 movq %rbp, 8(%rsp); \ 62 movq %r12, 16(%rsp); \ 63 movq %r13, 24(%rsp); \ 64 movq %r14, 32(%rsp); \ 65 movq %r15, 40(%rsp); \ 66 vzeroupper; 67#define EPILOGUE \ 68 vzeroupper; \ 69 movq (%rsp), %rbx; \ 70 movq 8(%rsp), %rbp; \ 71 movq 16(%rsp), %r12; \ 72 movq 24(%rsp), %r13; \ 73 movq 32(%rsp), %r14; \ 74 movq 40(%rsp), %r15; \ 75 addq $STACKSIZE, %rsp; 76 77#elif defined(OS_WINDOWS) 78 79#define STACKSIZE 256 80#define ARG1 %rcx 81#define ARG2 %rdx 82#define ARG3 %r8 83#define ARG4 %r9 84#define ARG5 STACKSIZE + 40(%rsp) 85#define ARG6 STACKSIZE + 48(%rsp) 86#define ARG7 STACKSIZE + 56(%rsp) 87#define ARG8 STACKSIZE + 64(%rsp) 88#define ARG9 STACKSIZE + 72(%rsp) 89#define ARG10 STACKSIZE + 80(%rsp) 90#define ARG11 STACKSIZE + 88(%rsp) 91#define ARG12 STACKSIZE + 96(%rsp) 92#define ARG13 STACKSIZE + 104(%rsp) 93#define ARG14 STACKSIZE + 112(%rsp) 94#define ARG15 STACKSIZE + 120(%rsp) 95#define ARG16 STACKSIZE + 128(%rsp) 96#define ARG17 STACKSIZE + 136(%rsp) 97#define ARG18 STACKSIZE + 144(%rsp) 98#define PROLOGUE \ 99 subq $STACKSIZE, %rsp; \ 100 movq %rbx, (%rsp); \ 101 movq %rbp, 8(%rsp); \ 102 movq %r12, 16(%rsp); \ 103 movq %r13, 24(%rsp); \ 104 movq %r14, 32(%rsp); \ 105 movq %r15, 40(%rsp); \ 106 movq %rdi, 48(%rsp); \ 107 movq %rsi, 56(%rsp); \ 108 vmovups %xmm6, 64(%rsp); \ 109 vmovups %xmm7, 80(%rsp); \ 110 vmovups %xmm8, 96(%rsp); \ 111 vmovups %xmm9, 112(%rsp); \ 112 vmovups %xmm10, 128(%rsp); \ 113 vmovups %xmm11, 144(%rsp); \ 114 vmovups %xmm12, 160(%rsp); \ 115 vmovups %xmm13, 176(%rsp); \ 116 vmovups %xmm14, 192(%rsp); \ 117 vmovups %xmm15, 208(%rsp); \ 118 vzeroupper; 119#define EPILOGUE \ 120 vzeroupper; \ 121 movq (%rsp), %rbx; \ 122 movq 8(%rsp), %rbp; \ 123 movq 16(%rsp), %r12; \ 124 movq 24(%rsp), %r13; \ 125 movq 32(%rsp), %r14; \ 126 movq 40(%rsp), %r15; \ 127 movq 48(%rsp), %rdi; \ 128 movq 56(%rsp), %rsi; \ 129 vmovups 64(%rsp), %xmm6; \ 130 vmovups 80(%rsp), %xmm7; \ 131 vmovups 96(%rsp), %xmm8; \ 132 vmovups 112(%rsp), %xmm9; \ 133 vmovups 128(%rsp), %xmm10; \ 134 vmovups 144(%rsp), %xmm11; \ 135 vmovups 160(%rsp), %xmm12; \ 136 vmovups 176(%rsp), %xmm13; \ 137 vmovups 192(%rsp), %xmm14; \ 138 vmovups 208(%rsp), %xmm15; \ 139 addq $STACKSIZE, %rsp; 140 141#else 142 143#error wrong OS 144 145#endif 146 147 148 149#if defined(OS_LINUX) | defined(OS_WINDOWS) 150 .text 151#elif defined(OS_MAC) 152 .section __TEXT,__text,regular,pure_instructions 153#endif 154 155 156 157 158// ASM Macros 159 160 161 162 163// void inner_kernel_dgemm_add_nn_4x2_lib4 164// common inner routine with file scope 165// 166// input arguments: 167// r10d <- k 168// r11 <- A 169// r12 <- B 170// r13 <- 4*sdb*sizeof(double) 171// r14 <= dirty 172// ymm0 <- [d00 d10 d20 d30] 173// ymm1 <- [d01 d11 d21 d31] 174 175// 176// output arguments: 177// r10d <- 0 178// r11 <- A+4*k*sizeof(double) 179// r12 <- B+(k/4)*sdb*sizeof(double)+(k%4) 180// r13 <- 4*sdb*sizeof(double) 181// r14 <= dirty 182// ymm0 <- [d00 d10 d20 d30] 183// ymm1 <- [d01 d11 d21 d31] 184 185#if MACRO_LEVEL>=2 186 .macro INNER_KERNEL_DGEMM_ADD_NN_4X2_LIB4 187#else 188 .p2align 4,,15 189#if defined(OS_LINUX) 190 .type inner_kernel_dgemm_add_nn_4x2_lib4, @function 191inner_kernel_dgemm_add_nn_4x2_lib4: 192#elif defined(OS_MAC) 193_inner_kernel_dgemm_add_nn_4x2_lib4: 194#elif defined(OS_WINDOWS) 195 .def inner_kernel_dgemm_add_nn_4x2_lib4; .scl 2; .type 32; .endef 196inner_kernel_dgemm_add_nn_4x2_lib4: 197#endif 198#endif 199 200 cmpl $0, %r10d 201 jle 2f // return 202 203 // preload 204 vmovapd 0(%r11), %ymm8 // A0[0] 205 206 cmpl $4, %r10d 207 jle 0f // consider clean-up loop 208 209 // main loop 210 .p2align 3 2111: // main loop 212 213 prefetcht0 0(%r12, %r13, 2) // software prefetch B + 2(4*sdb*sizeof(double)) 214 // T0 (temporal data)-prefetch data into all levels of the cache hierarchy. 215 // The prefetch size may be 32, 64, or 128 Bytes. 216 217 // unroll 0 218 // ymm8 = [a01 a10 a20 a30] 219 vbroadcastsd 0(%r12), %ymm12 // B 220 vmulpd %ymm8, %ymm12, %ymm15 221 vaddpd %ymm15, %ymm0, %ymm0 222 223 vmovapd 32(%r11), %ymm10 // load A 224 225 vbroadcastsd 32(%r12), %ymm12 // B + 32 = B+4*sizeof(double) 226 vmulpd %ymm8, %ymm12, %ymm15 227 vaddpd %ymm15, %ymm1, %ymm1 228 subl $4, %r10d 229 230 // unroll 1 231 // ymm10 = [a01 a10 a20 a30] 232 vbroadcastsd 8(%r12), %ymm12 // B 233 vmulpd %ymm10, %ymm12, %ymm15 234 vaddpd %ymm15, %ymm0, %ymm0 235 236 vmovapd 64(%r11), %ymm8 // A0 237 238 vbroadcastsd 40(%r12), %ymm12 // B + 32 = B+4*sizeof(double) 239 vmulpd %ymm10, %ymm12, %ymm15 240 vaddpd %ymm15, %ymm1, %ymm1 241 242 // unroll 2 243 // ymm8 = [a01 a10 a20 a30] 244 vbroadcastsd 16(%r12), %ymm12 // B 245 vmulpd %ymm8, %ymm12, %ymm15 246 vaddpd %ymm15, %ymm0, %ymm0 247 248 vmovapd 96(%r11), %ymm10 // A0 249 250 vbroadcastsd 48(%r12), %ymm12 // B 251 vmulpd %ymm8, %ymm12, %ymm15 252 vaddpd %ymm15, %ymm1, %ymm1 253 254 addq $128, %r11 // Update A 255 256 // unroll 3 257 // A[03 13 23 33] -> ymm10 258 vbroadcastsd 24(%r12), %ymm12 // B 259 vmulpd %ymm10, %ymm12, %ymm15 260 vaddpd %ymm15, %ymm0, %ymm0 261 262 vmovapd 0(%r11), %ymm8 // load A[04 14 24 34] -> ymm8 263 264 vbroadcastsd 56(%r12), %ymm12 // B 265 vmulpd %ymm10, %ymm12, %ymm15 266 vaddpd %ymm15, %ymm1, %ymm1 267 268 addq %r13, %r12 // Update B = B + sdb*ps*8bit 269 270 cmpl $4, %r10d 271 jg 1b // main loop 272 2730: // consider clean4-up 274 275 cmpl $3, %r10d 276 jle 4f // clean1 277 278 // ki==4 279 280 // unroll 0 281 // ymm8 = A[04 14 24 34] 282 vbroadcastsd 0(%r12), %ymm12 // B 283 vmulpd %ymm8, %ymm12, %ymm15 284 vaddpd %ymm15, %ymm0, %ymm0 285 286 vmovapd 32(%r11), %ymm10 // load A 287 288 vbroadcastsd 32(%r12), %ymm12 289 vmulpd %ymm8, %ymm12, %ymm15 290 vaddpd %ymm15, %ymm1, %ymm1 291 vaddpd %ymm15, %ymm7, %ymm7 292 293 subl $4, %r10d 294 295 // unroll 1 296 // ymm10 = A[05 15 25 35] 297 vbroadcastsd 8(%r12), %ymm12 298 vmulpd %ymm10, %ymm12, %ymm15 299 vaddpd %ymm15, %ymm0, %ymm0 300 301 vmovapd 64(%r11), %ymm8 // A0 302 303 vbroadcastsd 40(%r12), %ymm12 304 vmulpd %ymm10, %ymm12, %ymm15 305 vaddpd %ymm15, %ymm1, %ymm1 306 307 // unroll 2 308 // ymm8 309 vbroadcastsd 16(%r12), %ymm12 310 vmulpd %ymm8, %ymm12, %ymm15 311 vaddpd %ymm15, %ymm0, %ymm0 312 313 vmovapd 96(%r11), %ymm10 // A0 314 315 vbroadcastsd 48(%r12), %ymm12 316 vmulpd %ymm8, %ymm12, %ymm15 317 vaddpd %ymm15, %ymm1, %ymm1 318 319 addq $128, %r11 320 321 // unroll 3 322 // ymm10 323 vbroadcastsd 24(%r12), %ymm12 324 vmulpd %ymm10, %ymm12, %ymm15 325 vaddpd %ymm15, %ymm0, %ymm0 326 327// vmovapd 0(%r11), %ymm8 // A0 328 329 vbroadcastsd 56(%r12), %ymm12 330 vmulpd %ymm10, %ymm12, %ymm15 331 vaddpd %ymm15, %ymm1, %ymm1 332 333 addq %r13, %r12 334 335 jmp 2f 336 337 3384: // consider clean1-up loop 339 340 cmpl $0, %r10d 341 jle 2f // return 342 343 // clean-up loop 3443: // clean up loop 345 346 vmovapd 0(%r11), %ymm8 // A0[0] 347 vbroadcastsd 0(%r12), %ymm12 348 vmulpd %ymm8, %ymm12, %ymm15 349 vaddpd %ymm15, %ymm0, %ymm0 350 351 vbroadcastsd 32(%r12), %ymm12 352 vmulpd %ymm8, %ymm12, %ymm15 353 vaddpd %ymm15, %ymm1, %ymm1 354 addq $32, %r11 355 subl $1, %r10d 356 addq $8, %r12 357 358 cmpl $0, %r10d 359 jg 3b // clean up loop 360 3612: // return 362 363#if MACRO_LEVEL>=2 364 .endm 365#else 366 ret 367 368#if defined(OS_LINUX) 369 .size inner_kernel_dgemm_add_nn_4x2_lib4, .-inner_kernel_dgemm_add_nn_4x2_lib4 370#endif 371#endif 372// end 373 374 375 376 377// void inner_kernel_dgemm_add_nn_2x4_lib4 378// common inner routine with file scope 379// 380// input arguments: 381// r10d <- k 382// r11 <- A 383// r12 <- B 384// r13 <- 4*sdb*sizeof(double) 385 386// 387// output arguments: 388// r10d <- 0 389// r11 <- A+4*k*sizeof(double) 390// r12 <- B+(k/4)*sdb*sizeof(double)+(k%4) 391// r13 <- 4*sdb*sizeof(double) 392 393#if MACRO_LEVEL>=2 394 .macro INNER_KERNEL_DGEMM_ADD_NN_2X4_LIB4 395#else 396 .p2align 4,,15 397#if defined(OS_LINUX) 398 .type inner_kernel_dgemm_add_nn_2x4_lib4, @function 399inner_kernel_dgemm_add_nn_2x4_lib4: 400#elif defined(OS_MAC) 401_inner_kernel_dgemm_add_nn_2x4_lib4: 402#elif defined(OS_WINDOWS) 403 .def inner_kernel_dgemm_add_nn_2x4_lib4; .scl 2; .type 32; .endef 404inner_kernel_dgemm_add_nn_2x4_lib4: 405#endif 406#endif 407 408 cmpl $0, %r10d 409 jle 5f // return 410 411 // preload 412 vbroadcastf128 0(%r11), %ymm11 // A 413 vbroadcastf128 32(%r11), %ymm12 // A 414 415 cmpl $4, %r10d 416 jle 0f // consider clean-up loop 417 418 // main loop 419 .p2align 3 4201: // main loop 421 422 prefetcht0 0(%r12, %r13, 2) // software prefetch 423 prefetcht0 64(%r12, %r13, 2) // software prefetch 424 425 // unroll 0 1 426 427 // load 428 vmovapd 0(%r12), %ymm13 // B 429 vmovupd 16(%r12), %ymm14 // B 430 431 // mask 432 vblendpd $0x3, %ymm13, %ymm14, %ymm14 // 433 vshufpd $0x0, %ymm14, %ymm14, %ymm13 // 434 435 vmulpd %ymm11, %ymm13, %ymm15 436 vaddpd %ymm0, %ymm15, %ymm0 437 438 vbroadcastf128 64(%r11), %ymm9 // A 439 vshufpd $0xf, %ymm14, %ymm14, %ymm13 440 441 vmulpd %ymm12, %ymm13, %ymm15 442 vaddpd %ymm1, %ymm15, %ymm1 443 444 vmovapd 64(%r12), %ymm13 445 vmovupd 80(%r12), %ymm14 446 447 // load 448 vblendpd $0x3, %ymm13, %ymm14, %ymm14 449 vshufpd $0x0, %ymm14, %ymm14, %ymm13 450 451 vmulpd %ymm11, %ymm13, %ymm15 452 vaddpd %ymm2, %ymm15, %ymm2 453 454 vbroadcastf128 96(%r11), %ymm10 // A 455 vshufpd $0xf, %ymm14, %ymm14, %ymm13 456 457 vmulpd %ymm12, %ymm13, %ymm15 458 vaddpd %ymm3, %ymm15, %ymm3 459 460 // unroll 2 3 461 vmovupd 16(%r12), %ymm13 462 vmovapd 32(%r12), %ymm14 463 addq $128, %r11 464 vblendpd $0x3, %ymm13, %ymm14, %ymm14 465 vshufpd $0x0, %ymm14, %ymm14, %ymm13 466 vmulpd %ymm9, %ymm13, %ymm15 467 vaddpd %ymm0, %ymm15, %ymm0 468 vbroadcastf128 0(%r11), %ymm11 // A 469 vshufpd $0xf, %ymm14, %ymm14, %ymm13 470 vmulpd %ymm10, %ymm13, %ymm15 471 vaddpd %ymm1, %ymm15, %ymm1 472 vmovupd 80(%r12), %ymm13 473 vmovapd 96(%r12), %ymm14 474 vblendpd $0x3, %ymm13, %ymm14, %ymm14 475 vshufpd $0x0, %ymm14, %ymm14, %ymm13 476 vmulpd %ymm9, %ymm13, %ymm15 477 vaddpd %ymm2, %ymm15, %ymm2 478 vbroadcastf128 32(%r11), %ymm12 // A 479 vshufpd $0xf, %ymm14, %ymm14, %ymm13 480 vmulpd %ymm10, %ymm13, %ymm15 481 vaddpd %ymm3, %ymm15, %ymm3 482 483 subl $4, %r10d 484 addq %r13, %r12 485 486 cmpl $4, %r10d 487 jg 1b // main loop 488 489 4900: // consider clean4-up 491 492 cmpl $3, %r10d 493 jle 4f // clean1 494 495 // unroll 0 1 496 vmovapd 0(%r12), %ymm13 497 vmovupd 16(%r12), %ymm14 498 vblendpd $0x3, %ymm13, %ymm14, %ymm14 499 vshufpd $0x0, %ymm14, %ymm14, %ymm13 500 vmulpd %ymm11, %ymm13, %ymm15 501 vaddpd %ymm0, %ymm15, %ymm0 502 vbroadcastf128 64(%r11), %ymm9 // A 503 vshufpd $0xf, %ymm14, %ymm14, %ymm13 504 vmulpd %ymm12, %ymm13, %ymm15 505 vaddpd %ymm1, %ymm15, %ymm1 506 vmovapd 64(%r12), %ymm13 507 vmovupd 80(%r12), %ymm14 508 vblendpd $0x3, %ymm13, %ymm14, %ymm14 509 vshufpd $0x0, %ymm14, %ymm14, %ymm13 510 vmulpd %ymm11, %ymm13, %ymm15 511 vaddpd %ymm2, %ymm15, %ymm2 512 vbroadcastf128 96(%r11), %ymm10 // A 513 vshufpd $0xf, %ymm14, %ymm14, %ymm13 514 vmulpd %ymm12, %ymm13, %ymm15 515 vaddpd %ymm3, %ymm15, %ymm3 516 517 // unroll 2 3 518 vmovupd 16(%r12), %ymm13 519 vmovapd 32(%r12), %ymm14 520 addq $128, %r11 521 vblendpd $0x3, %ymm13, %ymm14, %ymm14 522 vshufpd $0x0, %ymm14, %ymm14, %ymm13 523 vmulpd %ymm9, %ymm13, %ymm15 524 vaddpd %ymm0, %ymm15, %ymm0 525// vbroadcastf128 0(%r11), %ymm11 // A 526 vshufpd $0xf, %ymm14, %ymm14, %ymm13 527 vmulpd %ymm10, %ymm13, %ymm15 528 vaddpd %ymm1, %ymm15, %ymm1 529 vmovupd 80(%r12), %ymm13 530 vmovapd 96(%r12), %ymm14 531 vblendpd $0x3, %ymm13, %ymm14, %ymm14 532 vshufpd $0x0, %ymm14, %ymm14, %ymm13 533 vmulpd %ymm9, %ymm13, %ymm15 534 vaddpd %ymm2, %ymm15, %ymm2 535// vbroadcastf128 32(%r11), %ymm12 // A 536 vshufpd $0xf, %ymm14, %ymm14, %ymm13 537 vmulpd %ymm10, %ymm13, %ymm15 538 vaddpd %ymm3, %ymm15, %ymm3 539 540 subl $4, %r10d 541 addq $128, %r11 542 addq %r13, %r12 543 544 jmp 2f // return 545 546 5474: // consider clean1-up loop 548 549 cmpl $0, %r10d 550 jle 2f // return 551 552 // clean-up loop 5533: // clean up loop 554 555 // unroll 0 556 vbroadcastf128 0(%r11), %ymm11 // A 557 vmovupd 0(%r12), %ymm13 558 vmovupd 16(%r12), %ymm14 559 vblendpd $0x3, %ymm13, %ymm14, %ymm14 560 vshufpd $0x0, %ymm14, %ymm14, %ymm13 561 vmulpd %ymm11, %ymm13, %ymm15 562 vaddpd %ymm0, %ymm15, %ymm0 563 vmovupd 64(%r12), %ymm13 564 vmovupd 80(%r12), %ymm14 565 vblendpd $0x3, %ymm13, %ymm14, %ymm14 566 vshufpd $0x0, %ymm14, %ymm14, %ymm13 567 vmulpd %ymm11, %ymm13, %ymm15 568 vaddpd %ymm2, %ymm15, %ymm2 569 570 addq $32, %r11 571 addq $8, %r12 572 subl $1, %r10d 573 574 cmpl $0, %r10d 575 jg 3b // clean up loop 576 577 5782: // reduce 579 vaddpd %ymm0, %ymm1, %ymm0 580 vextractf128 $0x1, %ymm0, %xmm1 581 vaddpd %ymm2, %ymm3, %ymm2 582 vextractf128 $0x1, %ymm2, %xmm3 583 5845: // return 585 586#if MACRO_LEVEL>=2 587 .endm 588#else 589 ret 590 591#if defined(OS_LINUX) 592 .size inner_kernel_dgemm_add_nn_2x4_lib4, .-inner_kernel_dgemm_add_nn_2x4_lib4 593#endif 594#endif 595// end 596 597 598 599 600// void inner_edge_dgemm_add_nn_4x2_lib4 601// common inner routine with file scope 602// 603// edge for B unaligned 604// 605// input arguments: 606// r10 <- k 607// r11 <- A 608// r12 <- B 609// r13 <- bs*sdb*sizeof(double) 610// r14 <- offB 611 612// 613// output arguments: 614// r10 <- k-(4-offB) 615// r11 <- A+(4-offB)*bs*sizeof(double) 616// r12 <- B-offB+bs*sdb*sizeof(double) 617// r13 <- bs*sdb*sizeof(double) 618// r14 <- offB 619 620 621#if MACRO_LEVEL>=1 622 .macro INNER_EDGE_DGEMM_ADD_NN_4X2_LIB4 623#else 624 .p2align 4,,15 625#if defined(OS_LINUX) 626 .type inner_edge_dgemm_add_nn_4x2_lib4, @function 627inner_edge_dgemm_add_nn_4x2_lib4: 628#elif defined(OS_MAC) 629_inner_edge_dgemm_add_nn_4x2_lib4: 630#elif defined(OS_WINDOWS) 631 .def inner_edge_dgemm_add_nn_4x2_lib4; .scl 2; .type 32; .endef 632inner_edge_dgemm_add_nn_4x2_lib4: 633#endif 634#endif 635 636 cmpl $0, %r14d // offset==0 637 jle 2f // end 638 639 cmpl $0, %r10d // k==0 640 jle 2f // end 641 642 movl $4, %r15d 643 subl %r14d, %r15d // 4-offsetB 644 cmpl %r10d, %r15d 645// jle 0f 646// movl %r10d, %r15d // kend=min(k,4-offsetB) 647//0: 648 cmovgl %r10d, %r15d // kend=min(k,4-offsetB) 649 650 movl %r14d, %eax 651 sall $3, %eax // offsetB*sizeof(double) 652 addq %rax, %r12 // B+offsetB*sizeof(double) 653 6541: 655 vmovapd 0(%r11), %ymm8 656 vbroadcastsd 0(%r12), %ymm12 657 vmulpd %ymm8, %ymm12, %ymm15 658 vaddpd %ymm15, %ymm0, %ymm0 659 vbroadcastsd 32(%r12), %ymm12 660 vmulpd %ymm8, %ymm12, %ymm15 661 vaddpd %ymm15, %ymm1, %ymm1 662 663 subl $1, %r10d // k-1 664 subl $1, %r15d // kend-1 665 addq $32, %r11 // A+1*bs*sizeof(float) 666 addq $8, %r12 // B+1*sizeof(float) 667 668 cmpl $0, %r15d 669 jg 1b 670 671 cmpl $0, %r10d 672 jle 2f // end 673 674 addq %r13, %r12 675 subq $32, %r12 // B+bs*(sdb-1)*sizeof(double) 676 6772: 678 679#if MACRO_LEVEL>=1 680 .endm 681#else 682 ret 683 684#if defined(OS_LINUX) 685 .size inner_edge_dgemm_add_nn_4x2_lib4, .-inner_edge_dgemm_add_nn_4x2_lib4 686#endif 687#endif 688// end 689 690 691 692 693// void inner_edge_dgemm_add_nn_2x4_lib4 694// common inner routine with file scope 695// 696// edge for B unaligned 697// 698// input arguments: 699// r10 <- k 700// r11 <- A 701// r12 <- B 702// r13 <- bs*sdb*sizeof(double) 703// r14 <- offB 704 705// 706// output arguments: 707// r10 <- k-(4-offB) 708// r11 <- A+(4-offB)*bs*sizeof(double) 709// r12 <- B-offB+bs*sdb*sizeof(double) 710// r13 <- bs*sdb*sizeof(double) 711// r14 <- offB 712 713 714#if MACRO_LEVEL>=1 715 .macro INNER_EDGE_DGEMM_ADD_NN_2X4_LIB4 716#else 717 .p2align 4,,15 718#if defined(OS_LINUX) 719 .type inner_edge_dgemm_add_nn_2x4_lib4, @function 720inner_edge_dgemm_add_nn_2x4_lib4: 721#elif defined(OS_MAC) 722_inner_edge_dgemm_add_nn_2x4_lib4: 723#elif defined(OS_WINDOWS) 724 .def inner_edge_dgemm_add_nn_2x4_lib4; .scl 2; .type 32; .endef 725inner_edge_dgemm_add_nn_2x4_lib4: 726#endif 727#endif 728 729 cmpl $0, %r14d // offset==0 730 jle 2f // end 731 732 cmpl $0, %r10d // k==0 733 jle 2f // end 734 735 movl $4, %r15d 736 subl %r14d, %r15d // 4-offsetB 737 cmpl %r10d, %r15d 738// jle 0f 739// movl %r10d, %r15d // kend=min(k,4-offsetB) 740//0: 741 cmovgl %r10d, %r15d // kend=min(k,4-offsetB) 742 743 movl %r14d, %eax 744 sall $3, %eax // offsetB*sizeof(double) 745 addq %rax, %r12 // B+offsetB*sizeof(double) 746 7471: 748 vbroadcastf128 0(%r11), %ymm11 // A 749 vmovupd 0(%r12), %ymm13 750 vmovupd 16(%r12), %ymm14 751 vblendpd $0x3, %ymm13, %ymm14, %ymm14 752 vshufpd $0x0, %ymm14, %ymm14, %ymm13 753 vmulpd %ymm11, %ymm13, %ymm15 754 vaddpd %ymm0, %ymm15, %ymm0 755 vmovupd 64(%r12), %ymm13 756 vmovupd 80(%r12), %ymm14 757 vblendpd $0x3, %ymm13, %ymm14, %ymm14 758 vshufpd $0x0, %ymm14, %ymm14, %ymm13 759 vmulpd %ymm11, %ymm13, %ymm15 760 vaddpd %ymm2, %ymm15, %ymm2 761 762 subl $1, %r10d // k-1 763 subl $1, %r15d // kend-1 764 addq $32, %r11 // A+1*bs*sizeof(float) 765 addq $8, %r12 // B+1*sizeof(float) 766 767 cmpl $0, %r15d 768 jg 1b 769 770 cmpl $0, %r10d 771 jle 2f // end 772 773 addq %r13, %r12 774 subq $32, %r12 // B+bs*(sdb-1)*sizeof(double) 775 7762: 777 778#if MACRO_LEVEL>=1 779 .endm 780#else 781 ret 782 783#if defined(OS_LINUX) 784 .size inner_edge_dgemm_add_nn_2x4_lib4, .-inner_edge_dgemm_add_nn_2x4_lib4 785#endif 786#endif 787// end 788 789 790 791 792// void inner_scale_ab_4x2_lib4 793// common inner routine with file scope 794// 795// scale for generic alpha and beta 796// 797// input arguments: 798// r10 <- alpha 799// r11 <- beta 800// r12 <- C 801// r13 <- 4*sdc*sizeof(double) 802// r15 <- dirty 803// ymm0 <- [d00 d11 d22 d33] 804// ymm1 <- [d01 d10 d23 d32] 805// 806// output arguments: 807// r10 <- alpha 808// r11 <- beta 809// r12 <- C 810// r13 <- 4*sdc*sizeof(double) 811// r15 <- dirty 812// ymm0 <- [d00 d10 d20 d30] 813// ymm1 <- [d01 d11 d21 d31] 814 815#if MACRO_LEVEL>=1 816 .macro INNER_SCALE_AB_4X2_LIB4 817#else 818 .p2align 4,,15 819#if defined(OS_LINUX) 820 .type inner_scale_ab_4x2_lib4, @function 821inner_scale_ab_4x2_lib4: 822#elif defined(OS_MAC) 823_inner_scale_ab_4x2_lib4: 824#elif defined(OS_WINDOWS) 825 .def inner_scale_ab_4x2_lib4; .scl 2; .type 32; .endef 826inner_scale_ab_4x2_lib4: 827#endif 828#endif 829 830 831 // alpha 832 vbroadcastsd 0(%r10), %ymm15 833 834 vmulpd %ymm0, %ymm15, %ymm0 835 vmulpd %ymm1, %ymm15, %ymm1 836 837 // beta 838 vbroadcastsd 0(%r11), %ymm14 839 840 vxorpd %ymm15, %ymm15, %ymm15 // 0.0 841 842 vucomisd %xmm15, %xmm14 // beta==0.0 ? 843 je 0f // end 844 845 // alg==1 846 vmovapd 0(%r12), %ymm15 847 vmulpd %ymm15, %ymm14, %ymm15 848 vaddpd %ymm0, %ymm15, %ymm0 849 vmovapd 32(%r12), %ymm15 850 vmulpd %ymm15, %ymm14, %ymm15 851 vaddpd %ymm1, %ymm15, %ymm1 852 8530: 854 855#if MACRO_LEVEL>=1 856 .endm 857#else 858 ret 859 860#if defined(OS_LINUX) 861 .size inner_scale_ab_4x2_lib4, .-inner_scale_ab_4x2_lib4 862#endif 863#endif 864// end 865 866 867 868 869// void inner_scale_ab_2x4_lib4 870// common inner routine with file scope 871// 872// scale for generic alpha and beta 873// 874// input arguments: 875// r10 <- alpha 876// r11 <- beta 877// r12 <- C 878// ymm0 <- [d00 d11 d22 d33] 879// ymm1 <- [d01 d10 d23 d32] 880// ymm2 <- [d03 d12 d21 d30] 881// ymm3 <- [d02 d13 d20 d31] 882// ymm8 <- dirty 883// ymm9 <- dirty 884// ymm10 <- dirty 885// ymm11 <- dirty 886// ymm15 <- dirty 887// 888// output arguments: 889// r10 <- alpha 890// r11 <- beta 891// r10 <- C 892// ymm0 <- [d00 d10 d20 d30] 893// ymm1 <- [d01 d11 d21 d31] 894// ymm2 <- [d02 d12 d22 d32] 895// ymm3 <- [d03 d13 d23 d33] 896// ymm8 <- dirty 897// ymm9 <- dirty 898// ymm10 <- dirty 899// ymm11 <- dirty 900// ymm15 <- dirty 901 902#if MACRO_LEVEL>=1 903 .macro INNER_SCALE_AB_2X4_LIB4 904#else 905 .p2align 4,,15 906#if defined(OS_LINUX) 907 .type inner_scale_ab_2x4_lib4, @function 908inner_scale_ab_2x4_lib4: 909#elif defined(OS_MAC) 910_inner_scale_ab_2x4_lib4: 911#elif defined(OS_WINDOWS) 912 .def inner_scale_ab_2x4_lib4; .scl 2; .type 32; .endef 913inner_scale_ab_2x4_lib4: 914#endif 915#endif 916 917 // alpha 918 vmovddup 0(%r10), %xmm15 919 920 vmulpd %xmm0, %xmm15, %xmm0 921 vmulpd %xmm1, %xmm15, %xmm1 922 vmulpd %xmm2, %xmm15, %xmm2 923 vmulpd %xmm3, %xmm15, %xmm3 924 925 // beta 926 vmovddup 0(%r11), %xmm14 927 928 vxorpd %xmm15, %xmm15, %xmm15 // 0.0 929 930 vucomisd %xmm15, %xmm14 // beta==0.0 ? 931 je 0f // end 932 933 vmovapd 0(%r12), %xmm15 934 vmulpd %xmm14, %xmm15, %xmm15 935 vaddpd %xmm15, %xmm0, %xmm0 936 vmovapd 32(%r12), %xmm15 937 vmulpd %xmm14, %xmm15, %xmm15 938 vaddpd %xmm15, %xmm1, %xmm1 939 vmovapd 64(%r12), %xmm15 940 vmulpd %xmm14, %xmm15, %xmm15 941 vaddpd %xmm15, %xmm2, %xmm2 942 vmovapd 96(%r12), %xmm15 943 vmulpd %xmm14, %xmm15, %xmm15 944 vaddpd %xmm15, %xmm3, %xmm3 945 9460: 947 948#if MACRO_LEVEL>=1 949 .endm 950#else 951 ret 952 953#if defined(OS_LINUX) 954 .size inner_scale_ab_2x4_lib4, .-inner_scale_ab_2x4_lib4 955#endif 956#endif 957 958// end 959 960 961 962 963// void inner_store_4x2_lib4 964// common inner routine with file scope 965// 966// store n 967// 968// input arguments: 969// r10 <- D 970// r11 <- 4*sdd*sizeof(double) 971// r15 <- dirty 972// ymm0 <- [d00 d10 d20 d30] 973// ymm1 <- [d01 d11 d21 d31] 974// 975// output arguments: 976// r10 <- D 977// r11 <- 4*sdd*sizeof(double) 978// r15 <- dirty 979// ymm0 <- [d00 d10 d20 d30] 980// ymm1 <- [d01 d11 d21 d31] 981 982#if MACRO_LEVEL>=1 983 .macro INNER_STORE_4X2_LIB4 984#else 985 .p2align 4,,15 986#if defined(OS_LINUX) 987 .type inner_store_4x2_lib4, @function 988inner_store_4x2_lib4: 989#elif defined(OS_MAC) 990_inner_store_4x2_lib4: 991#elif defined(OS_WINDOWS) 992 .def inner_store_4x2_lib4; .scl 2; .type 32; .endef 993inner_store_4x2_lib4: 994#endif 995#endif 996 997 vmovapd %ymm0, 0(%r10) 998 vmovapd %ymm1, 32(%r10) 999 1000#if MACRO_LEVEL>=1 1001 .endm 1002#else 1003 ret 1004 1005#if defined(OS_LINUX) 1006 .size inner_store_4x2_lib4, .-inner_store_4x2_lib4 1007#endif 1008#endif 1009// end 1010 1011 1012 1013 1014// void inner_store_2x4_lib4 1015// common inner routine with file scope 1016// 1017// store n 1018// 1019// input arguments: 1020// r10 <- D 1021// ymm0 <- [d00 d10] 1022// ymm1 <- [d01 d11] 1023// ymm2 <- [d02 d12] 1024// ymm3 <- [d03 d13] 1025// 1026// output arguments: 1027// r10 <- D 1028// ymm0 <- [d00 d10] 1029// ymm1 <- [d01 d11] 1030// ymm2 <- [d02 d12] 1031// ymm3 <- [d03 d13] 1032 1033#if MACRO_LEVEL>=1 1034 .macro INNER_STORE_2X4_LIB4 1035#else 1036 .p2align 4,,15 1037#if defined(OS_LINUX) 1038 .type inner_store_2x4_lib4, @function 1039inner_store_2x4_lib4: 1040#elif defined(OS_MAC) 1041_inner_store_2x4_lib4: 1042#elif defined(OS_WINDOWS) 1043 .def inner_store_2x4_lib4; .scl 2; .type 32; .endef 1044inner_store_2x4_lib4: 1045#endif 1046#endif 1047 1048 vmovapd %xmm0, 0(%r10) 1049 vmovapd %xmm1, 32(%r10) 1050 vmovapd %xmm2, 64(%r10) 1051 vmovapd %xmm3, 96(%r10) 1052 1053#if MACRO_LEVEL>=1 1054 .endm 1055#else 1056 ret 1057 1058#if defined(OS_LINUX) 1059 .size inner_store_2x4_lib4, .-inner_store_2x4_lib4 1060#endif 1061#endif 1062// end 1063 1064 1065 1066 1067// void inner_store_2x2_lib4 1068// common inner routine with file scope 1069// 1070// store n 1071// 1072// input arguments: 1073// r10 <- D 1074// ymm0 <- [d00 d10] 1075// ymm1 <- [d01 d11] 1076// ymm2 <- [d02 d12] 1077// ymm3 <- [d03 d13] 1078// 1079// output arguments: 1080// r10 <- D 1081// ymm0 <- [d00 d10] 1082// ymm1 <- [d01 d11] 1083// ymm2 <- [d02 d12] 1084// ymm3 <- [d03 d13] 1085 1086#if MACRO_LEVEL>=1 1087 .macro INNER_STORE_2X2_LIB4 1088#else 1089 .p2align 4,,15 1090#if defined(OS_LINUX) 1091 .type inner_store_2x2_lib4, @function 1092inner_store_2x2_lib4: 1093#elif defined(OS_MAC) 1094_inner_store_2x2_lib4: 1095#elif defined(OS_WINDOWS) 1096 .def inner_store_2x2_lib4; .scl 2; .type 32; .endef 1097inner_store_2x2_lib4: 1098#endif 1099#endif 1100 1101 vmovapd %xmm0, 0(%r10) 1102 vmovapd %xmm1, 32(%r10) 1103 1104#if MACRO_LEVEL>=1 1105 .endm 1106#else 1107 ret 1108 1109#if defined(OS_LINUX) 1110 .size inner_store_2x2_lib4, .-inner_store_2x2_lib4 1111#endif 1112#endif 1113// end 1114 1115 1116 1117 1118// void inner_store_4x2_vs_lib4 1119// common inner routine with file scope 1120// 1121// store n vs 1122// 1123// input arguments: 1124// r10 <- D 1125// r11d <- km 1126// r12d <- kn 1127// ymm0 <- [d00 d10 d20 d30] 1128// ymm1 <- [d01 d11 d21 d31] 1129// 1130// output arguments: 1131// r10 <- D 1132// r11d <- km 1133// r12d <- kn 1134// ymm0 <- [d00 d10 d20 d30] 1135// ymm1 <- [d01 d11 d21 d31] 1136 1137#if MACRO_LEVEL>=1 1138 .macro INNER_STORE_4X2_VS_LIB4 1139#else 1140 .p2align 4,,15 1141#if defined(OS_LINUX) 1142 .type inner_store_4x2_vs_lib4, @function 1143inner_store_4x2_vs_lib4: 1144#elif defined(OS_MAC) 1145_inner_store_4x2_vs_lib4: 1146#elif defined(OS_WINDOWS) 1147 .def inner_store_4x2_vs_lib4; .scl 2; .type 32; .endef 1148inner_store_4x2_vs_lib4: 1149#endif 1150#endif 1151 1152 vcvtsi2sd %r11d, %xmm15, %xmm15 1153#if defined(OS_LINUX) | defined(OS_WINDOWS) 1154 vmovupd .LC02(%rip), %ymm14 1155#elif defined(OS_MAC) 1156 vmovupd LC02(%rip), %ymm14 1157#endif 1158 vmovddup %xmm15, %xmm15 1159 vinsertf128 $1, %xmm15, %ymm15, %ymm15 1160 vsubpd %ymm15, %ymm14, %ymm15 1161 1162 vmaskmovpd %ymm0, %ymm15, 0(%r10) 1163 cmpl $2, %r12d 1164 jl 0f // end 1165 vmaskmovpd %ymm1, %ymm15, 32(%r10) 1166 11670: 1168 1169#if MACRO_LEVEL>=1 1170 .endm 1171#else 1172 ret 1173 1174#if defined(OS_LINUX) 1175 .size inner_store_4x2_vs_lib4, .-inner_store_4x2_vs_lib4 1176#endif 1177#endif 1178// end 1179 1180 1181 1182 1183// void inner_store_2x4_vs_lib4 1184// common inner routine with file scope 1185// 1186// store n 1187// 1188// input arguments: 1189// r10 <- D 1190// r11 <- m1 1191// r12 <- n1 1192// ymm0 <- [d00 d10 d20 d30] 1193// ymm1 <- [d01 d11 d21 d31] 1194// ymm2 <- [d02 d12 d22 d32] 1195// ymm3 <- [d03 d13 d23 d33] 1196// ymm4 <- [d40 d50 d60 d70] 1197// ymm5 <- [d41 d51 d61 d71] 1198// ymm6 <- [d42 d52 d62 d72] 1199// ymm7 <- [d43 d53 d63 d73] 1200// 1201// output arguments: 1202// r10 <- D 1203// r11 <- m1 1204// r12 <- n1 1205// ymm0 <- [d00 d10 d20 d30] 1206// ymm1 <- [d01 d11 d21 d31] 1207// ymm2 <- [d02 d12 d22 d32] 1208// ymm3 <- [d03 d13 d23 d33] 1209// ymm4 <- [d40 d50 d60 d70] 1210// ymm5 <- [d41 d51 d61 d71] 1211// ymm6 <- [d42 d52 d62 d72] 1212// ymm7 <- [d43 d53 d63 d73] 1213 1214#if MACRO_LEVEL>=1 1215 .macro INNER_STORE_2X4_VS_LIB4 1216#else 1217 .p2align 4,,15 1218#if defined(OS_LINUX) 1219 .type inner_store_2x4_vs_lib4, @function 1220inner_store_2x4_vs_lib4: 1221#elif defined(OS_MAC) 1222_inner_store_2x4_vs_lib4: 1223#elif defined(OS_WINDOWS) 1224 .def inner_store_2x4_vs_lib4; .scl 2; .type 32; .endef 1225inner_store_2x4_vs_lib4: 1226#endif 1227#endif 1228 1229 vcvtsi2sd %r11d, %xmm15, %xmm15 1230#if defined(OS_LINUX) | defined(OS_WINDOWS) 1231 vmovupd .LC02(%rip), %ymm14 1232#elif defined(OS_MAC) 1233 vmovupd LC02(%rip), %ymm14 1234#endif 1235 vmovddup %xmm15, %xmm15 1236 vinsertf128 $1, %xmm15, %ymm15, %ymm15 1237 vsubpd %ymm15, %ymm14, %ymm15 1238 1239 cmpl $2, %r12d 1240 vmaskmovpd %xmm0, %xmm15, 0(%r10) 1241 jl 0f // end 1242 cmpl $3, %r12d 1243 vmaskmovpd %xmm1, %xmm15, 32(%r10) 1244 jl 0f // end 1245 vmaskmovpd %xmm2, %xmm15, 64(%r10) 1246 je 0f // end 1247 vmaskmovpd %xmm3, %xmm15, 96(%r10) 1248 12490: 1250 1251#if MACRO_LEVEL>=1 1252 .endm 1253#else 1254 ret 1255 1256#if defined(OS_LINUX) 1257 .size inner_store_2x4_vs_lib4, .-inner_store_2x4_vs_lib4 1258#endif 1259#endif 1260// end 1261 1262 1263 1264 1265// ASM Kernels 1266 1267 1268 1269 1270// void kernel_dgemm_nn_4x2_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D); 1271// 1 2 3 4 5 6 7 8 9 1272 1273 .p2align 4,,15 1274#if defined(OS_LINUX) 1275 .globl kernel_dgemm_nn_4x2_lib4 1276 .type kernel_dgemm_nn_4x2_lib4, @function 1277kernel_dgemm_nn_4x2_lib4: 1278#elif defined(OS_MAC) 1279 .globl _kernel_dgemm_nn_4x2_lib4 1280_kernel_dgemm_nn_4x2_lib4: 1281#elif defined(OS_WINDOWS) 1282 .globl kernel_dgemm_nn_4x2_lib4 1283 .def kernel_dgemm_nn_4x2_lib4; .scl 2; .type 32; .endef 1284kernel_dgemm_nn_4x2_lib4: 1285#endif 1286 1287 PROLOGUE 1288 1289 // zero accumulation registers 1290 1291 vxorpd %ymm0, %ymm0, %ymm0 1292 vmovapd %ymm0, %ymm1 1293 vmovapd %ymm0, %ymm2 1294 vmovapd %ymm0, %ymm3 1295 1296 1297 // call inner dgemm kernel nn 1298 1299 movq ARG1, %r10 // k 1300 movq ARG3, %r11 // A 1301 movq ARG5, %r12 // B 1302 movq ARG6, %r13 // sdb 1303 sall $5, %r13d // 4*sdb*sizeof(double) 1304 movq ARG4, %r14 // offsetB 1305 1306#if MACRO_LEVEL>=1 1307 INNER_EDGE_DGEMM_ADD_NN_4X2_LIB4 1308#else 1309#if defined(OS_LINUX) | defined(OS_WINDOWS) 1310 call inner_edge_dgemm_add_nn_4x2_lib4 1311#elif defined(OS_MAC) 1312 callq _inner_edge_dgemm_add_nn_4x2_lib4 1313#endif 1314#endif 1315 1316#if MACRO_LEVEL>=2 1317 INNER_KERNEL_DGEMM_ADD_NN_4X2_LIB4 1318#else 1319#if defined(OS_LINUX) | defined(OS_WINDOWS) 1320 call inner_kernel_dgemm_add_nn_4x2_lib4 1321#elif defined(OS_MAC) 1322 callq _inner_kernel_dgemm_add_nn_4x2_lib4 1323#endif 1324#endif 1325 1326 1327 // call inner blend 1328 1329 movq ARG2, %r10 // alpha 1330 movq ARG7, %r11 // beta 1331 movq ARG8, %r12 // C 1332 1333#if MACRO_LEVEL>=1 1334 INNER_SCALE_AB_4X2_LIB4 1335#else 1336#if defined(OS_LINUX) | defined(OS_WINDOWS) 1337 call inner_scale_ab_4x2_lib4 1338#elif defined(OS_MAC) 1339 callq _inner_scale_ab_4x2_lib4 1340#endif 1341#endif 1342 1343 1344 // store n 1345 1346 movq ARG9, %r10 // D 1347 1348#if MACRO_LEVEL>=1 1349 INNER_STORE_4X2_LIB4 1350#else 1351#if defined(OS_LINUX) | defined(OS_WINDOWS) 1352 call inner_store_4x2_lib4 1353#elif defined(OS_MAC) 1354 callq _inner_store_4x2_lib4 1355#endif 1356#endif 1357 1358 1359 EPILOGUE 1360 1361 ret 1362 1363#if defined(OS_LINUX) 1364 .size kernel_dgemm_nn_4x2_lib4, .-kernel_dgemm_nn_4x2_lib4 1365#endif 1366// end 1367 1368 1369 1370 1371// void kernel_dgemm_nn_4x2_vs_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D, int m1, int n1); 1372// 1 2 3 4 5 6 7 8 9 10 11 1373 1374 .p2align 4,,15 1375#if defined(OS_LINUX) 1376 .globl kernel_dgemm_nn_4x2_vs_lib4 1377 .type kernel_dgemm_nn_4x2_vs_lib4, @function 1378kernel_dgemm_nn_4x2_vs_lib4: 1379#elif defined(OS_MAC) 1380 .globl _kernel_dgemm_nn_4x2_vs_lib4 1381_kernel_dgemm_nn_4x2_vs_lib4: 1382#elif defined(OS_WINDOWS) 1383 .globl kernel_dgemm_nn_4x2_vs_lib4 1384 .def kernel_dgemm_nn_4x2_vs_lib4; .scl 2; .type 32; .endef 1385kernel_dgemm_nn_4x2_vs_lib4: 1386#endif 1387 1388 PROLOGUE 1389 1390 // zero accumulation registers 1391 1392 vxorpd %ymm0, %ymm0, %ymm0 1393 vmovapd %ymm0, %ymm1 1394 vmovapd %ymm0, %ymm2 1395 vmovapd %ymm0, %ymm3 1396 1397 1398 // call inner dgemm kernel nn 1399 1400 movq ARG1, %r10 // k 1401 movq ARG3, %r11 // A 1402 movq ARG5, %r12 // B 1403 movq ARG6, %r13 // sdb 1404 sall $5, %r13d // 4*sdb*sizeof(double) 1405 movq ARG4, %r14 // offsetB 1406 1407#if MACRO_LEVEL>=1 1408 INNER_EDGE_DGEMM_ADD_NN_4X2_LIB4 1409#else 1410#if defined(OS_LINUX) | defined(OS_WINDOWS) 1411 call inner_edge_dgemm_add_nn_4x2_lib4 1412#elif defined(OS_MAC) 1413 callq _inner_edge_dgemm_add_nn_4x2_lib4 1414#endif 1415#endif 1416 1417#if MACRO_LEVEL>=2 1418 INNER_KERNEL_DGEMM_ADD_NN_4X2_LIB4 1419#else 1420#if defined(OS_LINUX) | defined(OS_WINDOWS) 1421 call inner_kernel_dgemm_add_nn_4x2_lib4 1422#elif defined(OS_MAC) 1423 callq _inner_kernel_dgemm_add_nn_4x2_lib4 1424#endif 1425#endif 1426 1427 1428 // call inner blend 1429 1430 movq ARG2, %r10 // alpha 1431 movq ARG7, %r11 // beta 1432 movq ARG8, %r12 // C 1433 1434#if MACRO_LEVEL>=1 1435 INNER_SCALE_AB_4X2_LIB4 1436#else 1437#if defined(OS_LINUX) | defined(OS_WINDOWS) 1438 call inner_scale_ab_4x2_lib4 1439#elif defined(OS_MAC) 1440 callq _inner_scale_ab_4x2_lib4 1441#endif 1442#endif 1443 1444 1445 // store n 1446 1447 movq ARG9, %r10 // D 1448 movq ARG10, %r11 // m1 1449 movq ARG11, %r12 // n1 1450 1451#if MACRO_LEVEL>=1 1452 INNER_STORE_4X2_VS_LIB4 1453#else 1454#if defined(OS_LINUX) | defined(OS_WINDOWS) 1455 call inner_store_4x2_vs_lib4 1456#elif defined(OS_MAC) 1457 callq _inner_store_4x2_vs_lib4 1458#endif 1459#endif 1460 1461 1462 EPILOGUE 1463 1464 ret 1465 1466#if defined(OS_LINUX) 1467 .size kernel_dgemm_nn_4x2_vs_lib4, .-kernel_dgemm_nn_4x2_vs_lib4 1468#endif 1469 1470// end 1471 1472 1473 1474 1475// void kernel_dgemm_nn_2x4_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D); 1476// 1 2 3 4 5 6 7 8 9 1477 1478 .p2align 4,,15 1479#if defined(OS_LINUX) 1480 .globl kernel_dgemm_nn_2x4_lib4 1481 .type kernel_dgemm_nn_2x4_lib4, @function 1482kernel_dgemm_nn_2x4_lib4: 1483#elif defined(OS_MAC) 1484 .globl _kernel_dgemm_nn_2x4_lib4 1485_kernel_dgemm_nn_2x4_lib4: 1486#elif defined(OS_WINDOWS) 1487 .globl kernel_dgemm_nn_2x4_lib4 1488 .def kernel_dgemm_nn_2x4_lib4; .scl 2; .type 32; .endef 1489kernel_dgemm_nn_2x4_lib4: 1490#endif 1491 1492 PROLOGUE 1493 1494 // zero accumulation registers 1495 1496 vxorpd %ymm0, %ymm0, %ymm0 1497 vmovapd %ymm0, %ymm1 1498 vmovapd %ymm0, %ymm2 1499 vmovapd %ymm0, %ymm3 1500 vmovapd %ymm0, %ymm4 1501 vmovapd %ymm0, %ymm5 1502 vmovapd %ymm0, %ymm6 1503 vmovapd %ymm0, %ymm7 1504 1505 1506 // call inner dgemm kernel nn 1507 1508 movq ARG1, %r10 // k 1509 movq ARG3, %r11 // A 1510 movq ARG5, %r12 // B 1511 movq ARG6, %r13 // sdb 1512 sall $5, %r13d // 4*sdb*sizeof(double) 1513 movq ARG4, %r14 // offsetB 1514 1515#if MACRO_LEVEL>=1 1516 INNER_EDGE_DGEMM_ADD_NN_2X4_LIB4 1517#else 1518#if defined(OS_LINUX) | defined(OS_WINDOWS) 1519 call inner_edge_dgemm_add_nn_2x4_lib4 1520#elif defined(OS_MAC) 1521 callq _inner_edge_dgemm_add_nn_2x4_lib4 1522#endif 1523#endif 1524 1525#if MACRO_LEVEL>=2 1526 INNER_KERNEL_DGEMM_ADD_NN_2X4_LIB4 1527#else 1528#if defined(OS_LINUX) | defined(OS_WINDOWS) 1529 call inner_kernel_dgemm_add_nn_2x4_lib4 1530#elif defined(OS_MAC) 1531 callq _inner_kernel_dgemm_add_nn_2x4_lib4 1532#endif 1533#endif 1534 1535 1536 // call inner blend 1537 1538 movq ARG2, %r10 // alpha 1539 movq ARG7, %r11 // beta 1540 movq ARG8, %r12 // C 1541 1542#if MACRO_LEVEL>=1 1543 INNER_SCALE_AB_2X4_LIB4 1544#else 1545#if defined(OS_LINUX) | defined(OS_WINDOWS) 1546 call inner_scale_ab_2x4_lib4 1547#elif defined(OS_MAC) 1548 callq _inner_scale_ab_2x4_lib4 1549#endif 1550#endif 1551 1552 1553 // store n 1554 1555 movq ARG9, %r10 // D 1556 1557#if MACRO_LEVEL>=1 1558 INNER_STORE_2X4_LIB4 1559#else 1560#if defined(OS_LINUX) | defined(OS_WINDOWS) 1561 call inner_store_2x4_lib4 1562#elif defined(OS_MAC) 1563 callq _inner_store_2x4_lib4 1564#endif 1565#endif 1566 1567 1568 EPILOGUE 1569 1570 ret 1571 1572#if defined(OS_LINUX) 1573 .size kernel_dgemm_nn_2x4_lib4, .-kernel_dgemm_nn_2x4_lib4 1574#endif 1575 1576// end 1577 1578 1579 1580 1581// void kernel_dgemm_nn_2x4_vs_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D, int m1, int n1); 1582// 1 2 3 4 5 6 7 8 9 10 11 1583 1584 .p2align 4,,15 1585#if defined(OS_LINUX) 1586 .globl kernel_dgemm_nn_2x4_vs_lib4 1587 .type kernel_dgemm_nn_2x4_vs_lib4, @function 1588kernel_dgemm_nn_2x4_vs_lib4: 1589#elif defined(OS_MAC) 1590 .globl _kernel_dgemm_nn_2x4_vs_lib4 1591_kernel_dgemm_nn_2x4_vs_lib4: 1592#elif defined(OS_WINDOWS) 1593 .globl kernel_dgemm_nn_2x4_vs_lib4 1594 .def kernel_dgemm_nn_2x4_vs_lib4; .scl 2; .type 32; .endef 1595kernel_dgemm_nn_2x4_vs_lib4: 1596#endif 1597 1598 PROLOGUE 1599 1600 // zero accumulation registers 1601 1602 vxorpd %ymm0, %ymm0, %ymm0 1603 vmovapd %ymm0, %ymm1 1604 vmovapd %ymm0, %ymm2 1605 vmovapd %ymm0, %ymm3 1606 vmovapd %ymm0, %ymm4 1607 vmovapd %ymm0, %ymm5 1608 vmovapd %ymm0, %ymm6 1609 vmovapd %ymm0, %ymm7 1610 1611 1612 // call inner dgemm kernel nn 1613 1614 movq ARG1, %r10 // k 1615 movq ARG3, %r11 // A 1616 movq ARG5, %r12 // B 1617 movq ARG6, %r13 // sdb 1618 sall $5, %r13d // 4*sdb*sizeof(double) 1619 movq ARG4, %r14 // offsetB 1620 1621#if MACRO_LEVEL>=1 1622 INNER_EDGE_DGEMM_ADD_NN_2X4_LIB4 1623#else 1624#if defined(OS_LINUX) | defined(OS_WINDOWS) 1625 call inner_edge_dgemm_add_nn_2x4_lib4 1626#elif defined(OS_MAC) 1627 callq _inner_edge_dgemm_add_nn_2x4_lib4 1628#endif 1629#endif 1630 1631#if MACRO_LEVEL>=2 1632 INNER_KERNEL_DGEMM_ADD_NN_2X4_LIB4 1633#else 1634#if defined(OS_LINUX) | defined(OS_WINDOWS) 1635 call inner_kernel_dgemm_add_nn_2x4_lib4 1636#elif defined(OS_MAC) 1637 callq _inner_kernel_dgemm_add_nn_2x4_lib4 1638#endif 1639#endif 1640 1641 1642 // call inner blend 1643 1644 movq ARG2, %r10 // alpha 1645 movq ARG7, %r11 // beta 1646 movq ARG8, %r12 // C 1647 1648#if MACRO_LEVEL>=1 1649 INNER_SCALE_AB_2X4_LIB4 1650#else 1651#if defined(OS_LINUX) | defined(OS_WINDOWS) 1652 call inner_scale_ab_2x4_lib4 1653#elif defined(OS_MAC) 1654 callq _inner_scale_ab_2x4_lib4 1655#endif 1656#endif 1657 1658 1659 // store n 1660 1661 movq ARG9, %r10 // D 1662 movq ARG10, %r11 // m1 1663 movq ARG11, %r12 // n1 1664 1665#if MACRO_LEVEL>=1 1666 INNER_STORE_2X4_VS_LIB4 1667#else 1668#if defined(OS_LINUX) | defined(OS_WINDOWS) 1669 call inner_store_2x4_vs_lib4 1670#elif defined(OS_MAC) 1671 callq _inner_store_2x4_vs_lib4 1672#endif 1673#endif 1674 1675 1676 EPILOGUE 1677 1678 ret 1679 1680#if defined(OS_LINUX) 1681 .size kernel_dgemm_nn_2x4_vs_lib4, .-kernel_dgemm_nn_2x4_vs_lib4 1682#endif 1683 1684// end 1685 1686 1687 1688 1689// void kernel_dgemm_nn_2x2_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D); 1690// 1 2 3 4 5 6 7 8 9 1691 1692 .p2align 4,,15 1693#if defined(OS_LINUX) 1694 .globl kernel_dgemm_nn_2x2_lib4 1695 .type kernel_dgemm_nn_2x2_lib4, @function 1696kernel_dgemm_nn_2x2_lib4: 1697#elif defined(OS_MAC) 1698 .globl _kernel_dgemm_nn_2x2_lib4 1699_kernel_dgemm_nn_2x2_lib4: 1700#elif defined(OS_WINDOWS) 1701 .globl kernel_dgemm_nn_2x2_lib4 1702 .def kernel_dgemm_nn_2x2_lib4; .scl 2; .type 32; .endef 1703kernel_dgemm_nn_2x2_lib4: 1704#endif 1705 1706 PROLOGUE 1707 1708 // zero accumulation registers 1709 1710 vxorpd %ymm0, %ymm0, %ymm0 1711 vmovapd %ymm0, %ymm1 1712 vmovapd %ymm0, %ymm2 1713 vmovapd %ymm0, %ymm3 1714 1715 1716 // call inner dgemm kernel nn 1717 1718 movq ARG1, %r10 // k 1719 movq ARG3, %r11 // A 1720 movq ARG5, %r12 // B 1721 movq ARG6, %r13 // sdb 1722 sall $5, %r13d // 4*sdb*sizeof(double) 1723 movq ARG4, %r14 // offsetB 1724 1725#if MACRO_LEVEL>=1 1726 INNER_EDGE_DGEMM_ADD_NN_4X2_LIB4 1727#else 1728#if defined(OS_LINUX) | defined(OS_WINDOWS) 1729 call inner_edge_dgemm_add_nn_4x2_lib4 1730#elif defined(OS_MAC) 1731 callq _inner_edge_dgemm_add_nn_4x2_lib4 1732#endif 1733#endif 1734 1735#if MACRO_LEVEL>=2 1736 INNER_KERNEL_DGEMM_ADD_NN_4X2_LIB4 1737#else 1738#if defined(OS_LINUX) | defined(OS_WINDOWS) 1739 call inner_kernel_dgemm_add_nn_4x2_lib4 1740#elif defined(OS_MAC) 1741 callq _inner_kernel_dgemm_add_nn_4x2_lib4 1742#endif 1743#endif 1744 1745 1746 // call inner blend 1747 1748 movq ARG2, %r10 // alpha 1749 movq ARG7, %r11 // beta 1750 movq ARG8, %r12 // C 1751 1752#if MACRO_LEVEL>=1 1753 INNER_SCALE_AB_4X2_LIB4 1754#else 1755#if defined(OS_LINUX) | defined(OS_WINDOWS) 1756 call inner_scale_ab_4x2_lib4 1757#elif defined(OS_MAC) 1758 callq _inner_scale_ab_4x2_lib4 1759#endif 1760#endif 1761 1762 1763 // store n 1764 1765 movq ARG9, %r10 // D 1766 1767#if MACRO_LEVEL>=1 1768 INNER_STORE_2X2_LIB4 1769#else 1770#if defined(OS_LINUX) | defined(OS_WINDOWS) 1771 call inner_store_2x2_lib4 1772#elif defined(OS_MAC) 1773 callq _inner_store_2x2_lib4 1774#endif 1775#endif 1776 1777 1778 EPILOGUE 1779 1780 ret 1781 1782#if defined(OS_LINUX) 1783 .size kernel_dgemm_nn_2x2_lib4, .-kernel_dgemm_nn_2x2_lib4 1784#endif 1785// end 1786 1787 1788 1789 1790// Data 1791 1792 1793 1794 1795 // read-only data 1796#if defined(OS_LINUX) 1797 .section .rodata.cst32,"aM",@progbits,32 1798#elif defined(OS_MAC) 1799 .section __TEXT,__const 1800#elif defined(OS_WINDOWS) 1801 .section .rdata,"dr" 1802#endif 1803 1804#if defined(OS_LINUX) | defined(OS_WINDOWS) 1805 .align 32 1806.LC00: // { -1 -1 -1 1 } 1807#elif defined(OS_MAC) 1808LC00: // { -1 -1 -1 1 } 1809 .align 5 1810#endif 1811 .quad -1 1812 .quad -1 1813 .quad -1 1814 .quad 1 1815 1816#if defined(OS_LINUX) | defined(OS_WINDOWS) 1817 .align 32 1818.LC01: // { -1 -1 -1 -1 } 1819#elif defined(OS_MAC) 1820LC01: // { -1 -1 -1 -1 } 1821 .align 5 1822#endif 1823 .quad -1 1824 .quad -1 1825 .quad -1 1826 .quad -1 1827 1828#if defined(OS_LINUX) | defined(OS_WINDOWS) 1829 .align 32 1830.LC02: // { 3.5 2.5 1.5 0.5 } 1831#elif defined(OS_MAC) 1832LC02: // { 3.5 2.5 1.5 0.5 } 1833 .align 5 1834#endif 1835 .long 0 1836 .long 1071644672 1837 .long 0 1838 .long 1073217536 1839 .long 0 1840 .long 1074003968 1841 .long 0 1842 .long 1074528256 1843 1844#if defined(OS_LINUX) | defined(OS_WINDOWS) 1845 .align 32 1846.LC03: // { 7.5 6.5 5.5 4.5 } 1847#elif defined(OS_MAC) 1848LC03: // { 7.5 6.5 5.5 4.5 } 1849 .align 5 1850#endif 1851 .long 0 1852 .long 1074921472 1853 .long 0 1854 .long 1075183616 1855 .long 0 1856 .long 1075445760 1857 .long 0 1858 .long 1075707904 1859 1860#if defined(OS_LINUX) | defined(OS_WINDOWS) 1861 .align 32 1862.LC04: // { 1.0 1.0 1.0 1.0 } 1863#elif defined(OS_MAC) 1864LC04: // { 1.0 1.0 1.0 1.0 } 1865 .align 5 1866#endif 1867 .long 0 1868 .long 1072693248 1869 .long 0 1870 .long 1072693248 1871 .long 0 1872 .long 1072693248 1873 .long 0 1874 .long 1072693248 1875 1876#if defined(OS_LINUX) | defined(OS_WINDOWS) 1877 .align 32 1878.LC05: // { 1.0 1.0 1.0 -1.0 } 1879#elif defined(OS_MAC) 1880 .align 5 1881LC05: // { 1.0 1.0 1.0 -1.0 } 1882#endif 1883 .long 0 1884 .long -1074790400 1885 .long 0 1886 .long 1072693248 1887 .long 0 1888 .long 1072693248 1889 .long 0 1890 .long 1072693248 1891 1892#if defined(OS_LINUX) | defined(OS_WINDOWS) 1893 .align 32 1894.LC06: // { 1.0 1.0 -1.0 -1.0 } 1895#elif defined(OS_MAC) 1896 .align 5 1897LC06: // { 1.0 1.0 -1.0 -1.0 } 1898#endif 1899 .long 0 1900 .long -1074790400 1901 .long 0 1902 .long -1074790400 1903 .long 0 1904 .long 1072693248 1905 .long 0 1906 .long 1072693248 1907 1908#if defined(OS_LINUX) | defined(OS_WINDOWS) 1909 .align 32 1910.LC07: // { 1.0 -1.0 -1.0 -1.0 } 1911#elif defined(OS_MAC) 1912 .align 5 1913LC07: // { 1.0 -1.0 -1.0 -1.0 } 1914#endif 1915 .long 0 1916 .long -1074790400 1917 .long 0 1918 .long -1074790400 1919 .long 0 1920 .long -1074790400 1921 .long 0 1922 .long 1072693248 1923 1924#if defined(OS_LINUX) | defined(OS_WINDOWS) 1925 .align 32 1926.LC08: // { -1.0 -1.0 -1.0 1.0 } 1927#elif defined(OS_MAC) 1928 .align 5 1929LC08: // { -1.0 -1.0 -1.0 1.0 } 1930#endif 1931 .long 0 1932 .long 1072693248 1933 .long 0 1934 .long -1074790400 1935 .long 0 1936 .long -1074790400 1937 .long 0 1938 .long -1074790400 1939 1940#if defined(OS_LINUX) | defined(OS_WINDOWS) 1941 .align 32 1942.LC09: // { -1.0 -1.0 1.0 1.0 } 1943#elif defined(OS_MAC) 1944 .align 5 1945LC09: // { -1.0 -1.0 1.0 1.0 } 1946#endif 1947 .long 0 1948 .long 1072693248 1949 .long 0 1950 .long 1072693248 1951 .long 0 1952 .long -1074790400 1953 .long 0 1954 .long -1074790400 1955 1956#if defined(OS_LINUX) | defined(OS_WINDOWS) 1957 .align 32 1958.LC10: // { -1.0 1.0 1.0 1.0 } 1959#elif defined(OS_MAC) 1960 .align 5 1961LC10: // { -1.0 1.0 1.0 1.0 } 1962#endif 1963 .long 0 1964 .long 1072693248 1965 .long 0 1966 .long 1072693248 1967 .long 0 1968 .long 1072693248 1969 .long 0 1970 .long -1074790400 1971 1972 1973 1974 1975#if defined(OS_LINUX) 1976 .section .note.GNU-stack,"",@progbits 1977#elif defined(OS_MAC) 1978 .subsections_via_symbols 1979#endif 1980