1/************************************************************************************************** 2* * 3* This file is part of BLASFEO. * 4* * 5* BLASFEO -- BLAS For Embedded Optimization. * 6* Copyright (C) 2019 by Gianluca Frison. * 7* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. * 8* All rights reserved. * 9* * 10* The 2-Clause BSD License * 11* * 12* Redistribution and use in source and binary forms, with or without * 13* modification, are permitted provided that the following conditions are met: * 14* * 15* 1. Redistributions of source code must retain the above copyright notice, this * 16* list of conditions and the following disclaimer. * 17* 2. Redistributions in binary form must reproduce the above copyright notice, * 18* this list of conditions and the following disclaimer in the documentation * 19* and/or other materials provided with the distribution. * 20* * 21* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * 22* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * 23* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * 24* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * 25* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * 26* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * 27* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * 28* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * 29* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * 30* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * 31* * 32* Author: Gianluca Frison, gianluca.frison (at) imtek.uni-freiburg.de * 33* * 34**************************************************************************************************/ 35 36#if defined(OS_LINUX) | defined(OS_MAC) 37 38//#define STACKSIZE 96 39#define STACKSIZE 64 40#define ARG1 %rdi 41#define ARG2 %rsi 42#define ARG3 %rdx 43#define ARG4 %rcx 44#define ARG5 %r8 45#define ARG6 %r9 46#define ARG7 STACKSIZE + 8(%rsp) 47#define ARG8 STACKSIZE + 16(%rsp) 48#define ARG9 STACKSIZE + 24(%rsp) 49#define ARG10 STACKSIZE + 32(%rsp) 50#define ARG11 STACKSIZE + 40(%rsp) 51#define ARG12 STACKSIZE + 48(%rsp) 52#define ARG13 STACKSIZE + 56(%rsp) 53#define ARG14 STACKSIZE + 64(%rsp) 54#define ARG15 STACKSIZE + 72(%rsp) 55#define ARG16 STACKSIZE + 80(%rsp) 56#define ARG17 STACKSIZE + 88(%rsp) 57#define ARG18 STACKSIZE + 96(%rsp) 58#define PROLOGUE \ 59 subq $STACKSIZE, %rsp; \ 60 movq %rbx, (%rsp); \ 61 movq %rbp, 8(%rsp); \ 62 movq %r12, 16(%rsp); \ 63 movq %r13, 24(%rsp); \ 64 movq %r14, 32(%rsp); \ 65 movq %r15, 40(%rsp); \ 66 vzeroupper; 67#define EPILOGUE \ 68 vzeroupper; \ 69 movq (%rsp), %rbx; \ 70 movq 8(%rsp), %rbp; \ 71 movq 16(%rsp), %r12; \ 72 movq 24(%rsp), %r13; \ 73 movq 32(%rsp), %r14; \ 74 movq 40(%rsp), %r15; \ 75 addq $STACKSIZE, %rsp; 76 77#elif defined(OS_WINDOWS) 78 79#define STACKSIZE 256 80#define ARG1 %rcx 81#define ARG2 %rdx 82#define ARG3 %r8 83#define ARG4 %r9 84#define ARG5 STACKSIZE + 40(%rsp) 85#define ARG6 STACKSIZE + 48(%rsp) 86#define ARG7 STACKSIZE + 56(%rsp) 87#define ARG8 STACKSIZE + 64(%rsp) 88#define ARG9 STACKSIZE + 72(%rsp) 89#define ARG10 STACKSIZE + 80(%rsp) 90#define ARG11 STACKSIZE + 88(%rsp) 91#define ARG12 STACKSIZE + 96(%rsp) 92#define ARG13 STACKSIZE + 104(%rsp) 93#define ARG14 STACKSIZE + 112(%rsp) 94#define ARG15 STACKSIZE + 120(%rsp) 95#define ARG16 STACKSIZE + 128(%rsp) 96#define ARG17 STACKSIZE + 136(%rsp) 97#define ARG18 STACKSIZE + 144(%rsp) 98#define PROLOGUE \ 99 subq $STACKSIZE, %rsp; \ 100 movq %rbx, (%rsp); \ 101 movq %rbp, 8(%rsp); \ 102 movq %r12, 16(%rsp); \ 103 movq %r13, 24(%rsp); \ 104 movq %r14, 32(%rsp); \ 105 movq %r15, 40(%rsp); \ 106 movq %rdi, 48(%rsp); \ 107 movq %rsi, 56(%rsp); \ 108 vmovups %xmm6, 64(%rsp); \ 109 vmovups %xmm7, 80(%rsp); \ 110 vmovups %xmm8, 96(%rsp); \ 111 vmovups %xmm9, 112(%rsp); \ 112 vmovups %xmm10, 128(%rsp); \ 113 vmovups %xmm11, 144(%rsp); \ 114 vmovups %xmm12, 160(%rsp); \ 115 vmovups %xmm13, 176(%rsp); \ 116 vmovups %xmm14, 192(%rsp); \ 117 vmovups %xmm15, 208(%rsp); \ 118 vzeroupper; 119#define EPILOGUE \ 120 vzeroupper; \ 121 movq (%rsp), %rbx; \ 122 movq 8(%rsp), %rbp; \ 123 movq 16(%rsp), %r12; \ 124 movq 24(%rsp), %r13; \ 125 movq 32(%rsp), %r14; \ 126 movq 40(%rsp), %r15; \ 127 movq 48(%rsp), %rdi; \ 128 movq 56(%rsp), %rsi; \ 129 vmovups 64(%rsp), %xmm6; \ 130 vmovups 80(%rsp), %xmm7; \ 131 vmovups 96(%rsp), %xmm8; \ 132 vmovups 112(%rsp), %xmm9; \ 133 vmovups 128(%rsp), %xmm10; \ 134 vmovups 144(%rsp), %xmm11; \ 135 vmovups 160(%rsp), %xmm12; \ 136 vmovups 176(%rsp), %xmm13; \ 137 vmovups 192(%rsp), %xmm14; \ 138 vmovups 208(%rsp), %xmm15; \ 139 addq $STACKSIZE, %rsp; 140 141#else 142 143#error wrong OS 144 145#endif 146 147 148 149#if defined(OS_LINUX) | defined(OS_WINDOWS) 150 .text 151#elif defined(OS_MAC) 152 .section __TEXT,__text,regular,pure_instructions 153#endif 154 155 156 157 158// common inner routine with file scope 159// 160// input arguments: 161// r10d <- k 162// r11 <- A 163// r12 <- B 164// r13 <- 4*sdb*sizeof(double) 165// r14 <= dirty 166// ymm0 <- [d00 d10 d20 d30] 167// ymm1 <- [d01 d11 d21 d31] 168 169// 170// output arguments: 171// r10d <- 0 172// r11 <- A+4*k*sizeof(double) 173// r12 <- B+(k/4)*sdb*sizeof(double)+(k%4) 174// r13 <- 4*sdb*sizeof(double) 175// r14 <= dirty 176// ymm0 <- [d00 d10 d20 d30] 177// ymm1 <- [d01 d11 d21 d31] 178 179#if MACRO_LEVEL>=2 180 .macro INNER_KERNEL_DGEMM_ADD_NN_4X2_LIB4 181#else 182 .p2align 4,,15 183#if defined(OS_LINUX) 184 .type inner_kernel_dgemm_add_nn_4x2_lib4, @function 185inner_kernel_dgemm_add_nn_4x2_lib4: 186#elif defined(OS_MAC) 187_inner_kernel_dgemm_add_nn_4x2_lib4: 188#elif defined(OS_WINDOWS) 189 .def inner_kernel_dgemm_add_nn_4x2_lib4; .scl 2; .type 32; .endef 190inner_kernel_dgemm_add_nn_4x2_lib4: 191#endif 192#endif 193 194 cmpl $0, %r10d 195 jle 2f // return 196 197 // preload 198 vmovapd 0(%r11), %ymm8 // A0[0] 199 200 cmpl $4, %r10d 201 jle 0f // consider clean-up loop 202 203 // main loop 204 .p2align 3 2051: // main loop 206 207 prefetcht0 0(%r12, %r13, 2) // software prefetch 208 209 // unroll 0 210 vbroadcastsd 0(%r12), %ymm12 211 vmulpd %ymm8, %ymm12, %ymm15 212 vaddpd %ymm15, %ymm0, %ymm0 213 vmovapd 32(%r11), %ymm10 // A0 214 215 vbroadcastsd 32(%r12), %ymm12 216 vmulpd %ymm8, %ymm12, %ymm15 217 vaddpd %ymm15, %ymm1, %ymm1 218 subl $4, %r10d 219 220 // unroll 1 221 vbroadcastsd 8(%r12), %ymm12 222 vmulpd %ymm10, %ymm12, %ymm15 223 vaddpd %ymm15, %ymm0, %ymm0 224 vmovapd 64(%r11), %ymm8 // A0 225 226 vbroadcastsd 40(%r12), %ymm12 227 vmulpd %ymm10, %ymm12, %ymm15 228 vaddpd %ymm15, %ymm1, %ymm1 229 230 // unroll 2 231 vbroadcastsd 16(%r12), %ymm12 232 vmulpd %ymm8, %ymm12, %ymm15 233 vaddpd %ymm15, %ymm0, %ymm0 234 vmovapd 96(%r11), %ymm10 // A0 235 236 vbroadcastsd 48(%r12), %ymm12 237 vmulpd %ymm8, %ymm12, %ymm15 238 vaddpd %ymm15, %ymm1, %ymm1 239 addq $128, %r11 240 241 // unroll 3 242 vbroadcastsd 24(%r12), %ymm12 243 vmulpd %ymm10, %ymm12, %ymm15 244 vaddpd %ymm15, %ymm0, %ymm0 245 vmovapd 0(%r11), %ymm8 // A0 246 247 vbroadcastsd 56(%r12), %ymm12 248 vmulpd %ymm10, %ymm12, %ymm15 249 vaddpd %ymm15, %ymm1, %ymm1 250 addq %r13, %r12 251 252 cmpl $4, %r10d 253 jg 1b // main loop 254 2550: // consider clean4-up 256 257 cmpl $3, %r10d 258 jle 4f // clean1 259 260 // unroll 0 261 vbroadcastsd 0(%r12), %ymm12 262 vmulpd %ymm8, %ymm12, %ymm15 263 vaddpd %ymm15, %ymm0, %ymm0 264 vmovapd 32(%r11), %ymm10 // A0 265 266 vbroadcastsd 32(%r12), %ymm12 267 vmulpd %ymm8, %ymm12, %ymm15 268 vaddpd %ymm15, %ymm1, %ymm1 269 vaddpd %ymm15, %ymm7, %ymm7 270 subl $4, %r10d 271 272 // unroll 1 273 vbroadcastsd 8(%r12), %ymm12 274 vmulpd %ymm10, %ymm12, %ymm15 275 vaddpd %ymm15, %ymm0, %ymm0 276 vmovapd 64(%r11), %ymm8 // A0 277 278 vbroadcastsd 40(%r12), %ymm12 279 vmulpd %ymm10, %ymm12, %ymm15 280 vaddpd %ymm15, %ymm1, %ymm1 281 282 // unroll 2 283 vbroadcastsd 16(%r12), %ymm12 284 vmulpd %ymm8, %ymm12, %ymm15 285 vaddpd %ymm15, %ymm0, %ymm0 286 vmovapd 96(%r11), %ymm10 // A0 287 288 vbroadcastsd 48(%r12), %ymm12 289 vmulpd %ymm8, %ymm12, %ymm15 290 vaddpd %ymm15, %ymm1, %ymm1 291 addq $128, %r11 292 293 // unroll 3 294 vbroadcastsd 24(%r12), %ymm12 295 vmulpd %ymm10, %ymm12, %ymm15 296 vaddpd %ymm15, %ymm0, %ymm0 297// vmovapd 0(%r11), %ymm8 // A0 298 299 vbroadcastsd 56(%r12), %ymm12 300 vmulpd %ymm10, %ymm12, %ymm15 301 vaddpd %ymm15, %ymm1, %ymm1 302 addq %r13, %r12 303 304 jmp 2f 305 306 3074: // consider clean1-up loop 308 309 cmpl $0, %r10d 310 jle 2f // return 311 312 // clean-up loop 3133: // clean up loop 314 315 vmovapd 0(%r11), %ymm8 // A0[0] 316 vbroadcastsd 0(%r12), %ymm12 317 vmulpd %ymm8, %ymm12, %ymm15 318 vaddpd %ymm15, %ymm0, %ymm0 319 320 vbroadcastsd 32(%r12), %ymm12 321 vmulpd %ymm8, %ymm12, %ymm15 322 vaddpd %ymm15, %ymm1, %ymm1 323 addq $32, %r11 324 subl $1, %r10d 325 addq $8, %r12 326 327 cmpl $0, %r10d 328 jg 3b // clean up loop 329 3302: // return 331 332#if MACRO_LEVEL>=2 333 .endm 334#else 335 ret 336 337#if defined(OS_LINUX) 338 .size inner_kernel_dgemm_add_nn_4x2_lib4, .-inner_kernel_dgemm_add_nn_4x2_lib4 339#endif 340#endif 341 342 343 344 345 346// common inner routine with file scope 347// 348// input arguments: 349// r10d <- k 350// r11 <- A 351// r12 <- B 352// r13 <- 4*sdb*sizeof(double) 353 354// 355// output arguments: 356// r10d <- 0 357// r11 <- A+4*k*sizeof(double) 358// r12 <- B+(k/4)*sdb*sizeof(double)+(k%4) 359// r13 <- 4*sdb*sizeof(double) 360 361#if MACRO_LEVEL>=2 362 .macro INNER_KERNEL_DGEMM_ADD_NN_2X4_LIB4 363#else 364 .p2align 4,,15 365#if defined(OS_LINUX) 366 .type inner_kernel_dgemm_add_nn_2x4_lib4, @function 367inner_kernel_dgemm_add_nn_2x4_lib4: 368#elif defined(OS_MAC) 369_inner_kernel_dgemm_add_nn_2x4_lib4: 370#elif defined(OS_WINDOWS) 371 .def inner_kernel_dgemm_add_nn_2x4_lib4; .scl 2; .type 32; .endef 372inner_kernel_dgemm_add_nn_2x4_lib4: 373#endif 374#endif 375 376 cmpl $0, %r10d 377 jle 5f // return 378 379 // preload 380 vbroadcastf128 0(%r11), %ymm11 // A 381 vbroadcastf128 32(%r11), %ymm12 // A 382 383 cmpl $4, %r10d 384 jle 0f // consider clean-up loop 385 386 // main loop 387 .p2align 3 3881: // main loop 389 390 prefetcht0 0(%r12, %r13, 2) // software prefetch 391 prefetcht0 64(%r12, %r13, 2) // software prefetch 392 393 // unroll 0 1 394 vmovapd 0(%r12), %ymm13 395 vmovupd 16(%r12), %ymm14 396 vblendpd $0x3, %ymm13, %ymm14, %ymm14 397 vshufpd $0x0, %ymm14, %ymm14, %ymm13 398 vmulpd %ymm11, %ymm13, %ymm15 399 vaddpd %ymm0, %ymm15, %ymm0 400 vbroadcastf128 64(%r11), %ymm9 // A 401 vshufpd $0xf, %ymm14, %ymm14, %ymm13 402 vmulpd %ymm12, %ymm13, %ymm15 403 vaddpd %ymm1, %ymm15, %ymm1 404 vmovapd 64(%r12), %ymm13 405 vmovupd 80(%r12), %ymm14 406 vblendpd $0x3, %ymm13, %ymm14, %ymm14 407 vshufpd $0x0, %ymm14, %ymm14, %ymm13 408 vmulpd %ymm11, %ymm13, %ymm15 409 vaddpd %ymm2, %ymm15, %ymm2 410 vbroadcastf128 96(%r11), %ymm10 // A 411 vshufpd $0xf, %ymm14, %ymm14, %ymm13 412 vmulpd %ymm12, %ymm13, %ymm15 413 vaddpd %ymm3, %ymm15, %ymm3 414 415 // unroll 2 3 416 vmovupd 16(%r12), %ymm13 417 vmovapd 32(%r12), %ymm14 418 addq $128, %r11 419 vblendpd $0x3, %ymm13, %ymm14, %ymm14 420 vshufpd $0x0, %ymm14, %ymm14, %ymm13 421 vmulpd %ymm9, %ymm13, %ymm15 422 vaddpd %ymm0, %ymm15, %ymm0 423 vbroadcastf128 0(%r11), %ymm11 // A 424 vshufpd $0xf, %ymm14, %ymm14, %ymm13 425 vmulpd %ymm10, %ymm13, %ymm15 426 vaddpd %ymm1, %ymm15, %ymm1 427 vmovupd 80(%r12), %ymm13 428 vmovapd 96(%r12), %ymm14 429 vblendpd $0x3, %ymm13, %ymm14, %ymm14 430 vshufpd $0x0, %ymm14, %ymm14, %ymm13 431 vmulpd %ymm9, %ymm13, %ymm15 432 vaddpd %ymm2, %ymm15, %ymm2 433 vbroadcastf128 32(%r11), %ymm12 // A 434 vshufpd $0xf, %ymm14, %ymm14, %ymm13 435 vmulpd %ymm10, %ymm13, %ymm15 436 vaddpd %ymm3, %ymm15, %ymm3 437 438 subl $4, %r10d 439 addq %r13, %r12 440 441 cmpl $4, %r10d 442 jg 1b // main loop 443 444 4450: // consider clean4-up 446 447 cmpl $3, %r10d 448 jle 4f // clean1 449 450 // unroll 0 1 451 vmovapd 0(%r12), %ymm13 452 vmovupd 16(%r12), %ymm14 453 vblendpd $0x3, %ymm13, %ymm14, %ymm14 454 vshufpd $0x0, %ymm14, %ymm14, %ymm13 455 vmulpd %ymm11, %ymm13, %ymm15 456 vaddpd %ymm0, %ymm15, %ymm0 457 vbroadcastf128 64(%r11), %ymm9 // A 458 vshufpd $0xf, %ymm14, %ymm14, %ymm13 459 vmulpd %ymm12, %ymm13, %ymm15 460 vaddpd %ymm1, %ymm15, %ymm1 461 vmovapd 64(%r12), %ymm13 462 vmovupd 80(%r12), %ymm14 463 vblendpd $0x3, %ymm13, %ymm14, %ymm14 464 vshufpd $0x0, %ymm14, %ymm14, %ymm13 465 vmulpd %ymm11, %ymm13, %ymm15 466 vaddpd %ymm2, %ymm15, %ymm2 467 vbroadcastf128 96(%r11), %ymm10 // A 468 vshufpd $0xf, %ymm14, %ymm14, %ymm13 469 vmulpd %ymm12, %ymm13, %ymm15 470 vaddpd %ymm3, %ymm15, %ymm3 471 472 // unroll 2 3 473 vmovupd 16(%r12), %ymm13 474 vmovapd 32(%r12), %ymm14 475 addq $128, %r11 476 vblendpd $0x3, %ymm13, %ymm14, %ymm14 477 vshufpd $0x0, %ymm14, %ymm14, %ymm13 478 vmulpd %ymm9, %ymm13, %ymm15 479 vaddpd %ymm0, %ymm15, %ymm0 480// vbroadcastf128 0(%r11), %ymm11 // A 481 vshufpd $0xf, %ymm14, %ymm14, %ymm13 482 vmulpd %ymm10, %ymm13, %ymm15 483 vaddpd %ymm1, %ymm15, %ymm1 484 vmovupd 80(%r12), %ymm13 485 vmovapd 96(%r12), %ymm14 486 vblendpd $0x3, %ymm13, %ymm14, %ymm14 487 vshufpd $0x0, %ymm14, %ymm14, %ymm13 488 vmulpd %ymm9, %ymm13, %ymm15 489 vaddpd %ymm2, %ymm15, %ymm2 490// vbroadcastf128 32(%r11), %ymm12 // A 491 vshufpd $0xf, %ymm14, %ymm14, %ymm13 492 vmulpd %ymm10, %ymm13, %ymm15 493 vaddpd %ymm3, %ymm15, %ymm3 494 495 subl $4, %r10d 496 addq $128, %r11 497 addq %r13, %r12 498 499 jmp 2f // return 500 501 5024: // consider clean1-up loop 503 504 cmpl $0, %r10d 505 jle 2f // return 506 507 // clean-up loop 5083: // clean up loop 509 510 // unroll 0 511 vbroadcastf128 0(%r11), %ymm11 // A 512 vmovupd 0(%r12), %ymm13 513 vmovupd 16(%r12), %ymm14 514 vblendpd $0x3, %ymm13, %ymm14, %ymm14 515 vshufpd $0x0, %ymm14, %ymm14, %ymm13 516 vmulpd %ymm11, %ymm13, %ymm15 517 vaddpd %ymm0, %ymm15, %ymm0 518 vmovupd 64(%r12), %ymm13 519 vmovupd 80(%r12), %ymm14 520 vblendpd $0x3, %ymm13, %ymm14, %ymm14 521 vshufpd $0x0, %ymm14, %ymm14, %ymm13 522 vmulpd %ymm11, %ymm13, %ymm15 523 vaddpd %ymm2, %ymm15, %ymm2 524 525 addq $32, %r11 526 addq $8, %r12 527 subl $1, %r10d 528 529 cmpl $0, %r10d 530 jg 3b // clean up loop 531 532 5332: // reduce 534 vaddpd %ymm0, %ymm1, %ymm0 535 vextractf128 $0x1, %ymm0, %xmm1 536 vaddpd %ymm2, %ymm3, %ymm2 537 vextractf128 $0x1, %ymm2, %xmm3 538 5395: // return 540 541#if MACRO_LEVEL>=2 542 .endm 543#else 544 ret 545 546#if defined(OS_LINUX) 547 .size inner_kernel_dgemm_add_nn_2x4_lib4, .-inner_kernel_dgemm_add_nn_2x4_lib4 548#endif 549#endif 550 551 552 553 554 555// common inner routine with file scope 556// 557// edge for B unaligned 558// 559// input arguments: 560// r10 <- k 561// r11 <- A 562// r12 <- B 563// r13 <- bs*sdb*sizeof(double) 564// r14 <- offB 565 566// 567// output arguments: 568// r10 <- k-(4-offB) 569// r11 <- A+(4-offB)*bs*sizeof(double) 570// r12 <- B-offB+bs*sdb*sizeof(double) 571// r13 <- bs*sdb*sizeof(double) 572// r14 <- offB 573 574 575#if MACRO_LEVEL>=1 576 .macro INNER_EDGE_DGEMM_ADD_NN_4X2_LIB4 577#else 578 .p2align 4,,15 579#if defined(OS_LINUX) 580 .type inner_edge_dgemm_add_nn_4x2_lib4, @function 581inner_edge_dgemm_add_nn_4x2_lib4: 582#elif defined(OS_MAC) 583_inner_edge_dgemm_add_nn_4x2_lib4: 584#elif defined(OS_WINDOWS) 585 .def inner_edge_dgemm_add_nn_4x2_lib4; .scl 2; .type 32; .endef 586inner_edge_dgemm_add_nn_4x2_lib4: 587#endif 588#endif 589 590 cmpl $0, %r14d // offset==0 591 jle 2f // end 592 593 cmpl $0, %r10d // k==0 594 jle 2f // end 595 596 movl $4, %r15d 597 subl %r14d, %r15d // 4-offsetB 598 cmpl %r10d, %r15d 599// jle 0f 600// movl %r10d, %r15d // kend=min(k,4-offsetB) 601//0: 602 cmovgl %r10d, %r15d // kend=min(k,4-offsetB) 603 604 movl %r14d, %eax 605 sall $3, %eax // offsetB*sizeof(double) 606 addq %rax, %r12 // B+offsetB*sizeof(double) 607 6081: 609 vmovapd 0(%r11), %ymm8 610 vbroadcastsd 0(%r12), %ymm12 611 vmulpd %ymm8, %ymm12, %ymm15 612 vaddpd %ymm15, %ymm0, %ymm0 613 vbroadcastsd 32(%r12), %ymm12 614 vmulpd %ymm8, %ymm12, %ymm15 615 vaddpd %ymm15, %ymm1, %ymm1 616 617 subl $1, %r10d // k-1 618 subl $1, %r15d // kend-1 619 addq $32, %r11 // A+1*bs*sizeof(float) 620 addq $8, %r12 // B+1*sizeof(float) 621 622 cmpl $0, %r15d 623 jg 1b 624 625 cmpl $0, %r10d 626 jle 2f // end 627 628 addq %r13, %r12 629 subq $32, %r12 // B+bs*(sdb-1)*sizeof(double) 630 6312: 632 633#if MACRO_LEVEL>=1 634 .endm 635#else 636 ret 637 638#if defined(OS_LINUX) 639 .size inner_edge_dgemm_add_nn_4x2_lib4, .-inner_edge_dgemm_add_nn_4x2_lib4 640#endif 641#endif 642 643 644 645 646 647// common inner routine with file scope 648// 649// edge for B unaligned 650// 651// input arguments: 652// r10 <- k 653// r11 <- A 654// r12 <- B 655// r13 <- bs*sdb*sizeof(double) 656// r14 <- offB 657 658// 659// output arguments: 660// r10 <- k-(4-offB) 661// r11 <- A+(4-offB)*bs*sizeof(double) 662// r12 <- B-offB+bs*sdb*sizeof(double) 663// r13 <- bs*sdb*sizeof(double) 664// r14 <- offB 665 666 667#if MACRO_LEVEL>=1 668 .macro INNER_EDGE_DGEMM_ADD_NN_2X4_LIB4 669#else 670 .p2align 4,,15 671#if defined(OS_LINUX) 672 .type inner_edge_dgemm_add_nn_2x4_lib4, @function 673inner_edge_dgemm_add_nn_2x4_lib4: 674#elif defined(OS_MAC) 675_inner_edge_dgemm_add_nn_2x4_lib4: 676#elif defined(OS_WINDOWS) 677 .def inner_edge_dgemm_add_nn_2x4_lib4; .scl 2; .type 32; .endef 678inner_edge_dgemm_add_nn_2x4_lib4: 679#endif 680#endif 681 682 cmpl $0, %r14d // offset==0 683 jle 2f // end 684 685 cmpl $0, %r10d // k==0 686 jle 2f // end 687 688 movl $4, %r15d 689 subl %r14d, %r15d // 4-offsetB 690 cmpl %r10d, %r15d 691// jle 0f 692// movl %r10d, %r15d // kend=min(k,4-offsetB) 693//0: 694 cmovgl %r10d, %r15d // kend=min(k,4-offsetB) 695 696 movl %r14d, %eax 697 sall $3, %eax // offsetB*sizeof(double) 698 addq %rax, %r12 // B+offsetB*sizeof(double) 699 7001: 701 vbroadcastf128 0(%r11), %ymm11 // A 702 vmovupd 0(%r12), %ymm13 703 vmovupd 16(%r12), %ymm14 704 vblendpd $0x3, %ymm13, %ymm14, %ymm14 705 vshufpd $0x0, %ymm14, %ymm14, %ymm13 706 vmulpd %ymm11, %ymm13, %ymm15 707 vaddpd %ymm0, %ymm15, %ymm0 708 vmovupd 64(%r12), %ymm13 709 vmovupd 80(%r12), %ymm14 710 vblendpd $0x3, %ymm13, %ymm14, %ymm14 711 vshufpd $0x0, %ymm14, %ymm14, %ymm13 712 vmulpd %ymm11, %ymm13, %ymm15 713 vaddpd %ymm2, %ymm15, %ymm2 714 715 subl $1, %r10d // k-1 716 subl $1, %r15d // kend-1 717 addq $32, %r11 // A+1*bs*sizeof(float) 718 addq $8, %r12 // B+1*sizeof(float) 719 720 cmpl $0, %r15d 721 jg 1b 722 723 cmpl $0, %r10d 724 jle 2f // end 725 726 addq %r13, %r12 727 subq $32, %r12 // B+bs*(sdb-1)*sizeof(double) 728 7292: 730 731#if MACRO_LEVEL>=1 732 .endm 733#else 734 ret 735 736#if defined(OS_LINUX) 737 .size inner_edge_dgemm_add_nn_2x4_lib4, .-inner_edge_dgemm_add_nn_2x4_lib4 738#endif 739#endif 740 741 742 743 744 745// common inner routine with file scope 746// 747// scale for generic alpha and beta 748// 749// input arguments: 750// r10 <- alpha 751// r11 <- beta 752// r12 <- C 753// r13 <- 4*sdc*sizeof(double) 754// r15 <- dirty 755// ymm0 <- [d00 d11 d22 d33] 756// ymm1 <- [d01 d10 d23 d32] 757// 758// output arguments: 759// r10 <- alpha 760// r11 <- beta 761// r12 <- C 762// r13 <- 4*sdc*sizeof(double) 763// r15 <- dirty 764// ymm0 <- [d00 d10 d20 d30] 765// ymm1 <- [d01 d11 d21 d31] 766 767#if MACRO_LEVEL>=1 768 .macro INNER_SCALE_AB_4X2_LIB4 769#else 770 .p2align 4,,15 771#if defined(OS_LINUX) 772 .type inner_scale_ab_4x2_lib4, @function 773inner_scale_ab_4x2_lib4: 774#elif defined(OS_MAC) 775_inner_scale_ab_4x2_lib4: 776#elif defined(OS_WINDOWS) 777 .def inner_scale_ab_4x2_lib4; .scl 2; .type 32; .endef 778inner_scale_ab_4x2_lib4: 779#endif 780#endif 781 782 783 // alpha 784 vbroadcastsd 0(%r10), %ymm15 785 786 vmulpd %ymm0, %ymm15, %ymm0 787 vmulpd %ymm1, %ymm15, %ymm1 788 789 // beta 790 vbroadcastsd 0(%r11), %ymm14 791 792 vxorpd %ymm15, %ymm15, %ymm15 // 0.0 793 794 vucomisd %xmm15, %xmm14 // beta==0.0 ? 795 je 0f // end 796 797 // alg==1 798 vmovapd 0(%r12), %ymm15 799 vmulpd %ymm15, %ymm14, %ymm15 800 vaddpd %ymm0, %ymm15, %ymm0 801 vmovapd 32(%r12), %ymm15 802 vmulpd %ymm15, %ymm14, %ymm15 803 vaddpd %ymm1, %ymm15, %ymm1 804 8050: 806 807#if MACRO_LEVEL>=1 808 .endm 809#else 810 ret 811 812#if defined(OS_LINUX) 813 .size inner_scale_ab_4x2_lib4, .-inner_scale_ab_4x2_lib4 814#endif 815#endif 816 817 818 819 820 821// common inner routine with file scope 822// 823// scale for generic alpha and beta 824// 825// input arguments: 826// r10 <- alpha 827// r11 <- beta 828// r12 <- C 829// ymm0 <- [d00 d11 d22 d33] 830// ymm1 <- [d01 d10 d23 d32] 831// ymm2 <- [d03 d12 d21 d30] 832// ymm3 <- [d02 d13 d20 d31] 833// ymm8 <- dirty 834// ymm9 <- dirty 835// ymm10 <- dirty 836// ymm11 <- dirty 837// ymm15 <- dirty 838// 839// output arguments: 840// r10 <- alpha 841// r11 <- beta 842// r10 <- C 843// ymm0 <- [d00 d10 d20 d30] 844// ymm1 <- [d01 d11 d21 d31] 845// ymm2 <- [d02 d12 d22 d32] 846// ymm3 <- [d03 d13 d23 d33] 847// ymm8 <- dirty 848// ymm9 <- dirty 849// ymm10 <- dirty 850// ymm11 <- dirty 851// ymm15 <- dirty 852 853#if MACRO_LEVEL>=1 854 .macro INNER_SCALE_AB_2X4_LIB4 855#else 856 .p2align 4,,15 857#if defined(OS_LINUX) 858 .type inner_scale_ab_2x4_lib4, @function 859inner_scale_ab_2x4_lib4: 860#elif defined(OS_MAC) 861_inner_scale_ab_2x4_lib4: 862#elif defined(OS_WINDOWS) 863 .def inner_scale_ab_2x4_lib4; .scl 2; .type 32; .endef 864inner_scale_ab_2x4_lib4: 865#endif 866#endif 867 868 // alpha 869 vmovddup 0(%r10), %xmm15 870 871 vmulpd %xmm0, %xmm15, %xmm0 872 vmulpd %xmm1, %xmm15, %xmm1 873 vmulpd %xmm2, %xmm15, %xmm2 874 vmulpd %xmm3, %xmm15, %xmm3 875 876 // beta 877 vmovddup 0(%r11), %xmm14 878 879 vxorpd %xmm15, %xmm15, %xmm15 // 0.0 880 881 vucomisd %xmm15, %xmm14 // beta==0.0 ? 882 je 0f // end 883 884 vmovapd 0(%r12), %xmm15 885 vmulpd %xmm14, %xmm15, %xmm15 886 vaddpd %xmm15, %xmm0, %xmm0 887 vmovapd 32(%r12), %xmm15 888 vmulpd %xmm14, %xmm15, %xmm15 889 vaddpd %xmm15, %xmm1, %xmm1 890 vmovapd 64(%r12), %xmm15 891 vmulpd %xmm14, %xmm15, %xmm15 892 vaddpd %xmm15, %xmm2, %xmm2 893 vmovapd 96(%r12), %xmm15 894 vmulpd %xmm14, %xmm15, %xmm15 895 vaddpd %xmm15, %xmm3, %xmm3 896 8970: 898 899#if MACRO_LEVEL>=1 900 .endm 901#else 902 ret 903 904#if defined(OS_LINUX) 905 .size inner_scale_ab_2x4_lib4, .-inner_scale_ab_2x4_lib4 906#endif 907#endif 908 909 910 911 912 913// common inner routine with file scope 914// 915// store n 916// 917// input arguments: 918// r10 <- D 919// r11 <- 4*sdd*sizeof(double) 920// r15 <- dirty 921// ymm0 <- [d00 d10 d20 d30] 922// ymm1 <- [d01 d11 d21 d31] 923// 924// output arguments: 925// r10 <- D 926// r11 <- 4*sdd*sizeof(double) 927// r15 <- dirty 928// ymm0 <- [d00 d10 d20 d30] 929// ymm1 <- [d01 d11 d21 d31] 930 931#if MACRO_LEVEL>=1 932 .macro INNER_STORE_4X2_LIB4 933#else 934 .p2align 4,,15 935#if defined(OS_LINUX) 936 .type inner_store_4x2_lib4, @function 937inner_store_4x2_lib4: 938#elif defined(OS_MAC) 939_inner_store_4x2_lib4: 940#elif defined(OS_WINDOWS) 941 .def inner_store_4x2_lib4; .scl 2; .type 32; .endef 942inner_store_4x2_lib4: 943#endif 944#endif 945 946 vmovapd %ymm0, 0(%r10) 947 vmovapd %ymm1, 32(%r10) 948 949#if MACRO_LEVEL>=1 950 .endm 951#else 952 ret 953 954#if defined(OS_LINUX) 955 .size inner_store_4x2_lib4, .-inner_store_4x2_lib4 956#endif 957#endif 958 959 960 961 962 963// common inner routine with file scope 964// 965// store n 966// 967// input arguments: 968// r10 <- D 969// ymm0 <- [d00 d10] 970// ymm1 <- [d01 d11] 971// ymm2 <- [d02 d12] 972// ymm3 <- [d03 d13] 973// 974// output arguments: 975// r10 <- D 976// ymm0 <- [d00 d10] 977// ymm1 <- [d01 d11] 978// ymm2 <- [d02 d12] 979// ymm3 <- [d03 d13] 980 981#if MACRO_LEVEL>=1 982 .macro INNER_STORE_2X4_LIB4 983#else 984 .p2align 4,,15 985#if defined(OS_LINUX) 986 .type inner_store_2x4_lib4, @function 987inner_store_2x4_lib4: 988#elif defined(OS_MAC) 989_inner_store_2x4_lib4: 990#elif defined(OS_WINDOWS) 991 .def inner_store_2x4_lib4; .scl 2; .type 32; .endef 992inner_store_2x4_lib4: 993#endif 994#endif 995 996 vmovapd %xmm0, 0(%r10) 997 vmovapd %xmm1, 32(%r10) 998 vmovapd %xmm2, 64(%r10) 999 vmovapd %xmm3, 96(%r10) 1000 1001#if MACRO_LEVEL>=1 1002 .endm 1003#else 1004 ret 1005 1006#if defined(OS_LINUX) 1007 .size inner_store_2x4_lib4, .-inner_store_2x4_lib4 1008#endif 1009#endif 1010 1011 1012 1013 1014 1015// common inner routine with file scope 1016// 1017// store n 1018// 1019// input arguments: 1020// r10 <- D 1021// ymm0 <- [d00 d10] 1022// ymm1 <- [d01 d11] 1023// ymm2 <- [d02 d12] 1024// ymm3 <- [d03 d13] 1025// 1026// output arguments: 1027// r10 <- D 1028// ymm0 <- [d00 d10] 1029// ymm1 <- [d01 d11] 1030// ymm2 <- [d02 d12] 1031// ymm3 <- [d03 d13] 1032 1033#if MACRO_LEVEL>=1 1034 .macro INNER_STORE_2X2_LIB4 1035#else 1036 .p2align 4,,15 1037#if defined(OS_LINUX) 1038 .type inner_store_2x2_lib4, @function 1039inner_store_2x2_lib4: 1040#elif defined(OS_MAC) 1041_inner_store_2x2_lib4: 1042#elif defined(OS_WINDOWS) 1043 .def inner_store_2x2_lib4; .scl 2; .type 32; .endef 1044inner_store_2x2_lib4: 1045#endif 1046#endif 1047 1048 vmovapd %xmm0, 0(%r10) 1049 vmovapd %xmm1, 32(%r10) 1050 1051#if MACRO_LEVEL>=1 1052 .endm 1053#else 1054 ret 1055 1056#if defined(OS_LINUX) 1057 .size inner_store_2x2_lib4, .-inner_store_2x2_lib4 1058#endif 1059#endif 1060 1061 1062 1063 1064 1065// common inner routine with file scope 1066// 1067// store n vs 1068// 1069// input arguments: 1070// r10 <- D 1071// r11d <- km 1072// r12d <- kn 1073// ymm0 <- [d00 d10 d20 d30] 1074// ymm1 <- [d01 d11 d21 d31] 1075// 1076// output arguments: 1077// r10 <- D 1078// r11d <- km 1079// r12d <- kn 1080// ymm0 <- [d00 d10 d20 d30] 1081// ymm1 <- [d01 d11 d21 d31] 1082 1083#if MACRO_LEVEL>=1 1084 .macro INNER_STORE_4X2_VS_LIB4 1085#else 1086 .p2align 4,,15 1087#if defined(OS_LINUX) 1088 .type inner_store_4x2_vs_lib4, @function 1089inner_store_4x2_vs_lib4: 1090#elif defined(OS_MAC) 1091_inner_store_4x2_vs_lib4: 1092#elif defined(OS_WINDOWS) 1093 .def inner_store_4x2_vs_lib4; .scl 2; .type 32; .endef 1094inner_store_4x2_vs_lib4: 1095#endif 1096#endif 1097 1098 vcvtsi2sd %r11d, %xmm15, %xmm15 1099#if defined(OS_LINUX) | defined(OS_WINDOWS) 1100 vmovupd .LC02(%rip), %ymm14 1101#elif defined(OS_MAC) 1102 vmovupd LC02(%rip), %ymm14 1103#endif 1104 vmovddup %xmm15, %xmm15 1105 vinsertf128 $1, %xmm15, %ymm15, %ymm15 1106 vsubpd %ymm15, %ymm14, %ymm15 1107 1108 vmaskmovpd %ymm0, %ymm15, 0(%r10) 1109 cmpl $2, %r12d 1110 jl 0f // end 1111 vmaskmovpd %ymm1, %ymm15, 32(%r10) 1112 11130: 1114 1115#if MACRO_LEVEL>=1 1116 .endm 1117#else 1118 ret 1119 1120#if defined(OS_LINUX) 1121 .size inner_store_4x2_vs_lib4, .-inner_store_4x2_vs_lib4 1122#endif 1123#endif 1124 1125 1126 1127 1128 1129// common inner routine with file scope 1130// 1131// store n 1132// 1133// input arguments: 1134// r10 <- D 1135// r11 <- m1 1136// r12 <- n1 1137// ymm0 <- [d00 d10 d20 d30] 1138// ymm1 <- [d01 d11 d21 d31] 1139// ymm2 <- [d02 d12 d22 d32] 1140// ymm3 <- [d03 d13 d23 d33] 1141// ymm4 <- [d40 d50 d60 d70] 1142// ymm5 <- [d41 d51 d61 d71] 1143// ymm6 <- [d42 d52 d62 d72] 1144// ymm7 <- [d43 d53 d63 d73] 1145// 1146// output arguments: 1147// r10 <- D 1148// r11 <- m1 1149// r12 <- n1 1150// ymm0 <- [d00 d10 d20 d30] 1151// ymm1 <- [d01 d11 d21 d31] 1152// ymm2 <- [d02 d12 d22 d32] 1153// ymm3 <- [d03 d13 d23 d33] 1154// ymm4 <- [d40 d50 d60 d70] 1155// ymm5 <- [d41 d51 d61 d71] 1156// ymm6 <- [d42 d52 d62 d72] 1157// ymm7 <- [d43 d53 d63 d73] 1158 1159#if MACRO_LEVEL>=1 1160 .macro INNER_STORE_2X4_VS_LIB4 1161#else 1162 .p2align 4,,15 1163#if defined(OS_LINUX) 1164 .type inner_store_2x4_vs_lib4, @function 1165inner_store_2x4_vs_lib4: 1166#elif defined(OS_MAC) 1167_inner_store_2x4_vs_lib4: 1168#elif defined(OS_WINDOWS) 1169 .def inner_store_2x4_vs_lib4; .scl 2; .type 32; .endef 1170inner_store_2x4_vs_lib4: 1171#endif 1172#endif 1173 1174 vcvtsi2sd %r11d, %xmm15, %xmm15 1175#if defined(OS_LINUX) | defined(OS_WINDOWS) 1176 vmovupd .LC02(%rip), %ymm14 1177#elif defined(OS_MAC) 1178 vmovupd LC02(%rip), %ymm14 1179#endif 1180 vmovddup %xmm15, %xmm15 1181 vinsertf128 $1, %xmm15, %ymm15, %ymm15 1182 vsubpd %ymm15, %ymm14, %ymm15 1183 1184 cmpl $2, %r12d 1185 vmaskmovpd %xmm0, %xmm15, 0(%r10) 1186 jl 0f // end 1187 cmpl $3, %r12d 1188 vmaskmovpd %xmm1, %xmm15, 32(%r10) 1189 jl 0f // end 1190 vmaskmovpd %xmm2, %xmm15, 64(%r10) 1191 je 0f // end 1192 vmaskmovpd %xmm3, %xmm15, 96(%r10) 1193 11940: 1195 1196#if MACRO_LEVEL>=1 1197 .endm 1198#else 1199 ret 1200 1201#if defined(OS_LINUX) 1202 .size inner_store_2x4_vs_lib4, .-inner_store_2x4_vs_lib4 1203#endif 1204#endif 1205 1206 1207 1208 1209 1210// 1 2 3 4 5 6 7 8 9 1211// void kernel_dgemm_nn_4x2_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D); 1212 1213 .p2align 4,,15 1214#if defined(OS_LINUX) 1215 .globl kernel_dgemm_nn_4x2_lib4 1216 .type kernel_dgemm_nn_4x2_lib4, @function 1217kernel_dgemm_nn_4x2_lib4: 1218#elif defined(OS_MAC) 1219 .globl _kernel_dgemm_nn_4x2_lib4 1220_kernel_dgemm_nn_4x2_lib4: 1221#elif defined(OS_WINDOWS) 1222 .globl kernel_dgemm_nn_4x2_lib4 1223 .def kernel_dgemm_nn_4x2_lib4; .scl 2; .type 32; .endef 1224kernel_dgemm_nn_4x2_lib4: 1225#endif 1226 1227 PROLOGUE 1228 1229 // zero accumulation registers 1230 1231 vxorpd %ymm0, %ymm0, %ymm0 1232 vmovapd %ymm0, %ymm1 1233 vmovapd %ymm0, %ymm2 1234 vmovapd %ymm0, %ymm3 1235 1236 1237 // call inner dgemm kernel nn 1238 1239 movq ARG1, %r10 // k 1240 movq ARG3, %r11 // A 1241 movq ARG5, %r12 // B 1242 movq ARG6, %r13 // sdb 1243 sall $5, %r13d // 4*sdb*sizeof(double) 1244 movq ARG4, %r14 // offsetB 1245 1246#if MACRO_LEVEL>=1 1247 INNER_EDGE_DGEMM_ADD_NN_4X2_LIB4 1248#else 1249#if defined(OS_LINUX) | defined(OS_WINDOWS) 1250 call inner_edge_dgemm_add_nn_4x2_lib4 1251#elif defined(OS_MAC) 1252 callq _inner_edge_dgemm_add_nn_4x2_lib4 1253#endif 1254#endif 1255 1256#if MACRO_LEVEL>=2 1257 INNER_KERNEL_DGEMM_ADD_NN_4X2_LIB4 1258#else 1259#if defined(OS_LINUX) | defined(OS_WINDOWS) 1260 call inner_kernel_dgemm_add_nn_4x2_lib4 1261#elif defined(OS_MAC) 1262 callq _inner_kernel_dgemm_add_nn_4x2_lib4 1263#endif 1264#endif 1265 1266 1267 // call inner blend 1268 1269 movq ARG2, %r10 // alpha 1270 movq ARG7, %r11 // beta 1271 movq ARG8, %r12 // C 1272 1273#if MACRO_LEVEL>=1 1274 INNER_SCALE_AB_4X2_LIB4 1275#else 1276#if defined(OS_LINUX) | defined(OS_WINDOWS) 1277 call inner_scale_ab_4x2_lib4 1278#elif defined(OS_MAC) 1279 callq _inner_scale_ab_4x2_lib4 1280#endif 1281#endif 1282 1283 1284 // store n 1285 1286 movq ARG9, %r10 // D 1287 1288#if MACRO_LEVEL>=1 1289 INNER_STORE_4X2_LIB4 1290#else 1291#if defined(OS_LINUX) | defined(OS_WINDOWS) 1292 call inner_store_4x2_lib4 1293#elif defined(OS_MAC) 1294 callq _inner_store_4x2_lib4 1295#endif 1296#endif 1297 1298 1299 EPILOGUE 1300 1301 ret 1302 1303#if defined(OS_LINUX) 1304 .size kernel_dgemm_nn_4x2_lib4, .-kernel_dgemm_nn_4x2_lib4 1305#endif 1306 1307 1308 1309 1310 1311// 1 2 3 4 5 6 7 8 9 10 11 1312// void kernel_dgemm_nn_4x2_vs_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D, int m1, int n1); 1313 1314 .p2align 4,,15 1315#if defined(OS_LINUX) 1316 .globl kernel_dgemm_nn_4x2_vs_lib4 1317 .type kernel_dgemm_nn_4x2_vs_lib4, @function 1318kernel_dgemm_nn_4x2_vs_lib4: 1319#elif defined(OS_MAC) 1320 .globl _kernel_dgemm_nn_4x2_vs_lib4 1321_kernel_dgemm_nn_4x2_vs_lib4: 1322#elif defined(OS_WINDOWS) 1323 .globl kernel_dgemm_nn_4x2_vs_lib4 1324 .def kernel_dgemm_nn_4x2_vs_lib4; .scl 2; .type 32; .endef 1325kernel_dgemm_nn_4x2_vs_lib4: 1326#endif 1327 1328 PROLOGUE 1329 1330 // zero accumulation registers 1331 1332 vxorpd %ymm0, %ymm0, %ymm0 1333 vmovapd %ymm0, %ymm1 1334 vmovapd %ymm0, %ymm2 1335 vmovapd %ymm0, %ymm3 1336 1337 1338 // call inner dgemm kernel nn 1339 1340 movq ARG1, %r10 // k 1341 movq ARG3, %r11 // A 1342 movq ARG5, %r12 // B 1343 movq ARG6, %r13 // sdb 1344 sall $5, %r13d // 4*sdb*sizeof(double) 1345 movq ARG4, %r14 // offsetB 1346 1347#if MACRO_LEVEL>=1 1348 INNER_EDGE_DGEMM_ADD_NN_4X2_LIB4 1349#else 1350#if defined(OS_LINUX) | defined(OS_WINDOWS) 1351 call inner_edge_dgemm_add_nn_4x2_lib4 1352#elif defined(OS_MAC) 1353 callq _inner_edge_dgemm_add_nn_4x2_lib4 1354#endif 1355#endif 1356 1357#if MACRO_LEVEL>=2 1358 INNER_KERNEL_DGEMM_ADD_NN_4X2_LIB4 1359#else 1360#if defined(OS_LINUX) | defined(OS_WINDOWS) 1361 call inner_kernel_dgemm_add_nn_4x2_lib4 1362#elif defined(OS_MAC) 1363 callq _inner_kernel_dgemm_add_nn_4x2_lib4 1364#endif 1365#endif 1366 1367 1368 // call inner blend 1369 1370 movq ARG2, %r10 // alpha 1371 movq ARG7, %r11 // beta 1372 movq ARG8, %r12 // C 1373 1374#if MACRO_LEVEL>=1 1375 INNER_SCALE_AB_4X2_LIB4 1376#else 1377#if defined(OS_LINUX) | defined(OS_WINDOWS) 1378 call inner_scale_ab_4x2_lib4 1379#elif defined(OS_MAC) 1380 callq _inner_scale_ab_4x2_lib4 1381#endif 1382#endif 1383 1384 1385 // store n 1386 1387 movq ARG9, %r10 // D 1388 movq ARG10, %r11 // m1 1389 movq ARG11, %r12 // n1 1390 1391#if MACRO_LEVEL>=1 1392 INNER_STORE_4X2_VS_LIB4 1393#else 1394#if defined(OS_LINUX) | defined(OS_WINDOWS) 1395 call inner_store_4x2_vs_lib4 1396#elif defined(OS_MAC) 1397 callq _inner_store_4x2_vs_lib4 1398#endif 1399#endif 1400 1401 1402 EPILOGUE 1403 1404 ret 1405 1406#if defined(OS_LINUX) 1407 .size kernel_dgemm_nn_4x2_vs_lib4, .-kernel_dgemm_nn_4x2_vs_lib4 1408#endif 1409 1410 1411 1412 1413 1414// 1 2 3 4 5 6 7 8 9 1415// void kernel_dgemm_nn_2x4_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D); 1416 1417 .p2align 4,,15 1418#if defined(OS_LINUX) 1419 .globl kernel_dgemm_nn_2x4_lib4 1420 .type kernel_dgemm_nn_2x4_lib4, @function 1421kernel_dgemm_nn_2x4_lib4: 1422#elif defined(OS_MAC) 1423 .globl _kernel_dgemm_nn_2x4_lib4 1424_kernel_dgemm_nn_2x4_lib4: 1425#elif defined(OS_WINDOWS) 1426 .globl kernel_dgemm_nn_2x4_lib4 1427 .def kernel_dgemm_nn_2x4_lib4; .scl 2; .type 32; .endef 1428kernel_dgemm_nn_2x4_lib4: 1429#endif 1430 1431 PROLOGUE 1432 1433 // zero accumulation registers 1434 1435 vxorpd %ymm0, %ymm0, %ymm0 1436 vmovapd %ymm0, %ymm1 1437 vmovapd %ymm0, %ymm2 1438 vmovapd %ymm0, %ymm3 1439 vmovapd %ymm0, %ymm4 1440 vmovapd %ymm0, %ymm5 1441 vmovapd %ymm0, %ymm6 1442 vmovapd %ymm0, %ymm7 1443 1444 1445 // call inner dgemm kernel nn 1446 1447 movq ARG1, %r10 // k 1448 movq ARG3, %r11 // A 1449 movq ARG5, %r12 // B 1450 movq ARG6, %r13 // sdb 1451 sall $5, %r13d // 4*sdb*sizeof(double) 1452 movq ARG4, %r14 // offsetB 1453 1454#if MACRO_LEVEL>=1 1455 INNER_EDGE_DGEMM_ADD_NN_2X4_LIB4 1456#else 1457#if defined(OS_LINUX) | defined(OS_WINDOWS) 1458 call inner_edge_dgemm_add_nn_2x4_lib4 1459#elif defined(OS_MAC) 1460 callq _inner_edge_dgemm_add_nn_2x4_lib4 1461#endif 1462#endif 1463 1464#if MACRO_LEVEL>=2 1465 INNER_KERNEL_DGEMM_ADD_NN_2X4_LIB4 1466#else 1467#if defined(OS_LINUX) | defined(OS_WINDOWS) 1468 call inner_kernel_dgemm_add_nn_2x4_lib4 1469#elif defined(OS_MAC) 1470 callq _inner_kernel_dgemm_add_nn_2x4_lib4 1471#endif 1472#endif 1473 1474 1475 // call inner blend 1476 1477 movq ARG2, %r10 // alpha 1478 movq ARG7, %r11 // beta 1479 movq ARG8, %r12 // C 1480 1481#if MACRO_LEVEL>=1 1482 INNER_SCALE_AB_2X4_LIB4 1483#else 1484#if defined(OS_LINUX) | defined(OS_WINDOWS) 1485 call inner_scale_ab_2x4_lib4 1486#elif defined(OS_MAC) 1487 callq _inner_scale_ab_2x4_lib4 1488#endif 1489#endif 1490 1491 1492 // store n 1493 1494 movq ARG9, %r10 // D 1495 1496#if MACRO_LEVEL>=1 1497 INNER_STORE_2X4_LIB4 1498#else 1499#if defined(OS_LINUX) | defined(OS_WINDOWS) 1500 call inner_store_2x4_lib4 1501#elif defined(OS_MAC) 1502 callq _inner_store_2x4_lib4 1503#endif 1504#endif 1505 1506 1507 EPILOGUE 1508 1509 ret 1510 1511#if defined(OS_LINUX) 1512 .size kernel_dgemm_nn_2x4_lib4, .-kernel_dgemm_nn_2x4_lib4 1513#endif 1514 1515 1516 1517 1518 1519// 1 2 3 4 5 6 7 8 9 10 11 1520// void kernel_dgemm_nn_2x4_vs_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D, int m1, int n1); 1521 1522 .p2align 4,,15 1523#if defined(OS_LINUX) 1524 .globl kernel_dgemm_nn_2x4_vs_lib4 1525 .type kernel_dgemm_nn_2x4_vs_lib4, @function 1526kernel_dgemm_nn_2x4_vs_lib4: 1527#elif defined(OS_MAC) 1528 .globl _kernel_dgemm_nn_2x4_vs_lib4 1529_kernel_dgemm_nn_2x4_vs_lib4: 1530#elif defined(OS_WINDOWS) 1531 .globl kernel_dgemm_nn_2x4_vs_lib4 1532 .def kernel_dgemm_nn_2x4_vs_lib4; .scl 2; .type 32; .endef 1533kernel_dgemm_nn_2x4_vs_lib4: 1534#endif 1535 1536 PROLOGUE 1537 1538 // zero accumulation registers 1539 1540 vxorpd %ymm0, %ymm0, %ymm0 1541 vmovapd %ymm0, %ymm1 1542 vmovapd %ymm0, %ymm2 1543 vmovapd %ymm0, %ymm3 1544 vmovapd %ymm0, %ymm4 1545 vmovapd %ymm0, %ymm5 1546 vmovapd %ymm0, %ymm6 1547 vmovapd %ymm0, %ymm7 1548 1549 1550 // call inner dgemm kernel nn 1551 1552 movq ARG1, %r10 // k 1553 movq ARG3, %r11 // A 1554 movq ARG5, %r12 // B 1555 movq ARG6, %r13 // sdb 1556 sall $5, %r13d // 4*sdb*sizeof(double) 1557 movq ARG4, %r14 // offsetB 1558 1559#if MACRO_LEVEL>=1 1560 INNER_EDGE_DGEMM_ADD_NN_2X4_LIB4 1561#else 1562#if defined(OS_LINUX) | defined(OS_WINDOWS) 1563 call inner_edge_dgemm_add_nn_2x4_lib4 1564#elif defined(OS_MAC) 1565 callq _inner_edge_dgemm_add_nn_2x4_lib4 1566#endif 1567#endif 1568 1569#if MACRO_LEVEL>=2 1570 INNER_KERNEL_DGEMM_ADD_NN_2X4_LIB4 1571#else 1572#if defined(OS_LINUX) | defined(OS_WINDOWS) 1573 call inner_kernel_dgemm_add_nn_2x4_lib4 1574#elif defined(OS_MAC) 1575 callq _inner_kernel_dgemm_add_nn_2x4_lib4 1576#endif 1577#endif 1578 1579 1580 // call inner blend 1581 1582 movq ARG2, %r10 // alpha 1583 movq ARG7, %r11 // beta 1584 movq ARG8, %r12 // C 1585 1586#if MACRO_LEVEL>=1 1587 INNER_SCALE_AB_2X4_LIB4 1588#else 1589#if defined(OS_LINUX) | defined(OS_WINDOWS) 1590 call inner_scale_ab_2x4_lib4 1591#elif defined(OS_MAC) 1592 callq _inner_scale_ab_2x4_lib4 1593#endif 1594#endif 1595 1596 1597 // store n 1598 1599 movq ARG9, %r10 // D 1600 movq ARG10, %r11 // m1 1601 movq ARG11, %r12 // n1 1602 1603#if MACRO_LEVEL>=1 1604 INNER_STORE_2X4_VS_LIB4 1605#else 1606#if defined(OS_LINUX) | defined(OS_WINDOWS) 1607 call inner_store_2x4_vs_lib4 1608#elif defined(OS_MAC) 1609 callq _inner_store_2x4_vs_lib4 1610#endif 1611#endif 1612 1613 1614 EPILOGUE 1615 1616 ret 1617 1618#if defined(OS_LINUX) 1619 .size kernel_dgemm_nn_2x4_vs_lib4, .-kernel_dgemm_nn_2x4_vs_lib4 1620#endif 1621 1622 1623 1624 1625 1626// 1 2 3 4 5 6 7 8 9 1627// void kernel_dgemm_nn_2x2_lib4(int k, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D); 1628 1629 .p2align 4,,15 1630#if defined(OS_LINUX) 1631 .globl kernel_dgemm_nn_2x2_lib4 1632 .type kernel_dgemm_nn_2x2_lib4, @function 1633kernel_dgemm_nn_2x2_lib4: 1634#elif defined(OS_MAC) 1635 .globl _kernel_dgemm_nn_2x2_lib4 1636_kernel_dgemm_nn_2x2_lib4: 1637#elif defined(OS_WINDOWS) 1638 .globl kernel_dgemm_nn_2x2_lib4 1639 .def kernel_dgemm_nn_2x2_lib4; .scl 2; .type 32; .endef 1640kernel_dgemm_nn_2x2_lib4: 1641#endif 1642 1643 PROLOGUE 1644 1645 // zero accumulation registers 1646 1647 vxorpd %ymm0, %ymm0, %ymm0 1648 vmovapd %ymm0, %ymm1 1649 vmovapd %ymm0, %ymm2 1650 vmovapd %ymm0, %ymm3 1651 1652 1653 // call inner dgemm kernel nn 1654 1655 movq ARG1, %r10 // k 1656 movq ARG3, %r11 // A 1657 movq ARG5, %r12 // B 1658 movq ARG6, %r13 // sdb 1659 sall $5, %r13d // 4*sdb*sizeof(double) 1660 movq ARG4, %r14 // offsetB 1661 1662#if MACRO_LEVEL>=1 1663 INNER_EDGE_DGEMM_ADD_NN_4X2_LIB4 1664#else 1665#if defined(OS_LINUX) | defined(OS_WINDOWS) 1666 call inner_edge_dgemm_add_nn_4x2_lib4 1667#elif defined(OS_MAC) 1668 callq _inner_edge_dgemm_add_nn_4x2_lib4 1669#endif 1670#endif 1671 1672#if MACRO_LEVEL>=2 1673 INNER_KERNEL_DGEMM_ADD_NN_4X2_LIB4 1674#else 1675#if defined(OS_LINUX) | defined(OS_WINDOWS) 1676 call inner_kernel_dgemm_add_nn_4x2_lib4 1677#elif defined(OS_MAC) 1678 callq _inner_kernel_dgemm_add_nn_4x2_lib4 1679#endif 1680#endif 1681 1682 1683 // call inner blend 1684 1685 movq ARG2, %r10 // alpha 1686 movq ARG7, %r11 // beta 1687 movq ARG8, %r12 // C 1688 1689#if MACRO_LEVEL>=1 1690 INNER_SCALE_AB_4X2_LIB4 1691#else 1692#if defined(OS_LINUX) | defined(OS_WINDOWS) 1693 call inner_scale_ab_4x2_lib4 1694#elif defined(OS_MAC) 1695 callq _inner_scale_ab_4x2_lib4 1696#endif 1697#endif 1698 1699 1700 // store n 1701 1702 movq ARG9, %r10 // D 1703 1704#if MACRO_LEVEL>=1 1705 INNER_STORE_2X2_LIB4 1706#else 1707#if defined(OS_LINUX) | defined(OS_WINDOWS) 1708 call inner_store_2x2_lib4 1709#elif defined(OS_MAC) 1710 callq _inner_store_2x2_lib4 1711#endif 1712#endif 1713 1714 1715 EPILOGUE 1716 1717 ret 1718 1719#if defined(OS_LINUX) 1720 .size kernel_dgemm_nn_2x2_lib4, .-kernel_dgemm_nn_2x2_lib4 1721#endif 1722 1723 1724 1725 1726 1727 // read-only data 1728#if defined(OS_LINUX) 1729 .section .rodata.cst32,"aM",@progbits,32 1730#elif defined(OS_MAC) 1731 .section __TEXT,__const 1732#elif defined(OS_WINDOWS) 1733 .section .rdata,"dr" 1734#endif 1735 1736#if defined(OS_LINUX) | defined(OS_WINDOWS) 1737 .align 32 1738.LC00: // { -1 -1 -1 1 } 1739#elif defined(OS_MAC) 1740LC00: // { -1 -1 -1 1 } 1741 .align 5 1742#endif 1743 .quad -1 1744 .quad -1 1745 .quad -1 1746 .quad 1 1747 1748#if defined(OS_LINUX) | defined(OS_WINDOWS) 1749 .align 32 1750.LC01: // { -1 -1 -1 -1 } 1751#elif defined(OS_MAC) 1752LC01: // { -1 -1 -1 -1 } 1753 .align 5 1754#endif 1755 .quad -1 1756 .quad -1 1757 .quad -1 1758 .quad -1 1759 1760#if defined(OS_LINUX) | defined(OS_WINDOWS) 1761 .align 32 1762.LC02: // { 3.5 2.5 1.5 0.5 } 1763#elif defined(OS_MAC) 1764LC02: // { 3.5 2.5 1.5 0.5 } 1765 .align 5 1766#endif 1767 .long 0 1768 .long 1071644672 1769 .long 0 1770 .long 1073217536 1771 .long 0 1772 .long 1074003968 1773 .long 0 1774 .long 1074528256 1775 1776#if defined(OS_LINUX) | defined(OS_WINDOWS) 1777 .align 32 1778.LC03: // { 7.5 6.5 5.5 4.5 } 1779#elif defined(OS_MAC) 1780LC03: // { 7.5 6.5 5.5 4.5 } 1781 .align 5 1782#endif 1783 .long 0 1784 .long 1074921472 1785 .long 0 1786 .long 1075183616 1787 .long 0 1788 .long 1075445760 1789 .long 0 1790 .long 1075707904 1791 1792#if defined(OS_LINUX) | defined(OS_WINDOWS) 1793 .align 32 1794.LC04: // { 1.0 1.0 1.0 1.0 } 1795#elif defined(OS_MAC) 1796LC04: // { 1.0 1.0 1.0 1.0 } 1797 .align 5 1798#endif 1799 .long 0 1800 .long 1072693248 1801 .long 0 1802 .long 1072693248 1803 .long 0 1804 .long 1072693248 1805 .long 0 1806 .long 1072693248 1807 1808#if defined(OS_LINUX) | defined(OS_WINDOWS) 1809 .align 32 1810.LC05: // { 1.0 1.0 1.0 -1.0 } 1811#elif defined(OS_MAC) 1812 .align 5 1813LC05: // { 1.0 1.0 1.0 -1.0 } 1814#endif 1815 .long 0 1816 .long -1074790400 1817 .long 0 1818 .long 1072693248 1819 .long 0 1820 .long 1072693248 1821 .long 0 1822 .long 1072693248 1823 1824#if defined(OS_LINUX) | defined(OS_WINDOWS) 1825 .align 32 1826.LC06: // { 1.0 1.0 -1.0 -1.0 } 1827#elif defined(OS_MAC) 1828 .align 5 1829LC06: // { 1.0 1.0 -1.0 -1.0 } 1830#endif 1831 .long 0 1832 .long -1074790400 1833 .long 0 1834 .long -1074790400 1835 .long 0 1836 .long 1072693248 1837 .long 0 1838 .long 1072693248 1839 1840#if defined(OS_LINUX) | defined(OS_WINDOWS) 1841 .align 32 1842.LC07: // { 1.0 -1.0 -1.0 -1.0 } 1843#elif defined(OS_MAC) 1844 .align 5 1845LC07: // { 1.0 -1.0 -1.0 -1.0 } 1846#endif 1847 .long 0 1848 .long -1074790400 1849 .long 0 1850 .long -1074790400 1851 .long 0 1852 .long -1074790400 1853 .long 0 1854 .long 1072693248 1855 1856#if defined(OS_LINUX) | defined(OS_WINDOWS) 1857 .align 32 1858.LC08: // { -1.0 -1.0 -1.0 1.0 } 1859#elif defined(OS_MAC) 1860 .align 5 1861LC08: // { -1.0 -1.0 -1.0 1.0 } 1862#endif 1863 .long 0 1864 .long 1072693248 1865 .long 0 1866 .long -1074790400 1867 .long 0 1868 .long -1074790400 1869 .long 0 1870 .long -1074790400 1871 1872#if defined(OS_LINUX) | defined(OS_WINDOWS) 1873 .align 32 1874.LC09: // { -1.0 -1.0 1.0 1.0 } 1875#elif defined(OS_MAC) 1876 .align 5 1877LC09: // { -1.0 -1.0 1.0 1.0 } 1878#endif 1879 .long 0 1880 .long 1072693248 1881 .long 0 1882 .long 1072693248 1883 .long 0 1884 .long -1074790400 1885 .long 0 1886 .long -1074790400 1887 1888#if defined(OS_LINUX) | defined(OS_WINDOWS) 1889 .align 32 1890.LC10: // { -1.0 1.0 1.0 1.0 } 1891#elif defined(OS_MAC) 1892 .align 5 1893LC10: // { -1.0 1.0 1.0 1.0 } 1894#endif 1895 .long 0 1896 .long 1072693248 1897 .long 0 1898 .long 1072693248 1899 .long 0 1900 .long 1072693248 1901 .long 0 1902 .long -1074790400 1903 1904 1905 1906 1907#if defined(OS_LINUX) 1908 .section .note.GNU-stack,"",@progbits 1909#elif defined(OS_MAC) 1910 .subsections_via_symbols 1911#endif 1912 1913 1914 1915