1/************************************************************************************************** 2* * 3* This file is part of BLASFEO. * 4* * 5* BLASFEO -- BLAS For Embedded Optimization. * 6* Copyright (C) 2019 by Gianluca Frison. * 7* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. * 8* All rights reserved. * 9* * 10* The 2-Clause BSD License * 11* * 12* Redistribution and use in source and binary forms, with or without * 13* modification, are permitted provided that the following conditions are met: * 14* * 15* 1. Redistributions of source code must retain the above copyright notice, this * 16* list of conditions and the following disclaimer. * 17* 2. Redistributions in binary form must reproduce the above copyright notice, * 18* this list of conditions and the following disclaimer in the documentation * 19* and/or other materials provided with the distribution. * 20* * 21* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * 22* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * 23* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * 24* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * 25* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * 26* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * 27* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * 28* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * 29* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * 30* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * 31* * 32* Author: Gianluca Frison, gianluca.frison (at) imtek.uni-freiburg.de * 33* * 34**************************************************************************************************/ 35 36#if defined(OS_LINUX) | defined(OS_MAC) 37 38//#define STACKSIZE 96 39#define STACKSIZE 64 40#define ARG1 %rdi 41#define ARG2 %rsi 42#define ARG3 %rdx 43#define ARG4 %rcx 44#define ARG5 %r8 45#define ARG6 %r9 46#define ARG7 STACKSIZE + 8(%rsp) 47#define ARG8 STACKSIZE + 16(%rsp) 48#define ARG9 STACKSIZE + 24(%rsp) 49#define ARG10 STACKSIZE + 32(%rsp) 50#define ARG11 STACKSIZE + 40(%rsp) 51#define ARG12 STACKSIZE + 48(%rsp) 52#define ARG13 STACKSIZE + 56(%rsp) 53#define ARG14 STACKSIZE + 64(%rsp) 54#define ARG15 STACKSIZE + 72(%rsp) 55#define ARG16 STACKSIZE + 80(%rsp) 56#define ARG17 STACKSIZE + 88(%rsp) 57#define ARG18 STACKSIZE + 96(%rsp) 58#define PROLOGUE \ 59 subq $STACKSIZE, %rsp; \ 60 movq %rbx, (%rsp); \ 61 movq %rbp, 8(%rsp); \ 62 movq %r12, 16(%rsp); \ 63 movq %r13, 24(%rsp); \ 64 movq %r14, 32(%rsp); \ 65 movq %r15, 40(%rsp); \ 66 vzeroupper; 67#define EPILOGUE \ 68 vzeroupper; \ 69 movq (%rsp), %rbx; \ 70 movq 8(%rsp), %rbp; \ 71 movq 16(%rsp), %r12; \ 72 movq 24(%rsp), %r13; \ 73 movq 32(%rsp), %r14; \ 74 movq 40(%rsp), %r15; \ 75 addq $STACKSIZE, %rsp; 76 77#if defined(OS_LINUX) 78 79#define GLOB_FUN_START(NAME) \ 80 .globl NAME; \ 81 .type NAME, @function; \ 82NAME: 83#define FUN_START(NAME) \ 84 .type NAME, @function; \ 85NAME: 86#define FUN_END(NAME) \ 87 .size NAME, .-NAME 88#define CALL(NAME) \ 89 call NAME 90//#define ZERO_ACC \ 91// vxorpd %ymm0, %ymm0, %ymm0; \ 92// vmovapd %ymm0, %ymm1; \ 93// vmovapd %ymm0, %ymm2; \ 94// vmovapd %ymm0, %ymm3 95//#define NEG_ACC \ 96// vmovapd .LC11(%rip), %ymm15; \ 97// vxorpd %ymm15, %ymm0, %ymm0; \ 98// vxorpd %ymm15, %ymm1, %ymm1; \ 99// vxorpd %ymm15, %ymm2, %ymm2; \ 100// vxorpd %ymm15, %ymm3, %ymm3 101 102#else // defined(OS_MAC) 103 104#define GLOB_FUN_START(NAME) \ 105 .globl _ ## NAME; \ 106_ ## NAME: 107#define FUN_START(NAME) \ 108_ ## NAME: 109#define FUN_END(NAME) 110#define CALL(NAME) \ 111 callq _ ## NAME 112//#define ZERO_ACC \ 113// vxorpd %ymm0, %ymm0, %ymm0; \ 114// vmovapd %ymm0, %ymm1; \ 115// vmovapd %ymm0, %ymm2; \ 116// vmovapd %ymm0, %ymm3 117//#define NEG_ACC \ 118// vmovapd LC11(%rip), %ymm15; \ 119// vxorpd %ymm15, %ymm0, %ymm0; \ 120// vxorpd %ymm15, %ymm1, %ymm1; \ 121// vxorpd %ymm15, %ymm2, %ymm2; \ 122// vxorpd %ymm15, %ymm3, %ymm3 123 124#endif 125 126#elif defined(OS_WINDOWS) 127 128#define STACKSIZE 256 129#define ARG1 %rcx 130#define ARG2 %rdx 131#define ARG3 %r8 132#define ARG4 %r9 133#define ARG5 STACKSIZE + 40(%rsp) 134#define ARG6 STACKSIZE + 48(%rsp) 135#define ARG7 STACKSIZE + 56(%rsp) 136#define ARG8 STACKSIZE + 64(%rsp) 137#define ARG9 STACKSIZE + 72(%rsp) 138#define ARG10 STACKSIZE + 80(%rsp) 139#define ARG11 STACKSIZE + 88(%rsp) 140#define ARG12 STACKSIZE + 96(%rsp) 141#define ARG13 STACKSIZE + 104(%rsp) 142#define ARG14 STACKSIZE + 112(%rsp) 143#define ARG15 STACKSIZE + 120(%rsp) 144#define ARG16 STACKSIZE + 128(%rsp) 145#define ARG17 STACKSIZE + 136(%rsp) 146#define ARG18 STACKSIZE + 144(%rsp) 147#define PROLOGUE \ 148 subq $STACKSIZE, %rsp; \ 149 movq %rbx, (%rsp); \ 150 movq %rbp, 8(%rsp); \ 151 movq %r12, 16(%rsp); \ 152 movq %r13, 24(%rsp); \ 153 movq %r14, 32(%rsp); \ 154 movq %r15, 40(%rsp); \ 155 movq %rdi, 48(%rsp); \ 156 movq %rsi, 56(%rsp); \ 157 vmovups %xmm6, 64(%rsp); \ 158 vmovups %xmm7, 80(%rsp); \ 159 vmovups %xmm8, 96(%rsp); \ 160 vmovups %xmm9, 112(%rsp); \ 161 vmovups %xmm10, 128(%rsp); \ 162 vmovups %xmm11, 144(%rsp); \ 163 vmovups %xmm12, 160(%rsp); \ 164 vmovups %xmm13, 176(%rsp); \ 165 vmovups %xmm14, 192(%rsp); \ 166 vmovups %xmm15, 208(%rsp); \ 167 vzeroupper; 168#define EPILOGUE \ 169 vzeroupper; \ 170 movq (%rsp), %rbx; \ 171 movq 8(%rsp), %rbp; \ 172 movq 16(%rsp), %r12; \ 173 movq 24(%rsp), %r13; \ 174 movq 32(%rsp), %r14; \ 175 movq 40(%rsp), %r15; \ 176 movq 48(%rsp), %rdi; \ 177 movq 56(%rsp), %rsi; \ 178 vmovups 64(%rsp), %xmm6; \ 179 vmovups 80(%rsp), %xmm7; \ 180 vmovups 96(%rsp), %xmm8; \ 181 vmovups 112(%rsp), %xmm9; \ 182 vmovups 128(%rsp), %xmm10; \ 183 vmovups 144(%rsp), %xmm11; \ 184 vmovups 160(%rsp), %xmm12; \ 185 vmovups 176(%rsp), %xmm13; \ 186 vmovups 192(%rsp), %xmm14; \ 187 vmovups 208(%rsp), %xmm15; \ 188 addq $STACKSIZE, %rsp; 189 190#define GLOB_FUN_START(NAME) \ 191 .globl NAME; \ 192 .def NAME; .scl 2; .type 32; .endef; \ 193NAME: 194#define FUN_START(NAME) \ 195 .def NAME; .scl 2; .type 32; .endef; \ 196NAME: 197#define FUN_END(NAME) 198#define CALL(NAME) \ 199 call NAME 200//#define ZERO_ACC \ 201// vxorpd %ymm0, %ymm0, %ymm0; \ 202// vmovapd %ymm0, %ymm1; \ 203// vmovapd %ymm0, %ymm2; \ 204// vmovapd %ymm0, %ymm3 205//#define NEG_ACC \ 206// vmovapd .LC11(%rip), %ymm15; \ 207// vxorpd %ymm15, %ymm0, %ymm0; \ 208// vxorpd %ymm15, %ymm1, %ymm1; \ 209// vxorpd %ymm15, %ymm2, %ymm2; \ 210// vxorpd %ymm15, %ymm3, %ymm3 211 212#else 213 214#error wrong OS 215 216#endif 217 218 219 220#if defined(OS_LINUX) | defined(OS_WINDOWS) 221 .text 222#elif defined(OS_MAC) 223 .section __TEXT,__text,regular,pure_instructions 224#endif 225 226 227 228 229 230// common inner routine with file scope 231// 232// input arguments: 233// r10d <- k 234// r11 <- A 235// r12 <- x 236// ymm0 <- [z0 z1 z2 z3]_a 237// ymm1 <- [z0 z1 z2 z3]_b 238// ymm2 <- [z0 z1 z2 z3]_c 239// ymm3 <- [z0 z1 z2 z3]_d 240// ymm8 <- dirty 241// ymm9 <- dirty 242// ymm10 <- dirty 243// ymm11 <- dirty 244// ymm12 <- dirty 245// ymm13 <- dirty 246// ymm14 <- dirty 247// ymm15 <- dirty 248 249// 250// output arguments: 251// r10d <- 0 252// r11 <- A+4*k*sizeof(double) 253// r12 <- x+k*sizeof(double) 254// ymm0 <- [z0 z1 z2 z3]_a 255// ymm1 <- [z0 z1 z2 z3]_b 256// ymm2 <- [z0 z1 z2 z3]_c 257// ymm3 <- [z0 z1 z2 z3]_d 258// ymm8 <- dirty 259// ymm9 <- dirty 260// ymm10 <- dirty 261// ymm11 <- dirty 262// ymm12 <- dirty 263// ymm13 <- dirty 264// ymm14 <- dirty 265// ymm15 <- dirty 266 267#if MACRO_LEVEL>=2 268 .macro INNER_KERNEL_GEMV_ADD_N_8_LIB8 269#else 270 .p2align 4,,15 271 FUN_START(inner_kernel_gemv_add_n_8_lib8) 272#endif 273 274 cmpl $0, %r10d 275 jle 2f // return 276 277 cmpl $4, %r10d 278 jl 0f // clean-up loop 279 280 // main loop 281 .p2align 3 2821: // main loop 283 284 vmovaps 0(%r11), %ymm8 285 vbroadcastss 0(%r12), %ymm12 286 vmulps %ymm8, %ymm12, %ymm15 287 vaddps %ymm0, %ymm15, %ymm0 288 289 subl $4, %r10d 290 291 vmovaps 32(%r11), %ymm8 292 vbroadcastss 4(%r12), %ymm12 293 vmulps %ymm8, %ymm12, %ymm15 294 vaddps %ymm1, %ymm15, %ymm1 295 296 vmovaps 64(%r11), %ymm8 297 vbroadcastss 8(%r12), %ymm12 298 vmulps %ymm8, %ymm12, %ymm15 299 vaddps %ymm2, %ymm15, %ymm2 300 301 vmovaps 96(%r11), %ymm8 302 vbroadcastss 12(%r12), %ymm12 303 vmulps %ymm8, %ymm12, %ymm15 304 vaddps %ymm3, %ymm15, %ymm3 305 306 addq $128, %r11 307 addq $16, %r12 308 309 cmpl $3, %r10d 310 311 jg 1b // main loop 312 313 314 // consider clean-up 315 cmpl $0, %r10d 316 jle 2f // return 317 3180: // clean-up 319 320 vmovaps 0(%r11), %ymm8 321 vbroadcastss 0(%r12), %ymm12 322 vmulps %ymm8, %ymm12, %ymm15 323 vaddps %ymm0, %ymm15, %ymm0 324 325 addq $32, %r11 326 addq $4, %r12 327 328 subl $1, %r10d 329 cmpl $0, %r10d 330 331 jg 0b // clean 332 3332: // return 334 335#if MACRO_LEVEL>=2 336 .endm 337#else 338 ret 339 340 FUN_END(inner_kernel_gemv_add_n_8_lib8) 341#endif 342 343 344 345 346 347// common inner routine with file scope 348// 349// input arguments: 350// r10d <- k 351// r11 <- A 352// r12 <- bs*sda*sizeof(double) = 32*sda 353// r13 <- x 354// ymm0 <- [z0a z0b z0c z0d] 355// ymm1 <- [z1a z1b z1c z1d] 356// ymm2 <- [z2a z2b z2c z2d] 357// ymm3 <- [z3a z3b z3c z3d] 358// ymm8 <- dirty 359// ymm9 <- dirty 360// ymm10 <- dirty 361// ymm11 <- dirty 362// ymm12 <- dirty 363// ymm13 <- dirty 364// ymm14 <- dirty 365// ymm15 <- dirty 366 367// 368// output arguments: 369// r10d <- 0 370// r11 <- A+4*k*sizeof(double) 371// r12 <- bs*sda*sizeof(double) = 32*sda 372// r13 <- x+k*sizeof(double) 373// ymm0 <- [z0a z0b z0c z0d] 374// ymm1 <- [z1a z1b z1c z1d] 375// ymm2 <- [z2a z2b z2c z2d] 376// ymm3 <- [z3a z3b z3c z3d] 377// ymm8 <- dirty 378// ymm9 <- dirty 379// ymm10 <- dirty 380// ymm11 <- dirty 381// ymm12 <- dirty 382// ymm13 <- dirty 383// ymm14 <- dirty 384// ymm15 <- dirty 385 386#if MACRO_LEVEL>=2 387 .macro INNER_KERNEL_GEMV_ADD_T_8_LIB8 388#else 389 .p2align 4,,15 390 FUN_START(inner_kernel_gemv_add_t_8_lib8) 391#endif 392 393 cmpl $0, %r10d 394 jle 2f // return 395 396 cmpl $8, %r10d 397 jl 0f // clean-up loop 398 399 // main loop 400 .p2align 3 4011: // main loop 402 403 vmovups 0(%r13), %ymm12 404 405 vmovaps 0(%r11), %ymm8 406 vmulps %ymm8, %ymm12, %ymm15 407 vaddps %ymm0, %ymm15, %ymm0 408 409 subl $8, %r10d 410 411 vmovaps 32(%r11), %ymm8 412 vmulps %ymm8, %ymm12, %ymm15 413 vaddps %ymm1, %ymm15, %ymm1 414 415 vmovaps 64(%r11), %ymm8 416 vmulps %ymm8, %ymm12, %ymm15 417 vaddps %ymm2, %ymm15, %ymm2 418 419 vmovaps 96(%r11), %ymm8 420 vmulps %ymm8, %ymm12, %ymm15 421 vaddps %ymm3, %ymm15, %ymm3 422 423 vmovaps 128(%r11), %ymm8 424 vmulps %ymm8, %ymm12, %ymm15 425 vaddps %ymm4, %ymm15, %ymm4 426 427 vmovaps 160(%r11), %ymm8 428 vmulps %ymm8, %ymm12, %ymm15 429 vaddps %ymm5, %ymm15, %ymm5 430 431 vmovaps 192(%r11), %ymm8 432 vmulps %ymm8, %ymm12, %ymm15 433 vaddps %ymm6, %ymm15, %ymm6 434 435 vmovaps 224(%r11), %ymm8 436 vmulps %ymm8, %ymm12, %ymm15 437 vaddps %ymm7, %ymm15, %ymm7 438 439 addq %r12, %r11 440 addq $32, %r13 441 442 cmpl $7, %r10d 443 444 jg 1b // main loop 445 446 447 // consider clean-up 448 cmpl $0, %r10d 449 jle 2f // return 450 4510: // clean-up 452 453 vcvtsi2ss %r10d, %xmm14, %xmm14 454#if defined(OS_LINUX) | defined(OS_WINDOWS) 455 vmovups .LC00(%rip), %ymm13 456#elif defined(OS_MAC) 457 vmovups LC00(%rip), %ymm13 458#endif 459 vshufps $0x00, %xmm14, %xmm14, %xmm14 460 vinsertf128 $0x1, %xmm14, %ymm14, %ymm14 461 vsubps %ymm14, %ymm13, %ymm14 462 463 vmaskmovps 0(%r13), %ymm14, %ymm12 464 465 vmaskmovps 0(%r11), %ymm14, %ymm8 466 vmulps %ymm8, %ymm12, %ymm15 467 vaddps %ymm0, %ymm15, %ymm0 468 469 vmaskmovps 32(%r11), %ymm14, %ymm8 470 vmulps %ymm8, %ymm12, %ymm15 471 vaddps %ymm1, %ymm15, %ymm1 472 473 vmaskmovps 64(%r11), %ymm14, %ymm8 474 vmulps %ymm8, %ymm12, %ymm15 475 vaddps %ymm2, %ymm15, %ymm2 476 477 vmaskmovps 96(%r11), %ymm14, %ymm8 478 vmulps %ymm8, %ymm12, %ymm15 479 vaddps %ymm3, %ymm15, %ymm3 480 481 vmaskmovps 128(%r11), %ymm14, %ymm8 482 vmulps %ymm8, %ymm12, %ymm15 483 vaddps %ymm4, %ymm15, %ymm4 484 485 vmaskmovps 160(%r11), %ymm14, %ymm8 486 vmulps %ymm8, %ymm12, %ymm15 487 vaddps %ymm5, %ymm15, %ymm5 488 489 vmaskmovps 192(%r11), %ymm14, %ymm8 490 vmulps %ymm8, %ymm12, %ymm15 491 vaddps %ymm6, %ymm15, %ymm6 492 493 vmaskmovps 224(%r11), %ymm14, %ymm8 494 vmulps %ymm8, %ymm12, %ymm15 495 vaddps %ymm7, %ymm15, %ymm7 496 497 sall $2, %r10d 498 addq %r10, %r11 499 addq %r10, %r13 500 xorl %r10d, %r10d 501 502 5032: // return 504 505#if MACRO_LEVEL>=2 506 .endm 507#else 508 ret 509 510 FUN_END(inner_kernel_gemv_add_t_8_lib8) 511#endif 512 513 514 515 516 517// common inner routine with file scope 518// 519// input arguments: 520// r10d <- k 521// r11 <- A 522// r12 <- bs*sda*sizeof(double) = 32*sda 523// r13 <- x 524// r14d <- offA 525// ymm0 <- [z0a z0b z0c z0d] 526// ymm1 <- [z1a z1b z1c z1d] 527// ymm2 <- [z2a z2b z2c z2d] 528// ymm3 <- [z3a z3b z3c z3d] 529// ymm8 <- dirty 530// ymm9 <- dirty 531// ymm10 <- dirty 532// ymm11 <- dirty 533// ymm12 <- dirty 534// ymm13 <- dirty 535// ymm14 <- dirty 536// ymm15 <- dirty 537 538// 539// output arguments: 540// r10d <- 541// r11 <- 542// r12 <- 543// r13 <- 544// r14d <- offA 545// ymm0 <- [z0a z0b z0c z0d] 546// ymm1 <- [z1a z1b z1c z1d] 547// ymm2 <- [z2a z2b z2c z2d] 548// ymm3 <- [z3a z3b z3c z3d] 549// ymm8 <- dirty 550// ymm9 <- dirty 551// ymm10 <- dirty 552// ymm11 <- dirty 553// ymm12 <- dirty 554// ymm13 <- dirty 555// ymm14 <- dirty 556// ymm15 <- dirty 557 558#if MACRO_LEVEL>=2 559 .macro INNER_EDGE_GEMV_ADD_T_8_LIB8 560#else 561 .p2align 4,,15 562 FUN_START(inner_edge_gemv_add_t_8_lib8) 563#endif 564 565 cmpl $0, %r14d 566 jle 0f // return 567 568 movl %r14d, %r15d 569 sall $2, %r15d // offA*sizeof(float) 570 571 subq %r15, %r11 // A - offA 572 subq %r15, %r13 // x - offA 573 574 movl %r10d, %r15d // kmax 575 addl %r14d, %r15d // kmax + offA 576 577 vcvtsi2ss %r14d, %xmm14, %xmm14 // offA 578 vcvtsi2ss %r15d, %xmm15, %xmm15 // offA + kmax 579#if defined(OS_LINUX) | defined(OS_WINDOWS) 580 vmovups .LC00(%rip), %ymm13 581#elif defined(OS_MAC) 582 vmovups LC00(%rip), %ymm13 583#endif 584 vshufps $0x00, %xmm14, %xmm14, %xmm14 585 vshufps $0x00, %xmm15, %xmm15, %xmm15 586 vinsertf128 $1, %xmm14, %ymm14, %ymm14 587 vinsertf128 $1, %xmm15, %ymm15, %ymm15 588 vsubps %ymm13, %ymm14, %ymm14 589 vsubps %ymm15, %ymm13, %ymm15 590 vandps %ymm15, %ymm14, %ymm14 591 592 vmaskmovps 0(%r13), %ymm14, %ymm12 593 594 vmovaps 0(%r11), %ymm8 595 vmulps %ymm8, %ymm12, %ymm15 596 vaddps %ymm0, %ymm15, %ymm0 597 598 vmovaps 32(%r11), %ymm8 599 vmulps %ymm8, %ymm12, %ymm15 600 vaddps %ymm1, %ymm15, %ymm1 601 602 vmovaps 64(%r11), %ymm8 603 vmulps %ymm8, %ymm12, %ymm15 604 vaddps %ymm2, %ymm15, %ymm2 605 606 vmovaps 96(%r11), %ymm8 607 vmulps %ymm8, %ymm12, %ymm15 608 vaddps %ymm3, %ymm15, %ymm3 609 610 vmovaps 128(%r11), %ymm8 611 vmulps %ymm8, %ymm12, %ymm15 612 vaddps %ymm4, %ymm15, %ymm4 613 614 vmovaps 160(%r11), %ymm8 615 vmulps %ymm8, %ymm12, %ymm15 616 vaddps %ymm5, %ymm15, %ymm5 617 618 vmovaps 192(%r11), %ymm8 619 vmulps %ymm8, %ymm12, %ymm15 620 vaddps %ymm6, %ymm15, %ymm6 621 622 vmovaps 224(%r11), %ymm8 623 vmulps %ymm8, %ymm12, %ymm15 624 vaddps %ymm7, %ymm15, %ymm7 625 626 addq $32, %r13 // x + 4 627 addq %r12, %r11 // A + bs*sda 628 629 addl %r14d, %r10d 630 subl $8, %r10d // kmax - (8-offA) 631 6320: // return 633 634#if MACRO_LEVEL>=2 635 .endm 636#else 637 ret 638 639 FUN_END(inner_edge_gemv_add_t_8_lib8) 640#endif 641 642 643 644 645 646// common inner routine with file scope 647// 648// triangular substitution with vector RHS 649// 650// input arguments: 651// r10 <- E 652// r11 <- inv_diag_E 653// ymm0 <- [z0 z1 z2 z3] 654// ymm12 <- dirty 655// ymm13 <- dirty 656// 657// output arguments: 658// r10 <- E 659// r11 <- inv_diag_E 660// ymm0 <- [z0 z1 z2 z3] 661// ymm12 <- dirty 662// ymm13 <- dirty 663 664#if MACRO_LEVEL>=1 665 .macro INNER_EDGE_TRSV_LN_INV_8_LIB8 666#else 667 .p2align 4,,15 668 FUN_START(inner_edge_trsv_ln_inv_8_lib8) 669#endif 670 671 vxorps %ymm14, %ymm14, %ymm14 672 673 vbroadcastss 0(%r11), %ymm12 674 vmulps %ymm0, %ymm12, %ymm1 675 vblendps $0x01, %ymm1, %ymm0, %ymm0 676 677 vmovaps 0(%r10), %ymm13 678 vblendps $0x01, %ymm14, %ymm13, %ymm13 679 vpermilps $0x00, %ymm0, %ymm12 680 vperm2f128 $0x00, %ymm12, %ymm12, %ymm12 681 vmulps %ymm13, %ymm12, %ymm15 682 vsubps %ymm15, %ymm0, %ymm0 683 vbroadcastss 4(%r11), %ymm12 684 vmulps %ymm0, %ymm12, %ymm1 685 vblendps $0x02, %ymm1, %ymm0, %ymm0 686 687 vmovaps 32(%r10), %ymm13 688 vblendps $0x03, %ymm14, %ymm13, %ymm13 689 vpermilps $0x55, %ymm0, %ymm12 690 vperm2f128 $0x00, %ymm12, %ymm12, %ymm12 691 vmulps %ymm13, %ymm12, %ymm15 692 vsubps %ymm15, %ymm0, %ymm0 693 vbroadcastss 8(%r11), %ymm12 694 vmulps %ymm0, %ymm12, %ymm1 695 vblendps $0x04, %ymm1, %ymm0, %ymm0 696 697 vmovaps 64(%r10), %ymm13 698 vblendps $0x07, %ymm14, %ymm13, %ymm13 699 vpermilps $0xaa, %ymm0, %ymm12 700 vperm2f128 $0x00, %ymm12, %ymm12, %ymm12 701 vmulps %ymm13, %ymm12, %ymm15 702 vsubps %ymm15, %ymm0, %ymm0 703 vbroadcastss 12(%r11), %ymm12 704 vmulps %ymm0, %ymm12, %ymm1 705 vblendps $0x08, %ymm1, %ymm0, %ymm0 706 707 vmovaps 96(%r10), %ymm13 708 vblendps $0x0f, %ymm14, %ymm13, %ymm13 709 vpermilps $0xff, %ymm0, %ymm12 710 vperm2f128 $0x00, %ymm12, %ymm12, %ymm12 711 vmulps %ymm13, %ymm12, %ymm15 712 vsubps %ymm15, %ymm0, %ymm0 713 vbroadcastss 16(%r11), %ymm12 714 vmulps %ymm0, %ymm12, %ymm1 715 vblendps $0x10, %ymm1, %ymm0, %ymm0 716 717 vmovaps 128(%r10), %ymm13 718 vblendps $0x1f, %ymm14, %ymm13, %ymm13 719 vpermilps $0x00, %ymm0, %ymm12 720 vperm2f128 $0x11, %ymm12, %ymm12, %ymm12 721 vmulps %ymm13, %ymm12, %ymm15 722 vsubps %ymm15, %ymm0, %ymm0 723 vbroadcastss 20(%r11), %ymm12 724 vmulps %ymm0, %ymm12, %ymm1 725 vblendps $0x20, %ymm1, %ymm0, %ymm0 726 727 vmovaps 160(%r10), %ymm13 728 vblendps $0x3f, %ymm14, %ymm13, %ymm13 729 vpermilps $0x55, %ymm0, %ymm12 730 vperm2f128 $0x11, %ymm12, %ymm12, %ymm12 731 vmulps %ymm13, %ymm12, %ymm15 732 vsubps %ymm15, %ymm0, %ymm0 733 vbroadcastss 24(%r11), %ymm12 734 vmulps %ymm0, %ymm12, %ymm1 735 vblendps $0x40, %ymm1, %ymm0, %ymm0 736 737 vmovaps 192(%r10), %ymm13 738 vblendps $0x7f, %ymm14, %ymm13, %ymm13 739 vpermilps $0xaa, %ymm0, %ymm12 740 vperm2f128 $0x11, %ymm12, %ymm12, %ymm12 741 vmulps %ymm13, %ymm12, %ymm15 742 vsubps %ymm15, %ymm0, %ymm0 743 vbroadcastss 28(%r11), %ymm12 744 vmulps %ymm0, %ymm12, %ymm1 745 vblendps $0x80, %ymm1, %ymm0, %ymm0 746 747#if MACRO_LEVEL>=1 748 .endm 749#else 750 ret 751 752 FUN_END(inner_edge_trsv_ln_inv_8_lib8) 753#endif 754 755 756 757 758 759// common inner routine with file scope 760// 761// triangular substitution with vector RHS 762// 763// input arguments: 764// r10 <- E 765// r11 <- inv_diag_E 766// r12d <- kn 767// ymm0 <- [z0 z1 z2 z3] 768// ymm12 <- dirty 769// ymm13 <- dirty 770// 771// output arguments: 772// r10 <- E 773// r11 <- inv_diag_E 774// r12d <- kn 775// ymm0 <- [z0 z1 z2 z3] 776// ymm12 <- dirty 777// ymm13 <- dirty 778 779#if MACRO_LEVEL>=1 780 .macro INNER_EDGE_TRSV_LN_INV_8_VS_LIB8 781#else 782 .p2align 4,,15 783 FUN_START(inner_edge_trsv_ln_inv_8_vs_lib8) 784#endif 785 786 vxorps %ymm14, %ymm14, %ymm14 787 788 vbroadcastss 0(%r11), %ymm12 789 vmulps %ymm0, %ymm12, %ymm1 790 vblendps $0x01, %ymm1, %ymm0, %ymm0 791 vmovaps 0(%r10), %ymm13 792 vblendps $0x01, %ymm14, %ymm13, %ymm13 793 vpermilps $0x00, %ymm0, %ymm12 794 vperm2f128 $0x00, %ymm12, %ymm12, %ymm12 795 vmulps %ymm13, %ymm12, %ymm15 796 vsubps %ymm15, %ymm0, %ymm0 797 798 cmpl $2, %r12d 799 jl 0f // ret 800 801 vbroadcastss 4(%r11), %ymm12 802 vmulps %ymm0, %ymm12, %ymm1 803 vblendps $0x02, %ymm1, %ymm0, %ymm0 804 vmovaps 32(%r10), %ymm13 805 vblendps $0x03, %ymm14, %ymm13, %ymm13 806 vpermilps $0x55, %ymm0, %ymm12 807 vperm2f128 $0x00, %ymm12, %ymm12, %ymm12 808 vmulps %ymm13, %ymm12, %ymm15 809 vsubps %ymm15, %ymm0, %ymm0 810 811 cmpl $3, %r12d 812 jl 0f // ret 813 814 vbroadcastss 8(%r11), %ymm12 815 vmulps %ymm0, %ymm12, %ymm1 816 vblendps $0x04, %ymm1, %ymm0, %ymm0 817 vmovaps 64(%r10), %ymm13 818 vblendps $0x07, %ymm14, %ymm13, %ymm13 819 vpermilps $0xaa, %ymm0, %ymm12 820 vperm2f128 $0x00, %ymm12, %ymm12, %ymm12 821 vmulps %ymm13, %ymm12, %ymm15 822 vsubps %ymm15, %ymm0, %ymm0 823 824 cmpl $4, %r12d 825 jl 0f // ret 826 827 vbroadcastss 12(%r11), %ymm12 828 vmulps %ymm0, %ymm12, %ymm1 829 vblendps $0x08, %ymm1, %ymm0, %ymm0 830 vmovaps 96(%r10), %ymm13 831 vblendps $0x0f, %ymm14, %ymm13, %ymm13 832 vpermilps $0xff, %ymm0, %ymm12 833 vperm2f128 $0x00, %ymm12, %ymm12, %ymm12 834 vmulps %ymm13, %ymm12, %ymm15 835 vsubps %ymm15, %ymm0, %ymm0 836 837 cmpl $5, %r12d 838 jl 0f // ret 839 840 vbroadcastss 16(%r11), %ymm12 841 vmulps %ymm0, %ymm12, %ymm1 842 vblendps $0x10, %ymm1, %ymm0, %ymm0 843 vmovaps 128(%r10), %ymm13 844 vblendps $0x1f, %ymm14, %ymm13, %ymm13 845 vpermilps $0x00, %ymm0, %ymm12 846 vperm2f128 $0x11, %ymm12, %ymm12, %ymm12 847 vmulps %ymm13, %ymm12, %ymm15 848 vsubps %ymm15, %ymm0, %ymm0 849 850 cmpl $6, %r12d 851 jl 0f // ret 852 853 vbroadcastss 20(%r11), %ymm12 854 vmulps %ymm0, %ymm12, %ymm1 855 vblendps $0x20, %ymm1, %ymm0, %ymm0 856 vmovaps 160(%r10), %ymm13 857 vblendps $0x3f, %ymm14, %ymm13, %ymm13 858 vpermilps $0x55, %ymm0, %ymm12 859 vperm2f128 $0x11, %ymm12, %ymm12, %ymm12 860 vmulps %ymm13, %ymm12, %ymm15 861 vsubps %ymm15, %ymm0, %ymm0 862 863 cmpl $7, %r12d 864 jl 0f // ret 865 866 vbroadcastss 24(%r11), %ymm12 867 vmulps %ymm0, %ymm12, %ymm1 868 vblendps $0x40, %ymm1, %ymm0, %ymm0 869 vmovaps 192(%r10), %ymm13 870 vblendps $0x7f, %ymm14, %ymm13, %ymm13 871 vpermilps $0xaa, %ymm0, %ymm12 872 vperm2f128 $0x11, %ymm12, %ymm12, %ymm12 873 vmulps %ymm13, %ymm12, %ymm15 874 vsubps %ymm15, %ymm0, %ymm0 875 876 cmpl $8, %r12d 877 jl 0f // ret 878 879 vbroadcastss 28(%r11), %ymm12 880 vmulps %ymm0, %ymm12, %ymm1 881 vblendps $0x80, %ymm1, %ymm0, %ymm0 882 8830: 884 885#if MACRO_LEVEL>=1 886 .endm 887#else 888 ret 889 890 FUN_END(inner_edge_trsv_ln_inv_8_vs_lib8) 891#endif 892 893 894 895 896 897// common inner routine with file scope 898// 899// triangular substitution with vector RHS 900// 901// input arguments: 902// r10 <- E 903// r11 <- inv_diag_E 904// ymm0 <- [z0 z1 z2 z3] 905// ymm12 <- dirty 906// ymm13 <- dirty 907// 908// output arguments: 909// r10 <- E 910// r11 <- inv_diag_E 911// ymm0 <- [z0 z1 z2 z3] 912// ymm12 <- dirty 913// ymm13 <- dirty 914 915#if MACRO_LEVEL>=1 916 .macro INNER_EDGE_TRSV_LT_INV_8_LIB8 917#else 918 .p2align 4,,15 919 FUN_START(inner_edge_trsv_lt_inv_8_lib8) 920#endif 921 922 vxorps %ymm14, %ymm14, %ymm14 923 924 vmovaps 0(%r10), %ymm12 925 vblendps $0x01, %ymm14, %ymm12, %ymm12 926 vmovaps 32(%r10), %ymm13 927 vblendps $0x03, %ymm14, %ymm13, %ymm13 928 vunpcklps %ymm13, %ymm12, %ymm8 929 vunpckhps %ymm13, %ymm12, %ymm9 930 931 vmovaps 64(%r10), %ymm12 932 vblendps $0x07, %ymm14, %ymm12, %ymm12 933 vmovaps 96(%r10), %ymm13 934 vblendps $0x0f, %ymm14, %ymm13, %ymm13 935 vunpcklps %ymm13, %ymm12, %ymm10 936 vunpckhps %ymm13, %ymm12, %ymm11 937 938 vshufps $0x44, %ymm10, %ymm8, %ymm7 939 vshufps $0xee, %ymm10, %ymm8, %ymm4 940 vshufps $0x44, %ymm11, %ymm9, %ymm5 941 vshufps $0xee, %ymm11, %ymm9, %ymm6 942 vextractf128 $0x1, %ymm7, %xmm7 943 vextractf128 $0x1, %ymm4, %xmm8 944 vextractf128 $0x1, %ymm5, %xmm9 945 vextractf128 $0x1, %ymm6, %xmm10 946 947 vmovaps 144(%r10), %xmm12 948 vblendps $0x01, %xmm14, %xmm12, %xmm12 949 vmovaps 176(%r10), %xmm13 950 vblendps $0x03, %xmm14, %xmm13, %xmm13 951 vunpcklps %xmm13, %xmm12, %xmm1 952 vunpckhps %xmm13, %xmm12, %xmm2 953 954 vmovaps 208(%r10), %xmm12 955 vblendps $0x07, %xmm14, %xmm12, %xmm12 956 vmovaps 240(%r10), %xmm13 957 vblendps $0x0f, %xmm14, %xmm13, %xmm13 958 vunpcklps %xmm13, %xmm12, %xmm3 959 vunpckhps %xmm13, %xmm12, %xmm15 960 961 vshufps $0xee, %xmm3, %xmm1, %xmm11 962 vshufps $0x44, %xmm15, %xmm2, %xmm12 963 vshufps $0xee, %xmm15, %xmm2, %xmm13 964 965 966 vxorps %ymm14, %ymm14, %ymm14 967 968 vextractf128 $0x1, %ymm0, %xmm1 969 970 vshufps $0xff, %xmm1, %xmm1, %xmm2 971 vbroadcastss 28(%r11), %xmm15 972 vmulps %xmm2, %xmm15, %xmm2 973 vblendps $0x08, %xmm2, %xmm1, %xmm1 974 vmulps %xmm10, %xmm2, %xmm15 975 vsubps %xmm15, %xmm0, %xmm0 976 vmulps %xmm13, %xmm2, %xmm15 977 vsubps %xmm15, %xmm1, %xmm1 978 979 vshufps $0xaa, %xmm1, %xmm1, %xmm2 980 vbroadcastss 24(%r11), %xmm15 981 vmulps %xmm2, %xmm15, %xmm2 982 vblendps $0x04, %xmm2, %xmm1, %xmm1 983 vmulps %xmm9, %xmm2, %xmm15 984 vsubps %xmm15, %xmm0, %xmm0 985 vmulps %xmm12, %xmm2, %xmm15 986 vsubps %xmm15, %xmm1, %xmm1 987 988 vshufps $0x55, %xmm1, %xmm1, %xmm2 989 vbroadcastss 20(%r11), %xmm15 990 vmulps %xmm2, %xmm15, %xmm2 991 vblendps $0x02, %xmm2, %xmm1, %xmm1 992 vmulps %xmm8, %xmm2, %xmm15 993 vsubps %xmm15, %xmm0, %xmm0 994 vmulps %xmm11, %xmm2, %xmm15 995 vsubps %xmm15, %xmm1, %xmm1 996 997 vshufps $0x00, %xmm1, %xmm1, %xmm2 998 vbroadcastss 16(%r11), %xmm15 999 vmulps %xmm2, %xmm15, %xmm2 1000 vblendps $0x01, %xmm2, %xmm1, %xmm1 1001 vmulps %xmm7, %xmm2, %xmm15 1002 vsubps %xmm15, %xmm0, %xmm0 1003 1004 vshufps $0xff, %xmm0, %xmm0, %xmm2 1005 vbroadcastss 12(%r11), %xmm15 1006 vmulps %xmm2, %xmm15, %xmm2 1007 vblendps $0x08, %xmm2, %xmm0, %xmm0 1008 vmulps %xmm6, %xmm2, %xmm15 1009 vsubps %xmm15, %xmm0, %xmm0 1010 1011 vshufps $0xaa, %xmm0, %xmm0, %xmm2 1012 vbroadcastss 8(%r11), %xmm15 1013 vmulps %xmm2, %xmm15, %xmm2 1014 vblendps $0x04, %xmm2, %xmm0, %xmm0 1015 vmulps %xmm5, %xmm2, %xmm15 1016 vsubps %xmm15, %xmm0, %xmm0 1017 1018 vshufps $0x55, %xmm0, %xmm0, %xmm2 1019 vbroadcastss 4(%r11), %xmm15 1020 vmulps %xmm2, %xmm15, %xmm2 1021 vblendps $0x02, %xmm2, %xmm0, %xmm0 1022 vmulps %xmm4, %xmm2, %xmm15 1023 vsubps %xmm15, %xmm0, %xmm0 1024 1025 vshufps $0x00, %xmm0, %xmm0, %xmm2 1026 vbroadcastss 0(%r11), %xmm15 1027 vmulps %xmm2, %xmm15, %xmm2 1028 vblendps $0x01, %xmm2, %xmm0, %xmm0 1029 1030 vinsertf128 $0x1, %xmm1, %ymm0, %ymm0 1031 1032#if MACRO_LEVEL>=1 1033 .endm 1034#else 1035 ret 1036 1037 FUN_END(inner_edge_trsv_lt_inv_8_lib8) 1038#endif 1039 1040 1041 1042 1043 1044// common inner routine with file scope 1045// 1046// triangular substitution with vector RHS 1047// 1048// input arguments: 1049// r10 <- E 1050// r11 <- inv_diag_E 1051// r12 <- km 1052// r13 <- kn 1053// r14 <- x 1054// ymm0 <- [z0 z1 z2 z3] 1055// ymm12 <- dirty 1056// ymm13 <- dirty 1057// 1058// output arguments: 1059// r10 <- E 1060// r11 <- inv_diag_E 1061// r12 <- km 1062// r13 <- kn 1063// r14 <- x 1064// ymm0 <- [z0 z1 z2 z3] 1065// ymm12 <- dirty 1066// ymm13 <- dirty 1067 1068#if MACRO_LEVEL>=1 1069 .macro INNER_EDGE_TRSV_LT_INV_8_VS_LIB8 1070#else 1071 .p2align 4,,15 1072 FUN_START(inner_edge_trsv_lt_inv_8_vs_lib8) 1073#endif 1074 1075 vcvtsi2ss %r13d, %xmm14, %xmm14 1076#if defined(OS_LINUX) | defined(OS_WINDOWS) 1077 vmovups .LC00(%rip), %ymm13 1078#elif defined(OS_MAC) 1079 vmovups LC00(%rip), %ymm13 1080#endif 1081 vshufps $0x00, %xmm14, %xmm14, %xmm14 1082 vinsertf128 $0x1, %xmm14, %ymm14, %ymm14 1083 vsubps %ymm14, %ymm13, %ymm14 1084 1085 vmovups 0(%r14), %ymm15 1086 vblendvps %ymm14, %ymm0, %ymm15, %ymm0 1087 1088 1089 1090 vxorps %ymm14, %ymm14, %ymm14 1091 1092 vmovaps 0(%r10), %ymm12 1093 vblendps $0x01, %ymm14, %ymm12, %ymm12 1094 cmpl $2, %r13d 1095 jl 1f 1096 vmovaps 32(%r10), %ymm13 1097 vblendps $0x03, %ymm14, %ymm13, %ymm13 1098 vunpcklps %ymm13, %ymm12, %ymm8 1099 vunpckhps %ymm13, %ymm12, %ymm9 1100 1101 cmpl $3, %r13d 1102 jl 2f 1103 vmovaps 64(%r10), %ymm12 1104 vblendps $0x07, %ymm14, %ymm12, %ymm12 1105 cmpl $4, %r13d 1106 jl 3f 1107 vmovaps 96(%r10), %ymm13 1108 vblendps $0x0f, %ymm14, %ymm13, %ymm13 1109 vunpcklps %ymm13, %ymm12, %ymm10 1110 vunpckhps %ymm13, %ymm12, %ymm11 1111 1112 vshufps $0x44, %ymm10, %ymm8, %ymm7 1113 vshufps $0xee, %ymm10, %ymm8, %ymm4 1114 vshufps $0x44, %ymm11, %ymm9, %ymm5 1115 vshufps $0xee, %ymm11, %ymm9, %ymm6 1116 vextractf128 $0x1, %ymm7, %xmm7 1117 vextractf128 $0x1, %ymm4, %xmm8 1118 vextractf128 $0x1, %ymm5, %xmm9 1119 vextractf128 $0x1, %ymm6, %xmm10 1120 1121 cmpl $5, %r13d 1122 jl 4f 1123 vmovaps 144(%r10), %xmm12 1124 vblendps $0x01, %xmm14, %xmm12, %xmm12 1125 cmpl $6, %r13d 1126 jl 5f 1127 vmovaps 176(%r10), %xmm13 1128 vblendps $0x03, %xmm14, %xmm13, %xmm13 1129 vunpcklps %xmm13, %xmm12, %xmm1 1130 vunpckhps %xmm13, %xmm12, %xmm2 1131 1132 cmpl $7, %r13d 1133 jl 6f 1134 vmovaps 208(%r10), %xmm12 1135 vblendps $0x07, %xmm14, %xmm12, %xmm12 1136 cmpl $8, %r13d 1137 jl 7f 1138 vmovaps 240(%r10), %xmm13 1139 vblendps $0x0f, %xmm14, %xmm13, %xmm13 1140 vunpcklps %xmm13, %xmm12, %xmm3 1141 vunpckhps %xmm13, %xmm12, %xmm15 1142 1143 vshufps $0xee, %xmm3, %xmm1, %xmm11 1144 vshufps $0x44, %xmm15, %xmm2, %xmm12 1145 vshufps $0xee, %xmm15, %xmm2, %xmm13 1146 1147 jmp 0f 1148 1149 1150 1151 vmovaps %ymm14, %ymm12 11521: 1153 vmovaps %ymm14, %ymm13 1154 vunpcklps %ymm13, %ymm12, %ymm8 1155 vunpckhps %ymm13, %ymm12, %ymm9 1156 11572: 1158 vmovaps %ymm14, %ymm12 11593: 1160 vmovaps %ymm14, %ymm13 1161 vunpcklps %ymm13, %ymm12, %ymm10 1162 vunpckhps %ymm13, %ymm12, %ymm11 1163 1164 vshufps $0x44, %ymm10, %ymm8, %ymm7 1165 vshufps $0xee, %ymm10, %ymm8, %ymm4 1166 vshufps $0x44, %ymm11, %ymm9, %ymm5 1167 vshufps $0xee, %ymm11, %ymm9, %ymm6 1168 vextractf128 $0x1, %ymm7, %xmm7 1169 vextractf128 $0x1, %ymm4, %xmm8 1170 vextractf128 $0x1, %ymm5, %xmm9 1171 vextractf128 $0x1, %ymm6, %xmm10 1172 1173 jmp 8f 1174 11754: 1176 vmovaps %xmm14, %xmm12 11775: 1178 vmovaps %xmm14, %xmm13 1179 vunpcklps %xmm13, %xmm12, %xmm1 1180 vunpckhps %xmm13, %xmm12, %xmm2 1181 11826: 1183 vmovaps %xmm14, %xmm12 11847: 1185 vmovaps %xmm14, %xmm13 1186 vunpcklps %xmm13, %xmm12, %xmm3 1187 vunpckhps %xmm13, %xmm12, %xmm15 1188 1189 vshufps $0xee, %xmm3, %xmm1, %xmm11 1190 vshufps $0x44, %xmm15, %xmm2, %xmm12 1191 vshufps $0xee, %xmm15, %xmm2, %xmm13 1192 11938: 1194 1195 vmovaps %xmm14, %xmm11 1196 vmovaps %xmm14, %xmm12 1197 vmovaps %xmm14, %xmm13 1198 11990: 1200 vxorps %ymm14, %ymm14, %ymm14 1201 1202 vextractf128 $0x1, %ymm0, %xmm1 1203 1204 cmpl $8, %r12d 1205 jl 0f 1206 1207 vshufps $0xff, %xmm1, %xmm1, %xmm2 1208 cmpl $8, %r13d 1209 jl 1f 1210 vbroadcastss 28(%r11), %xmm15 1211 vmulps %xmm2, %xmm15, %xmm2 1212 vblendps $0x08, %xmm2, %xmm1, %xmm1 12131: 1214 vmulps %xmm10, %xmm2, %xmm15 1215 vsubps %xmm15, %xmm0, %xmm0 1216 vmulps %xmm13, %xmm2, %xmm15 1217 vsubps %xmm15, %xmm1, %xmm1 1218 12190: 1220 cmpl $7, %r12d 1221 jl 0f 1222 1223 vshufps $0xaa, %xmm1, %xmm1, %xmm2 1224 cmpl $7, %r13d 1225 jl 1f 1226 vbroadcastss 24(%r11), %xmm15 1227 vmulps %xmm2, %xmm15, %xmm2 1228 vblendps $0x04, %xmm2, %xmm1, %xmm1 12291: 1230 vmulps %xmm9, %xmm2, %xmm15 1231 vsubps %xmm15, %xmm0, %xmm0 1232 vmulps %xmm12, %xmm2, %xmm15 1233 vsubps %xmm15, %xmm1, %xmm1 1234 12350: 1236 cmpl $6, %r12d 1237 jl 0f 1238 1239 vshufps $0x55, %xmm1, %xmm1, %xmm2 1240 cmpl $6, %r13d 1241 jl 1f 1242 vbroadcastss 20(%r11), %xmm15 1243 vmulps %xmm2, %xmm15, %xmm2 1244 vblendps $0x02, %xmm2, %xmm1, %xmm1 12451: 1246 vmulps %xmm8, %xmm2, %xmm15 1247 vsubps %xmm15, %xmm0, %xmm0 1248 vmulps %xmm11, %xmm2, %xmm15 1249 vsubps %xmm15, %xmm1, %xmm1 1250 12510: 1252 cmpl $5, %r12d 1253 jl 0f 1254 1255 vshufps $0x00, %xmm1, %xmm1, %xmm2 1256 cmpl $5, %r13d 1257 jl 1f 1258 vbroadcastss 16(%r11), %xmm15 1259 vmulps %xmm2, %xmm15, %xmm2 1260 vblendps $0x01, %xmm2, %xmm1, %xmm1 12611: 1262 vmulps %xmm7, %xmm2, %xmm15 1263 vsubps %xmm15, %xmm0, %xmm0 1264 12650: 1266 cmpl $4, %r12d 1267 jl 0f 1268 1269 vshufps $0xff, %xmm0, %xmm0, %xmm2 1270 cmpl $4, %r13d 1271 jl 1f 1272 vbroadcastss 12(%r11), %xmm15 1273 vmulps %xmm2, %xmm15, %xmm2 1274 vblendps $0x08, %xmm2, %xmm0, %xmm0 12751: 1276 vmulps %xmm6, %xmm2, %xmm15 1277 vsubps %xmm15, %xmm0, %xmm0 1278 12790: 1280 cmpl $3, %r12d 1281 jl 0f 1282 1283 vshufps $0xaa, %xmm0, %xmm0, %xmm2 1284 cmpl $3, %r13d 1285 jl 1f 1286 vbroadcastss 8(%r11), %xmm15 1287 vmulps %xmm2, %xmm15, %xmm2 1288 vblendps $0x04, %xmm2, %xmm0, %xmm0 12891: 1290 vmulps %xmm5, %xmm2, %xmm15 1291 vsubps %xmm15, %xmm0, %xmm0 1292 12930: 1294 cmpl $2, %r12d 1295 jl 0f 1296 1297 vshufps $0x55, %xmm0, %xmm0, %xmm2 1298 cmpl $2, %r13d 1299 jl 1f 1300 vbroadcastss 4(%r11), %xmm15 1301 vmulps %xmm2, %xmm15, %xmm2 1302 vblendps $0x02, %xmm2, %xmm0, %xmm0 13031: 1304 vmulps %xmm4, %xmm2, %xmm15 1305 vsubps %xmm15, %xmm0, %xmm0 1306 13070: 1308 cmpl $1, %r12d 1309 jl 0f 1310 1311 vshufps $0x00, %xmm0, %xmm0, %xmm2 1312 cmpl $1, %r13d 1313 jl 1f 1314 vbroadcastss 0(%r11), %xmm15 1315 vmulps %xmm2, %xmm15, %xmm2 1316 vblendps $0x01, %xmm2, %xmm0, %xmm0 13171: 1318 13190: 1320 1321 vinsertf128 $0x1, %xmm1, %ymm0, %ymm0 1322 1323#if MACRO_LEVEL>=1 1324 .endm 1325#else 1326 ret 1327 1328 FUN_END(inner_edge_trsv_lt_inv_8_vs_lib8) 1329#endif 1330 1331 1332 1333 1334 1335// common inner routine with file scope 1336// 1337// blend for ta==n, scale for generic alpha and beta 1338// 1339// input arguments: 1340// r10 <- alpha 1341// r11 <- beta 1342// r12 <- y 1343// ymm0 <- [z0 z1 z2 z3]_a 1344// ymm1 <- [z0 z1 z2 z3]_b 1345// ymm2 <- [z0 z1 z2 z3]_c 1346// ymm3 <- [z0 z1 z2 z3]_d 1347// ymm8 <- dirty 1348// ymm9 <- dirty 1349// ymm10 <- dirty 1350// ymm11 <- dirty 1351// ymm15 <- dirty 1352// 1353// output arguments: 1354// r10 <- alpha 1355// r11 <- beta 1356// r12 <- y 1357// ymm0 <- [z0 z1 z2 z3] 1358// ymm1 <- dirty 1359// ymm2 <- dirty 1360// ymm3 <- dirty 1361// ymm8 <- dirty 1362// ymm9 <- dirty 1363// ymm10 <- dirty 1364// ymm11 <- dirty 1365// ymm15 <- dirty 1366 1367#if MACRO_LEVEL>=1 1368 .macro INNER_BLEND_N_SCALE_AB_8_LIB8 1369#else 1370 .p2align 4,,15 1371 FUN_START(inner_blend_n_scale_ab_8_lib8) 1372#endif 1373 1374 // reduction 1375 vaddps %ymm0, %ymm1, %ymm0 1376 vaddps %ymm2, %ymm3, %ymm2 1377 vaddps %ymm0, %ymm2, %ymm0 1378 1379 // alpha 1380 vbroadcastss 0(%r10), %ymm15 1381 vmulps %ymm0, %ymm15, %ymm0 1382 1383 // beta 1384 vbroadcastss 0(%r11), %ymm15 1385 vmovups 0(%r12), %ymm14 1386 vmulps %ymm15, %ymm14, %ymm14 1387 vaddps %ymm0, %ymm14, %ymm0 1388 1389#if MACRO_LEVEL>=1 1390 .endm 1391#else 1392 ret 1393 1394 FUN_END(inner_blend_n_scale_ab_8_lib8) 1395#endif 1396 1397 1398 1399 1400 1401// common inner routine with file scope 1402// 1403// blend for ta==n, scale for alpha=-1.0 and beta=1.0 1404// 1405// input arguments: 1406// r10 <- y 1407// ymm0 <- [z0 z1 z2 z3]_a 1408// ymm1 <- [z0 z1 z2 z3]_b 1409// ymm2 <- [z0 z1 z2 z3]_c 1410// ymm3 <- [z0 z1 z2 z3]_d 1411// ymm8 <- dirty 1412// ymm9 <- dirty 1413// ymm10 <- dirty 1414// ymm11 <- dirty 1415// ymm15 <- dirty 1416// 1417// output arguments: 1418// r10 <- y 1419// ymm0 <- [z0 z1 z2 z3] 1420// ymm1 <- dirty 1421// ymm2 <- dirty 1422// ymm3 <- dirty 1423// ymm8 <- dirty 1424// ymm9 <- dirty 1425// ymm10 <- dirty 1426// ymm11 <- dirty 1427// ymm15 <- dirty 1428 1429#if MACRO_LEVEL>=1 1430 .macro INNER_BLEND_N_SCALE_M11_8_LIB8 1431#else 1432 .p2align 4,,15 1433 FUN_START(inner_blend_n_scale_m11_8_lib8) 1434#endif 1435 1436 // reduction 1437 vaddps %ymm0, %ymm1, %ymm0 1438 vaddps %ymm2, %ymm3, %ymm2 1439 vaddps %ymm0, %ymm2, %ymm0 1440 1441 // beta 1442 vmovups 0(%r10), %ymm14 1443 vsubps %ymm0, %ymm14, %ymm0 1444 1445#if MACRO_LEVEL>=1 1446 .endm 1447#else 1448 ret 1449 1450 FUN_END(inner_blend_n_scale_m11_8_lib8) 1451#endif 1452 1453 1454 1455 1456 1457// common inner routine with file scope 1458// 1459// blend for ta==t, scale for generic alpha and beta 1460// 1461// input arguments: 1462// r10 <- alpha 1463// r11 <- beta 1464// r12 <- y 1465// ymm0 <- [z0a z0b z0c z0d] 1466// ymm1 <- [z1a z1b z1c z1d] 1467// ymm2 <- [z2a z2b z2c z2d] 1468// ymm3 <- [z3a z3b z3c z3d] 1469// ymm8 <- dirty 1470// ymm9 <- dirty 1471// ymm10 <- dirty 1472// ymm11 <- dirty 1473// ymm15 <- dirty 1474// 1475// output arguments: 1476// r10 <- alpha 1477// r11 <- beta 1478// r12 <- y 1479// ymm0 <- [z0 z1 z2 z3] 1480// ymm1 <- dirty 1481// ymm2 <- dirty 1482// ymm3 <- dirty 1483// ymm8 <- dirty 1484// ymm9 <- dirty 1485// ymm10 <- dirty 1486// ymm11 <- dirty 1487// ymm15 <- dirty 1488 1489#if MACRO_LEVEL>=1 1490 .macro INNER_BLEND_T_SCALE_AB_8_LIB8 1491#else 1492 .p2align 4,,15 1493 FUN_START(inner_blend_t_scale_ab_8_lib8) 1494#endif 1495 1496 // reduction 1497 vhaddps %ymm1, %ymm0, %ymm0 1498 vhaddps %ymm3, %ymm2, %ymm2 1499 vhaddps %ymm5, %ymm4, %ymm4 1500 vhaddps %ymm7, %ymm6, %ymm6 1501 1502 vhaddps %ymm2, %ymm0, %ymm0 1503 vhaddps %ymm6, %ymm4, %ymm4 1504 1505 vperm2f128 $0x20, %ymm4, %ymm0, %ymm1 1506 vperm2f128 $0x13, %ymm0, %ymm4, %ymm0 1507 1508 vaddps %ymm0, %ymm1, %ymm0 1509 1510 // alpha 1511 vbroadcastss 0(%r10), %ymm15 1512 vmulps %ymm0, %ymm15, %ymm0 1513 1514 // beta 1515 vbroadcastss 0(%r11), %ymm15 1516 vmovups 0(%r12), %ymm14 1517 vmulps %ymm15, %ymm14, %ymm14 1518 vaddps %ymm0, %ymm14, %ymm0 1519 1520#if MACRO_LEVEL>=1 1521 .endm 1522#else 1523 ret 1524 1525 FUN_END(inner_blend_t_scale_ab_8_lib8) 1526#endif 1527 1528 1529 1530 1531 1532// common inner routine with file scope 1533// 1534// blend for ta==t, scale for alpha=-1.0 and beta=1.0 1535// 1536// input arguments: 1537// r10 <- y 1538// ymm0 <- [z0a z0b z0c z0d] 1539// ymm1 <- [z1a z1b z1c z1d] 1540// ymm2 <- [z2a z2b z2c z2d] 1541// ymm3 <- [z3a z3b z3c z3d] 1542// ymm8 <- dirty 1543// ymm9 <- dirty 1544// ymm10 <- dirty 1545// ymm11 <- dirty 1546// ymm15 <- dirty 1547// 1548// output arguments: 1549// r10 <- y 1550// ymm0 <- [z0 z1 z2 z3] 1551// ymm1 <- dirty 1552// ymm2 <- dirty 1553// ymm3 <- dirty 1554// ymm8 <- dirty 1555// ymm9 <- dirty 1556// ymm10 <- dirty 1557// ymm11 <- dirty 1558// ymm15 <- dirty 1559 1560#if MACRO_LEVEL>=1 1561 .macro INNER_BLEND_T_SCALE_M11_8_LIB8 1562#else 1563 .p2align 4,,15 1564 FUN_START(inner_blend_t_scale_m11_8_lib8) 1565#endif 1566 1567 // reduction 1568 vhaddps %ymm1, %ymm0, %ymm0 1569 vhaddps %ymm3, %ymm2, %ymm2 1570 vhaddps %ymm5, %ymm4, %ymm4 1571 vhaddps %ymm7, %ymm6, %ymm6 1572 1573 vhaddps %ymm2, %ymm0, %ymm0 1574 vhaddps %ymm6, %ymm4, %ymm4 1575 1576 vperm2f128 $0x20, %ymm4, %ymm0, %ymm1 1577 vperm2f128 $0x13, %ymm0, %ymm4, %ymm0 1578 1579 vaddps %ymm0, %ymm1, %ymm0 1580 1581 // beta 1582 vmovups 0(%r10), %ymm14 1583 vsubps %ymm0, %ymm14, %ymm0 1584 1585#if MACRO_LEVEL>=1 1586 .endm 1587#else 1588 ret 1589 1590 FUN_END(inner_blend_t_scale_m11_8_lib8) 1591#endif 1592 1593 1594 1595 1596 1597// common inner routine with file scope 1598// 1599// store 1600// 1601// input arguments: 1602// r10 <- z 1603// ymm0 <- [z0 z1 z2 z3] 1604// 1605// output arguments: 1606// r10 <- z 1607// ymm0 <- [z0 z1 z2 z3] 1608 1609#if MACRO_LEVEL>=1 1610 .macro INNER_STORE_8_LIB8 1611#else 1612 .p2align 4,,15 1613 FUN_START(inner_store_8_lib8) 1614#endif 1615 1616 vmovups %ymm0, 0(%r10) 1617 1618#if MACRO_LEVEL>=1 1619 .endm 1620#else 1621 ret 1622 1623 FUN_END(inner_store_8_lib8) 1624#endif 1625 1626 1627 1628 1629 1630// common inner routine with file scope 1631// 1632// store vs 1633// 1634// input arguments: 1635// r10 <- D 1636// r11d <- km 1637// ymm0 <- [z0 z1 z2 z3] 1638// ymm14 <- dirty 1639// ymm15 <- dirty 1640// 1641// output arguments: 1642// r10 <- D 1643// r11d <- km 1644// ymm0 <- [z0 z1 z2 z3] 1645// ymm14 <- dirty 1646// ymm15 <- dirty 1647 1648#if MACRO_LEVEL>=1 1649 .macro INNER_STORE_8_VS_LIB8 1650#else 1651 .p2align 4,,15 1652 FUN_START(inner_store_8_vs_lib8) 1653#endif 1654 1655 vcvtsi2ss %r11d, %xmm15, %xmm15 1656#if defined(OS_LINUX) | defined(OS_WINDOWS) 1657 vmovups .LC00(%rip), %ymm14 1658#elif defined(OS_MAC) 1659 vmovups LC00(%rip), %ymm14 1660#endif 1661 vshufps $0x00, %xmm15, %xmm15, %xmm15 1662 vinsertf128 $0x1, %xmm15, %ymm15, %ymm15 1663 vsubps %ymm15, %ymm14, %ymm15 1664 1665 vmaskmovps %ymm0, %ymm15, 0(%r10) 1666 1667#if MACRO_LEVEL>=1 1668 .endm 1669#else 1670 ret 1671 1672 FUN_END(inner_store_8_vs_lib8) 1673#endif 1674 1675 1676 1677 1678 1679// common inner routine with file scope 1680// 1681// store gen 1682// 1683// input arguments: 1684// r10 <- D 1685// r11d <- k0 : start form (inc) 1686// r12d <- k1 : up to (exc) 1687// ymm0 <- [z0 z1 z2 z3] 1688// ymm14 <- dirty 1689// ymm15 <- dirty 1690// 1691// output arguments: 1692// r10 <- D 1693// r11d <- k0 : start form (inc) 1694// r12d <- k1 : up to (exc) 1695// ymm0 <- [z0 z1 z2 z3] 1696// ymm14 <- dirty 1697// ymm15 <- dirty 1698 1699#if MACRO_LEVEL>=1 1700 .macro INNER_STORE_8_GEN_LIB8 1701#else 1702 .p2align 4,,15 1703 FUN_START(inner_store_8_gen_lib8) 1704#endif 1705 1706 // compute mask for rows 1707 vcvtsi2ss %r11d, %xmm14, %xmm14 1708 vcvtsi2ss %r12d, %xmm15, %xmm15 1709#if defined(OS_LINUX) | defined(OS_WINDOWS) 1710 vmovups .LC00(%rip), %ymm12 1711#elif defined(OS_MAC) 1712 vmovups LC00(%rip), %ymm12 1713#endif 1714 vshufps $0x00, %xmm14, %xmm14, %xmm14 1715 vshufps $0x00, %xmm15, %xmm15, %xmm15 1716 vinsertf128 $0x1, %xmm14, %ymm14, %ymm14 1717 vinsertf128 $0x1, %xmm15, %ymm15, %ymm15 1718 vsubps %ymm12, %ymm14, %ymm14 1719 vsubps %ymm15, %ymm12, %ymm15 1720 vandps %ymm14, %ymm15, %ymm15 1721 1722 vmaskmovps %ymm0, %ymm15, 0(%r10) 1723 1724#if MACRO_LEVEL>=1 1725 .endm 1726#else 1727 ret 1728 1729 FUN_END(inner_store_8_gen_lib8) 1730#endif 1731 1732 1733 1734 1735 1736// 1 2 3 4 5 6 7 1737// void kernel_sgemv_n_8_lib8(int k, double *alpha, double *A, double *x, double *beta, double *y, double *z); 1738 1739 .p2align 4,,15 1740 GLOB_FUN_START(kernel_sgemv_n_8_lib8) 1741 1742 PROLOGUE 1743 1744 // zero accumulation registers 1745 1746 vxorps %ymm0, %ymm0, %ymm0 1747 vmovaps %ymm0, %ymm1 1748 vmovaps %ymm0, %ymm2 1749 vmovaps %ymm0, %ymm3 1750 1751 1752 // call inner sgemv kernel n 1753 1754 movq ARG1, %r10 // k 1755 movq ARG3, %r11 // A 1756 movq ARG4, %r12 // x 1757 1758#if MACRO_LEVEL>=2 1759 INNER_KERNEL_GEMV_ADD_N_8_LIB8 1760#else 1761 CALL(inner_kernel_gemv_add_n_8_lib8) 1762#endif 1763 1764 1765 // call inner blend n scale ab 1766 1767 movq ARG2, %r10 // alpha 1768 movq ARG5, %r11 // beta 1769 movq ARG6, %r12 // y 1770 1771#if MACRO_LEVEL>=1 1772 INNER_BLEND_N_SCALE_AB_8_LIB8 1773#else 1774 CALL(inner_blend_n_scale_ab_8_lib8) 1775#endif 1776 1777 1778 // store 1779 1780 movq ARG7, %r10 // z 1781 1782#if MACRO_LEVEL>=1 1783 INNER_STORE_8_LIB8 1784#else 1785 CALL(inner_store_8_lib8) 1786#endif 1787 1788 1789 EPILOGUE 1790 1791 ret 1792 1793 FUN_END(kernel_sgemv_n_8_lib8) 1794 1795 1796 1797 1798 1799// 1 2 3 4 5 6 7 8 1800// void kernel_sgemv_n_8_vs_lib8(int k, double *alpha, double *A, double *x, double *beta, double *y, double *z, int k1); 1801 1802 .p2align 4,,15 1803 GLOB_FUN_START(kernel_sgemv_n_8_vs_lib8) 1804 1805 PROLOGUE 1806 1807 // zero accumulation registers 1808 1809 vxorps %ymm0, %ymm0, %ymm0 1810 vmovaps %ymm0, %ymm1 1811 vmovaps %ymm0, %ymm2 1812 vmovaps %ymm0, %ymm3 1813 1814 1815 // call inner sgemv kernel n 1816 1817 movq ARG1, %r10 // k 1818 movq ARG3, %r11 // A 1819 movq ARG4, %r12 // x 1820 1821#if MACRO_LEVEL>=2 1822 INNER_KERNEL_GEMV_ADD_N_8_LIB8 1823#else 1824 CALL(inner_kernel_gemv_add_n_8_lib8) 1825#endif 1826 1827 1828 // call inner blend n scale ab 1829 1830 movq ARG2, %r10 // alpha 1831 movq ARG5, %r11 // beta 1832 movq ARG6, %r12 // y 1833 1834#if MACRO_LEVEL>=1 1835 INNER_BLEND_N_SCALE_AB_8_LIB8 1836#else 1837 CALL(inner_blend_n_scale_ab_8_lib8) 1838#endif 1839 1840 1841 // store 1842 1843 movq ARG7, %r10 // z 1844 movq ARG8, %r11 // k1 1845 1846#if MACRO_LEVEL>=1 1847 INNER_STORE_8_VS_LIB8 1848#else 1849 CALL(inner_store_8_vs_lib8) 1850#endif 1851 1852 1853 EPILOGUE 1854 1855 ret 1856 1857 FUN_END(kernel_sgemv_n_8_vs_lib8) 1858 1859 1860 1861 1862 1863// 1 2 3 4 5 6 7 8 9 1864// void kernel_sgemv_n_8_gen_lib8(int k, double *alpha, double *A, double *x, double *beta, double *y, double *z, int k0, int kq); 1865 1866 .p2align 4,,15 1867 GLOB_FUN_START(kernel_sgemv_n_8_gen_lib8) 1868 1869 PROLOGUE 1870 1871 // zero accumulation registers 1872 1873 vxorps %ymm0, %ymm0, %ymm0 1874 vmovaps %ymm0, %ymm1 1875 vmovaps %ymm0, %ymm2 1876 vmovaps %ymm0, %ymm3 1877 1878 1879 // call inner sgemv kernel n 1880 1881 movq ARG1, %r10 // k 1882 movq ARG3, %r11 // A 1883 movq ARG4, %r12 // x 1884 1885#if MACRO_LEVEL>=2 1886 INNER_KERNEL_GEMV_ADD_N_8_LIB8 1887#else 1888 CALL(inner_kernel_gemv_add_n_8_lib8) 1889#endif 1890 1891 1892 // call inner blend n scale ab 1893 1894 movq ARG2, %r10 // alpha 1895 movq ARG5, %r11 // beta 1896 movq ARG6, %r12 // y 1897 1898#if MACRO_LEVEL>=1 1899 INNER_BLEND_N_SCALE_AB_8_LIB8 1900#else 1901 CALL(inner_blend_n_scale_ab_8_lib8) 1902#endif 1903 1904 1905 // store 1906 1907 movq ARG7, %r10 // z 1908 movq ARG8, %r11 // k1 1909 movq ARG9, %r12 // k2 1910 1911#if MACRO_LEVEL>=1 1912 INNER_STORE_8_GEN_LIB8 1913#else 1914 CALL(inner_store_8_gen_lib8) 1915#endif 1916 1917 1918 EPILOGUE 1919 1920 ret 1921 1922 FUN_END(kernel_sgemv_n_8_gen_lib8) 1923 1924 1925 1926 1927 1928// 1 2 3 4 5 6 7 8 9 1929// void kernel_sgemv_t_8_lib8(int k, double *alpha, offA, double *A, int sda, double *x, double *beta, double *y, double *z); 1930 1931 .p2align 4,,15 1932 GLOB_FUN_START(kernel_sgemv_t_8_lib8) 1933 1934 PROLOGUE 1935 1936 // zero accumulation registers 1937 1938 vxorps %ymm0, %ymm0, %ymm0 1939 vmovaps %ymm0, %ymm1 1940 vmovaps %ymm0, %ymm2 1941 vmovaps %ymm0, %ymm3 1942 vmovaps %ymm0, %ymm4 1943 vmovaps %ymm0, %ymm5 1944 vmovaps %ymm0, %ymm6 1945 vmovaps %ymm0, %ymm7 1946 1947 1948 // call inner sgemv kernel n 1949 1950 movq ARG1, %r10 // k 1951 movq ARG4, %r11 // A 1952 movq ARG5, %r12 // sda 1953 sall $5, %r12d // 8*sda*sizeof(float) 1954 movq ARG6, %r13 // x 1955 movq ARG3, %r14 // offA 1956 1957#if MACRO_LEVEL>=2 1958 INNER_EDGE_GEMV_ADD_T_8_LIB8 1959#else 1960 CALL(inner_edge_gemv_add_t_8_lib8) 1961#endif 1962 1963#if MACRO_LEVEL>=2 1964 INNER_KERNEL_GEMV_ADD_T_8_LIB8 1965#else 1966 CALL(inner_kernel_gemv_add_t_8_lib8) 1967#endif 1968 1969 1970 // call inner blender t 1971 1972 movq ARG2, %r10 // alpha 1973 movq ARG7, %r11 // beta 1974 movq ARG8, %r12 // y 1975 1976#if MACRO_LEVEL>=1 1977 INNER_BLEND_T_SCALE_AB_8_LIB8 1978#else 1979 CALL(inner_blend_t_scale_ab_8_lib8) 1980#endif 1981 1982 1983 // store 1984 1985 movq ARG9, %r10 // z 1986 1987#if MACRO_LEVEL>=1 1988 INNER_STORE_8_LIB8 1989#else 1990 CALL(inner_store_8_lib8) 1991#endif 1992 1993 1994 EPILOGUE 1995 1996 ret 1997 1998 FUN_END(kernel_sgemv_t_8_lib8) 1999 2000 2001 2002 2003 2004// 1 2 3 4 5 6 7 8 9 10 2005// void kernel_sgemv_t_8_vs_lib8(int k, double *alpha, offA, double *A, int sda, double *x, double *beta, double *y, double *z, int k1); 2006 2007 .p2align 4,,15 2008 GLOB_FUN_START(kernel_sgemv_t_8_vs_lib8) 2009 2010 PROLOGUE 2011 2012 // zero accumulation registers 2013 2014 vxorps %ymm0, %ymm0, %ymm0 2015 vmovaps %ymm0, %ymm1 2016 vmovaps %ymm0, %ymm2 2017 vmovaps %ymm0, %ymm3 2018 vmovaps %ymm0, %ymm4 2019 vmovaps %ymm0, %ymm5 2020 vmovaps %ymm0, %ymm6 2021 vmovaps %ymm0, %ymm7 2022 2023 2024 // call inner sgemv kernel n 2025 2026 movq ARG1, %r10 // k 2027 movq ARG4, %r11 // A 2028 movq ARG5, %r12 // sda 2029 sall $5, %r12d // 8*sda*sizeof(float) 2030 movq ARG6, %r13 // x 2031 movq ARG3, %r14 // offA 2032 2033#if MACRO_LEVEL>=2 2034 INNER_EDGE_GEMV_ADD_T_8_LIB8 2035#else 2036 CALL(inner_edge_gemv_add_t_8_lib8) 2037#endif 2038 2039#if MACRO_LEVEL>=2 2040 INNER_KERNEL_GEMV_ADD_T_8_LIB8 2041#else 2042 CALL(inner_kernel_gemv_add_t_8_lib8) 2043#endif 2044 2045 2046 // call inner blender t 2047 2048 movq ARG2, %r10 // alpha 2049 movq ARG7, %r11 // beta 2050 movq ARG8, %r12 // y 2051 2052#if MACRO_LEVEL>=1 2053 INNER_BLEND_T_SCALE_AB_8_LIB8 2054#else 2055 CALL(inner_blend_t_scale_ab_8_lib8) 2056#endif 2057 2058 2059 // store 2060 2061 movq ARG9, %r10 // z 2062 movq ARG10, %r11 // km 2063 2064#if MACRO_LEVEL>=1 2065 INNER_STORE_8_VS_LIB8 2066#else 2067 CALL(inner_store_8_vs_lib8) 2068#endif 2069 2070 2071 EPILOGUE 2072 2073 ret 2074 2075 FUN_END(kernel_sgemv_t_8_vs_lib8) 2076 2077 2078 2079 2080 2081// 1 2 3 4 5 6 2082// void kernel_strsv_ln_inv_8_lib8(int k, double *A, double *inv_diag_A, double *x, double *y, double *z); 2083 2084 .p2align 4,,15 2085 GLOB_FUN_START(kernel_strsv_ln_inv_8_lib8) 2086 2087 PROLOGUE 2088 2089 // zero accumulation registers 2090 2091 vxorps %ymm0, %ymm0, %ymm0 2092 vmovaps %ymm0, %ymm1 2093 vmovaps %ymm0, %ymm2 2094 vmovaps %ymm0, %ymm3 2095 vmovaps %ymm0, %ymm4 2096 vmovaps %ymm0, %ymm5 2097 vmovaps %ymm0, %ymm6 2098 vmovaps %ymm0, %ymm7 2099 2100 2101 // call inner dgemv kernel n 2102 2103 movq ARG1, %r10 // k 2104 movq ARG2, %r11 // A 2105 movq ARG4, %r12 // x 2106 2107#if MACRO_LEVEL>=2 2108 INNER_KERNEL_GEMV_ADD_N_8_LIB8 2109#else 2110 CALL(inner_kernel_gemv_add_n_8_lib8) 2111#endif 2112 2113 movq %r11, %r13 // A+k*sizeof(double) 2114 2115 2116 // call inner blender n 2117 2118 movq ARG5, %r10 // y 2119 2120#if MACRO_LEVEL>=1 2121 INNER_BLEND_N_SCALE_M11_8_LIB8 2122#else 2123 CALL(inner_blend_n_scale_m11_8_lib8) 2124#endif 2125 2126 2127 // solution 2128 2129 movq %r13, %r10 // A+k*sizeof(double) 2130 movq ARG3, %r11 // inv_diag_A 2131 2132#if MACRO_LEVEL>=1 2133 INNER_EDGE_TRSV_LN_INV_8_LIB8 2134#else 2135 CALL(inner_edge_trsv_ln_inv_8_lib8) 2136#endif 2137 2138 2139 // store 2140 2141 movq ARG6, %r10 // z 2142 2143#if MACRO_LEVEL>=1 2144 INNER_STORE_8_LIB8 2145#else 2146 CALL(inner_store_8_lib8) 2147#endif 2148 2149 2150 EPILOGUE 2151 2152 ret 2153 2154 FUN_END(kernel_strsv_ln_inv_8_lib8) 2155 2156 2157 2158 2159 2160// 1 2 3 4 5 6 7 8 2161// void kernel_strsv_ln_inv_8_vs_lib8(int k, double *A, double *inv_diag_A, double *x, double *y, double *z, int km, int kn); 2162 2163 .p2align 4,,15 2164 GLOB_FUN_START(kernel_strsv_ln_inv_8_vs_lib8) 2165 2166 PROLOGUE 2167 2168 // zero accumulation registers 2169 2170 vxorps %ymm0, %ymm0, %ymm0 2171 vmovaps %ymm0, %ymm1 2172 vmovaps %ymm0, %ymm2 2173 vmovaps %ymm0, %ymm3 2174 vmovaps %ymm0, %ymm4 2175 vmovaps %ymm0, %ymm5 2176 vmovaps %ymm0, %ymm6 2177 vmovaps %ymm0, %ymm7 2178 2179 2180 // call inner dgemv kernel n 2181 2182 movq ARG1, %r10 // k 2183 movq ARG2, %r11 // A 2184 movq ARG4, %r12 // x 2185 2186#if MACRO_LEVEL>=2 2187 INNER_KERNEL_GEMV_ADD_N_8_LIB8 2188#else 2189 CALL(inner_kernel_gemv_add_n_8_lib8) 2190#endif 2191 2192 movq %r11, %r13 // A+k*sizeof(double) 2193 2194 2195 // call inner blender n 2196 2197 movq ARG5, %r10 // y 2198 2199#if MACRO_LEVEL>=1 2200 INNER_BLEND_N_SCALE_M11_8_LIB8 2201#else 2202 CALL(inner_blend_n_scale_m11_8_lib8) 2203#endif 2204 2205 2206 // solution 2207 2208 movq %r13, %r10 // A+k*sizeof(double) 2209 movq ARG3, %r11 // inv_diag_A 2210 movq ARG8, %r12 // kn 2211 2212#if MACRO_LEVEL>=1 2213 INNER_EDGE_TRSV_LN_INV_8_VS_LIB8 2214#else 2215 CALL(inner_edge_trsv_ln_inv_8_vs_lib8) 2216#endif 2217 2218 2219 // store 2220 2221 movq ARG6, %r10 // z 2222 movq ARG7, %r11 // km 2223 2224#if MACRO_LEVEL>=1 2225 INNER_STORE_8_VS_LIB8 2226#else 2227 CALL(inner_store_8_vs_lib8) 2228#endif 2229 2230 2231 EPILOGUE 2232 2233 ret 2234 2235 FUN_END(kernel_strsv_ln_inv_8_vs_lib8) 2236 2237 2238 2239 2240 2241// 1 2 3 4 5 6 7 2242// void kernel_strsv_lt_inv_8_lib8(int k, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z); 2243 2244 .p2align 4,,15 2245 GLOB_FUN_START(kernel_strsv_lt_inv_8_lib8) 2246 2247 PROLOGUE 2248 2249 // zero accumulation registers 2250 2251 vxorps %ymm0, %ymm0, %ymm0 2252 vmovaps %ymm0, %ymm1 2253 vmovaps %ymm0, %ymm2 2254 vmovaps %ymm0, %ymm3 2255 vmovaps %ymm0, %ymm4 2256 vmovaps %ymm0, %ymm5 2257 vmovaps %ymm0, %ymm6 2258 vmovaps %ymm0, %ymm7 2259 2260 2261 // call inner dgemv kernel n 2262 2263 movq ARG1, %r10 // k 2264 subl $8, %r10d 2265 movq ARG2, %r11 // A 2266 movq ARG3, %r12 2267 sall $5, %r12d // 8*sda*sizeof(float) 2268 addq %r12, %r11 // A+8*sda*sizeof(float) 2269 movq ARG5, %r13 // x 2270 addq $32, %r13 // x+8 2271 2272#if MACRO_LEVEL>=2 2273 INNER_KERNEL_GEMV_ADD_T_8_LIB8 2274#else 2275 CALL(inner_kernel_gemv_add_t_8_lib8) 2276#endif 2277 2278 2279 // call inner blender t 2280 2281 movq ARG6, %r10 // y 2282 2283#if MACRO_LEVEL>=1 2284 INNER_BLEND_T_SCALE_M11_8_LIB8 2285#else 2286 CALL(inner_blend_t_scale_m11_8_lib8) 2287#endif 2288 2289 2290 // solution 2291 2292 movq ARG2, %r10 // A 2293 movq ARG4, %r11 // inv_diag_A 2294 2295#if MACRO_LEVEL>=1 2296 INNER_EDGE_TRSV_LT_INV_8_LIB8 2297#else 2298 CALL(inner_edge_trsv_lt_inv_8_lib8) 2299#endif 2300 2301 2302 // store 2303 2304 movq ARG7, %r10 // z 2305 2306#if MACRO_LEVEL>=1 2307 INNER_STORE_8_LIB8 2308#else 2309 CALL(inner_store_8_lib8) 2310#endif 2311 2312 2313 EPILOGUE 2314 2315 ret 2316 2317 FUN_END(kernel_strsv_lt_inv_8_lib8) 2318 2319 2320 2321 2322 2323// 1 2 3 4 5 6 7 8 9 2324// void kernel_strsv_lt_inv_8_vs_lib8(int k, double *A, int sda, double *inv_diag_A, double *x, double *y, double *z, int km, int kn); 2325 2326 .p2align 4,,15 2327 GLOB_FUN_START(kernel_strsv_lt_inv_8_vs_lib8) 2328 2329 PROLOGUE 2330 2331 // zero accumulation registers 2332 2333 vxorps %ymm0, %ymm0, %ymm0 2334 vmovaps %ymm0, %ymm1 2335 vmovaps %ymm0, %ymm2 2336 vmovaps %ymm0, %ymm3 2337 vmovaps %ymm0, %ymm4 2338 vmovaps %ymm0, %ymm5 2339 vmovaps %ymm0, %ymm6 2340 vmovaps %ymm0, %ymm7 2341 2342 2343 // call inner dgemv kernel n 2344 2345 movq ARG1, %r10 // k 2346 subl $8, %r10d 2347 movq ARG2, %r11 // A 2348 movq ARG3, %r12 2349 sall $5, %r12d // 8*sda*sizeof(float) 2350 addq %r12, %r11 // A+8*sda*sizeof(float) 2351 movq ARG5, %r13 // x 2352 addq $32, %r13 // x+8 2353 2354#if MACRO_LEVEL>=2 2355 INNER_KERNEL_GEMV_ADD_T_8_LIB8 2356#else 2357 CALL(inner_kernel_gemv_add_t_8_lib8) 2358#endif 2359 2360 2361 // call inner blender t 2362 2363 movq ARG6, %r10 // y 2364 2365#if MACRO_LEVEL>=1 2366 INNER_BLEND_T_SCALE_M11_8_LIB8 2367#else 2368 CALL(inner_blend_t_scale_m11_8_lib8) 2369#endif 2370 2371 2372 // solution 2373 2374 movq ARG2, %r10 // A 2375 movq ARG4, %r11 // inv_diag_A 2376 movq ARG8, %r12 // km 2377 movq ARG9, %r13 // kn 2378 movq ARG5, %r14 // x 2379 2380#if MACRO_LEVEL>=1 2381 INNER_EDGE_TRSV_LT_INV_8_VS_LIB8 2382#else 2383 CALL(inner_edge_trsv_lt_inv_8_vs_lib8) 2384#endif 2385 2386 2387 // store 2388 2389 movq ARG7, %r10 // z 2390 movq ARG9, %r11 // kn 2391 2392#if MACRO_LEVEL>=1 2393 INNER_STORE_8_VS_LIB8 2394#else 2395 CALL(inner_store_8_vs_lib8) 2396#endif 2397 2398 2399 EPILOGUE 2400 2401 ret 2402 2403 FUN_END(kernel_strsv_lt_inv_8_vs_lib8) 2404 2405 2406 2407 2408 2409 // read-only data 2410#if defined(OS_LINUX) 2411 .section .rodata.cst32,"aM",@progbits,32 2412#elif defined(OS_MAC) 2413 .section __TEXT,__const 2414#elif defined(OS_WINDOWS) 2415 .section .rdata,"dr" 2416#endif 2417 2418#if defined(OS_LINUX) | defined(OS_WINDOWS) 2419 .align 32 2420.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 } 2421#elif defined(OS_MAC) 2422 .align 5 2423LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 } 2424#endif 2425 .float 0.5 2426 .float 1.5 2427 .float 2.5 2428 .float 3.5 2429 .float 4.5 2430 .float 5.5 2431 .float 6.5 2432 .float 7.5 2433 2434 2435 2436 2437#if defined(OS_LINUX) 2438 .section .note.GNU-stack,"",@progbits 2439#elif defined(OS_MAC) 2440 .subsections_via_symbols 2441#endif 2442 2443