1/************************************************************************************************** 2* * 3* This file is part of BLASFEO. * 4* * 5* BLASFEO -- BLAS For Embedded Optimization. * 6* Copyright (C) 2019 by Gianluca Frison. * 7* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. * 8* All rights reserved. * 9* * 10* The 2-Clause BSD License * 11* * 12* Redistribution and use in source and binary forms, with or without * 13* modification, are permitted provided that the following conditions are met: * 14* * 15* 1. Redistributions of source code must retain the above copyright notice, this * 16* list of conditions and the following disclaimer. * 17* 2. Redistributions in binary form must reproduce the above copyright notice, * 18* this list of conditions and the following disclaimer in the documentation * 19* and/or other materials provided with the distribution. * 20* * 21* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * 22* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * 23* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * 24* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * 25* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * 26* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * 27* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * 28* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * 29* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * 30* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * 31* * 32* Author: Gianluca Frison, gianluca.frison (at) imtek.uni-freiburg.de * 33* * 34**************************************************************************************************/ 35 36#if defined(OS_LINUX) | defined(OS_MAC) 37 38//#define STACKSIZE 96 39#define STACKSIZE 64 40#define ARG1 %rdi 41#define ARG2 %rsi 42#define ARG3 %rdx 43#define ARG4 %rcx 44#define ARG5 %r8 45#define ARG6 %r9 46#define ARG7 STACKSIZE + 8(%rsp) 47#define ARG8 STACKSIZE + 16(%rsp) 48#define ARG9 STACKSIZE + 24(%rsp) 49#define ARG10 STACKSIZE + 32(%rsp) 50#define ARG11 STACKSIZE + 40(%rsp) 51#define ARG12 STACKSIZE + 48(%rsp) 52#define ARG13 STACKSIZE + 56(%rsp) 53#define ARG14 STACKSIZE + 64(%rsp) 54#define ARG15 STACKSIZE + 72(%rsp) 55#define ARG16 STACKSIZE + 80(%rsp) 56#define ARG17 STACKSIZE + 88(%rsp) 57#define ARG18 STACKSIZE + 96(%rsp) 58#define PROLOGUE \ 59 subq $STACKSIZE, %rsp; \ 60 movq %rbx, (%rsp); \ 61 movq %rbp, 8(%rsp); \ 62 movq %r12, 16(%rsp); \ 63 movq %r13, 24(%rsp); \ 64 movq %r14, 32(%rsp); \ 65 movq %r15, 40(%rsp); 66#define EPILOGUE \ 67 movq (%rsp), %rbx; \ 68 movq 8(%rsp), %rbp; \ 69 movq 16(%rsp), %r12; \ 70 movq 24(%rsp), %r13; \ 71 movq 32(%rsp), %r14; \ 72 movq 40(%rsp), %r15; \ 73 addq $STACKSIZE, %rsp; 74 75#if defined(OS_LINUX) 76 77#define GLOB_FUN_START(NAME) \ 78 .globl NAME; \ 79 .type NAME, @function; \ 80NAME: 81#define FUN_START(NAME) \ 82 .type NAME, @function; \ 83NAME: 84#define FUN_END(NAME) \ 85 .size NAME, .-NAME 86#define CALL(NAME) \ 87 call NAME 88#define ZERO_ACC \ 89 xorpd %xmm0, %xmm0; \ 90 movapd %xmm0, %xmm1; \ 91 movapd %xmm0, %xmm2; \ 92 movapd %xmm0, %xmm3 93//#define NEG_ACC \ 94// movapd .LC11(%rip), %xmm15; \ 95// xorpd %xmm15, %xmm0; \ 96// xorpd %xmm15, %xmm1; \ 97// xorpd %xmm15, %xmm2; \ 98// xorpd %xmm15, %xmm3; \ 99// xorpd %xmm15, %xmm4; \ 100// xorpd %xmm15, %xmm5; \ 101// xorpd %xmm15, %xmm6; \ 102// xorpd %xmm15, %xmm7 103 104#else // defined(OS_MAC) 105 106#define GLOB_FUN_START(NAME) \ 107 .globl _ ## NAME; \ 108_ ## NAME: 109#define FUN_START(NAME) \ 110_ ## NAME: 111#define FUN_END(NAME) 112#define CALL(NAME) \ 113 callq _ ## NAME 114#define ZERO_ACC \ 115 xorpd %xmm0, %xmm0; \ 116 movapd %xmm0, %xmm1; \ 117 movapd %xmm0, %xmm2; \ 118 movapd %xmm0, %xmm3 119//#define NEG_ACC \ 120// movapd LC11(%rip), %xmm15; \ 121// xorpd %xmm15, %xmm0; \ 122// xorpd %xmm15, %xmm1; \ 123// xorpd %xmm15, %xmm2; \ 124// xorpd %xmm15, %xmm3; \ 125// xorpd %xmm15, %xmm4; \ 126// xorpd %xmm15, %xmm5; \ 127// xorpd %xmm15, %xmm6; \ 128// xorpd %xmm15, %xmm7 129 130#endif 131 132#elif defined(OS_WINDOWS) 133 134#define STACKSIZE 256 135#define ARG1 %rcx 136#define ARG2 %rdx 137#define ARG3 %r8 138#define ARG4 %r9 139#define ARG5 STACKSIZE + 40(%rsp) 140#define ARG6 STACKSIZE + 48(%rsp) 141#define ARG7 STACKSIZE + 56(%rsp) 142#define ARG8 STACKSIZE + 64(%rsp) 143#define ARG9 STACKSIZE + 72(%rsp) 144#define ARG10 STACKSIZE + 80(%rsp) 145#define ARG11 STACKSIZE + 88(%rsp) 146#define ARG12 STACKSIZE + 96(%rsp) 147#define ARG13 STACKSIZE + 104(%rsp) 148#define ARG14 STACKSIZE + 112(%rsp) 149#define ARG15 STACKSIZE + 120(%rsp) 150#define ARG16 STACKSIZE + 128(%rsp) 151#define ARG17 STACKSIZE + 136(%rsp) 152#define ARG18 STACKSIZE + 144(%rsp) 153#define PROLOGUE \ 154 subq $STACKSIZE, %rsp; \ 155 movq %rbx, (%rsp); \ 156 movq %rbp, 8(%rsp); \ 157 movq %r12, 16(%rsp); \ 158 movq %r13, 24(%rsp); \ 159 movq %r14, 32(%rsp); \ 160 movq %r15, 40(%rsp); \ 161 movq %rdi, 48(%rsp); \ 162 movq %rsi, 56(%rsp); \ 163 movups %xmm6, 64(%rsp); \ 164 movups %xmm7, 80(%rsp); \ 165 movups %xmm8, 96(%rsp); \ 166 movups %xmm9, 112(%rsp); \ 167 movups %xmm10, 128(%rsp); \ 168 movups %xmm11, 144(%rsp); \ 169 movups %xmm12, 160(%rsp); \ 170 movups %xmm13, 176(%rsp); \ 171 movups %xmm14, 192(%rsp); \ 172 movups %xmm15, 208(%rsp); 173#define EPILOGUE \ 174 movq (%rsp), %rbx; \ 175 movq 8(%rsp), %rbp; \ 176 movq 16(%rsp), %r12; \ 177 movq 24(%rsp), %r13; \ 178 movq 32(%rsp), %r14; \ 179 movq 40(%rsp), %r15; \ 180 movq 48(%rsp), %rdi; \ 181 movq 56(%rsp), %rsi; \ 182 movups 64(%rsp), %xmm6; \ 183 movups 80(%rsp), %xmm7; \ 184 movups 96(%rsp), %xmm8; \ 185 movups 112(%rsp), %xmm9; \ 186 movups 128(%rsp), %xmm10; \ 187 movups 144(%rsp), %xmm11; \ 188 movups 160(%rsp), %xmm12; \ 189 movups 176(%rsp), %xmm13; \ 190 movups 192(%rsp), %xmm14; \ 191 movups 208(%rsp), %xmm15; \ 192 addq $STACKSIZE, %rsp; 193 194#define GLOB_FUN_START(NAME) \ 195 .globl NAME; \ 196 .def NAME; .scl 2; .type 32; .endef; \ 197NAME: 198#define FUN_START(NAME) \ 199 .def NAME; .scl 2; .type 32; .endef; \ 200NAME: 201#define FUN_END(NAME) 202#define CALL(NAME) \ 203 call NAME 204#define ZERO_ACC \ 205 xorpd %xmm0, %xmm0; \ 206 movapd %xmm0, %xmm1; \ 207 movapd %xmm0, %xmm2; \ 208 movapd %xmm0, %xmm3 209//#define NEG_ACC \ 210// movapd .LC11(%rip), %xmm15; \ 211// xorpd %xmm15, %xmm0; \ 212// xorpd %xmm15, %xmm1; \ 213// xorpd %xmm15, %xmm2; \ 214// xorpd %xmm15, %xmm3; \ 215// xorpd %xmm15, %xmm4; \ 216// xorpd %xmm15, %xmm5; \ 217// xorpd %xmm15, %xmm6; \ 218// xorpd %xmm15, %xmm7 219 220#else 221 222#error wrong OS 223 224#endif 225 226 227 228#if defined(OS_LINUX) | defined(OS_WINDOWS) 229 .text 230#elif defined(OS_MAC) 231 .section __TEXT,__text,regular,pure_instructions 232#endif 233 234 235 236 237 238// common inner routine with file scope 239// 240// input arguments: 241// r10d <- k 242// r11 <- A 243// r12 <- x 244// xmm0 <- [z0 z1]_a 245// xmm1 <- [z2 z3]_a 246// xmm2 <- [z0 z1]_b 247// xmm3 <- [z2 z3]_b 248 249// 250// output arguments: 251// r10d <- 0 252// r11 <- A+4*k*sizeof(double) 253// r12 <- x+k*sizeof(double) 254// xmm0 <- [z0 z1]_a 255// xmm1 <- [z2 z3]_a 256// xmm2 <- [z0 z1]_b 257// xmm3 <- [z2 z3]_b 258 259#if MACRO_LEVEL>=2 260 .macro INNER_KERNEL_DGEMV_ADD_N_4_LIB4 261#else 262 .p2align 4,,15 263 FUN_START(inner_kernel_dgemv_add_n_4_lib4) 264#endif 265 266 cmpl $0, %r10d 267 jle 2f // return 268 269 cmpl $4, %r10d 270 jl 0f // clean-up loop 271 272 // main loop 273 .p2align 3 2741: // main loop 275 276 movddup 0(%r12), %xmm12 277 movapd 0(%r11), %xmm8 278 mulpd %xmm12, %xmm8 279 addpd %xmm8, %xmm0 280 movapd 16(%r11), %xmm8 281 mulpd %xmm12, %xmm8 282 addpd %xmm8, %xmm1 283 subl $4, %r10d 284 285 movddup 8(%r12), %xmm12 286 movapd 32(%r11), %xmm8 287 mulpd %xmm12, %xmm8 288 addpd %xmm8, %xmm2 289 movapd 48(%r11), %xmm8 290 mulpd %xmm12, %xmm8 291 addpd %xmm8, %xmm3 292 293 movddup 16(%r12), %xmm12 294 movapd 64(%r11), %xmm8 295 mulpd %xmm12, %xmm8 296 addpd %xmm8, %xmm0 297 movapd 80(%r11), %xmm8 298 mulpd %xmm12, %xmm8 299 addpd %xmm8, %xmm1 300 301 movddup 24(%r12), %xmm12 302 movapd 96(%r11), %xmm8 303 mulpd %xmm12, %xmm8 304 addpd %xmm8, %xmm2 305 movapd 112(%r11), %xmm8 306 mulpd %xmm12, %xmm8 307 addpd %xmm8, %xmm3 308 309 addq $128, %r11 310 addq $32, %r12 311 312 cmpl $3, %r10d 313 314 jg 1b // main loop 315 316 317 // consider clean-up 318 cmpl $0, %r10d 319 jle 2f // return 320 3210: // clean-up 322 323 movddup 0(%r12), %xmm12 324 movapd 0(%r11), %xmm8 325 mulpd %xmm12, %xmm8 326 addpd %xmm8, %xmm0 327 movapd 16(%r11), %xmm8 328 mulpd %xmm12, %xmm8 329 addpd %xmm8, %xmm1 330 331 addq $32, %r11 332 addq $8, %r12 333 334 subl $1, %r10d 335 cmpl $0, %r10d 336 337 jg 0b // clean 338 3392: // return 340 341#if MACRO_LEVEL>=2 342 .endm 343#else 344 ret 345 346 FUN_END(inner_kernel_dgemv_add_n_4_lib4) 347#endif 348 349 350 351 352 353// common inner routine with file scope 354// 355// input arguments: 356// r10d <- k 357// r11 <- A 358// r12 <- bs*sda*sizeof(double) = 32*sda 359// r13 <- x 360// xmm0 <- [z0a z0b] 361// xmm1 <- [z1a z1b] 362// xmm2 <- [z2a z2b] 363// xmm3 <- [z3a z3b] 364 365// 366// output arguments: 367// r10d <- 0 368// r11 <- A+4*k*sizeof(double) 369// r12 <- bs*sda*sizeof(double) = 32*sda 370// r13 <- x+k*sizeof(double) 371// xmm0 <- [z0a z0b] 372// xmm1 <- [z1a z1b] 373// xmm2 <- [z2a z2b] 374// xmm3 <- [z3a z3b] 375 376#if MACRO_LEVEL>=2 377 .macro INNER_KERNEL_DGEMV_ADD_T_4_LIB4 378#else 379 .p2align 4,,15 380 FUN_START(inner_kernel_dgemv_add_t_4_lib4) 381#endif 382 383 cmpl $0, %r10d 384 jle 2f // return 385 386 cmpl $4, %r10d 387 jl 0f // clean-up loop 388 389 // main loop 390 .p2align 3 3911: // main loop 392 393 movupd 0(%r13), %xmm12 394 395 movapd 0(%r11), %xmm8 396 mulpd %xmm12, %xmm8 397 addpd %xmm8, %xmm0 398 subl $4, %r10d 399 400 movapd 32(%r11), %xmm8 401 mulpd %xmm12, %xmm8 402 addpd %xmm8, %xmm1 403 404 movapd 64(%r11), %xmm8 405 mulpd %xmm12, %xmm8 406 addpd %xmm8, %xmm2 407 408 movapd 96(%r11), %xmm8 409 mulpd %xmm12, %xmm8 410 addpd %xmm8, %xmm3 411 412 movupd 16(%r13), %xmm12 413 414 movapd 16(%r11), %xmm8 415 mulpd %xmm12, %xmm8 416 addpd %xmm8, %xmm0 417 418 movapd 48(%r11), %xmm8 419 mulpd %xmm12, %xmm8 420 addpd %xmm8, %xmm1 421 422 movapd 80(%r11), %xmm8 423 mulpd %xmm12, %xmm8 424 addpd %xmm8, %xmm2 425 426 movapd 112(%r11), %xmm8 427 mulpd %xmm12, %xmm8 428 addpd %xmm8, %xmm3 429 430 addq %r12, %r11 431 addq $32, %r13 432 433 cmpl $3, %r10d 434 jg 1b // main loop 435 436 437 // consider clean-up 438 cmpl $0, %r10d 439 jle 2f // return 440 4410: // clean-up 442 443 movsd 0(%r13), %xmm12 444 445 movsd 0(%r11), %xmm8 446 mulsd %xmm12, %xmm8 447 addsd %xmm8, %xmm0 448 subl $1, %r10d 449 450 movsd 32(%r11), %xmm8 451 mulsd %xmm12, %xmm8 452 addsd %xmm8, %xmm1 453 454 movsd 64(%r11), %xmm8 455 mulsd %xmm12, %xmm8 456 addsd %xmm8, %xmm2 457 458 movsd 96(%r11), %xmm8 459 mulsd %xmm12, %xmm8 460 addsd %xmm8, %xmm3 461 462 addq $8, %r11 463 addq $8, %r13 464 465 cmpl $0, %r10d 466 jg 0b // main loop 467 468 4692: // return 470 471#if MACRO_LEVEL>=2 472 .endm 473#else 474 ret 475 476 FUN_END(inner_kernel_dgemv_add_t_4_lib4) 477#endif 478 479 480 481 482 483// common inner routine with file scope 484// 485// input arguments: 486// r10d <- k 487// r11 <- A 488// r12 <- bs*sda*sizeof(double) = 32*sda 489// r13 <- x_t 490// r14 <- z_n 491// xmm0 <- [z_t_0a z_t_0b] 492// xmm1 <- [z_t_1a z_t_1b] 493// xmm2 <- [z_t_2a z_t_2b] 494// xmm3 <- [z_t_3a z_t_3b] 495// xmm4 <- x_n_0 496// xmm5 <- x_n_1 497// xmm6 <- x_n_2 498// xmm7 <- x_n_3 499 500// 501// output arguments: 502// r10d <- 0 503// r11 <- A+4*k*sizeof(double) 504// r12 <- bs*sda*sizeof(double) = 32*sda 505// r13 <- x_t+k*sizeof(double) 506// r14 <- z_n+k*sizeof(double) 507// xmm0 <- [z_t_0a z_t_0b] 508// xmm1 <- [z_t_1a z_t_1b] 509// xmm2 <- [z_t_2a z_t_2b] 510// xmm3 <- [z_t_3a z_t_3b] 511// xmm4 <- x_n_0 512// xmm5 <- x_n_1 513// xmm6 <- x_n_2 514// xmm7 <- x_n_3 515 516#if MACRO_LEVEL>=2 517 .macro INNER_KERNEL_DGEMV_ADD_NT_4_LIB4 518#else 519 .p2align 4,,15 520 FUN_START(inner_kernel_dgemv_add_nt_4_lib4) 521#endif 522 523 cmpl $0, %r10d 524 jle 2f // return 525 526 cmpl $4, %r10d 527 jl 0f // clean-up loop 528 529 // main loop 530 .p2align 3 5311: // main loop 532 533 movupd 0(%r13), %xmm9 534 movupd 16(%r13), %xmm10 535 movupd 0(%r14), %xmm11 536 movupd 16(%r14), %xmm12 537 538 subl $4, %r10d 539 540 movapd 0(%r11), %xmm14 541 movapd %xmm14, %xmm15 542 mulpd %xmm9, %xmm14 543 addpd %xmm14, %xmm0 544 mulpd %xmm4, %xmm15 545 addpd %xmm15, %xmm11 546 547 movapd 16(%r11), %xmm14 548 movapd %xmm14, %xmm15 549 mulpd %xmm10, %xmm14 550 addpd %xmm14, %xmm0 551 mulpd %xmm4, %xmm15 552 addpd %xmm15, %xmm12 553 554 movapd 32(%r11), %xmm14 555 movapd %xmm14, %xmm15 556 mulpd %xmm9, %xmm14 557 addpd %xmm14, %xmm1 558 mulpd %xmm5, %xmm15 559 addpd %xmm15, %xmm11 560 561 movapd 48(%r11), %xmm14 562 movapd %xmm14, %xmm15 563 mulpd %xmm10, %xmm14 564 addpd %xmm14, %xmm1 565 mulpd %xmm5, %xmm15 566 addpd %xmm15, %xmm12 567 568 movapd 64(%r11), %xmm14 569 movapd %xmm14, %xmm15 570 mulpd %xmm9, %xmm14 571 addpd %xmm14, %xmm2 572 mulpd %xmm6, %xmm15 573 addpd %xmm15, %xmm11 574 575 movapd 80(%r11), %xmm14 576 movapd %xmm14, %xmm15 577 mulpd %xmm10, %xmm14 578 addpd %xmm14, %xmm2 579 mulpd %xmm6, %xmm15 580 addpd %xmm15, %xmm12 581 582 movapd 96(%r11), %xmm14 583 movapd %xmm14, %xmm15 584 mulpd %xmm9, %xmm14 585 addpd %xmm14, %xmm3 586 mulpd %xmm7, %xmm15 587 addpd %xmm15, %xmm11 588 589 movapd 112(%r11), %xmm14 590 movapd %xmm14, %xmm15 591 mulpd %xmm10, %xmm14 592 addpd %xmm14, %xmm3 593 mulpd %xmm7, %xmm15 594 addpd %xmm15, %xmm12 595 596 movupd %xmm11, 0(%r14) 597 movupd %xmm12, 16(%r14) 598 599 addq %r12, %r11 600 addq $32, %r13 601 addq $32, %r14 602 603 cmpl $3, %r10d 604 jg 1b // main loop 605 606 607 // consider clean-up 608 cmpl $0, %r10d 609 jle 2f // return 610 6110: // clean-up 612 613 movsd 0(%r13), %xmm9 614 movsd 0(%r14), %xmm11 615 616 subl $1, %r10d 617 618 movsd 0(%r11), %xmm14 619 movsd %xmm14, %xmm15 620 mulsd %xmm9, %xmm14 621 addsd %xmm14, %xmm0 622 mulsd %xmm4, %xmm15 623 addsd %xmm15, %xmm11 624 625 movsd 32(%r11), %xmm14 626 movsd %xmm14, %xmm15 627 mulsd %xmm9, %xmm14 628 addsd %xmm14, %xmm1 629 mulsd %xmm5, %xmm15 630 addsd %xmm15, %xmm11 631 632 movsd 64(%r11), %xmm14 633 movsd %xmm14, %xmm15 634 mulsd %xmm9, %xmm14 635 addsd %xmm14, %xmm2 636 mulsd %xmm6, %xmm15 637 addsd %xmm15, %xmm11 638 639 movsd 96(%r11), %xmm14 640 movsd %xmm14, %xmm15 641 mulsd %xmm9, %xmm14 642 addsd %xmm14, %xmm3 643 mulsd %xmm7, %xmm15 644 addsd %xmm15, %xmm11 645 646 movsd %xmm11, 0(%r14) 647 648 addq $8, %r11 649 addq $8, %r13 650 addq $8, %r14 651 652 cmpl $0, %r10d 653 jg 0b // main loop 654 6552: // return 656 657#if MACRO_LEVEL>=2 658 .endm 659#else 660 ret 661 662 FUN_END(inner_kernel_dgemv_add_nt_4_lib4) 663#endif 664 665 666 667 668 669// common inner routine with file scope 670// 671// input arguments: 672// r10d <- k 673// r11 <- A 674// r12 <- bs*sda*sizeof(double) = 32*sda 675// r13 <- x 676// r14d <- offA 677// xmm0 <- [z0a z0b] 678// xmm1 <- [z1a z1b] 679// xmm2 <- [z2a z2b] 680// xmm3 <- [z3a z3b] 681 682// 683// output arguments: 684// r10d <- 685// r11 <- 686// r12 <- 687// r13 <- 688// r14d <- offA 689// xmm0 <- [z0a z0b] 690// xmm1 <- [z1a z1b] 691// xmm2 <- [z2a z2b] 692// xmm3 <- [z3a z3b] 693 694#if MACRO_LEVEL>=2 695 .macro INNER_EDGE_GEMV_ADD_T_4_LIB4 696#else 697 .p2align 4,,15 698 FUN_START(inner_edge_dgemv_add_t_4_lib4) 699#endif 700 701 cmpl $0, %r14d // offset==0 702 jle 2f // end 703 704 cmpl $0, %r10d // k==0 705 jle 2f // end 706 707 movl $4, %r15d // load 4 708 subl %r14d, %r15d // 4-offsetA 709 cmpl %r10d, %r15d // k > 4-offsetA 710 cmovgl %r10d, %r15d // kend=min(k,4-offsetA) 711 712// movl %r14d, %eax // load offsetA 713// sall $3, %eax // offsetA*sizeof(double) 714// addq %rax, %r11 // A+offsetA*sizeof(double) 715 7161: 717 movsd 0(%r13), %xmm12 718 719 movsd 0(%r11), %xmm8 720 mulsd %xmm12, %xmm8 721 addsd %xmm8, %xmm0 722 subl $1, %r10d 723 724 movsd 32(%r11), %xmm8 725 mulsd %xmm12, %xmm8 726 addsd %xmm8, %xmm1 727 728 movsd 64(%r11), %xmm8 729 mulsd %xmm12, %xmm8 730 addsd %xmm8, %xmm2 731 732 movsd 96(%r11), %xmm8 733 mulsd %xmm12, %xmm8 734 addsd %xmm8, %xmm3 735 736 subl $1, %r10d // k=-1 737 subl $1, %r15d // k_panel=-1 738 addq $8, %r11 // A=+bs 739 addq $8, %r13 // x=+1 740 741 cmpl $0, %r15d // if k_panel=0 742 jg 1b // loop 1 743 744 cmpl $0, %r10d // if k=0 745 jle 2f // end 746 747 addq %r12, %r11 // B=Boff+sdb*bs 748 subq $32, %r11 // B-=4*sizeof(double) (loop+offsetB) 749 7502: 751 752#if MACRO_LEVEL>=2 753 .endm 754#else 755 ret 756 757 FUN_END(inner_edge_dgemv_add_t_4_lib4) 758#endif 759 760 761 762 763 764// common inner routine with file scope 765// 766// input arguments: 767// r10 <- kmax 768// r11 <- A 769// r12 <- bs*sda*sizeof(double) = 32*sda 770// r13 <- x_t 771// r14 <- z_n 772// xmm0 <- [z_t_0a z_t_0b] 773// xmm1 <- [z_t_1a z_t_1b] 774// xmm2 <- [z_t_2a z_t_2b] 775// xmm3 <- [z_t_3a z_t_3b] 776// xmm4 <- x_n_0 777// xmm5 <- x_n_1 778// xmm6 <- x_n_2 779// xmm7 <- x_n_3 780 781// 782// output arguments: 783// r10 <- kmax-4 784// r11 <- A+4*k*sizeof(double) 785// r12 <- bs*sda*sizeof(double) = 32*sda 786// r13 <- x_t+k*sizeof(double) 787// r14 <- z_n+k*sizeof(double) 788// xmm0 <- [z_t_0a z_t_0b] 789// xmm1 <- [z_t_1a z_t_1b] 790// xmm2 <- [z_t_2a z_t_2b] 791// xmm3 <- [z_t_3a z_t_3b] 792// xmm4 <- x_n_0 793// xmm5 <- x_n_1 794// xmm6 <- x_n_2 795// xmm7 <- x_n_3 796 797#if MACRO_LEVEL>=2 798 .macro INNER_EDGE_DSYMV_ADD_NT_4_LIB4 799#else 800 .p2align 4,,15 801 FUN_START(inner_edge_dsymv_add_nt_4_lib4) 802#endif 803 804 xorpd %xmm13, %xmm13 805 806 movupd 0(%r13), %xmm9 807 movupd 16(%r13), %xmm10 808 movupd 0(%r14), %xmm11 809 movupd 16(%r14), %xmm12 810 811 // 0 812 movapd 0(%r11), %xmm14 813 movapd %xmm14, %xmm15 814 mulpd %xmm9, %xmm14 815 addpd %xmm14, %xmm0 816 movsd %xmm13, %xmm15 // 817 mulpd %xmm4, %xmm15 818 addpd %xmm15, %xmm11 819 820 movapd 16(%r11), %xmm14 821 movapd %xmm14, %xmm15 822 mulpd %xmm10, %xmm14 823 addpd %xmm14, %xmm0 824 mulpd %xmm4, %xmm15 825 addpd %xmm15, %xmm12 826 827 // 1 828 movapd 32(%r11), %xmm14 829 movapd %xmm14, %xmm15 830 movsd %xmm13, %xmm14 // 831 mulpd %xmm9, %xmm14 832 addpd %xmm14, %xmm1 833// movapd %xmm13, %xmm15 // 834// mulpd %xmm5, %xmm15 835// addpd %xmm15, %xmm11 836 837 movapd 48(%r11), %xmm14 838 movapd %xmm14, %xmm15 839 mulpd %xmm10, %xmm14 840 addpd %xmm14, %xmm1 841 mulpd %xmm5, %xmm15 842 addpd %xmm15, %xmm12 843 844 // 2 845// movapd 64(%r11), %xmm14 846// movapd %xmm14, %xmm15 847// movapd %xmm13, %xmm14 // 848// mulpd %xmm9, %xmm14 849// addpd %xmm14, %xmm2 850// movapd %xmm13, %xmm15 // 851// mulpd %xmm6, %xmm15 852// addpd %xmm15, %xmm11 853 854 movapd 80(%r11), %xmm14 855 movapd %xmm14, %xmm15 856 mulpd %xmm10, %xmm14 857 addpd %xmm14, %xmm2 858 movsd %xmm13, %xmm15 // 859 mulpd %xmm6, %xmm15 860 addpd %xmm15, %xmm12 861 862 // 3 863// movapd 96(%r11), %xmm14 864// movapd %xmm14, %xmm15 865// movapd %xmm13, %xmm14 // 866// mulpd %xmm9, %xmm14 867// addpd %xmm14, %xmm3 868// movapd %xmm13, %xmm15 // 869// mulpd %xmm7, %xmm15 870// addpd %xmm15, %xmm11 871 872 movapd 112(%r11), %xmm14 873 movapd %xmm14, %xmm15 874 movsd %xmm13, %xmm14 // 875 mulpd %xmm10, %xmm14 876 addpd %xmm14, %xmm3 877// movapd %xmm13, %xmm15 // 878// mulpd %xmm7, %xmm15 879// addpd %xmm15, %xmm12 880 881 movupd %xmm11, 0(%r14) 882 movupd %xmm12, 16(%r14) 883 884 addq %r12, %r11 885 addq $32, %r13 886 addq $32, %r14 887 888 subq $4, %r10 889 890#if MACRO_LEVEL>=2 891 .endm 892#else 893 ret 894 895 FUN_END(inner_edge_dsymv_add_nt_4_lib4) 896#endif 897 898 899 900 901 902 903#if 0 904 905// common inner routine with file scope 906// 907// triangular substitution with vector RHS 908// 909// input arguments: 910// r10 <- E 911// r11 <- inv_diag_E 912// xmm0 <- [z0 z1] 913// xmm1 <- [z2 z3] 914// 915// output arguments: 916// r10 <- E 917// r11 <- inv_diag_E 918// xmm0 <- [z0 z1] 919// xmm1 <- [z2 z3] 920 921#if MACRO_LEVEL>=1 922 .macro INNER_EDGE_DTRSV_LN_INV_4_LIB4 923#else 924 .p2align 4,,15 925 FUN_START(inner_edge_dtrsv_ln_inv_4_lib4) 926#endif 927 928 xorpd %xmm14, %xmm14 929 930 movddup 0(%r11), %xmm12 931 mulpd %xmm0, %xmm12 932 movsd %xmm12, %xmm0 933 934 movapd 0(%r10), %xmm13 935 movsd %xmm14, %xmm13 936 movddup %xmm0, %xmm12 937 mulpd %xmm13, %xmm12 938 subpd %xmm12, %xmm0 939 movddup 8(%r11), %xmm12 940 mulpd %xmm0, %xmm12 941 movhpd %xmm12, %xmm0 942 943 movapd 32(%r10), %ymm13 944 vblendpd $0x3, %ymm14, %ymm13, %ymm13 945 vpermilpd $0x3, %ymm0, %ymm12 946 vperm2f128 $0x00, %ymm12, %ymm12, %ymm12 947 vmulpd %ymm13, %ymm12, %ymm15 948 vsubpd %ymm15, %ymm0, %ymm0 949 vbroadcastsd 16(%r11), %ymm12 950 vmulpd %ymm0, %ymm12, %ymm1 951 vblendpd $0x4, %ymm1, %ymm0, %ymm0 952 953 vmovapd 64(%r10), %ymm13 954 vblendpd $0x7, %ymm14, %ymm13, %ymm13 955 vpermilpd $0x0, %ymm0, %ymm12 956 vperm2f128 $0x11, %ymm12, %ymm12, %ymm12 957 vmulpd %ymm13, %ymm12, %ymm15 958 vsubpd %ymm15, %ymm0, %ymm0 959 vbroadcastsd 24(%r11), %ymm12 960 vmulpd %ymm0, %ymm12, %ymm1 961 vblendpd $0x8, %ymm1, %ymm0, %ymm0 962 963#if MACRO_LEVEL>=1 964 .endm 965#else 966 ret 967 968 FUN_END(inner_edge_dtrsv_ln_inv_4_lib4) 969#endif 970 971#endif 972 973 974 975 976// common inner routine with file scope 977// 978// blend for ta==n, scale for generic alpha and beta 979// 980// input arguments: 981// r10 <- alpha 982// r11 <- beta 983// r12 <- y 984// xmm0 <- [z0 z1]_a 985// xmm1 <- [z2 z3]_a 986// xmm2 <- [z0 z1]_b 987// xmm3 <- [z2 z3]_b 988// 989// output arguments: 990// r10 <- alpha 991// r11 <- beta 992// r12 <- y 993// xmm0 <- [z0 z1] 994// xmm1 <- [z2 z3] 995 996#if MACRO_LEVEL>=1 997 .macro INNER_BLEND_N_SCALE_AB_4_LIB4 998#else 999 .p2align 4,,15 1000 FUN_START(inner_blend_n_scale_ab_4_lib4) 1001#endif 1002 1003 // reduction 1004 addpd %xmm2, %xmm0 1005 addpd %xmm3, %xmm1 1006 1007 // alpha 1008 movddup 0(%r10), %xmm15 1009 mulpd %xmm15, %xmm0 1010 mulpd %xmm15, %xmm1 1011 1012 // beta 1013 movddup 0(%r11), %xmm15 1014 movupd 0(%r12), %xmm14 1015 mulpd %xmm15, %xmm14 1016 addpd %xmm14, %xmm0 1017 movupd 16(%r12), %xmm14 1018 mulpd %xmm15, %xmm14 1019 addpd %xmm14, %xmm1 1020 1021#if MACRO_LEVEL>=1 1022 .endm 1023#else 1024 ret 1025 1026 FUN_END(inner_blend_n_scale_ab_4_lib4) 1027#endif 1028 1029 1030 1031 1032 1033// common inner routine with file scope 1034// 1035// blend for ta==t, scale for generic alpha and beta 1036// 1037// input arguments: 1038// r10 <- alpha 1039// r11 <- beta 1040// r12 <- y 1041// xmm0 <- [z0a z0b] 1042// xmm1 <- [z1a z1b] 1043// xmm2 <- [z2a z2b] 1044// xmm3 <- [z3a z3b] 1045// 1046// output arguments: 1047// r10 <- alpha 1048// r11 <- beta 1049// r12 <- y 1050// xmm0 <- [z0 z1] 1051// xmm1 <- [z2 z3] 1052 1053#if MACRO_LEVEL>=1 1054 .macro INNER_BLEND_T_SCALE_AB_4_LIB4 1055#else 1056 .p2align 4,,15 1057 FUN_START(inner_blend_t_scale_ab_4_lib4) 1058#endif 1059 1060 // reduction 1061 haddpd %xmm1, %xmm0 1062 haddpd %xmm3, %xmm2 1063 movapd %xmm2, %xmm1 1064 1065 // alpha 1066 movddup 0(%r10), %xmm15 1067 mulpd %xmm15, %xmm0 1068 mulpd %xmm15, %xmm1 1069 1070 // beta 1071 movddup 0(%r11), %xmm15 1072 movupd 0(%r12), %xmm14 1073 mulpd %xmm15, %xmm14 1074 addpd %xmm14, %xmm0 1075 movupd 16(%r12), %xmm14 1076 mulpd %xmm15, %xmm14 1077 addpd %xmm14, %xmm1 1078 1079 1080#if MACRO_LEVEL>=1 1081 .endm 1082#else 1083 ret 1084 1085 FUN_END(inner_blend_t_scale_ab_4_lib4) 1086#endif 1087 1088 1089 1090 1091 1092// common inner routine with file scope 1093// 1094// blend for ta==t, scale for generic alpha and beta=1.0 1095// 1096// input arguments: 1097// r10 <- alpha 1098// r11 <- y 1099// xmm0 <- [z0a z0b] 1100// xmm1 <- [z1a z1b] 1101// xmm2 <- [z2a z2b] 1102// xmm3 <- [z3a z3b] 1103// 1104// output arguments: 1105// r10 <- alpha 1106// r11 <- y 1107// xmm0 <- [z0 z1] 1108// xmm1 <- [z2 z3] 1109 1110#if MACRO_LEVEL>=1 1111 .macro INNER_BLEND_T_SCALE_A1_4_LIB4 1112#else 1113 .p2align 4,,15 1114 FUN_START(inner_blend_t_scale_a1_4_lib4) 1115#endif 1116 1117 // reduction 1118 haddpd %xmm1, %xmm0 1119 haddpd %xmm3, %xmm2 1120 movapd %xmm2, %xmm1 1121 1122 // alpha 1123 movddup 0(%r10), %xmm15 1124 mulpd %xmm15, %xmm0 1125 mulpd %xmm15, %xmm1 1126 1127 // beta 1128 movupd 0(%r11), %xmm14 1129 addpd %xmm14, %xmm0 1130 movupd 16(%r11), %xmm14 1131 addpd %xmm14, %xmm1 1132 1133#if MACRO_LEVEL>=1 1134 .endm 1135#else 1136 ret 1137 1138 FUN_END(inner_blend_t_scale_a1_4_lib4) 1139#endif 1140 1141 1142 1143 1144 1145// common inner routine with file scope 1146// 1147// store 1148// 1149// input arguments: 1150// r10 <- z 1151// xmm0 <- [z0 z1] 1152// xmm1 <- [z2 z3] 1153// 1154// output arguments: 1155// r10 <- z 1156// xmm0 <- [z0 z1] 1157// xmm1 <- [z2 z3] 1158 1159#if MACRO_LEVEL>=1 1160 .macro INNER_STORE_4_LIB4 1161#else 1162 .p2align 4,,15 1163 FUN_START(inner_store_4_lib4) 1164#endif 1165 1166 movupd %xmm0, 0(%r10) 1167 movupd %xmm1, 16(%r10) 1168 1169#if MACRO_LEVEL>=1 1170 .endm 1171#else 1172 ret 1173 1174 FUN_END(inner_store_4_lib4) 1175#endif 1176 1177 1178 1179 1180 1181// common inner routine with file scope 1182// 1183// store vs 1184// 1185// input arguments: 1186// r10 <- D 1187// r11d <- km 1188// xmm0 <- [z0 z1] 1189// xmm1 <- [z2 z3] 1190// 1191// output arguments: 1192// r10 <- D 1193// r11d <- km 1194// xmm0 <- [z0 z1] 1195// xmm1 <- [z2 z3] 1196 1197#if MACRO_LEVEL>=1 1198 .macro INNER_STORE_4_VS_LIB4 1199#else 1200 .p2align 4,,15 1201 FUN_START(inner_store_4_vs_lib4) 1202#endif 1203 1204 cmpl $0, %r11d 1205 jle 0f // return 1206 1207 movsd %xmm0, 0(%r10) 1208 1209 cmpl $1, %r11d 1210 jle 0f // return 1211 1212 movhpd %xmm0, 8(%r10) 1213 1214 cmpl $2, %r11d 1215 jle 0f // return 1216 1217 movsd %xmm1, 16(%r10) 1218 1219 cmpl $3, %r11d 1220 jle 0f // return 1221 1222 movhpd %xmm1, 24(%r10) 1223 12240: 1225 1226#if MACRO_LEVEL>=1 1227 .endm 1228#else 1229 ret 1230 1231 FUN_END(inner_store_4_vs_lib4) 1232#endif 1233 1234 1235 1236 1237 1238// 1 2 3 4 5 6 7 1239// void kernel_dgemv_n_4_lib4(int k, double *alpha, double *A, double *x, double *beta, double *y, double *z); 1240 1241 .p2align 4,,15 1242 GLOB_FUN_START(kernel_dgemv_n_4_lib4) 1243 1244 PROLOGUE 1245 1246 // zero accumulation registers 1247 1248 ZERO_ACC 1249 1250 1251 // call inner dgemv kernel n 1252 1253 movq ARG1, %r10 // k 1254 movq ARG3, %r11 // A 1255 movq ARG4, %r12 // x 1256 1257#if MACRO_LEVEL>=2 1258 INNER_KERNEL_DGEMV_ADD_N_4_LIB4 1259#else 1260 CALL(inner_kernel_dgemv_add_n_4_lib4) 1261#endif 1262 1263 1264 // call inner blend n scale ab 1265 1266 movq ARG2, %r10 // alpha 1267 movq ARG5, %r11 // beta 1268 movq ARG6, %r12 // y 1269 1270#if MACRO_LEVEL>=1 1271 INNER_BLEND_N_SCALE_AB_4_LIB4 1272#else 1273 CALL(inner_blend_n_scale_ab_4_lib4) 1274#endif 1275 1276 1277 // store 1278 1279 movq ARG7, %r10 // z 1280 1281#if MACRO_LEVEL>=1 1282 INNER_STORE_4_LIB4 1283#else 1284 CALL(inner_store_4_lib4) 1285#endif 1286 1287 1288 EPILOGUE 1289 1290 ret 1291 1292 FUN_END(kernel_dgemv_n_4_lib4) 1293 1294 1295 1296 1297 1298// 1 2 3 4 5 6 7 8 1299// void kernel_dgemv_n_4_vs_lib4(int k, double *alpha, double *A, double *x, double *beta, double *y, double *z, int k1); 1300 1301 .p2align 4,,15 1302 GLOB_FUN_START(kernel_dgemv_n_4_vs_lib4) 1303 1304 PROLOGUE 1305 1306 // zero accumulation registers 1307 1308 ZERO_ACC 1309 1310 1311 // call inner dgemv kernel n 1312 1313 movq ARG1, %r10 // k 1314 movq ARG3, %r11 // A 1315 movq ARG4, %r12 // x 1316 1317#if MACRO_LEVEL>=2 1318 INNER_KERNEL_DGEMV_ADD_N_4_LIB4 1319#else 1320 CALL(inner_kernel_dgemv_add_n_4_lib4) 1321#endif 1322 1323 1324 // call inner blend n scale ab 1325 1326 movq ARG2, %r10 // alpha 1327 movq ARG5, %r11 // beta 1328 movq ARG6, %r12 // y 1329 1330#if MACRO_LEVEL>=1 1331 INNER_BLEND_N_SCALE_AB_4_LIB4 1332#else 1333 CALL(inner_blend_n_scale_ab_4_lib4) 1334#endif 1335 1336 1337 // store 1338 1339 movq ARG7, %r10 // z 1340 movq ARG8, %r11 // k1 1341 1342#if MACRO_LEVEL>=1 1343 INNER_STORE_4_VS_LIB4 1344#else 1345 CALL(inner_store_4_vs_lib4) 1346#endif 1347 1348 1349 EPILOGUE 1350 1351 ret 1352 1353 FUN_END(kernel_dgemv_n_4_vs_lib4) 1354 1355 1356 1357 1358 1359// 1 2 3 4 5 6 7 8 9 1360// void kernel_dgemv_t_4_lib4(int k, double *alpha, int offa, double *A, int sda, double *x, double *beta, double *y, double *z); 1361 1362 .p2align 4,,15 1363 GLOB_FUN_START(kernel_dgemv_t_4_lib4) 1364 1365 PROLOGUE 1366 1367 // zero accumulation registers 1368 1369 ZERO_ACC 1370 1371 1372 // call inner dgemv kernel n 1373 1374 movq ARG1, %r10 // k 1375 movq ARG4, %r11 // A 1376 movq ARG5, %r12 // sda 1377 sall $5, %r12d // 4*sda*sizeof(double) 1378// movslq %r12d, %r12 1379 movq ARG6, %r13 // x 1380 movq ARG3, %r14 // offA 1381 1382#if MACRO_LEVEL>=2 1383 INNER_EDGE_GEMV_ADD_T_4_LIB4 1384#else 1385 CALL(inner_edge_dgemv_add_t_4_lib4) 1386#endif 1387 1388#if MACRO_LEVEL>=2 1389 INNER_KERNEL_DGEMV_ADD_T_4_LIB4 1390#else 1391 CALL(inner_kernel_dgemv_add_t_4_lib4) 1392#endif 1393 1394 1395 // call inner blender t 1396 1397 movq ARG2, %r10 // alpha 1398 movq ARG7, %r11 // beta 1399 movq ARG8, %r12 // y 1400 1401#if MACRO_LEVEL>=1 1402 INNER_BLEND_T_SCALE_AB_4_LIB4 1403#else 1404 CALL(inner_blend_t_scale_ab_4_lib4) 1405#endif 1406 1407 1408 // store 1409 1410 movq ARG9, %r10 // z 1411 1412#if MACRO_LEVEL>=1 1413 INNER_STORE_4_LIB4 1414#else 1415 CALL(inner_store_4_lib4) 1416#endif 1417 1418 1419 EPILOGUE 1420 1421 ret 1422 1423 FUN_END(kernel_dgemv_t_4_lib4) 1424 1425 1426 1427 1428 1429// 1 2 3 4 5 6 7 8 9 10 1430// void kernel_dgemv_t_4_vs_lib4(int k, double *alpha, int offA, double *A, int sda, double *x, double *beta, double *y, double *z, int km); 1431 1432 .p2align 4,,15 1433 GLOB_FUN_START(kernel_dgemv_t_4_vs_lib4) 1434 1435 PROLOGUE 1436 1437 // zero accumulation registers 1438 1439 ZERO_ACC 1440 1441 1442 // call inner dgemv kernel n 1443 1444 movq ARG1, %r10 // k 1445 movq ARG4, %r11 // A 1446 movq ARG5, %r12 // sda 1447 sall $5, %r12d // 4*sda*sizeof(double) 1448// movslq %r12d, %r12 1449 movq ARG6, %r13 // x 1450 movq ARG3, %r14 // offA 1451 1452#if MACRO_LEVEL>=2 1453 INNER_EDGE_GEMV_ADD_T_4_LIB4 1454#else 1455 CALL(inner_edge_dgemv_add_t_4_lib4) 1456#endif 1457 1458#if MACRO_LEVEL>=2 1459 INNER_KERNEL_DGEMV_ADD_T_4_LIB4 1460#else 1461 CALL(inner_kernel_dgemv_add_t_4_lib4) 1462#endif 1463 1464 1465 // call inner blender t 1466 1467 movq ARG2, %r10 // alpha 1468 movq ARG7, %r11 // beta 1469 movq ARG8, %r12 // y 1470 1471#if MACRO_LEVEL>=1 1472 INNER_BLEND_T_SCALE_AB_4_LIB4 1473#else 1474 CALL(inner_blend_t_scale_ab_4_lib4) 1475#endif 1476 1477 1478 // store 1479 1480 movq ARG9, %r10 // z 1481 movq ARG10, %r11 // km 1482 1483#if MACRO_LEVEL>=1 1484 INNER_STORE_4_VS_LIB4 1485#else 1486 CALL(inner_store_4_vs_lib4) 1487#endif 1488 1489 1490 EPILOGUE 1491 1492 ret 1493 1494 FUN_END(kernel_dgemv_t_4_vs_lib4) 1495 1496 1497 1498 1499 1500// 1 2 3 4 5 6 7 8 9 10 11 1501// void kernel_dgemv_nt_4_lib4(int k, double *alpha_n, double *alpha_t, double *A, int sda, double *x_n, double *x_t, double *beta_t, double *y_t, double *z_n, double *z_t); 1502 1503 .p2align 4,,15 1504 GLOB_FUN_START(kernel_dgemv_nt_4_lib4) 1505 1506 PROLOGUE 1507 1508 // zero accumulation registers y_t 1509 1510 ZERO_ACC 1511 1512 // initialize x_n 1513 movq ARG2, %r10 // alpha_n 1514 movddup 0(%r10), %xmm15 1515 1516 movq ARG6, %r10 // x_n 1517 1518 movddup 0(%r10), %xmm4 1519 mulpd %xmm15, %xmm4 1520 movddup 8(%r10), %xmm5 1521 mulpd %xmm15, %xmm5 1522 movddup 16(%r10), %xmm6 1523 mulpd %xmm15, %xmm6 1524 movddup 24(%r10), %xmm7 1525 mulpd %xmm15, %xmm7 1526 1527 1528 // inner kernel dgemv nt 1529 1530 movq ARG1, %r10 // k 1531 movq ARG4, %r11 // A 1532 movq ARG5, %r12 // sda 1533 sall $5, %r12d // 4*sda*sizeof(double) 1534// movslq %r12d, %r12 1535 movq ARG7, %r13 // x_t 1536 movq ARG10, %r14 // z_n 1537 1538#if MACRO_LEVEL>=2 1539 INNER_KERNEL_DGEMV_ADD_NT_4_LIB4 1540#else 1541 CALL(inner_kernel_dgemv_add_nt_4_lib4) 1542#endif 1543 1544 1545 // inner blend n scale ab 1546 1547 movq ARG3, %r10 // alpha_t 1548 movq ARG8, %r11 // beta_t 1549 movq ARG9, %r12 // y_t 1550 1551#if MACRO_LEVEL>=1 1552 INNER_BLEND_T_SCALE_AB_4_LIB4 1553#else 1554 CALL(inner_blend_t_scale_ab_4_lib4) 1555#endif 1556 1557 1558 // store 1559 1560 movq ARG11, %r10 // z_t 1561 1562#if MACRO_LEVEL>=1 1563 INNER_STORE_4_LIB4 1564#else 1565 CALL(inner_store_4_lib4) 1566#endif 1567 1568 1569 EPILOGUE 1570 1571 ret 1572 1573 FUN_END(kernel_dgemv_nt_4_lib4) 1574 1575 1576 1577 1578 1579// 1 2 3 4 5 6 7 8 9 10 11 12 1580// void kernel_dgemv_nt_4_vs_lib4(int k, double *alpha_n, double *alpha_t, double *A, int sda, double *x_n, double *x_t, double *beta_t, double *y_t, double *z_n, double *z_t, int km); 1581 1582 .p2align 4,,15 1583 GLOB_FUN_START(kernel_dgemv_nt_4_vs_lib4) 1584 1585 PROLOGUE 1586 1587 // zero accumulation registers y_t 1588 1589 xorpd %xmm0, %xmm0 1590 movapd %xmm0, %xmm1 1591 movapd %xmm0, %xmm2 1592 movapd %xmm0, %xmm3 1593 1594 movapd %xmm0, %xmm4 1595 movapd %xmm0, %xmm5 1596 movapd %xmm0, %xmm6 1597 movapd %xmm0, %xmm7 1598 1599 // initialize x_n 1600 movq ARG2, %r10 // alpha_n 1601 movddup 0(%r10), %xmm15 1602 1603 movq ARG6, %r10 // x_n 1604 movq ARG12, %r11 // km 1605 1606 movddup 0(%r10), %xmm4 1607 mulpd %xmm15, %xmm4 1608 cmpl $2, %r11d 1609 jl 0f 1610 movddup 8(%r10), %xmm5 1611 mulpd %xmm15, %xmm5 1612 cmpl $3, %r11d 1613 jl 0f 1614 movddup 16(%r10), %xmm6 1615 mulpd %xmm15, %xmm6 1616 je 0f 1617 movddup 24(%r10), %xmm7 1618 mulpd %xmm15, %xmm7 16190: 1620 1621 // inner kernel dgemv nt 1622 1623 movq ARG1, %r10 // k 1624 movq ARG4, %r11 // A 1625 movq ARG5, %r12 // sda 1626 sall $5, %r12d // 4*sda*sizeof(double) 1627// movslq %r12d, %r12 1628 movq ARG7, %r13 // x_t 1629 movq ARG10, %r14 // z_n 1630 1631#if MACRO_LEVEL>=2 1632 INNER_KERNEL_DGEMV_ADD_NT_4_LIB4 1633#else 1634 CALL(inner_kernel_dgemv_add_nt_4_lib4) 1635#endif 1636 1637 1638 // inner blend n scale ab 1639 1640 movq ARG3, %r10 // alpha_t 1641 movq ARG8, %r11 // beta_t 1642 movq ARG9, %r12 // y_t 1643 1644#if MACRO_LEVEL>=1 1645 INNER_BLEND_T_SCALE_AB_4_LIB4 1646#else 1647 CALL(inner_blend_t_scale_ab_4_lib4) 1648#endif 1649 1650 1651 // store 1652 1653 movq ARG11, %r10 // z_t 1654 movq ARG12, %r11 // km 1655 1656#if MACRO_LEVEL>=1 1657 INNER_STORE_4_VS_LIB4 1658#else 1659 CALL(inner_store_4_vs_lib4) 1660#endif 1661 1662 1663 EPILOGUE 1664 1665 ret 1666 1667 FUN_END(kernel_dgemv_nt_4_vs_lib4) 1668 1669 1670 1671 1672 1673// 1 2 3 4 5 6 1674// void kernel_dsymv_l_4_lib4(int k, double *alpha, double *A, int sda, double *x, double *z); 1675 1676 .p2align 4,,15 1677 GLOB_FUN_START(kernel_dsymv_l_4_lib4) 1678 1679 PROLOGUE 1680 1681 // zero accumulation registers y_t 1682 1683 ZERO_ACC 1684 1685 // initialize x_n 1686 movq ARG2, %r10 // alpha 1687 movddup 0(%r10), %xmm15 1688 1689 movq ARG5, %r10 // x_n 1690 1691 movddup 0(%r10), %xmm4 1692 mulpd %xmm15, %xmm4 1693 movddup 8(%r10), %xmm5 1694 mulpd %xmm15, %xmm5 1695 movddup 16(%r10), %xmm6 1696 mulpd %xmm15, %xmm6 1697 movddup 24(%r10), %xmm7 1698 mulpd %xmm15, %xmm7 1699 1700 1701 // inner edge dsyrk & kernel dgemv nt 1702 1703 movq ARG1, %r10 // k 1704 movq ARG3, %r11 // A 1705 movq ARG4, %r12 // sda 1706 sall $5, %r12d // 4*sda*sizeof(double) 1707 movq ARG5, %r13 // x_t 1708 movq ARG6, %r14 // z_n 1709 1710#if MACRO_LEVEL>=2 1711 INNER_EDGE_DSYMV_ADD_NT_4_LIB4 1712#else 1713 CALL(inner_edge_dsymv_add_nt_4_lib4) 1714#endif 1715 1716#if MACRO_LEVEL>=2 1717 INNER_KERNEL_DGEMV_ADD_NT_4_LIB4 1718#else 1719 CALL(inner_kernel_dgemv_add_nt_4_lib4) 1720#endif 1721 1722 1723 // call inner blend n scale ab 1724 1725 movq ARG2, %r10 // alpha 1726 movq ARG6, %r11 // z_t 1727 1728#if MACRO_LEVEL>=1 1729 INNER_BLEND_T_SCALE_A1_4_LIB4 1730#else 1731 CALL(inner_blend_t_scale_a1_4_lib4) 1732#endif 1733 1734 1735 // store 1736 1737 movq ARG6, %r10 // z_t 1738 1739#if MACRO_LEVEL>=1 1740 INNER_STORE_4_LIB4 1741#else 1742 CALL(inner_store_4_lib4) 1743#endif 1744 1745 1746 EPILOGUE 1747 1748 ret 1749 1750 FUN_END(kernel_dsymv_l_4_lib4) 1751 1752 1753 1754 1755 1756 1757