1/************************************************************************************************** 2* * 3* This file is part of BLASFEO. * 4* * 5* BLASFEO -- BLAS For Embedded Optimization. * 6* Copyright (C) 2019 by Gianluca Frison. * 7* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. * 8* All rights reserved. * 9* * 10* The 2-Clause BSD License * 11* * 12* Redistribution and use in source and binary forms, with or without * 13* modification, are permitted provided that the following conditions are met: * 14* * 15* 1. Redistributions of source code must retain the above copyright notice, this * 16* list of conditions and the following disclaimer. * 17* 2. Redistributions in binary form must reproduce the above copyright notice, * 18* this list of conditions and the following disclaimer in the documentation * 19* and/or other materials provided with the distribution. * 20* * 21* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * 22* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * 23* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * 24* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * 25* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * 26* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * 27* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * 28* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * 29* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * 30* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * 31* * 32* Author: Gianluca Frison, gianluca.frison (at) imtek.uni-freiburg.de * 33* * 34**************************************************************************************************/ 35 36// common inner routine with file scope 37// 38// input arguments: 39// r10d <- k 40// r11 <- A 41// r12 <- B 42// r13 <- ldb 43// ymm0 <- [d00 d10 d20 d30] 44// ymm1 <- [d01 d11 d21 d31] 45// ymm2 <- [d02 d12 d22 d32] 46// ymm3 <- [d03 d13 d23 d33] 47// 48// output arguments: 49 50#if MACRO_LEVEL>=2 51 .macro INNER_KERNEL_DGEMM_NT_4X4_LIB4C 52#else 53 .p2align 4,,15 54 FUN_START(inner_kernel_dgemm_nt_4x4_lib4c) 55#endif 56 57 cmpl $0, %r10d 58 jle 5f // return 59 60 // preload 61 62 vxorpd %ymm4, %ymm4, %ymm4 63 vmovapd %ymm4, %ymm5 64 vmovapd %ymm4, %ymm6 65 vmovapd %ymm4, %ymm7 66 67 cmpl $4, %r10d 68 jle 0f // consider clean-up loop 69 70 // main loop 71 .p2align 3 721: // main loop 73 74// prefetcht0 0(%r12, %r13, 2) // software prefetch 75// prefetcht0 64(%r12, %r13, 2) // software prefetch 76 77 // unroll 0 78 vmovupd 0(%r11), %ymm13 // A 79 vbroadcastsd 0(%r12), %ymm12 // B 80 vmulpd %ymm13, %ymm12, %ymm15 81 vaddpd %ymm0, %ymm15, %ymm0 82 vbroadcastsd 8(%r12), %ymm12 // B 83 vmulpd %ymm13, %ymm12, %ymm15 84 vaddpd %ymm1, %ymm15, %ymm1 85 vbroadcastsd 16(%r12), %ymm12 // B 86 vmulpd %ymm13, %ymm12, %ymm15 87 vaddpd %ymm2, %ymm15, %ymm2 88 vbroadcastsd 24(%r12), %ymm12 // B 89 vmulpd %ymm13, %ymm12, %ymm15 90 vaddpd %ymm3, %ymm15, %ymm3 91 addq %r13, %r12 92 93 // unroll 1 94 vmovupd 32(%r11), %ymm13 // A 95 vbroadcastsd 0(%r12), %ymm12 // B 96 vmulpd %ymm13, %ymm12, %ymm15 97 vaddpd %ymm4, %ymm15, %ymm4 98 vbroadcastsd 8(%r12), %ymm12 // B 99 vmulpd %ymm13, %ymm12, %ymm15 100 vaddpd %ymm5, %ymm15, %ymm5 101 vbroadcastsd 16(%r12), %ymm12 // B 102 vmulpd %ymm13, %ymm12, %ymm15 103 vaddpd %ymm6, %ymm15, %ymm6 104 vbroadcastsd 24(%r12), %ymm12 // B 105 vmulpd %ymm13, %ymm12, %ymm15 106 vaddpd %ymm7, %ymm15, %ymm7 107 addq %r13, %r12 108 109 // unroll 2 110 vmovupd 64(%r11), %ymm13 // A 111 vbroadcastsd 0(%r12), %ymm12 // B 112 vmulpd %ymm13, %ymm12, %ymm15 113 vaddpd %ymm0, %ymm15, %ymm0 114 vbroadcastsd 8(%r12), %ymm12 // B 115 vmulpd %ymm13, %ymm12, %ymm15 116 vaddpd %ymm1, %ymm15, %ymm1 117 vbroadcastsd 16(%r12), %ymm12 // B 118 vmulpd %ymm13, %ymm12, %ymm15 119 vaddpd %ymm2, %ymm15, %ymm2 120 vbroadcastsd 24(%r12), %ymm12 // B 121 vmulpd %ymm13, %ymm12, %ymm15 122 vaddpd %ymm3, %ymm15, %ymm3 123 addq %r13, %r12 124 125 // unroll 3 126 vmovupd 96(%r11), %ymm13 // A 127 vbroadcastsd 0(%r12), %ymm12 // B 128 vmulpd %ymm13, %ymm12, %ymm15 129 vaddpd %ymm4, %ymm15, %ymm4 130 vbroadcastsd 8(%r12), %ymm12 // B 131 vmulpd %ymm13, %ymm12, %ymm15 132 vaddpd %ymm5, %ymm15, %ymm5 133 vbroadcastsd 16(%r12), %ymm12 // B 134 vmulpd %ymm13, %ymm12, %ymm15 135 vaddpd %ymm6, %ymm15, %ymm6 136 vbroadcastsd 24(%r12), %ymm12 // B 137 vmulpd %ymm13, %ymm12, %ymm15 138 vaddpd %ymm7, %ymm15, %ymm7 139 addq %r13, %r12 140 141 subl $4, %r10d 142 addq $128, %r11 143 144 cmpl $4, %r10d 145 jg 1b // main loop 146 147 1480: // consider clean4-up 149 150 cmpl $3, %r10d 151 jle 4f // clean1 152 153 // unroll 0 154 vmovupd 0(%r11), %ymm13 // A 155 vbroadcastsd 0(%r12), %ymm12 // B 156 vmulpd %ymm13, %ymm12, %ymm15 157 vaddpd %ymm0, %ymm15, %ymm0 158 vbroadcastsd 8(%r12), %ymm12 // B 159 vmulpd %ymm13, %ymm12, %ymm15 160 vaddpd %ymm1, %ymm15, %ymm1 161 vbroadcastsd 16(%r12), %ymm12 // B 162 vmulpd %ymm13, %ymm12, %ymm15 163 vaddpd %ymm2, %ymm15, %ymm2 164 vbroadcastsd 24(%r12), %ymm12 // B 165 vmulpd %ymm13, %ymm12, %ymm15 166 vaddpd %ymm3, %ymm15, %ymm3 167 addq %r13, %r12 168 169 // unroll 1 170 vmovupd 32(%r11), %ymm13 // A 171 vbroadcastsd 0(%r12), %ymm12 // B 172 vmulpd %ymm13, %ymm12, %ymm15 173 vaddpd %ymm4, %ymm15, %ymm4 174 vbroadcastsd 8(%r12), %ymm12 // B 175 vmulpd %ymm13, %ymm12, %ymm15 176 vaddpd %ymm5, %ymm15, %ymm5 177 vbroadcastsd 16(%r12), %ymm12 // B 178 vmulpd %ymm13, %ymm12, %ymm15 179 vaddpd %ymm6, %ymm15, %ymm6 180 vbroadcastsd 24(%r12), %ymm12 // B 181 vmulpd %ymm13, %ymm12, %ymm15 182 vaddpd %ymm7, %ymm15, %ymm7 183 addq %r13, %r12 184 185 // unroll 2 186 vmovupd 64(%r11), %ymm13 // A 187 vbroadcastsd 0(%r12), %ymm12 // B 188 vmulpd %ymm13, %ymm12, %ymm15 189 vaddpd %ymm0, %ymm15, %ymm0 190 vbroadcastsd 8(%r12), %ymm12 // B 191 vmulpd %ymm13, %ymm12, %ymm15 192 vaddpd %ymm1, %ymm15, %ymm1 193 vbroadcastsd 16(%r12), %ymm12 // B 194 vmulpd %ymm13, %ymm12, %ymm15 195 vaddpd %ymm2, %ymm15, %ymm2 196 vbroadcastsd 24(%r12), %ymm12 // B 197 vmulpd %ymm13, %ymm12, %ymm15 198 vaddpd %ymm3, %ymm15, %ymm3 199 addq %r13, %r12 200 201 // unroll 3 202 vmovupd 96(%r11), %ymm13 // A 203 vbroadcastsd 0(%r12), %ymm12 // B 204 vmulpd %ymm13, %ymm12, %ymm15 205 vaddpd %ymm4, %ymm15, %ymm4 206 vbroadcastsd 8(%r12), %ymm12 // B 207 vmulpd %ymm13, %ymm12, %ymm15 208 vaddpd %ymm5, %ymm15, %ymm5 209 vbroadcastsd 16(%r12), %ymm12 // B 210 vmulpd %ymm13, %ymm12, %ymm15 211 vaddpd %ymm6, %ymm15, %ymm6 212 vbroadcastsd 24(%r12), %ymm12 // B 213 vmulpd %ymm13, %ymm12, %ymm15 214 vaddpd %ymm7, %ymm15, %ymm7 215 addq %r13, %r12 216 217 subl $4, %r10d 218 addq $128, %r11 219 220 jmp 2f // return 221 222 2234: // consider clean1-up loop 224 225 cmpl $0, %r10d 226 jle 2f // return 227 228 // clean-up loop 2293: // clean up loop 230 231 // unroll 0 232 vmovupd 0(%r11), %ymm13 // A 233 vbroadcastsd 0(%r12), %ymm12 // B 234 vmulpd %ymm13, %ymm12, %ymm15 235 vaddpd %ymm0, %ymm15, %ymm0 236 vbroadcastsd 8(%r12), %ymm12 // B 237 vmulpd %ymm13, %ymm12, %ymm15 238 vaddpd %ymm1, %ymm15, %ymm1 239 vbroadcastsd 16(%r12), %ymm12 // B 240 vmulpd %ymm13, %ymm12, %ymm15 241 vaddpd %ymm2, %ymm15, %ymm2 242 vbroadcastsd 24(%r12), %ymm12 // B 243 vmulpd %ymm13, %ymm12, %ymm15 244 vaddpd %ymm3, %ymm15, %ymm3 245 addq %r13, %r12 246 247 subl $1, %r10d 248 addq $32, %r11 249 250 cmpl $0, %r10d 251 jg 3b // clean up loop 252 253 2542: // return 255 256 vaddpd %ymm4, %ymm0, %ymm0 257 vaddpd %ymm5, %ymm1, %ymm1 258 vaddpd %ymm6, %ymm2, %ymm2 259 vaddpd %ymm7, %ymm3, %ymm3 260 2615: // return 262 263#if MACRO_LEVEL>=2 264 .endm 265#else 266 ret 267 268 FUN_END(inner_kernel_dgemm_nt_4x4_lib4c) 269#endif 270 271 272 273 274 275// common inner routine with file scope 276// 277// input arguments: 278// r10d <- k 279// r11 <- A 280// r12 <- B 281// r13 <- ldb 282// ymm0 <- [d00 d10 d20 d30] 283// ymm1 <- [d01 d11 d21 d31] 284// ymm2 <- [d02 d12 d22 d32] 285// ymm3 <- [d03 d13 d23 d33] 286// 287// output arguments: 288 289#if MACRO_LEVEL>=2 290 .macro INNER_KERNEL_DGEMM_NT_4X3_LIB4C 291#else 292 .p2align 4,,15 293 FUN_START(inner_kernel_dgemm_nt_4x3_lib4c) 294#endif 295 296 cmpl $0, %r10d 297 jle 5f // return 298 299 // preload 300 301 vxorpd %ymm4, %ymm4, %ymm4 302 vmovapd %ymm4, %ymm5 303 vmovapd %ymm4, %ymm6 304 305 cmpl $4, %r10d 306 jle 0f // consider clean-up loop 307 308 // main loop 309 .p2align 3 3101: // main loop 311 312// prefetcht0 0(%r12, %r13, 2) // software prefetch 313// prefetcht0 64(%r12, %r13, 2) // software prefetch 314 315 // unroll 0 316 vmovupd 0(%r11), %ymm13 // A 317 vbroadcastsd 0(%r12), %ymm12 // B 318 vmulpd %ymm13, %ymm12, %ymm15 319 vaddpd %ymm0, %ymm15, %ymm0 320 vbroadcastsd 8(%r12), %ymm12 // B 321 vmulpd %ymm13, %ymm12, %ymm15 322 vaddpd %ymm1, %ymm15, %ymm1 323 vbroadcastsd 16(%r12), %ymm12 // B 324 vmulpd %ymm13, %ymm12, %ymm15 325 vaddpd %ymm2, %ymm15, %ymm2 326 addq %r13, %r12 327 328 // unroll 1 329 vmovupd 32(%r11), %ymm13 // A 330 vbroadcastsd 0(%r12), %ymm12 // B 331 vmulpd %ymm13, %ymm12, %ymm15 332 vaddpd %ymm4, %ymm15, %ymm4 333 vbroadcastsd 8(%r12), %ymm12 // B 334 vmulpd %ymm13, %ymm12, %ymm15 335 vaddpd %ymm5, %ymm15, %ymm5 336 vbroadcastsd 16(%r12), %ymm12 // B 337 vmulpd %ymm13, %ymm12, %ymm15 338 vaddpd %ymm6, %ymm15, %ymm6 339 addq %r13, %r12 340 341 // unroll 2 342 vmovupd 64(%r11), %ymm13 // A 343 vbroadcastsd 0(%r12), %ymm12 // B 344 vmulpd %ymm13, %ymm12, %ymm15 345 vaddpd %ymm0, %ymm15, %ymm0 346 vbroadcastsd 8(%r12), %ymm12 // B 347 vmulpd %ymm13, %ymm12, %ymm15 348 vaddpd %ymm1, %ymm15, %ymm1 349 vbroadcastsd 16(%r12), %ymm12 // B 350 vmulpd %ymm13, %ymm12, %ymm15 351 vaddpd %ymm2, %ymm15, %ymm2 352 addq %r13, %r12 353 354 // unroll 3 355 vmovupd 96(%r11), %ymm13 // A 356 vbroadcastsd 0(%r12), %ymm12 // B 357 vmulpd %ymm13, %ymm12, %ymm15 358 vaddpd %ymm4, %ymm15, %ymm4 359 vbroadcastsd 8(%r12), %ymm12 // B 360 vmulpd %ymm13, %ymm12, %ymm15 361 vaddpd %ymm5, %ymm15, %ymm5 362 vbroadcastsd 16(%r12), %ymm12 // B 363 vmulpd %ymm13, %ymm12, %ymm15 364 vaddpd %ymm6, %ymm15, %ymm6 365 addq %r13, %r12 366 367 subl $4, %r10d 368 addq $128, %r11 369 370 cmpl $4, %r10d 371 jg 1b // main loop 372 373 3740: // consider clean4-up 375 376 cmpl $3, %r10d 377 jle 4f // clean1 378 379 // unroll 0 380 vmovupd 0(%r11), %ymm13 // A 381 vbroadcastsd 0(%r12), %ymm12 // B 382 vmulpd %ymm13, %ymm12, %ymm15 383 vaddpd %ymm0, %ymm15, %ymm0 384 vbroadcastsd 8(%r12), %ymm12 // B 385 vmulpd %ymm13, %ymm12, %ymm15 386 vaddpd %ymm1, %ymm15, %ymm1 387 vbroadcastsd 16(%r12), %ymm12 // B 388 vmulpd %ymm13, %ymm12, %ymm15 389 vaddpd %ymm2, %ymm15, %ymm2 390 addq %r13, %r12 391 392 // unroll 1 393 vmovupd 32(%r11), %ymm13 // A 394 vbroadcastsd 0(%r12), %ymm12 // B 395 vmulpd %ymm13, %ymm12, %ymm15 396 vaddpd %ymm4, %ymm15, %ymm4 397 vbroadcastsd 8(%r12), %ymm12 // B 398 vmulpd %ymm13, %ymm12, %ymm15 399 vaddpd %ymm5, %ymm15, %ymm5 400 vbroadcastsd 16(%r12), %ymm12 // B 401 vmulpd %ymm13, %ymm12, %ymm15 402 vaddpd %ymm6, %ymm15, %ymm6 403 addq %r13, %r12 404 405 // unroll 2 406 vmovupd 64(%r11), %ymm13 // A 407 vbroadcastsd 0(%r12), %ymm12 // B 408 vmulpd %ymm13, %ymm12, %ymm15 409 vaddpd %ymm0, %ymm15, %ymm0 410 vbroadcastsd 8(%r12), %ymm12 // B 411 vmulpd %ymm13, %ymm12, %ymm15 412 vaddpd %ymm1, %ymm15, %ymm1 413 vbroadcastsd 16(%r12), %ymm12 // B 414 vmulpd %ymm13, %ymm12, %ymm15 415 vaddpd %ymm2, %ymm15, %ymm2 416 addq %r13, %r12 417 418 // unroll 3 419 vmovupd 96(%r11), %ymm13 // A 420 vbroadcastsd 0(%r12), %ymm12 // B 421 vmulpd %ymm13, %ymm12, %ymm15 422 vaddpd %ymm4, %ymm15, %ymm4 423 vbroadcastsd 8(%r12), %ymm12 // B 424 vmulpd %ymm13, %ymm12, %ymm15 425 vaddpd %ymm5, %ymm15, %ymm5 426 vbroadcastsd 16(%r12), %ymm12 // B 427 vmulpd %ymm13, %ymm12, %ymm15 428 vaddpd %ymm6, %ymm15, %ymm6 429 addq %r13, %r12 430 431 subl $4, %r10d 432 addq $128, %r11 433 434 jmp 2f // return 435 436 4374: // consider clean1-up loop 438 439 cmpl $0, %r10d 440 jle 2f // return 441 442 // clean-up loop 4433: // clean up loop 444 445 // unroll 0 446 vmovupd 0(%r11), %ymm13 // A 447 vbroadcastsd 0(%r12), %ymm12 // B 448 vmulpd %ymm13, %ymm12, %ymm15 449 vaddpd %ymm0, %ymm15, %ymm0 450 vbroadcastsd 8(%r12), %ymm12 // B 451 vmulpd %ymm13, %ymm12, %ymm15 452 vaddpd %ymm1, %ymm15, %ymm1 453 vbroadcastsd 16(%r12), %ymm12 // B 454 vmulpd %ymm13, %ymm12, %ymm15 455 vaddpd %ymm2, %ymm15, %ymm2 456 addq %r13, %r12 457 458 subl $1, %r10d 459 addq $32, %r11 460 461 cmpl $0, %r10d 462 jg 3b // clean up loop 463 464 4652: // return 466 467 vaddpd %ymm4, %ymm0, %ymm0 468 vaddpd %ymm5, %ymm1, %ymm1 469 vaddpd %ymm6, %ymm2, %ymm2 470 4715: // return 472 473#if MACRO_LEVEL>=2 474 .endm 475#else 476 ret 477 478 FUN_END(inner_kernel_dgemm_nt_4x3_lib4c) 479#endif 480 481 482 483 484 485// common inner routine with file scope 486// 487// input arguments: 488// r10d <- k 489// r11 <- A 490// r12 <- B 491// r13 <- ldb 492// ymm0 <- [d00 d10 d20 d30] 493// ymm1 <- [d01 d11 d21 d31] 494// ymm2 <- [d02 d12 d22 d32] 495// ymm3 <- [d03 d13 d23 d33] 496// 497// output arguments: 498 499#if MACRO_LEVEL>=2 500 .macro INNER_KERNEL_DGEMM_NT_4X2_LIB4C 501#else 502 .p2align 4,,15 503 FUN_START(inner_kernel_dgemm_nt_4x2_lib4c) 504#endif 505 506 cmpl $0, %r10d 507 jle 5f // return 508 509 // preload 510 511 vxorpd %ymm4, %ymm4, %ymm4 512 vmovapd %ymm4, %ymm5 513 514 cmpl $4, %r10d 515 jle 0f // consider clean-up loop 516 517 // main loop 518 .p2align 3 5191: // main loop 520 521// prefetcht0 0(%r12, %r13, 2) // software prefetch 522// prefetcht0 64(%r12, %r13, 2) // software prefetch 523 524 // unroll 0 525 vmovupd 0(%r11), %ymm13 // A 526 vbroadcastsd 0(%r12), %ymm12 // B 527 vmulpd %ymm13, %ymm12, %ymm15 528 vaddpd %ymm0, %ymm15, %ymm0 529 vbroadcastsd 8(%r12), %ymm12 // B 530 vmulpd %ymm13, %ymm12, %ymm15 531 vaddpd %ymm1, %ymm15, %ymm1 532 addq %r13, %r12 533 534 // unroll 1 535 vmovupd 32(%r11), %ymm13 // A 536 vbroadcastsd 0(%r12), %ymm12 // B 537 vmulpd %ymm13, %ymm12, %ymm15 538 vaddpd %ymm4, %ymm15, %ymm4 539 vbroadcastsd 8(%r12), %ymm12 // B 540 vmulpd %ymm13, %ymm12, %ymm15 541 vaddpd %ymm5, %ymm15, %ymm5 542 addq %r13, %r12 543 544 // unroll 2 545 vmovupd 64(%r11), %ymm13 // A 546 vbroadcastsd 0(%r12), %ymm12 // B 547 vmulpd %ymm13, %ymm12, %ymm15 548 vaddpd %ymm0, %ymm15, %ymm0 549 vbroadcastsd 8(%r12), %ymm12 // B 550 vmulpd %ymm13, %ymm12, %ymm15 551 vaddpd %ymm1, %ymm15, %ymm1 552 addq %r13, %r12 553 554 // unroll 3 555 vmovupd 96(%r11), %ymm13 // A 556 vbroadcastsd 0(%r12), %ymm12 // B 557 vmulpd %ymm13, %ymm12, %ymm15 558 vaddpd %ymm4, %ymm15, %ymm4 559 vbroadcastsd 8(%r12), %ymm12 // B 560 vmulpd %ymm13, %ymm12, %ymm15 561 vaddpd %ymm5, %ymm15, %ymm5 562 addq %r13, %r12 563 564 subl $4, %r10d 565 addq $128, %r11 566 567 cmpl $4, %r10d 568 jg 1b // main loop 569 570 5710: // consider clean4-up 572 573 cmpl $3, %r10d 574 jle 4f // clean1 575 576 // unroll 0 577 vmovupd 0(%r11), %ymm13 // A 578 vbroadcastsd 0(%r12), %ymm12 // B 579 vmulpd %ymm13, %ymm12, %ymm15 580 vaddpd %ymm0, %ymm15, %ymm0 581 vbroadcastsd 8(%r12), %ymm12 // B 582 vmulpd %ymm13, %ymm12, %ymm15 583 vaddpd %ymm1, %ymm15, %ymm1 584 addq %r13, %r12 585 586 // unroll 1 587 vmovupd 32(%r11), %ymm13 // A 588 vbroadcastsd 0(%r12), %ymm12 // B 589 vmulpd %ymm13, %ymm12, %ymm15 590 vaddpd %ymm4, %ymm15, %ymm4 591 vbroadcastsd 8(%r12), %ymm12 // B 592 vmulpd %ymm13, %ymm12, %ymm15 593 vaddpd %ymm5, %ymm15, %ymm5 594 addq %r13, %r12 595 596 // unroll 2 597 vmovupd 64(%r11), %ymm13 // A 598 vbroadcastsd 0(%r12), %ymm12 // B 599 vmulpd %ymm13, %ymm12, %ymm15 600 vaddpd %ymm0, %ymm15, %ymm0 601 vbroadcastsd 8(%r12), %ymm12 // B 602 vmulpd %ymm13, %ymm12, %ymm15 603 vaddpd %ymm1, %ymm15, %ymm1 604 addq %r13, %r12 605 606 // unroll 3 607 vmovupd 96(%r11), %ymm13 // A 608 vbroadcastsd 0(%r12), %ymm12 // B 609 vmulpd %ymm13, %ymm12, %ymm15 610 vaddpd %ymm4, %ymm15, %ymm4 611 vbroadcastsd 8(%r12), %ymm12 // B 612 vmulpd %ymm13, %ymm12, %ymm15 613 vaddpd %ymm5, %ymm15, %ymm5 614 addq %r13, %r12 615 616 subl $4, %r10d 617 addq $128, %r11 618 619 jmp 2f // return 620 621 6224: // consider clean1-up loop 623 624 cmpl $0, %r10d 625 jle 2f // return 626 627 // clean-up loop 6283: // clean up loop 629 630 // unroll 0 631 vmovupd 0(%r11), %ymm13 // A 632 vbroadcastsd 0(%r12), %ymm12 // B 633 vmulpd %ymm13, %ymm12, %ymm15 634 vaddpd %ymm0, %ymm15, %ymm0 635 vbroadcastsd 8(%r12), %ymm12 // B 636 vmulpd %ymm13, %ymm12, %ymm15 637 vaddpd %ymm1, %ymm15, %ymm1 638 addq %r13, %r12 639 640 subl $1, %r10d 641 addq $32, %r11 642 643 cmpl $0, %r10d 644 jg 3b // clean up loop 645 646 6472: // return 648 649 vaddpd %ymm4, %ymm0, %ymm0 650 vaddpd %ymm5, %ymm1, %ymm1 651 6525: // return 653 654#if MACRO_LEVEL>=2 655 .endm 656#else 657 ret 658 659 FUN_END(inner_kernel_dgemm_nt_4x2_lib4c) 660#endif 661 662 663 664 665 666// common inner routine with file scope 667// 668// input arguments: 669// r10d <- k 670// r11 <- A 671// r12 <- B 672// r13 <- ldb 673// ymm0 <- [d00 d10 d20 d30] 674// ymm1 <- [d01 d11 d21 d31] 675// ymm2 <- [d02 d12 d22 d32] 676// ymm3 <- [d03 d13 d23 d33] 677// 678// output arguments: 679 680#if MACRO_LEVEL>=2 681 .macro INNER_KERNEL_DGEMM_NT_4X1_LIB4C 682#else 683 .p2align 4,,15 684 FUN_START(inner_kernel_dgemm_nt_4x1_lib4c) 685#endif 686 687 cmpl $0, %r10d 688 jle 5f // return 689 690 // preload 691 692 vxorpd %ymm4, %ymm4, %ymm4 693 694 cmpl $4, %r10d 695 jle 0f // consider clean-up loop 696 697 // main loop 698 .p2align 3 6991: // main loop 700 701// prefetcht0 0(%r12, %r13, 2) // software prefetch 702// prefetcht0 64(%r12, %r13, 2) // software prefetch 703 704 // unroll 0 705 vmovupd 0(%r11), %ymm13 // A 706 vbroadcastsd 0(%r12), %ymm12 // B 707 vmulpd %ymm13, %ymm12, %ymm15 708 vaddpd %ymm0, %ymm15, %ymm0 709 addq %r13, %r12 710 711 // unroll 1 712 vmovupd 32(%r11), %ymm13 // A 713 vbroadcastsd 0(%r12), %ymm12 // B 714 vmulpd %ymm13, %ymm12, %ymm15 715 vaddpd %ymm4, %ymm15, %ymm4 716 addq %r13, %r12 717 718 // unroll 2 719 vmovupd 64(%r11), %ymm13 // A 720 vbroadcastsd 0(%r12), %ymm12 // B 721 vmulpd %ymm13, %ymm12, %ymm15 722 vaddpd %ymm0, %ymm15, %ymm0 723 addq %r13, %r12 724 725 // unroll 3 726 vmovupd 96(%r11), %ymm13 // A 727 vbroadcastsd 0(%r12), %ymm12 // B 728 vmulpd %ymm13, %ymm12, %ymm15 729 vaddpd %ymm4, %ymm15, %ymm4 730 addq %r13, %r12 731 732 subl $4, %r10d 733 addq $128, %r11 734 735 cmpl $4, %r10d 736 jg 1b // main loop 737 738 7390: // consider clean4-up 740 741 cmpl $3, %r10d 742 jle 4f // clean1 743 744 // unroll 0 745 vmovupd 0(%r11), %ymm13 // A 746 vbroadcastsd 0(%r12), %ymm12 // B 747 vmulpd %ymm13, %ymm12, %ymm15 748 vaddpd %ymm0, %ymm15, %ymm0 749 addq %r13, %r12 750 751 // unroll 1 752 vmovupd 32(%r11), %ymm13 // A 753 vbroadcastsd 0(%r12), %ymm12 // B 754 vmulpd %ymm13, %ymm12, %ymm15 755 vaddpd %ymm4, %ymm15, %ymm4 756 addq %r13, %r12 757 758 // unroll 2 759 vmovupd 64(%r11), %ymm13 // A 760 vbroadcastsd 0(%r12), %ymm12 // B 761 vmulpd %ymm13, %ymm12, %ymm15 762 vaddpd %ymm0, %ymm15, %ymm0 763 addq %r13, %r12 764 765 // unroll 3 766 vmovupd 96(%r11), %ymm13 // A 767 vbroadcastsd 0(%r12), %ymm12 // B 768 vmulpd %ymm13, %ymm12, %ymm15 769 vaddpd %ymm4, %ymm15, %ymm4 770 addq %r13, %r12 771 772 subl $4, %r10d 773 addq $128, %r11 774 775 jmp 2f // return 776 777 7784: // consider clean1-up loop 779 780 cmpl $0, %r10d 781 jle 2f // return 782 783 // clean-up loop 7843: // clean up loop 785 786 // unroll 0 787 vmovupd 0(%r11), %ymm13 // A 788 vbroadcastsd 0(%r12), %ymm12 // B 789 vmulpd %ymm13, %ymm12, %ymm15 790 vaddpd %ymm0, %ymm15, %ymm0 791 addq %r13, %r12 792 793 subl $1, %r10d 794 addq $32, %r11 795 796 cmpl $0, %r10d 797 jg 3b // clean up loop 798 799 8002: // return 801 802 vaddpd %ymm4, %ymm0, %ymm0 803 8045: // return 805 806#if MACRO_LEVEL>=2 807 .endm 808#else 809 ret 810 811 FUN_END(inner_kernel_dgemm_nt_4x1_lib4c) 812#endif 813 814 815 816 817 818// common inner routine with file scope 819// 820// input arguments: 821// r10d <- k 822// r11 <- A 823// r12 <- B 824// r13 <- ldb 825// ymm0 <- [d00 d10 d20 d30] 826// ymm1 <- [d01 d11 d21 d31] 827// ymm2 <- [d02 d12 d22 d32] 828// ymm3 <- [d03 d13 d23 d33] 829// 830// output arguments: 831 832#if MACRO_LEVEL>=2 833 .macro INNER_KERNEL_DGEMM_NN_4X4_LIB4C 834#else 835 .p2align 4,,15 836 FUN_START(inner_kernel_dgemm_nn_4x4_lib4c) 837#endif 838 839 cmpl $0, %r10d 840 jle 5f // return 841 842 movq %r12, %r15 843 addq %r13, %r15 844 addq %r13, %r15 // B+2*ldb 845 846 // preload 847 848 vxorpd %ymm4, %ymm4, %ymm4 849 vmovapd %ymm4, %ymm5 850 vmovapd %ymm4, %ymm6 851 vmovapd %ymm4, %ymm7 852 853 cmpl $4, %r10d 854 jle 0f // consider clean-up loop 855 856 // main loop 857 .p2align 3 8581: // main loop 859 860// prefetcht0 0(%r12, %r13, 2) // software prefetch 861// prefetcht0 64(%r12, %r13, 2) // software prefetch 862 863 // unroll 0 864 vmovupd 0(%r11), %ymm13 // A 865 vbroadcastsd 0(%r12), %ymm12 // B 866 vmulpd %ymm13, %ymm12, %ymm15 867 vaddpd %ymm0, %ymm15, %ymm0 868 vbroadcastsd 0(%r12, %r13, 1), %ymm12 // B 869 vmulpd %ymm13, %ymm12, %ymm15 870 vaddpd %ymm1, %ymm15, %ymm1 871 vbroadcastsd 0(%r15), %ymm12 // B 872 vmulpd %ymm13, %ymm12, %ymm15 873 vaddpd %ymm2, %ymm15, %ymm2 874 vbroadcastsd 0(%r15, %r13, 1), %ymm12 // B 875 vmulpd %ymm13, %ymm12, %ymm15 876 vaddpd %ymm3, %ymm15, %ymm3 877 878 // unroll 1 879 vmovupd 32(%r11), %ymm13 // A 880 vbroadcastsd 8(%r12), %ymm12 // B 881 vmulpd %ymm13, %ymm12, %ymm15 882 vaddpd %ymm4, %ymm15, %ymm4 883 vbroadcastsd 8(%r12, %r13, 1), %ymm12 // B 884 vmulpd %ymm13, %ymm12, %ymm15 885 vaddpd %ymm5, %ymm15, %ymm5 886 vbroadcastsd 8(%r15), %ymm12 // B 887 vmulpd %ymm13, %ymm12, %ymm15 888 vaddpd %ymm6, %ymm15, %ymm6 889 vbroadcastsd 8(%r15, %r13, 1), %ymm12 // B 890 vmulpd %ymm13, %ymm12, %ymm15 891 vaddpd %ymm7, %ymm15, %ymm7 892 893 // unroll 2 894 vmovupd 64(%r11), %ymm13 // A 895 vbroadcastsd 16(%r12), %ymm12 // B 896 vmulpd %ymm13, %ymm12, %ymm15 897 vaddpd %ymm0, %ymm15, %ymm0 898 vbroadcastsd 16(%r12, %r13, 1), %ymm12 // B 899 vmulpd %ymm13, %ymm12, %ymm15 900 vaddpd %ymm1, %ymm15, %ymm1 901 vbroadcastsd 16(%r15), %ymm12 // B 902 vmulpd %ymm13, %ymm12, %ymm15 903 vaddpd %ymm2, %ymm15, %ymm2 904 vbroadcastsd 16(%r15, %r13, 1), %ymm12 // B 905 vmulpd %ymm13, %ymm12, %ymm15 906 vaddpd %ymm3, %ymm15, %ymm3 907 908 // unroll 3 909 vmovupd 96(%r11), %ymm13 // A 910 vbroadcastsd 24(%r12), %ymm12 // B 911 vmulpd %ymm13, %ymm12, %ymm15 912 vaddpd %ymm4, %ymm15, %ymm4 913 vbroadcastsd 24(%r12, %r13, 1), %ymm12 // B 914 vmulpd %ymm13, %ymm12, %ymm15 915 vaddpd %ymm5, %ymm15, %ymm5 916 vbroadcastsd 24(%r15), %ymm12 // B 917 vmulpd %ymm13, %ymm12, %ymm15 918 vaddpd %ymm6, %ymm15, %ymm6 919 vbroadcastsd 24(%r15, %r13, 1), %ymm12 // B 920 vmulpd %ymm13, %ymm12, %ymm15 921 vaddpd %ymm7, %ymm15, %ymm7 922 923 subl $4, %r10d 924 addq $32, %r12 925 addq $32, %r15 926 addq $128, %r11 927 928 cmpl $4, %r10d 929 jg 1b // main loop 930 931 9320: // consider clean4-up 933 934 cmpl $3, %r10d 935 jle 4f // clean1 936 937 // unroll 0 938 vmovupd 0(%r11), %ymm13 // A 939 vbroadcastsd 0(%r12), %ymm12 // B 940 vmulpd %ymm13, %ymm12, %ymm15 941 vaddpd %ymm0, %ymm15, %ymm0 942 vbroadcastsd 0(%r12, %r13, 1), %ymm12 // B 943 vmulpd %ymm13, %ymm12, %ymm15 944 vaddpd %ymm1, %ymm15, %ymm1 945 vbroadcastsd 0(%r15), %ymm12 // B 946 vmulpd %ymm13, %ymm12, %ymm15 947 vaddpd %ymm2, %ymm15, %ymm2 948 vbroadcastsd 0(%r15, %r13, 1), %ymm12 // B 949 vmulpd %ymm13, %ymm12, %ymm15 950 vaddpd %ymm3, %ymm15, %ymm3 951 952 // unroll 1 953 vmovupd 32(%r11), %ymm13 // A 954 vbroadcastsd 8(%r12), %ymm12 // B 955 vmulpd %ymm13, %ymm12, %ymm15 956 vaddpd %ymm4, %ymm15, %ymm4 957 vbroadcastsd 8(%r12, %r13, 1), %ymm12 // B 958 vmulpd %ymm13, %ymm12, %ymm15 959 vaddpd %ymm5, %ymm15, %ymm5 960 vbroadcastsd 8(%r15), %ymm12 // B 961 vmulpd %ymm13, %ymm12, %ymm15 962 vaddpd %ymm6, %ymm15, %ymm6 963 vbroadcastsd 8(%r15, %r13, 1), %ymm12 // B 964 vmulpd %ymm13, %ymm12, %ymm15 965 vaddpd %ymm7, %ymm15, %ymm7 966 967 // unroll 2 968 vmovupd 64(%r11), %ymm13 // A 969 vbroadcastsd 16(%r12), %ymm12 // B 970 vmulpd %ymm13, %ymm12, %ymm15 971 vaddpd %ymm0, %ymm15, %ymm0 972 vbroadcastsd 16(%r12, %r13, 1), %ymm12 // B 973 vmulpd %ymm13, %ymm12, %ymm15 974 vaddpd %ymm1, %ymm15, %ymm1 975 vbroadcastsd 16(%r15), %ymm12 // B 976 vmulpd %ymm13, %ymm12, %ymm15 977 vaddpd %ymm2, %ymm15, %ymm2 978 vbroadcastsd 16(%r15, %r13, 1), %ymm12 // B 979 vmulpd %ymm13, %ymm12, %ymm15 980 vaddpd %ymm3, %ymm15, %ymm3 981 982 // unroll 3 983 vmovupd 96(%r11), %ymm13 // A 984 vbroadcastsd 24(%r12), %ymm12 // B 985 vmulpd %ymm13, %ymm12, %ymm15 986 vaddpd %ymm4, %ymm15, %ymm4 987 vbroadcastsd 24(%r12, %r13, 1), %ymm12 // B 988 vmulpd %ymm13, %ymm12, %ymm15 989 vaddpd %ymm5, %ymm15, %ymm5 990 vbroadcastsd 24(%r15), %ymm12 // B 991 vmulpd %ymm13, %ymm12, %ymm15 992 vaddpd %ymm6, %ymm15, %ymm6 993 vbroadcastsd 24(%r15, %r13, 1), %ymm12 // B 994 vmulpd %ymm13, %ymm12, %ymm15 995 vaddpd %ymm7, %ymm15, %ymm7 996 997 subl $4, %r10d 998 addq $32, %r12 999 addq $32, %r15 1000 addq $128, %r11 1001 1002 jmp 2f // return 1003 1004 10054: // consider clean1-up loop 1006 1007 cmpl $0, %r10d 1008 jle 2f // return 1009 1010 // clean-up loop 10113: // clean up loop 1012 1013 // unroll 0 1014 vmovupd 0(%r11), %ymm13 // A 1015 vbroadcastsd 0(%r12), %ymm12 // B 1016 vmulpd %ymm13, %ymm12, %ymm15 1017 vaddpd %ymm0, %ymm15, %ymm0 1018 vbroadcastsd 0(%r12, %r13, 1), %ymm12 // B 1019 vmulpd %ymm13, %ymm12, %ymm15 1020 vaddpd %ymm1, %ymm15, %ymm1 1021 vbroadcastsd 0(%r15), %ymm12 // B 1022 vmulpd %ymm13, %ymm12, %ymm15 1023 vaddpd %ymm2, %ymm15, %ymm2 1024 vbroadcastsd 0(%r15, %r13, 1), %ymm12 // B 1025 vmulpd %ymm13, %ymm12, %ymm15 1026 vaddpd %ymm3, %ymm15, %ymm3 1027 1028 subl $1, %r10d 1029 addq $8, %r12 1030 addq $8, %r15 1031 addq $32, %r11 1032 1033 cmpl $0, %r10d 1034 jg 3b // clean up loop 1035 1036 10372: // return 1038 1039 vaddpd %ymm4, %ymm0, %ymm0 1040 vaddpd %ymm5, %ymm1, %ymm1 1041 vaddpd %ymm6, %ymm2, %ymm2 1042 vaddpd %ymm7, %ymm3, %ymm3 1043 10445: // return 1045 1046#if MACRO_LEVEL>=2 1047 .endm 1048#else 1049 ret 1050 1051 FUN_END(inner_kernel_dgemm_nn_4x4_lib4c) 1052#endif 1053 1054 1055 1056 1057 1058// common inner routine with file scope 1059// 1060// input arguments: 1061// r10d <- k 1062// r11 <- A 1063// r12 <- B 1064// r13 <- ldb 1065// ymm0 <- [d00 d10 d20 d30] 1066// ymm1 <- [d01 d11 d21 d31] 1067// ymm2 <- [d02 d12 d22 d32] 1068// ymm3 <- [d03 d13 d23 d33] 1069// 1070// output arguments: 1071 1072#if MACRO_LEVEL>=2 1073 .macro INNER_KERNEL_DGEMM_NN_4X3_LIB4C 1074#else 1075 .p2align 4,,15 1076 FUN_START(inner_kernel_dgemm_nn_4x3_lib4c) 1077#endif 1078 1079 cmpl $0, %r10d 1080 jle 5f // return 1081 1082 movq %r12, %r15 1083 addq %r13, %r15 1084 addq %r13, %r15 // B+2*ldb 1085 1086 // preload 1087 1088 vxorpd %ymm4, %ymm4, %ymm4 1089 vmovapd %ymm4, %ymm5 1090 vmovapd %ymm4, %ymm6 1091 1092 cmpl $4, %r10d 1093 jle 0f // consider clean-up loop 1094 1095 // main loop 1096 .p2align 3 10971: // main loop 1098 1099// prefetcht0 0(%r12, %r13, 2) // software prefetch 1100// prefetcht0 64(%r12, %r13, 2) // software prefetch 1101 1102 // unroll 0 1103 vmovupd 0(%r11), %ymm13 // A 1104 vbroadcastsd 0(%r12), %ymm12 // B 1105 vmulpd %ymm13, %ymm12, %ymm15 1106 vaddpd %ymm0, %ymm15, %ymm0 1107 vbroadcastsd 0(%r12, %r13, 1), %ymm12 // B 1108 vmulpd %ymm13, %ymm12, %ymm15 1109 vaddpd %ymm1, %ymm15, %ymm1 1110 vbroadcastsd 0(%r15), %ymm12 // B 1111 vmulpd %ymm13, %ymm12, %ymm15 1112 vaddpd %ymm2, %ymm15, %ymm2 1113 1114 // unroll 1 1115 vmovupd 32(%r11), %ymm13 // A 1116 vbroadcastsd 8(%r12), %ymm12 // B 1117 vmulpd %ymm13, %ymm12, %ymm15 1118 vaddpd %ymm4, %ymm15, %ymm4 1119 vbroadcastsd 8(%r12, %r13, 1), %ymm12 // B 1120 vmulpd %ymm13, %ymm12, %ymm15 1121 vaddpd %ymm5, %ymm15, %ymm5 1122 vbroadcastsd 8(%r15), %ymm12 // B 1123 vmulpd %ymm13, %ymm12, %ymm15 1124 vaddpd %ymm6, %ymm15, %ymm6 1125 1126 // unroll 2 1127 vmovupd 64(%r11), %ymm13 // A 1128 vbroadcastsd 16(%r12), %ymm12 // B 1129 vmulpd %ymm13, %ymm12, %ymm15 1130 vaddpd %ymm0, %ymm15, %ymm0 1131 vbroadcastsd 16(%r12, %r13, 1), %ymm12 // B 1132 vmulpd %ymm13, %ymm12, %ymm15 1133 vaddpd %ymm1, %ymm15, %ymm1 1134 vbroadcastsd 16(%r15), %ymm12 // B 1135 vmulpd %ymm13, %ymm12, %ymm15 1136 vaddpd %ymm2, %ymm15, %ymm2 1137 1138 // unroll 3 1139 vmovupd 96(%r11), %ymm13 // A 1140 vbroadcastsd 24(%r12), %ymm12 // B 1141 vmulpd %ymm13, %ymm12, %ymm15 1142 vaddpd %ymm4, %ymm15, %ymm4 1143 vbroadcastsd 24(%r12, %r13, 1), %ymm12 // B 1144 vmulpd %ymm13, %ymm12, %ymm15 1145 vaddpd %ymm5, %ymm15, %ymm5 1146 vbroadcastsd 24(%r15), %ymm12 // B 1147 vmulpd %ymm13, %ymm12, %ymm15 1148 vaddpd %ymm6, %ymm15, %ymm6 1149 1150 subl $4, %r10d 1151 addq $32, %r12 1152 addq $32, %r15 1153 addq $128, %r11 1154 1155 cmpl $4, %r10d 1156 jg 1b // main loop 1157 1158 11590: // consider clean4-up 1160 1161 cmpl $3, %r10d 1162 jle 4f // clean1 1163 1164 // unroll 0 1165 vmovupd 0(%r11), %ymm13 // A 1166 vbroadcastsd 0(%r12), %ymm12 // B 1167 vmulpd %ymm13, %ymm12, %ymm15 1168 vaddpd %ymm0, %ymm15, %ymm0 1169 vbroadcastsd 0(%r12, %r13, 1), %ymm12 // B 1170 vmulpd %ymm13, %ymm12, %ymm15 1171 vaddpd %ymm1, %ymm15, %ymm1 1172 vbroadcastsd 0(%r15), %ymm12 // B 1173 vmulpd %ymm13, %ymm12, %ymm15 1174 vaddpd %ymm2, %ymm15, %ymm2 1175 1176 // unroll 1 1177 vmovupd 32(%r11), %ymm13 // A 1178 vbroadcastsd 8(%r12), %ymm12 // B 1179 vmulpd %ymm13, %ymm12, %ymm15 1180 vaddpd %ymm4, %ymm15, %ymm4 1181 vbroadcastsd 8(%r12, %r13, 1), %ymm12 // B 1182 vmulpd %ymm13, %ymm12, %ymm15 1183 vaddpd %ymm5, %ymm15, %ymm5 1184 vbroadcastsd 8(%r15), %ymm12 // B 1185 vmulpd %ymm13, %ymm12, %ymm15 1186 vaddpd %ymm6, %ymm15, %ymm6 1187 1188 // unroll 2 1189 vmovupd 64(%r11), %ymm13 // A 1190 vbroadcastsd 16(%r12), %ymm12 // B 1191 vmulpd %ymm13, %ymm12, %ymm15 1192 vaddpd %ymm0, %ymm15, %ymm0 1193 vbroadcastsd 16(%r12, %r13, 1), %ymm12 // B 1194 vmulpd %ymm13, %ymm12, %ymm15 1195 vaddpd %ymm1, %ymm15, %ymm1 1196 vbroadcastsd 16(%r15), %ymm12 // B 1197 vmulpd %ymm13, %ymm12, %ymm15 1198 vaddpd %ymm2, %ymm15, %ymm2 1199 1200 // unroll 3 1201 vmovupd 96(%r11), %ymm13 // A 1202 vbroadcastsd 24(%r12), %ymm12 // B 1203 vmulpd %ymm13, %ymm12, %ymm15 1204 vaddpd %ymm4, %ymm15, %ymm4 1205 vbroadcastsd 24(%r12, %r13, 1), %ymm12 // B 1206 vmulpd %ymm13, %ymm12, %ymm15 1207 vaddpd %ymm5, %ymm15, %ymm5 1208 vbroadcastsd 24(%r15), %ymm12 // B 1209 vmulpd %ymm13, %ymm12, %ymm15 1210 vaddpd %ymm6, %ymm15, %ymm6 1211 1212 subl $4, %r10d 1213 addq $32, %r12 1214 addq $32, %r15 1215 addq $128, %r11 1216 1217 jmp 2f // return 1218 1219 12204: // consider clean1-up loop 1221 1222 cmpl $0, %r10d 1223 jle 2f // return 1224 1225 // clean-up loop 12263: // clean up loop 1227 1228 // unroll 0 1229 vmovupd 0(%r11), %ymm13 // A 1230 vbroadcastsd 0(%r12), %ymm12 // B 1231 vmulpd %ymm13, %ymm12, %ymm15 1232 vaddpd %ymm0, %ymm15, %ymm0 1233 vbroadcastsd 0(%r12, %r13, 1), %ymm12 // B 1234 vmulpd %ymm13, %ymm12, %ymm15 1235 vaddpd %ymm1, %ymm15, %ymm1 1236 vbroadcastsd 0(%r15), %ymm12 // B 1237 vmulpd %ymm13, %ymm12, %ymm15 1238 vaddpd %ymm2, %ymm15, %ymm2 1239 1240 subl $1, %r10d 1241 addq $8, %r12 1242 addq $8, %r15 1243 addq $32, %r11 1244 1245 cmpl $0, %r10d 1246 jg 3b // clean up loop 1247 1248 12492: // return 1250 1251 vaddpd %ymm4, %ymm0, %ymm0 1252 vaddpd %ymm5, %ymm1, %ymm1 1253 vaddpd %ymm6, %ymm2, %ymm2 1254 12555: // return 1256 1257#if MACRO_LEVEL>=2 1258 .endm 1259#else 1260 ret 1261 1262 FUN_END(inner_kernel_dgemm_nn_4x3_lib4c) 1263#endif 1264 1265 1266 1267 1268 1269// common inner routine with file scope 1270// 1271// input arguments: 1272// r10d <- k 1273// r11 <- A 1274// r12 <- B 1275// r13 <- ldb 1276// ymm0 <- [d00 d10 d20 d30] 1277// ymm1 <- [d01 d11 d21 d31] 1278// ymm2 <- [d02 d12 d22 d32] 1279// ymm3 <- [d03 d13 d23 d33] 1280// 1281// output arguments: 1282 1283#if MACRO_LEVEL>=2 1284 .macro INNER_KERNEL_DGEMM_NN_4X2_LIB4C 1285#else 1286 .p2align 4,,15 1287 FUN_START(inner_kernel_dgemm_nn_4x2_lib4c) 1288#endif 1289 1290 cmpl $0, %r10d 1291 jle 5f // return 1292 1293 movq %r12, %r15 1294 addq %r13, %r15 1295 addq %r13, %r15 // B+2*ldb 1296 1297 // preload 1298 1299 vxorpd %ymm4, %ymm4, %ymm4 1300 vmovapd %ymm4, %ymm5 1301 1302 cmpl $4, %r10d 1303 jle 0f // consider clean-up loop 1304 1305 // main loop 1306 .p2align 3 13071: // main loop 1308 1309// prefetcht0 0(%r12, %r13, 2) // software prefetch 1310// prefetcht0 64(%r12, %r13, 2) // software prefetch 1311 1312 // unroll 0 1313 vmovupd 0(%r11), %ymm13 // A 1314 vbroadcastsd 0(%r12), %ymm12 // B 1315 vmulpd %ymm13, %ymm12, %ymm15 1316 vaddpd %ymm0, %ymm15, %ymm0 1317 vbroadcastsd 0(%r12, %r13, 1), %ymm12 // B 1318 vmulpd %ymm13, %ymm12, %ymm15 1319 vaddpd %ymm1, %ymm15, %ymm1 1320 1321 // unroll 1 1322 vmovupd 32(%r11), %ymm13 // A 1323 vbroadcastsd 8(%r12), %ymm12 // B 1324 vmulpd %ymm13, %ymm12, %ymm15 1325 vaddpd %ymm4, %ymm15, %ymm4 1326 vbroadcastsd 8(%r12, %r13, 1), %ymm12 // B 1327 vmulpd %ymm13, %ymm12, %ymm15 1328 vaddpd %ymm5, %ymm15, %ymm5 1329 1330 // unroll 2 1331 vmovupd 64(%r11), %ymm13 // A 1332 vbroadcastsd 16(%r12), %ymm12 // B 1333 vmulpd %ymm13, %ymm12, %ymm15 1334 vaddpd %ymm0, %ymm15, %ymm0 1335 vbroadcastsd 16(%r12, %r13, 1), %ymm12 // B 1336 vmulpd %ymm13, %ymm12, %ymm15 1337 vaddpd %ymm1, %ymm15, %ymm1 1338 1339 // unroll 3 1340 vmovupd 96(%r11), %ymm13 // A 1341 vbroadcastsd 24(%r12), %ymm12 // B 1342 vmulpd %ymm13, %ymm12, %ymm15 1343 vaddpd %ymm4, %ymm15, %ymm4 1344 vbroadcastsd 24(%r12, %r13, 1), %ymm12 // B 1345 vmulpd %ymm13, %ymm12, %ymm15 1346 vaddpd %ymm5, %ymm15, %ymm5 1347 1348 subl $4, %r10d 1349 addq $32, %r12 1350 addq $32, %r15 1351 addq $128, %r11 1352 1353 cmpl $4, %r10d 1354 jg 1b // main loop 1355 1356 13570: // consider clean4-up 1358 1359 cmpl $3, %r10d 1360 jle 4f // clean1 1361 1362 // unroll 0 1363 vmovupd 0(%r11), %ymm13 // A 1364 vbroadcastsd 0(%r12), %ymm12 // B 1365 vmulpd %ymm13, %ymm12, %ymm15 1366 vaddpd %ymm0, %ymm15, %ymm0 1367 vbroadcastsd 0(%r12, %r13, 1), %ymm12 // B 1368 vmulpd %ymm13, %ymm12, %ymm15 1369 vaddpd %ymm1, %ymm15, %ymm1 1370 1371 // unroll 1 1372 vmovupd 32(%r11), %ymm13 // A 1373 vbroadcastsd 8(%r12), %ymm12 // B 1374 vmulpd %ymm13, %ymm12, %ymm15 1375 vaddpd %ymm4, %ymm15, %ymm4 1376 vbroadcastsd 8(%r12, %r13, 1), %ymm12 // B 1377 vmulpd %ymm13, %ymm12, %ymm15 1378 vaddpd %ymm5, %ymm15, %ymm5 1379 1380 // unroll 2 1381 vmovupd 64(%r11), %ymm13 // A 1382 vbroadcastsd 16(%r12), %ymm12 // B 1383 vmulpd %ymm13, %ymm12, %ymm15 1384 vaddpd %ymm0, %ymm15, %ymm0 1385 vbroadcastsd 16(%r12, %r13, 1), %ymm12 // B 1386 vmulpd %ymm13, %ymm12, %ymm15 1387 vaddpd %ymm1, %ymm15, %ymm1 1388 1389 // unroll 3 1390 vmovupd 96(%r11), %ymm13 // A 1391 vbroadcastsd 24(%r12), %ymm12 // B 1392 vmulpd %ymm13, %ymm12, %ymm15 1393 vaddpd %ymm4, %ymm15, %ymm4 1394 vbroadcastsd 24(%r12, %r13, 1), %ymm12 // B 1395 vmulpd %ymm13, %ymm12, %ymm15 1396 vaddpd %ymm5, %ymm15, %ymm5 1397 1398 subl $4, %r10d 1399 addq $32, %r12 1400 addq $32, %r15 1401 addq $128, %r11 1402 1403 jmp 2f // return 1404 1405 14064: // consider clean1-up loop 1407 1408 cmpl $0, %r10d 1409 jle 2f // return 1410 1411 // clean-up loop 14123: // clean up loop 1413 1414 // unroll 0 1415 vmovupd 0(%r11), %ymm13 // A 1416 vbroadcastsd 0(%r12), %ymm12 // B 1417 vmulpd %ymm13, %ymm12, %ymm15 1418 vaddpd %ymm0, %ymm15, %ymm0 1419 vbroadcastsd 0(%r12, %r13, 1), %ymm12 // B 1420 vmulpd %ymm13, %ymm12, %ymm15 1421 vaddpd %ymm1, %ymm15, %ymm1 1422 1423 subl $1, %r10d 1424 addq $8, %r12 1425 addq $8, %r15 1426 addq $32, %r11 1427 1428 cmpl $0, %r10d 1429 jg 3b // clean up loop 1430 1431 14322: // return 1433 1434 vaddpd %ymm4, %ymm0, %ymm0 1435 vaddpd %ymm5, %ymm1, %ymm1 1436 14375: // return 1438 1439#if MACRO_LEVEL>=2 1440 .endm 1441#else 1442 ret 1443 1444 FUN_END(inner_kernel_dgemm_nn_4x2_lib4c) 1445#endif 1446 1447 1448 1449 1450 1451// common inner routine with file scope 1452// 1453// input arguments: 1454// r10d <- k 1455// r11 <- A 1456// r12 <- B 1457// r13 <- ldb 1458// ymm0 <- [d00 d10 d20 d30] 1459// ymm1 <- [d01 d11 d21 d31] 1460// ymm2 <- [d02 d12 d22 d32] 1461// ymm3 <- [d03 d13 d23 d33] 1462// 1463// output arguments: 1464 1465#if MACRO_LEVEL>=2 1466 .macro INNER_KERNEL_DGEMM_NN_4X1_LIB4C 1467#else 1468 .p2align 4,,15 1469 FUN_START(inner_kernel_dgemm_nn_4x1_lib4c) 1470#endif 1471 1472 cmpl $0, %r10d 1473 jle 5f // return 1474 1475 movq %r12, %r15 1476 addq %r13, %r15 1477 addq %r13, %r15 // B+2*ldb 1478 1479 // preload 1480 1481 vxorpd %ymm4, %ymm4, %ymm4 1482 1483 cmpl $4, %r10d 1484 jle 0f // consider clean-up loop 1485 1486 // main loop 1487 .p2align 3 14881: // main loop 1489 1490// prefetcht0 0(%r12, %r13, 2) // software prefetch 1491// prefetcht0 64(%r12, %r13, 2) // software prefetch 1492 1493 // unroll 0 1494 vmovupd 0(%r11), %ymm13 // A 1495 vbroadcastsd 0(%r12), %ymm12 // B 1496 vmulpd %ymm13, %ymm12, %ymm15 1497 vaddpd %ymm0, %ymm15, %ymm0 1498 1499 // unroll 1 1500 vmovupd 32(%r11), %ymm13 // A 1501 vbroadcastsd 8(%r12), %ymm12 // B 1502 vmulpd %ymm13, %ymm12, %ymm15 1503 vaddpd %ymm4, %ymm15, %ymm4 1504 1505 // unroll 2 1506 vmovupd 64(%r11), %ymm13 // A 1507 vbroadcastsd 16(%r12), %ymm12 // B 1508 vmulpd %ymm13, %ymm12, %ymm15 1509 vaddpd %ymm0, %ymm15, %ymm0 1510 1511 // unroll 3 1512 vmovupd 96(%r11), %ymm13 // A 1513 vbroadcastsd 24(%r12), %ymm12 // B 1514 vmulpd %ymm13, %ymm12, %ymm15 1515 vaddpd %ymm4, %ymm15, %ymm4 1516 1517 subl $4, %r10d 1518 addq $32, %r12 1519 addq $32, %r15 1520 addq $128, %r11 1521 1522 cmpl $4, %r10d 1523 jg 1b // main loop 1524 1525 15260: // consider clean4-up 1527 1528 cmpl $3, %r10d 1529 jle 4f // clean1 1530 1531 // unroll 0 1532 vmovupd 0(%r11), %ymm13 // A 1533 vbroadcastsd 0(%r12), %ymm12 // B 1534 vmulpd %ymm13, %ymm12, %ymm15 1535 vaddpd %ymm0, %ymm15, %ymm0 1536 1537 // unroll 1 1538 vmovupd 32(%r11), %ymm13 // A 1539 vbroadcastsd 8(%r12), %ymm12 // B 1540 vmulpd %ymm13, %ymm12, %ymm15 1541 vaddpd %ymm4, %ymm15, %ymm4 1542 1543 // unroll 2 1544 vmovupd 64(%r11), %ymm13 // A 1545 vbroadcastsd 16(%r12), %ymm12 // B 1546 vmulpd %ymm13, %ymm12, %ymm15 1547 vaddpd %ymm0, %ymm15, %ymm0 1548 1549 // unroll 3 1550 vmovupd 96(%r11), %ymm13 // A 1551 vbroadcastsd 24(%r12), %ymm12 // B 1552 vmulpd %ymm13, %ymm12, %ymm15 1553 vaddpd %ymm4, %ymm15, %ymm4 1554 1555 subl $4, %r10d 1556 addq $32, %r12 1557 addq $32, %r15 1558 addq $128, %r11 1559 1560 jmp 2f // return 1561 1562 15634: // consider clean1-up loop 1564 1565 cmpl $0, %r10d 1566 jle 2f // return 1567 1568 // clean-up loop 15693: // clean up loop 1570 1571 // unroll 0 1572 vmovupd 0(%r11), %ymm13 // A 1573 vbroadcastsd 0(%r12), %ymm12 // B 1574 vmulpd %ymm13, %ymm12, %ymm15 1575 vaddpd %ymm0, %ymm15, %ymm0 1576 1577 subl $1, %r10d 1578 addq $8, %r12 1579 addq $8, %r15 1580 addq $32, %r11 1581 1582 cmpl $0, %r10d 1583 jg 3b // clean up loop 1584 1585 15862: // return 1587 1588 vaddpd %ymm4, %ymm0, %ymm0 1589 15905: // return 1591 1592#if MACRO_LEVEL>=2 1593 .endm 1594#else 1595 ret 1596 1597 FUN_END(inner_kernel_dgemm_nn_4x1_lib4c) 1598#endif 1599 1600 1601 1602 1603 1604// common inner routine with file scope 1605// 1606// edge for B lower triangular 1607// 1608// input arguments: 1609// r10d <- k 1610// r11 <- A 1611// r12 <- B 1612// r13 <- ldb 1613// ymm0 <- [d00 d10 d20 d30] 1614// ymm1 <- [d01 d11 d21 d31] 1615// ymm2 <- [d02 d12 d22 d32] 1616// ymm3 <- [d03 d13 d23 d33] 1617// 1618// output arguments: 1619 1620#if MACRO_LEVEL>=1 1621 .macro INNER_EDGE_DTRMM_NN_RL_4X4_LIB4C 1622#else 1623 .p2align 4,,15 1624 FUN_START(inner_edge_dtrmm_nn_rl_4x4_lib4c) 1625#endif 1626 1627 movq %r12, %r15 1628 addq %r13, %r15 1629 addq %r13, %r15 // B+2*ldb 1630 1631 // unroll 0 1632 vmovupd 0(%r11), %ymm13 // A 1633 vbroadcastsd 0(%r12), %ymm12 // B 1634 vmulpd %ymm13, %ymm12, %ymm15 1635 vaddpd %ymm0, %ymm15, %ymm0 1636 1637 // unroll 1 1638 vmovupd 32(%r11), %ymm13 // A 1639 vbroadcastsd 8(%r12), %ymm12 // B 1640 vmulpd %ymm13, %ymm12, %ymm15 1641 vaddpd %ymm0, %ymm15, %ymm0 1642 vbroadcastsd 8(%r12, %r13, 1), %ymm12 // B 1643 vmulpd %ymm13, %ymm12, %ymm15 1644 vaddpd %ymm1, %ymm15, %ymm1 1645 1646 // unroll 2 1647 vmovupd 64(%r11), %ymm13 // A 1648 vbroadcastsd 16(%r12), %ymm12 // B 1649 vmulpd %ymm13, %ymm12, %ymm15 1650 vaddpd %ymm0, %ymm15, %ymm0 1651 vbroadcastsd 16(%r12, %r13, 1), %ymm12 // B 1652 vmulpd %ymm13, %ymm12, %ymm15 1653 vaddpd %ymm1, %ymm15, %ymm1 1654 vbroadcastsd 16(%r15), %ymm12 // B 1655 vmulpd %ymm13, %ymm12, %ymm15 1656 vaddpd %ymm2, %ymm15, %ymm2 1657 1658 // unroll 3 1659 vmovupd 96(%r11), %ymm13 // A 1660 vbroadcastsd 24(%r12), %ymm12 // B 1661 vmulpd %ymm13, %ymm12, %ymm15 1662 vaddpd %ymm0, %ymm15, %ymm0 1663 vbroadcastsd 24(%r12, %r13, 1), %ymm12 // B 1664 vmulpd %ymm13, %ymm12, %ymm15 1665 vaddpd %ymm1, %ymm15, %ymm1 1666 vbroadcastsd 24(%r15), %ymm12 // B 1667 vmulpd %ymm13, %ymm12, %ymm15 1668 vaddpd %ymm2, %ymm15, %ymm2 1669 vbroadcastsd 24(%r15, %r13, 1), %ymm12 // B 1670 vmulpd %ymm13, %ymm12, %ymm15 1671 vaddpd %ymm3, %ymm15, %ymm3 1672 1673 subl $4, %r10d 1674 addq $32, %r12 1675 addq $32, %r15 1676 addq $128, %r11 1677 1678#if MACRO_LEVEL>=1 1679 .endm 1680#else 1681 ret 1682 1683 FUN_END(inner_edge_dtrmm_nn_rl_4x4_lib4c) 1684#endif 1685 1686 1687 1688 1689 1690// common inner routine with file scope 1691// 1692// edge for B lower triangular 1693// 1694// input arguments: 1695// r10d <- k 1696// r11 <- A 1697// r12 <- B 1698// r13 <- ldb 1699// ymm0 <- [d00 d10 d20 d30] 1700// ymm1 <- [d01 d11 d21 d31] 1701// ymm2 <- [d02 d12 d22 d32] 1702// ymm3 <- [d03 d13 d23 d33] 1703// 1704// output arguments: 1705 1706#if MACRO_LEVEL>=1 1707 .macro INNER_EDGE_DTRMM_NN_RL_4X4_VS_LIB4C 1708#else 1709 .p2align 4,,15 1710 FUN_START(inner_edge_dtrmm_nn_rl_4x4_vs_lib4c) 1711#endif 1712 1713 cmpl $0, %r10d 1714 jle 0f // end 1715 1716 movq %r12, %r15 1717 addq %r13, %r15 1718 addq %r13, %r15 // B+2*ldb 1719 1720 // unroll 0 1721 vmovupd 0(%r11), %ymm13 // A 1722 vbroadcastsd 0(%r12), %ymm12 // B 1723 vmulpd %ymm13, %ymm12, %ymm15 1724 vaddpd %ymm0, %ymm15, %ymm0 1725 1726 subl $1, %r10d 1727 addq $8, %r12 1728 addq $8, %r15 1729 addq $32, %r11 1730 1731 cmpl $0, %r10d 1732 jle 0f // end 1733 1734 // unroll 1 1735 vmovupd 0(%r11), %ymm13 // A 1736 vbroadcastsd 0(%r12), %ymm12 // B 1737 vmulpd %ymm13, %ymm12, %ymm15 1738 vaddpd %ymm0, %ymm15, %ymm0 1739 vbroadcastsd 0(%r12, %r13, 1), %ymm12 // B 1740 vmulpd %ymm13, %ymm12, %ymm15 1741 vaddpd %ymm1, %ymm15, %ymm1 1742 1743 subl $1, %r10d 1744 addq $8, %r12 1745 addq $8, %r15 1746 addq $32, %r11 1747 1748 cmpl $0, %r10d 1749 jle 0f // end 1750 1751 // unroll 2 1752 vmovupd 0(%r11), %ymm13 // A 1753 vbroadcastsd 0(%r12), %ymm12 // B 1754 vmulpd %ymm13, %ymm12, %ymm15 1755 vaddpd %ymm0, %ymm15, %ymm0 1756 vbroadcastsd 0(%r12, %r13, 1), %ymm12 // B 1757 vmulpd %ymm13, %ymm12, %ymm15 1758 vaddpd %ymm1, %ymm15, %ymm1 1759 vbroadcastsd 0(%r15), %ymm12 // B 1760 vmulpd %ymm13, %ymm12, %ymm15 1761 vaddpd %ymm2, %ymm15, %ymm2 1762 1763 subl $1, %r10d 1764 addq $8, %r12 1765 addq $8, %r15 1766 addq $32, %r11 1767 1768 cmpl $0, %r10d 1769 jle 0f // end 1770 1771 // unroll 3 1772 vmovupd 0(%r11), %ymm13 // A 1773 vbroadcastsd 0(%r12), %ymm12 // B 1774 vmulpd %ymm13, %ymm12, %ymm15 1775 vaddpd %ymm0, %ymm15, %ymm0 1776 vbroadcastsd 0(%r12, %r13, 1), %ymm12 // B 1777 vmulpd %ymm13, %ymm12, %ymm15 1778 vaddpd %ymm1, %ymm15, %ymm1 1779 vbroadcastsd 0(%r15), %ymm12 // B 1780 vmulpd %ymm13, %ymm12, %ymm15 1781 vaddpd %ymm2, %ymm15, %ymm2 1782 vbroadcastsd 0(%r15, %r13, 1), %ymm12 // B 1783 vmulpd %ymm13, %ymm12, %ymm15 1784 vaddpd %ymm3, %ymm15, %ymm3 1785 1786 subl $1, %r10d 1787 addq $8, %r12 1788 addq $8, %r15 1789 addq $32, %r11 1790 17910: 1792 1793#if MACRO_LEVEL>=1 1794 .endm 1795#else 1796 ret 1797 1798 FUN_END(inner_edge_dtrmm_nn_rl_4x4_vs_lib4c) 1799#endif 1800 1801 1802 1803 1804 1805// common inner routine with file scope 1806// 1807// edge for B lower triangular 1808// 1809// input arguments: 1810// r10d <- k 1811// r11 <- A 1812// r12 <- B 1813// r13 <- ldb 1814// ymm0 <- [d00 d10 d20 d30] 1815// ymm1 <- [d01 d11 d21 d31] 1816// ymm2 <- [d02 d12 d22 d32] 1817// ymm3 <- [d03 d13 d23 d33] 1818// 1819// output arguments: 1820 1821#if MACRO_LEVEL>=1 1822 .macro INNER_EDGE_DTRMM_NN_RL_ONE_4X4_LIB4C 1823#else 1824 .p2align 4,,15 1825 FUN_START(inner_edge_dtrmm_nn_rl_one_4x4_lib4c) 1826#endif 1827 1828 movq %r12, %r15 1829 addq %r13, %r15 1830 addq %r13, %r15 // B+2*ldb 1831 1832 // unroll 0 1833 vmovupd 0(%r11), %ymm13 // A 1834 vaddpd %ymm0, %ymm13, %ymm0 1835 1836 // unroll 1 1837 vmovupd 32(%r11), %ymm13 // A 1838 vbroadcastsd 8(%r12), %ymm12 // B 1839 vmulpd %ymm13, %ymm12, %ymm15 1840 vaddpd %ymm0, %ymm15, %ymm0 1841 vaddpd %ymm1, %ymm13, %ymm1 1842 1843 // unroll 2 1844 vmovupd 64(%r11), %ymm13 // A 1845 vbroadcastsd 16(%r12), %ymm12 // B 1846 vmulpd %ymm13, %ymm12, %ymm15 1847 vaddpd %ymm0, %ymm15, %ymm0 1848 vbroadcastsd 16(%r12, %r13, 1), %ymm12 // B 1849 vmulpd %ymm13, %ymm12, %ymm15 1850 vaddpd %ymm1, %ymm15, %ymm1 1851 vaddpd %ymm2, %ymm13, %ymm2 1852 1853 // unroll 3 1854 vmovupd 96(%r11), %ymm13 // A 1855 vbroadcastsd 24(%r12), %ymm12 // B 1856 vmulpd %ymm13, %ymm12, %ymm15 1857 vaddpd %ymm0, %ymm15, %ymm0 1858 vbroadcastsd 24(%r12, %r13, 1), %ymm12 // B 1859 vmulpd %ymm13, %ymm12, %ymm15 1860 vaddpd %ymm1, %ymm15, %ymm1 1861 vbroadcastsd 24(%r15), %ymm12 // B 1862 vmulpd %ymm13, %ymm12, %ymm15 1863 vaddpd %ymm2, %ymm15, %ymm2 1864 vaddpd %ymm3, %ymm13, %ymm3 1865 1866 subl $4, %r10d 1867 addq $32, %r12 1868 addq $32, %r15 1869 addq $128, %r11 1870 1871#if MACRO_LEVEL>=1 1872 .endm 1873#else 1874 ret 1875 1876 FUN_END(inner_edge_dtrmm_nn_rl_one_4x4_lib4c) 1877#endif 1878 1879 1880 1881 1882 1883// common inner routine with file scope 1884// 1885// edge for B lower triangular 1886// 1887// input arguments: 1888// r10d <- k 1889// r11 <- A 1890// r12 <- B 1891// r13 <- ldb 1892// ymm0 <- [d00 d10 d20 d30] 1893// ymm1 <- [d01 d11 d21 d31] 1894// ymm2 <- [d02 d12 d22 d32] 1895// ymm3 <- [d03 d13 d23 d33] 1896// 1897// output arguments: 1898 1899#if MACRO_LEVEL>=1 1900 .macro INNER_EDGE_DTRMM_NN_RL_ONE_4X4_VS_LIB4C 1901#else 1902 .p2align 4,,15 1903 FUN_START(inner_edge_dtrmm_nn_rl_one_4x4_vs_lib4c) 1904#endif 1905 1906 cmpl $0, %r10d 1907 jle 0f // end 1908 1909 movq %r12, %r15 1910 addq %r13, %r15 1911 addq %r13, %r15 // B+2*ldb 1912 1913 // unroll 0 1914 vmovupd 0(%r11), %ymm13 // A 1915 vaddpd %ymm0, %ymm13, %ymm0 1916 1917 subl $1, %r10d 1918 addq $8, %r12 1919 addq $8, %r15 1920 addq $32, %r11 1921 1922 cmpl $0, %r10d 1923 jle 0f // end 1924 1925 // unroll 1 1926 vmovupd 0(%r11), %ymm13 // A 1927 vbroadcastsd 0(%r12), %ymm12 // B 1928 vmulpd %ymm13, %ymm12, %ymm15 1929 vaddpd %ymm0, %ymm15, %ymm0 1930 vaddpd %ymm1, %ymm13, %ymm1 1931 1932 subl $1, %r10d 1933 addq $8, %r12 1934 addq $8, %r15 1935 addq $32, %r11 1936 1937 cmpl $0, %r10d 1938 jle 0f // end 1939 1940 // unroll 2 1941 vmovupd 0(%r11), %ymm13 // A 1942 vbroadcastsd 0(%r12), %ymm12 // B 1943 vmulpd %ymm13, %ymm12, %ymm15 1944 vaddpd %ymm0, %ymm15, %ymm0 1945 vbroadcastsd 0(%r12, %r13, 1), %ymm12 // B 1946 vmulpd %ymm13, %ymm12, %ymm15 1947 vaddpd %ymm1, %ymm15, %ymm1 1948 vaddpd %ymm2, %ymm13, %ymm2 1949 1950 subl $1, %r10d 1951 addq $8, %r12 1952 addq $8, %r15 1953 addq $32, %r11 1954 1955 cmpl $0, %r10d 1956 jle 0f // end 1957 1958 // unroll 3 1959 vmovupd 0(%r11), %ymm13 // A 1960 vbroadcastsd 0(%r12), %ymm12 // B 1961 vmulpd %ymm13, %ymm12, %ymm15 1962 vaddpd %ymm0, %ymm15, %ymm0 1963 vbroadcastsd 0(%r12, %r13, 1), %ymm12 // B 1964 vmulpd %ymm13, %ymm12, %ymm15 1965 vaddpd %ymm1, %ymm15, %ymm1 1966 vbroadcastsd 0(%r15), %ymm12 // B 1967 vmulpd %ymm13, %ymm12, %ymm15 1968 vaddpd %ymm2, %ymm15, %ymm2 1969 vaddpd %ymm3, %ymm13, %ymm3 1970 1971 subl $1, %r10d 1972 addq $8, %r12 1973 addq $8, %r15 1974 addq $32, %r11 1975 19760: 1977 1978#if MACRO_LEVEL>=1 1979 .endm 1980#else 1981 ret 1982 1983 FUN_END(inner_edge_dtrmm_nn_rl_one_4x4_vs_lib4c) 1984#endif 1985 1986 1987 1988 1989 1990// common inner routine with file scope 1991// 1992// edge for B upper triangular 1993// 1994// input arguments: 1995// r10d <- k 1996// r11 <- A 1997// r12 <- B 1998// r13 <- ldb 1999// ymm0 <- [d00 d10 d20 d30] 2000// ymm1 <- [d01 d11 d21 d31] 2001// ymm2 <- [d02 d12 d22 d32] 2002// ymm3 <- [d03 d13 d23 d33] 2003// 2004// output arguments: 2005 2006#if MACRO_LEVEL>=1 2007 .macro INNER_EDGE_DTRMM_NN_RU_4X4_LIB4C 2008#else 2009 .p2align 4,,15 2010 FUN_START(inner_edge_dtrmm_nn_ru_4x4_lib4c) 2011#endif 2012 2013 movq %r12, %r15 2014 addq %r13, %r15 2015 addq %r13, %r15 // B+2*ldb 2016 2017 // unroll 0 2018 vmovupd 0(%r11), %ymm13 // A 2019 vbroadcastsd 0(%r12), %ymm12 // B 2020 vmulpd %ymm13, %ymm12, %ymm15 2021 vaddpd %ymm0, %ymm15, %ymm0 2022 vbroadcastsd 0(%r12, %r13), %ymm12 // B 2023 vmulpd %ymm13, %ymm12, %ymm15 2024 vaddpd %ymm1, %ymm15, %ymm1 2025 vbroadcastsd 0(%r15), %ymm12 // B 2026 vmulpd %ymm13, %ymm12, %ymm15 2027 vaddpd %ymm2, %ymm15, %ymm2 2028 vbroadcastsd 0(%r15, %r13), %ymm12 // B 2029 vmulpd %ymm13, %ymm12, %ymm15 2030 vaddpd %ymm3, %ymm15, %ymm3 2031 2032 // unroll 1 2033 vmovupd 32(%r11), %ymm13 // A 2034 vbroadcastsd 8(%r12, %r13), %ymm12 // B 2035 vmulpd %ymm13, %ymm12, %ymm15 2036 vaddpd %ymm1, %ymm15, %ymm1 2037 vbroadcastsd 8(%r15), %ymm12 // B 2038 vmulpd %ymm13, %ymm12, %ymm15 2039 vaddpd %ymm2, %ymm15, %ymm2 2040 vbroadcastsd 8(%r15, %r13), %ymm12 // B 2041 vmulpd %ymm13, %ymm12, %ymm15 2042 vaddpd %ymm3, %ymm15, %ymm3 2043 2044 // unroll 2 2045 vmovupd 64(%r11), %ymm13 // A 2046 vbroadcastsd 16(%r15), %ymm12 // B 2047 vmulpd %ymm13, %ymm12, %ymm15 2048 vaddpd %ymm2, %ymm15, %ymm2 2049 vbroadcastsd 16(%r15, %r13), %ymm12 // B 2050 vmulpd %ymm13, %ymm12, %ymm15 2051 vaddpd %ymm3, %ymm15, %ymm3 2052 2053 // unroll 3 2054 vmovupd 96(%r11), %ymm13 // A 2055 vbroadcastsd 24(%r15, %r13), %ymm12 // B 2056 vmulpd %ymm13, %ymm12, %ymm15 2057 vaddpd %ymm3, %ymm15, %ymm3 2058 2059 subl $4, %r10d 2060 addq $32, %r12 2061 addq $32, %r15 2062 addq $128, %r11 2063 2064#if MACRO_LEVEL>=1 2065 .endm 2066#else 2067 ret 2068 2069 FUN_END(inner_edge_dtrmm_nn_ru_4x4_lib4c) 2070#endif 2071 2072 2073 2074 2075 2076// common inner routine with file scope 2077// 2078// edge for B upper triangular 2079// 2080// input arguments: 2081// r10d <- k 2082// r11 <- A 2083// r12 <- B 2084// r13 <- ldb 2085// r14 <- n1 2086// ymm0 <- [d00 d10 d20 d30] 2087// ymm1 <- [d01 d11 d21 d31] 2088// ymm2 <- [d02 d12 d22 d32] 2089// ymm3 <- [d03 d13 d23 d33] 2090// 2091// output arguments: 2092 2093#if MACRO_LEVEL>=1 2094 .macro INNER_EDGE_DTRMM_NN_RU_4X4_VS_LIB4C 2095#else 2096 .p2align 4,,15 2097 FUN_START(inner_edge_dtrmm_nn_ru_4x4_vs_lib4c) 2098#endif 2099 2100 cmpl $0, %r14d 2101 jle 0f // end 2102 2103 movq %r12, %r15 2104 addq %r13, %r15 2105 addq %r13, %r15 // B+2*ldb 2106 2107 cmpl $4, %r14d 2108 jl 1f // end 2109 2110 // unroll 0 2111 vmovupd 0(%r11), %ymm13 // A 2112 vbroadcastsd 0(%r12), %ymm12 // B 2113 vmulpd %ymm13, %ymm12, %ymm15 2114 vaddpd %ymm0, %ymm15, %ymm0 2115 vbroadcastsd 0(%r12, %r13), %ymm12 // B 2116 vmulpd %ymm13, %ymm12, %ymm15 2117 vaddpd %ymm1, %ymm15, %ymm1 2118 vbroadcastsd 0(%r15), %ymm12 // B 2119 vmulpd %ymm13, %ymm12, %ymm15 2120 vaddpd %ymm2, %ymm15, %ymm2 2121 vbroadcastsd 0(%r15, %r13), %ymm12 // B 2122 vmulpd %ymm13, %ymm12, %ymm15 2123 vaddpd %ymm3, %ymm15, %ymm3 2124 2125 // unroll 1 2126 vmovupd 32(%r11), %ymm13 // A 2127 vbroadcastsd 8(%r12, %r13), %ymm12 // B 2128 vmulpd %ymm13, %ymm12, %ymm15 2129 vaddpd %ymm1, %ymm15, %ymm1 2130 vbroadcastsd 8(%r15), %ymm12 // B 2131 vmulpd %ymm13, %ymm12, %ymm15 2132 vaddpd %ymm2, %ymm15, %ymm2 2133 vbroadcastsd 8(%r15, %r13), %ymm12 // B 2134 vmulpd %ymm13, %ymm12, %ymm15 2135 vaddpd %ymm3, %ymm15, %ymm3 2136 2137 // unroll 2 2138 vmovupd 64(%r11), %ymm13 // A 2139 vbroadcastsd 16(%r15), %ymm12 // B 2140 vmulpd %ymm13, %ymm12, %ymm15 2141 vaddpd %ymm2, %ymm15, %ymm2 2142 vbroadcastsd 16(%r15, %r13), %ymm12 // B 2143 vmulpd %ymm13, %ymm12, %ymm15 2144 vaddpd %ymm3, %ymm15, %ymm3 2145 2146 // unroll 3 2147 vmovupd 96(%r11), %ymm13 // A 2148 vbroadcastsd 24(%r15, %r13), %ymm12 // B 2149 vmulpd %ymm13, %ymm12, %ymm15 2150 vaddpd %ymm3, %ymm15, %ymm3 2151 2152 subl $4, %r10d 2153 addq $32, %r12 2154 addq $32, %r15 2155 addq $128, %r11 2156 2157 jmp 0f 2158 21591: 2160 2161 cmpl $3, %r14d 2162 jl 2f // end 2163 2164 // unroll 0 2165 vmovupd 0(%r11), %ymm13 // A 2166 vbroadcastsd 0(%r12), %ymm12 // B 2167 vmulpd %ymm13, %ymm12, %ymm15 2168 vaddpd %ymm0, %ymm15, %ymm0 2169 vbroadcastsd 0(%r12, %r13), %ymm12 // B 2170 vmulpd %ymm13, %ymm12, %ymm15 2171 vaddpd %ymm1, %ymm15, %ymm1 2172 vbroadcastsd 0(%r15), %ymm12 // B 2173 vmulpd %ymm13, %ymm12, %ymm15 2174 vaddpd %ymm2, %ymm15, %ymm2 2175 2176 // unroll 1 2177 vmovupd 32(%r11), %ymm13 // A 2178 vbroadcastsd 8(%r12, %r13), %ymm12 // B 2179 vmulpd %ymm13, %ymm12, %ymm15 2180 vaddpd %ymm1, %ymm15, %ymm1 2181 vbroadcastsd 8(%r15), %ymm12 // B 2182 vmulpd %ymm13, %ymm12, %ymm15 2183 vaddpd %ymm2, %ymm15, %ymm2 2184 2185 // unroll 2 2186 vmovupd 64(%r11), %ymm13 // A 2187 vbroadcastsd 16(%r15), %ymm12 // B 2188 vmulpd %ymm13, %ymm12, %ymm15 2189 vaddpd %ymm2, %ymm15, %ymm2 2190 2191 // unroll 3 2192 2193 subl $3, %r10d 2194 addq $24, %r12 2195 addq $24, %r15 2196 addq $96, %r11 2197 2198 jmp 0f 2199 22002: 2201 2202 cmpl $2, %r14d 2203 jl 3f // end 2204 2205 // unroll 0 2206 vmovupd 0(%r11), %ymm13 // A 2207 vbroadcastsd 0(%r12), %ymm12 // B 2208 vmulpd %ymm13, %ymm12, %ymm15 2209 vaddpd %ymm0, %ymm15, %ymm0 2210 vbroadcastsd 0(%r12, %r13), %ymm12 // B 2211 vmulpd %ymm13, %ymm12, %ymm15 2212 vaddpd %ymm1, %ymm15, %ymm1 2213 2214 // unroll 1 2215 vmovupd 32(%r11), %ymm13 // A 2216 vbroadcastsd 8(%r12, %r13), %ymm12 // B 2217 vmulpd %ymm13, %ymm12, %ymm15 2218 vaddpd %ymm1, %ymm15, %ymm1 2219 2220 // unroll 2 2221 2222 // unroll 3 2223 2224 subl $2, %r10d 2225 addq $16, %r12 2226 addq $16, %r15 2227 addq $64, %r11 2228 2229 jmp 0f 2230 22313: 2232 2233// cmpl $1, %r14d 2234// jl 0f // end 2235 2236 // unroll 0 2237 vmovupd 0(%r11), %ymm13 // A 2238 vbroadcastsd 0(%r12), %ymm12 // B 2239 vmulpd %ymm13, %ymm12, %ymm15 2240 vaddpd %ymm0, %ymm15, %ymm0 2241 2242 // unroll 1 2243 2244 // unroll 2 2245 2246 // unroll 3 2247 2248 subl $1, %r10d 2249 addq $8, %r12 2250 addq $8, %r15 2251 addq $32, %r11 2252 22530: 2254 2255#if MACRO_LEVEL>=1 2256 .endm 2257#else 2258 ret 2259 2260 FUN_END(inner_edge_dtrmm_nn_ru_4x4_vs_lib4c) 2261#endif 2262 2263 2264 2265 2266 2267// common inner routine with file scope 2268// 2269// edge for B upper triangular 2270// 2271// input arguments: 2272// r10d <- k 2273// r11 <- A 2274// r12 <- B 2275// r13 <- ldb 2276// ymm0 <- [d00 d10 d20 d30] 2277// ymm1 <- [d01 d11 d21 d31] 2278// ymm2 <- [d02 d12 d22 d32] 2279// ymm3 <- [d03 d13 d23 d33] 2280// 2281// output arguments: 2282 2283#if MACRO_LEVEL>=1 2284 .macro INNER_EDGE_DTRMM_NN_RU_ONE_4X4_LIB4C 2285#else 2286 .p2align 4,,15 2287 FUN_START(inner_edge_dtrmm_nn_ru_one_4x4_lib4c) 2288#endif 2289 2290 movq %r12, %r15 2291 addq %r13, %r15 2292 addq %r13, %r15 // B+2*ldb 2293 2294 // unroll 0 2295 vmovupd 0(%r11), %ymm13 // A 2296 vaddpd %ymm0, %ymm13, %ymm0 2297 vbroadcastsd 0(%r12, %r13), %ymm12 // B 2298 vmulpd %ymm13, %ymm12, %ymm15 2299 vaddpd %ymm1, %ymm15, %ymm1 2300 vbroadcastsd 0(%r15), %ymm12 // B 2301 vmulpd %ymm13, %ymm12, %ymm15 2302 vaddpd %ymm2, %ymm15, %ymm2 2303 vbroadcastsd 0(%r15, %r13), %ymm12 // B 2304 vmulpd %ymm13, %ymm12, %ymm15 2305 vaddpd %ymm3, %ymm15, %ymm3 2306 2307 // unroll 1 2308 vmovupd 32(%r11), %ymm13 // A 2309 vaddpd %ymm1, %ymm13, %ymm1 2310 vbroadcastsd 8(%r15), %ymm12 // B 2311 vmulpd %ymm13, %ymm12, %ymm15 2312 vaddpd %ymm2, %ymm15, %ymm2 2313 vbroadcastsd 8(%r15, %r13), %ymm12 // B 2314 vmulpd %ymm13, %ymm12, %ymm15 2315 vaddpd %ymm3, %ymm15, %ymm3 2316 2317 // unroll 2 2318 vmovupd 64(%r11), %ymm13 // A 2319 vaddpd %ymm2, %ymm13, %ymm2 2320 vbroadcastsd 16(%r15, %r13), %ymm12 // B 2321 vmulpd %ymm13, %ymm12, %ymm15 2322 vaddpd %ymm3, %ymm15, %ymm3 2323 2324 // unroll 3 2325 vmovupd 96(%r11), %ymm13 // A 2326 vaddpd %ymm3, %ymm13, %ymm3 2327 2328 subl $4, %r10d 2329 addq $32, %r12 2330 addq $32, %r15 2331 addq $128, %r11 2332 2333#if MACRO_LEVEL>=1 2334 .endm 2335#else 2336 ret 2337 2338 FUN_END(inner_edge_dtrmm_nn_ru_one_4x4_lib4c) 2339#endif 2340 2341 2342 2343 2344 2345// common inner routine with file scope 2346// 2347// edge for B upper triangular 2348// 2349// input arguments: 2350// r10d <- k 2351// r11 <- A 2352// r12 <- B 2353// r13 <- ldb 2354// r14 <- n1 2355// ymm0 <- [d00 d10 d20 d30] 2356// ymm1 <- [d01 d11 d21 d31] 2357// ymm2 <- [d02 d12 d22 d32] 2358// ymm3 <- [d03 d13 d23 d33] 2359// 2360// output arguments: 2361 2362#if MACRO_LEVEL>=1 2363 .macro INNER_EDGE_DTRMM_NN_RU_ONE_4X4_VS_LIB4C 2364#else 2365 .p2align 4,,15 2366 FUN_START(inner_edge_dtrmm_nn_ru_one_4x4_vs_lib4c) 2367#endif 2368 2369 cmpl $0, %r14d 2370 jle 0f // end 2371 2372 movq %r12, %r15 2373 addq %r13, %r15 2374 addq %r13, %r15 // B+2*ldb 2375 2376 cmpl $4, %r14d 2377 jl 1f // end 2378 2379 // unroll 0 2380 vmovupd 0(%r11), %ymm13 // A 2381 vaddpd %ymm0, %ymm13, %ymm0 2382 vbroadcastsd 0(%r12, %r13), %ymm12 // B 2383 vmulpd %ymm13, %ymm12, %ymm15 2384 vaddpd %ymm1, %ymm15, %ymm1 2385 vbroadcastsd 0(%r15), %ymm12 // B 2386 vmulpd %ymm13, %ymm12, %ymm15 2387 vaddpd %ymm2, %ymm15, %ymm2 2388 vbroadcastsd 0(%r15, %r13), %ymm12 // B 2389 vmulpd %ymm13, %ymm12, %ymm15 2390 vaddpd %ymm3, %ymm15, %ymm3 2391 2392 // unroll 1 2393 vmovupd 32(%r11), %ymm13 // A 2394 vaddpd %ymm1, %ymm13, %ymm1 2395 vbroadcastsd 8(%r15), %ymm12 // B 2396 vmulpd %ymm13, %ymm12, %ymm15 2397 vaddpd %ymm2, %ymm15, %ymm2 2398 vbroadcastsd 8(%r15, %r13), %ymm12 // B 2399 vmulpd %ymm13, %ymm12, %ymm15 2400 vaddpd %ymm3, %ymm15, %ymm3 2401 2402 // unroll 2 2403 vmovupd 64(%r11), %ymm13 // A 2404 vaddpd %ymm2, %ymm13, %ymm2 2405 vbroadcastsd 16(%r15, %r13), %ymm12 // B 2406 vmulpd %ymm13, %ymm12, %ymm15 2407 vaddpd %ymm3, %ymm15, %ymm3 2408 2409 // unroll 3 2410 vmovupd 96(%r11), %ymm13 // A 2411 vaddpd %ymm3, %ymm13, %ymm3 2412 2413 subl $4, %r10d 2414 addq $32, %r12 2415 addq $32, %r15 2416 addq $128, %r11 2417 2418 jmp 0f 2419 24201: 2421 2422 cmpl $3, %r14d 2423 jl 2f // end 2424 2425 // unroll 0 2426 vmovupd 0(%r11), %ymm13 // A 2427 vaddpd %ymm0, %ymm13, %ymm0 2428 vbroadcastsd 0(%r12, %r13), %ymm12 // B 2429 vmulpd %ymm13, %ymm12, %ymm15 2430 vaddpd %ymm1, %ymm15, %ymm1 2431 vbroadcastsd 0(%r15), %ymm12 // B 2432 vmulpd %ymm13, %ymm12, %ymm15 2433 vaddpd %ymm2, %ymm15, %ymm2 2434 2435 // unroll 1 2436 vmovupd 32(%r11), %ymm13 // A 2437 vaddpd %ymm1, %ymm13, %ymm1 2438 vbroadcastsd 8(%r15), %ymm12 // B 2439 vmulpd %ymm13, %ymm12, %ymm15 2440 vaddpd %ymm2, %ymm15, %ymm2 2441 2442 // unroll 2 2443 vmovupd 64(%r11), %ymm13 // A 2444 vaddpd %ymm2, %ymm13, %ymm2 2445 2446 // unroll 3 2447 2448 subl $3, %r10d 2449 addq $24, %r12 2450 addq $24, %r15 2451 addq $96, %r11 2452 2453 jmp 0f 2454 24552: 2456 2457 cmpl $2, %r14d 2458 jl 3f // end 2459 2460 // unroll 0 2461 vmovupd 0(%r11), %ymm13 // A 2462 vaddpd %ymm0, %ymm13, %ymm0 2463 vbroadcastsd 0(%r12, %r13), %ymm12 // B 2464 vmulpd %ymm13, %ymm12, %ymm15 2465 vaddpd %ymm1, %ymm15, %ymm1 2466 2467 // unroll 1 2468 vmovupd 32(%r11), %ymm13 // A 2469 vaddpd %ymm1, %ymm13, %ymm1 2470 2471 // unroll 2 2472 2473 // unroll 3 2474 2475 subl $2, %r10d 2476 addq $16, %r12 2477 addq $16, %r15 2478 addq $64, %r11 2479 2480 jmp 0f 2481 24823: 2483 2484// cmpl $1, %r14d 2485// jl 0f // end 2486 2487 // unroll 0 2488 vmovupd 0(%r11), %ymm13 // A 2489 vaddpd %ymm0, %ymm13, %ymm0 2490 2491 // unroll 1 2492 2493 // unroll 2 2494 2495 // unroll 3 2496 2497 subl $1, %r10d 2498 addq $8, %r12 2499 addq $8, %r15 2500 addq $32, %r11 2501 25020: 2503 2504#if MACRO_LEVEL>=1 2505 .endm 2506#else 2507 ret 2508 2509 FUN_END(inner_edge_dtrmm_nn_ru_one_4x4_vs_lib4c) 2510#endif 2511 2512 2513 2514 2515 2516// common inner routine with file scope 2517// 2518// edge for B lower triangular 2519// 2520// input arguments: 2521// r10d <- k 2522// r11 <- A 2523// r12 <- B 2524// r13 <- ldb 2525// ymm0 <- [d00 d10 d20 d30] 2526// ymm1 <- [d01 d11 d21 d31] 2527// ymm2 <- [d02 d12 d22 d32] 2528// ymm3 <- [d03 d13 d23 d33] 2529// 2530// output arguments: 2531 2532#if MACRO_LEVEL>=1 2533 .macro INNER_EDGE_DTRMM_NT_RL_4X4_LIB4C 2534#else 2535 .p2align 4,,15 2536 FUN_START(inner_edge_dtrmm_nt_rl_4x4_lib4c) 2537#endif 2538 2539 // unroll 0 2540 vmovupd 0(%r11), %ymm13 // A 2541 vbroadcastsd 0(%r12), %ymm12 // B 2542 vmulpd %ymm13, %ymm12, %ymm15 2543 vaddpd %ymm0, %ymm15, %ymm0 2544 vbroadcastsd 8(%r12), %ymm12 // B 2545 vmulpd %ymm13, %ymm12, %ymm15 2546 vaddpd %ymm1, %ymm15, %ymm1 2547 vbroadcastsd 16(%r12), %ymm12 // B 2548 vmulpd %ymm13, %ymm12, %ymm15 2549 vaddpd %ymm2, %ymm15, %ymm2 2550 vbroadcastsd 24(%r12), %ymm12 // B 2551 vmulpd %ymm13, %ymm12, %ymm15 2552 vaddpd %ymm3, %ymm15, %ymm3 2553 addq %r13, %r12 2554 2555 // unroll 1 2556 vmovupd 32(%r11), %ymm13 // A 2557 vbroadcastsd 8(%r12), %ymm12 // B 2558 vmulpd %ymm13, %ymm12, %ymm15 2559 vaddpd %ymm1, %ymm15, %ymm1 2560 vbroadcastsd 16(%r12), %ymm12 // B 2561 vmulpd %ymm13, %ymm12, %ymm15 2562 vaddpd %ymm2, %ymm15, %ymm2 2563 vbroadcastsd 24(%r12), %ymm12 // B 2564 vmulpd %ymm13, %ymm12, %ymm15 2565 vaddpd %ymm3, %ymm15, %ymm3 2566 addq %r13, %r12 2567 2568 // unroll 2 2569 vmovupd 64(%r11), %ymm13 // A 2570 vbroadcastsd 16(%r12), %ymm12 // B 2571 vmulpd %ymm13, %ymm12, %ymm15 2572 vaddpd %ymm2, %ymm15, %ymm2 2573 vbroadcastsd 24(%r12), %ymm12 // B 2574 vmulpd %ymm13, %ymm12, %ymm15 2575 vaddpd %ymm3, %ymm15, %ymm3 2576 addq %r13, %r12 2577 2578 // unroll 3 2579 vmovupd 96(%r11), %ymm13 // A 2580 vbroadcastsd 24(%r12), %ymm12 // B 2581 vmulpd %ymm13, %ymm12, %ymm15 2582 vaddpd %ymm3, %ymm15, %ymm3 2583 addq %r13, %r12 2584 2585 subl $4, %r10d 2586 addq $128, %r11 2587 2588#if MACRO_LEVEL>=1 2589 .endm 2590#else 2591 ret 2592 2593 FUN_END(inner_edge_dtrmm_nt_rl_4x4_lib4c) 2594#endif 2595 2596 2597 2598 2599 2600// common inner routine with file scope 2601// 2602// edge for B lower triangular 2603// 2604// input arguments: 2605// r10d <- k 2606// r11 <- A 2607// r12 <- B 2608// r13 <- ldb 2609// r14 <- n1 2610// ymm0 <- [d00 d10 d20 d30] 2611// ymm1 <- [d01 d11 d21 d31] 2612// ymm2 <- [d02 d12 d22 d32] 2613// ymm3 <- [d03 d13 d23 d33] 2614// 2615// output arguments: 2616 2617#if MACRO_LEVEL>=1 2618 .macro INNER_EDGE_DTRMM_NT_RL_4X4_VS_LIB4C 2619#else 2620 .p2align 4,,15 2621 FUN_START(inner_edge_dtrmm_nt_rl_4x4_vs_lib4c) 2622#endif 2623 2624 cmpl $0, %r14d 2625 jle 0f // end 2626 2627 cmpl $4, %r14d 2628 jl 1f // end 2629 2630 // unroll 0 2631 vmovupd 0(%r11), %ymm13 // A 2632 vbroadcastsd 0(%r12), %ymm12 // B 2633 vmulpd %ymm13, %ymm12, %ymm15 2634 vaddpd %ymm0, %ymm15, %ymm0 2635 vbroadcastsd 8(%r12), %ymm12 // B 2636 vmulpd %ymm13, %ymm12, %ymm15 2637 vaddpd %ymm1, %ymm15, %ymm1 2638 vbroadcastsd 16(%r12), %ymm12 // B 2639 vmulpd %ymm13, %ymm12, %ymm15 2640 vaddpd %ymm2, %ymm15, %ymm2 2641 vbroadcastsd 24(%r12), %ymm12 // B 2642 vmulpd %ymm13, %ymm12, %ymm15 2643 vaddpd %ymm3, %ymm15, %ymm3 2644 addq %r13, %r12 2645 2646 // unroll 1 2647 vmovupd 32(%r11), %ymm13 // A 2648 vbroadcastsd 8(%r12), %ymm12 // B 2649 vmulpd %ymm13, %ymm12, %ymm15 2650 vaddpd %ymm1, %ymm15, %ymm1 2651 vbroadcastsd 16(%r12), %ymm12 // B 2652 vmulpd %ymm13, %ymm12, %ymm15 2653 vaddpd %ymm2, %ymm15, %ymm2 2654 vbroadcastsd 24(%r12), %ymm12 // B 2655 vmulpd %ymm13, %ymm12, %ymm15 2656 vaddpd %ymm3, %ymm15, %ymm3 2657 addq %r13, %r12 2658 2659 // unroll 2 2660 vmovupd 64(%r11), %ymm13 // A 2661 vbroadcastsd 16(%r12), %ymm12 // B 2662 vmulpd %ymm13, %ymm12, %ymm15 2663 vaddpd %ymm2, %ymm15, %ymm2 2664 vbroadcastsd 24(%r12), %ymm12 // B 2665 vmulpd %ymm13, %ymm12, %ymm15 2666 vaddpd %ymm3, %ymm15, %ymm3 2667 addq %r13, %r12 2668 2669 // unroll 3 2670 vmovupd 96(%r11), %ymm13 // A 2671 vbroadcastsd 24(%r12), %ymm12 // B 2672 vmulpd %ymm13, %ymm12, %ymm15 2673 vaddpd %ymm3, %ymm15, %ymm3 2674 addq %r13, %r12 2675 2676 subl $4, %r10d 2677 addq $128, %r11 2678 2679 jmp 0f 2680 26811: 2682 2683 cmpl $3, %r14d 2684 jl 2f // end 2685 2686 // unroll 0 2687 vmovupd 0(%r11), %ymm13 // A 2688 vbroadcastsd 0(%r12), %ymm12 // B 2689 vmulpd %ymm13, %ymm12, %ymm15 2690 vaddpd %ymm0, %ymm15, %ymm0 2691 vbroadcastsd 8(%r12), %ymm12 // B 2692 vmulpd %ymm13, %ymm12, %ymm15 2693 vaddpd %ymm1, %ymm15, %ymm1 2694 vbroadcastsd 16(%r12), %ymm12 // B 2695 vmulpd %ymm13, %ymm12, %ymm15 2696 vaddpd %ymm2, %ymm15, %ymm2 2697 addq %r13, %r12 2698 2699 // unroll 1 2700 vmovupd 32(%r11), %ymm13 // A 2701 vbroadcastsd 8(%r12), %ymm12 // B 2702 vmulpd %ymm13, %ymm12, %ymm15 2703 vaddpd %ymm1, %ymm15, %ymm1 2704 vbroadcastsd 16(%r12), %ymm12 // B 2705 vmulpd %ymm13, %ymm12, %ymm15 2706 vaddpd %ymm2, %ymm15, %ymm2 2707 addq %r13, %r12 2708 2709 // unroll 2 2710 vmovupd 64(%r11), %ymm13 // A 2711 vbroadcastsd 16(%r12), %ymm12 // B 2712 vmulpd %ymm13, %ymm12, %ymm15 2713 vaddpd %ymm2, %ymm15, %ymm2 2714 addq %r13, %r12 2715 2716 // unroll 3 2717 2718 subl $3, %r10d 2719 addq $96, %r11 2720 2721 jmp 0f 2722 27232: 2724 2725 cmpl $2, %r14d 2726 jl 3f // end 2727 2728 // unroll 0 2729 vmovupd 0(%r11), %ymm13 // A 2730 vbroadcastsd 0(%r12), %ymm12 // B 2731 vmulpd %ymm13, %ymm12, %ymm15 2732 vaddpd %ymm0, %ymm15, %ymm0 2733 vbroadcastsd 8(%r12), %ymm12 // B 2734 vmulpd %ymm13, %ymm12, %ymm15 2735 vaddpd %ymm1, %ymm15, %ymm1 2736 addq %r13, %r12 2737 2738 // unroll 1 2739 vmovupd 32(%r11), %ymm13 // A 2740 vbroadcastsd 8(%r12), %ymm12 // B 2741 vmulpd %ymm13, %ymm12, %ymm15 2742 vaddpd %ymm1, %ymm15, %ymm1 2743 addq %r13, %r12 2744 2745 // unroll 2 2746 2747 // unroll 3 2748 2749 subl $2, %r10d 2750 addq $64, %r11 2751 2752 jmp 0f 2753 27543: 2755 2756// cmpl $1, %r14d 2757// jl 0f // end 2758 2759 // unroll 0 2760 vmovupd 0(%r11), %ymm13 // A 2761 vbroadcastsd 0(%r12), %ymm12 // B 2762 vmulpd %ymm13, %ymm12, %ymm15 2763 vaddpd %ymm0, %ymm15, %ymm0 2764 addq %r13, %r12 2765 2766 // unroll 1 2767 2768 // unroll 2 2769 2770 // unroll 3 2771 2772 subl $1, %r10d 2773 addq $32, %r11 2774 27750: 2776 2777#if MACRO_LEVEL>=1 2778 .endm 2779#else 2780 ret 2781 2782 FUN_END(inner_edge_dtrmm_nt_rl_4x4_vs_lib4c) 2783#endif 2784 2785 2786 2787 2788 2789// common inner routine with file scope 2790// 2791// edge for B lower triangular 2792// 2793// input arguments: 2794// r10d <- k 2795// r11 <- A 2796// r12 <- B 2797// r13 <- ldb 2798// ymm0 <- [d00 d10 d20 d30] 2799// ymm1 <- [d01 d11 d21 d31] 2800// ymm2 <- [d02 d12 d22 d32] 2801// ymm3 <- [d03 d13 d23 d33] 2802// 2803// output arguments: 2804 2805#if MACRO_LEVEL>=1 2806 .macro INNER_EDGE_DTRMM_NT_RL_ONE_4X4_LIB4C 2807#else 2808 .p2align 4,,15 2809 FUN_START(inner_edge_dtrmm_nt_rl_one_4x4_lib4c) 2810#endif 2811 2812 // unroll 0 2813 vmovupd 0(%r11), %ymm13 // A 2814 vaddpd %ymm0, %ymm13, %ymm0 2815 vbroadcastsd 8(%r12), %ymm12 // B 2816 vmulpd %ymm13, %ymm12, %ymm15 2817 vaddpd %ymm1, %ymm15, %ymm1 2818 vbroadcastsd 16(%r12), %ymm12 // B 2819 vmulpd %ymm13, %ymm12, %ymm15 2820 vaddpd %ymm2, %ymm15, %ymm2 2821 vbroadcastsd 24(%r12), %ymm12 // B 2822 vmulpd %ymm13, %ymm12, %ymm15 2823 vaddpd %ymm3, %ymm15, %ymm3 2824 addq %r13, %r12 2825 2826 // unroll 1 2827 vmovupd 32(%r11), %ymm13 // A 2828 vaddpd %ymm1, %ymm13, %ymm1 2829 vbroadcastsd 16(%r12), %ymm12 // B 2830 vmulpd %ymm13, %ymm12, %ymm15 2831 vaddpd %ymm2, %ymm15, %ymm2 2832 vbroadcastsd 24(%r12), %ymm12 // B 2833 vmulpd %ymm13, %ymm12, %ymm15 2834 vaddpd %ymm3, %ymm15, %ymm3 2835 addq %r13, %r12 2836 2837 // unroll 2 2838 vmovupd 64(%r11), %ymm13 // A 2839 vaddpd %ymm2, %ymm13, %ymm2 2840 vbroadcastsd 24(%r12), %ymm12 // B 2841 vmulpd %ymm13, %ymm12, %ymm15 2842 vaddpd %ymm3, %ymm15, %ymm3 2843 addq %r13, %r12 2844 2845 // unroll 3 2846 vmovupd 96(%r11), %ymm13 // A 2847 vaddpd %ymm3, %ymm13, %ymm3 2848 addq %r13, %r12 2849 2850 subl $4, %r10d 2851 addq $128, %r11 2852 2853#if MACRO_LEVEL>=1 2854 .endm 2855#else 2856 ret 2857 2858 FUN_END(inner_edge_dtrmm_nt_rl_one_4x4_lib4c) 2859#endif 2860 2861 2862 2863 2864 2865// common inner routine with file scope 2866// 2867// edge for B lower triangular 2868// 2869// input arguments: 2870// r10d <- k 2871// r11 <- A 2872// r12 <- B 2873// r13 <- ldb 2874// r14 <- n1 2875// ymm0 <- [d00 d10 d20 d30] 2876// ymm1 <- [d01 d11 d21 d31] 2877// ymm2 <- [d02 d12 d22 d32] 2878// ymm3 <- [d03 d13 d23 d33] 2879// 2880// output arguments: 2881 2882#if MACRO_LEVEL>=1 2883 .macro INNER_EDGE_DTRMM_NT_RL_ONE_4X4_VS_LIB4C 2884#else 2885 .p2align 4,,15 2886 FUN_START(inner_edge_dtrmm_nt_rl_one_4x4_vs_lib4c) 2887#endif 2888 2889 cmpl $0, %r14d 2890 jle 0f // end 2891 2892 cmpl $4, %r14d 2893 jl 1f // end 2894 2895 // unroll 0 2896 vmovupd 0(%r11), %ymm13 // A 2897 vaddpd %ymm0, %ymm13, %ymm0 2898 vbroadcastsd 8(%r12), %ymm12 // B 2899 vmulpd %ymm13, %ymm12, %ymm15 2900 vaddpd %ymm1, %ymm15, %ymm1 2901 vbroadcastsd 16(%r12), %ymm12 // B 2902 vmulpd %ymm13, %ymm12, %ymm15 2903 vaddpd %ymm2, %ymm15, %ymm2 2904 vbroadcastsd 24(%r12), %ymm12 // B 2905 vmulpd %ymm13, %ymm12, %ymm15 2906 vaddpd %ymm3, %ymm15, %ymm3 2907 addq %r13, %r12 2908 2909 // unroll 1 2910 vmovupd 32(%r11), %ymm13 // A 2911 vaddpd %ymm1, %ymm13, %ymm1 2912 vbroadcastsd 16(%r12), %ymm12 // B 2913 vmulpd %ymm13, %ymm12, %ymm15 2914 vaddpd %ymm2, %ymm15, %ymm2 2915 vbroadcastsd 24(%r12), %ymm12 // B 2916 vmulpd %ymm13, %ymm12, %ymm15 2917 vaddpd %ymm3, %ymm15, %ymm3 2918 addq %r13, %r12 2919 2920 // unroll 2 2921 vmovupd 64(%r11), %ymm13 // A 2922 vaddpd %ymm2, %ymm13, %ymm2 2923 vbroadcastsd 24(%r12), %ymm12 // B 2924 vmulpd %ymm13, %ymm12, %ymm15 2925 vaddpd %ymm3, %ymm15, %ymm3 2926 addq %r13, %r12 2927 2928 // unroll 3 2929 vmovupd 96(%r11), %ymm13 // A 2930 vaddpd %ymm3, %ymm13, %ymm3 2931 addq %r13, %r12 2932 2933 subl $4, %r10d 2934 addq $128, %r11 2935 2936 jmp 0f 2937 29381: 2939 2940 cmpl $3, %r14d 2941 jl 2f // end 2942 2943 // unroll 0 2944 vmovupd 0(%r11), %ymm13 // A 2945 vaddpd %ymm0, %ymm13, %ymm0 2946 vbroadcastsd 8(%r12), %ymm12 // B 2947 vmulpd %ymm13, %ymm12, %ymm15 2948 vaddpd %ymm1, %ymm15, %ymm1 2949 vbroadcastsd 16(%r12), %ymm12 // B 2950 vmulpd %ymm13, %ymm12, %ymm15 2951 vaddpd %ymm2, %ymm15, %ymm2 2952 addq %r13, %r12 2953 2954 // unroll 1 2955 vmovupd 32(%r11), %ymm13 // A 2956 vaddpd %ymm1, %ymm13, %ymm1 2957 vbroadcastsd 16(%r12), %ymm12 // B 2958 vmulpd %ymm13, %ymm12, %ymm15 2959 vaddpd %ymm2, %ymm15, %ymm2 2960 addq %r13, %r12 2961 2962 // unroll 2 2963 vmovupd 64(%r11), %ymm13 // A 2964 vaddpd %ymm2, %ymm13, %ymm2 2965 addq %r13, %r12 2966 2967 // unroll 3 2968 2969 subl $3, %r10d 2970 addq $96, %r11 2971 2972 jmp 0f 2973 29742: 2975 2976 cmpl $2, %r14d 2977 jl 3f // end 2978 2979 // unroll 0 2980 vmovupd 0(%r11), %ymm13 // A 2981 vaddpd %ymm0, %ymm13, %ymm0 2982 vbroadcastsd 8(%r12), %ymm12 // B 2983 vmulpd %ymm13, %ymm12, %ymm15 2984 vaddpd %ymm1, %ymm15, %ymm1 2985 addq %r13, %r12 2986 2987 // unroll 1 2988 vmovupd 32(%r11), %ymm13 // A 2989 vaddpd %ymm1, %ymm13, %ymm1 2990 addq %r13, %r12 2991 2992 // unroll 2 2993 2994 // unroll 3 2995 2996 subl $2, %r10d 2997 addq $64, %r11 2998 2999 jmp 0f 3000 30013: 3002 3003// cmpl $1, %r14d 3004// jl 0f // end 3005 3006 // unroll 0 3007 vmovupd 0(%r11), %ymm13 // A 3008 vaddpd %ymm0, %ymm13, %ymm0 3009 addq %r13, %r12 3010 3011 // unroll 1 3012 3013 // unroll 2 3014 3015 // unroll 3 3016 3017 subl $1, %r10d 3018 addq $32, %r11 3019 30200: 3021 3022#if MACRO_LEVEL>=1 3023 .endm 3024#else 3025 ret 3026 3027 FUN_END(inner_edge_dtrmm_nt_rl_one_4x4_vs_lib4c) 3028#endif 3029 3030 3031 3032 3033 3034// common inner routine with file scope 3035// 3036// edge for B upper triangular 3037// 3038// input arguments: 3039// r10 <- kmax 3040// r11 <- A 3041// r12 <- B 3042// r13 <- ldb 3043// ymm0 <- [d00 d10 d20 d30] 3044// ymm1 <- [d01 d11 d21 d31] 3045// ymm2 <- [d02 d12 d22 d32] 3046// ymm3 <- [d03 d13 d23 d33] 3047 3048// 3049// output arguments: 3050 3051 3052#if MACRO_LEVEL>=1 3053 .macro INNER_EDGE_DTRMM_NT_RU_4X4_LIB4C 3054#else 3055 .p2align 4,,15 3056 FUN_START(inner_edge_dtrmm_nt_ru_4x4_lib4c) 3057#endif 3058 3059 vmovapd 0(%r11), %ymm8 3060 vbroadcastsd 0(%r12), %ymm12 3061 vmulpd %ymm8, %ymm12, %ymm15 3062 vaddpd %ymm0, %ymm15, %ymm0 3063 addq %r13, %r12 3064 3065 vmovapd 32(%r11), %ymm8 3066 vbroadcastsd 0(%r12), %ymm12 3067 vmulpd %ymm8, %ymm12, %ymm15 3068 vaddpd %ymm0, %ymm15, %ymm0 3069 vbroadcastsd 8(%r12), %ymm12 3070 vmulpd %ymm8, %ymm12, %ymm15 3071 vaddpd %ymm1, %ymm15, %ymm1 3072 addq %r13, %r12 3073 3074 vmovapd 64(%r11), %ymm8 3075 vbroadcastsd 0(%r12), %ymm12 3076 vmulpd %ymm8, %ymm12, %ymm15 3077 vaddpd %ymm0, %ymm15, %ymm0 3078 vbroadcastsd 8(%r12), %ymm12 3079 vmulpd %ymm8, %ymm12, %ymm15 3080 vaddpd %ymm1, %ymm15, %ymm1 3081 vbroadcastsd 16(%r12), %ymm12 3082 vmulpd %ymm8, %ymm12, %ymm15 3083 vaddpd %ymm2, %ymm15, %ymm2 3084 addq %r13, %r12 3085 3086 vmovapd 96(%r11), %ymm8 3087 vbroadcastsd 0(%r12), %ymm12 3088 vmulpd %ymm8, %ymm12, %ymm15 3089 vaddpd %ymm0, %ymm15, %ymm0 3090 vbroadcastsd 8(%r12), %ymm12 3091 vmulpd %ymm8, %ymm12, %ymm15 3092 vaddpd %ymm1, %ymm15, %ymm1 3093 vbroadcastsd 16(%r12), %ymm12 3094 vmulpd %ymm8, %ymm12, %ymm15 3095 vaddpd %ymm2, %ymm15, %ymm2 3096 vbroadcastsd 24(%r12), %ymm12 3097 vmulpd %ymm8, %ymm12, %ymm15 3098 vaddpd %ymm3, %ymm15, %ymm3 3099 addq %r13, %r12 3100 3101 subl $4, %r10d 3102 addq $128, %r11 3103 3104#if MACRO_LEVEL>=1 3105 .endm 3106#else 3107 ret 3108 3109 FUN_END(inner_edge_dtrmm_nt_ru_4x4_lib4c) 3110#endif 3111 3112 3113 3114 3115 3116// common inner routine with file scope 3117// 3118// edge for B upper triangular 3119// 3120// input arguments: 3121// r10d <- k 3122// r11 <- A 3123// r12 <- B 3124// r13 <- ldb 3125// ymm0 <- [d00 d10 d20 d30] 3126// ymm1 <- [d01 d11 d21 d31] 3127// ymm2 <- [d02 d12 d22 d32] 3128// ymm3 <- [d03 d13 d23 d33] 3129 3130// 3131// output arguments: 3132 3133 3134#if MACRO_LEVEL>=1 3135 .macro INNER_EDGE_DTRMM_NT_RU_4X4_VS_LIB4C 3136#else 3137 .p2align 4,,15 3138 FUN_START(inner_edge_dtrmm_nt_ru_4x4_vs_lib4c) 3139#endif 3140 3141 cmpl $0, %r10d 3142 jle 0f // end 3143 3144 vmovapd 0(%r11), %ymm8 3145 subl $1, %r10d 3146 vbroadcastsd 0(%r12), %ymm12 3147 vmulpd %ymm8, %ymm12, %ymm15 3148 vaddpd %ymm0, %ymm15, %ymm0 3149 addq %r13, %r12 3150 addq $32, %r11 3151 3152 cmpl $0, %r10d 3153 jle 0f 3154 3155 vmovapd 0(%r11), %ymm8 3156 subl $1, %r10d 3157 vbroadcastsd 0(%r12), %ymm12 3158 vmulpd %ymm8, %ymm12, %ymm15 3159 vaddpd %ymm0, %ymm15, %ymm0 3160 vbroadcastsd 8(%r12), %ymm12 3161 addq $32, %r11 3162 vmulpd %ymm8, %ymm12, %ymm15 3163 vaddpd %ymm1, %ymm15, %ymm1 3164 addq %r13, %r12 3165 3166 cmpl $0, %r10d 3167 jle 0f 3168 3169 vmovapd 0(%r11), %ymm8 3170 subl $1, %r10d 3171 vbroadcastsd 0(%r12), %ymm12 3172 vmulpd %ymm8, %ymm12, %ymm15 3173 vaddpd %ymm0, %ymm15, %ymm0 3174 vbroadcastsd 8(%r12), %ymm12 3175 addq $32, %r11 3176 vmulpd %ymm8, %ymm12, %ymm15 3177 vaddpd %ymm1, %ymm15, %ymm1 3178 vbroadcastsd 16(%r12), %ymm12 3179 vmulpd %ymm8, %ymm12, %ymm15 3180 vaddpd %ymm2, %ymm15, %ymm2 3181 addq %r13, %r12 3182 3183 cmpl $0, %r10d 3184 jle 0f 3185 3186 vmovapd 0(%r11), %ymm8 3187 subl $1, %r10d 3188 vbroadcastsd 0(%r12), %ymm12 3189 vmulpd %ymm8, %ymm12, %ymm15 3190 vaddpd %ymm0, %ymm15, %ymm0 3191 vbroadcastsd 8(%r12), %ymm12 3192 addq $32, %r11 3193 vmulpd %ymm8, %ymm12, %ymm15 3194 vaddpd %ymm1, %ymm15, %ymm1 3195 vbroadcastsd 16(%r12), %ymm12 3196 vmulpd %ymm8, %ymm12, %ymm15 3197 vaddpd %ymm2, %ymm15, %ymm2 3198 vbroadcastsd 24(%r12), %ymm12 3199 vmulpd %ymm8, %ymm12, %ymm15 3200 vaddpd %ymm3, %ymm15, %ymm3 3201 addq %r13, %r12 3202 32030: 3204 3205#if MACRO_LEVEL>=1 3206 .endm 3207#else 3208 ret 3209 3210 FUN_END(inner_edge_dtrmm_nt_ru_4x4_vs_lib4c) 3211#endif 3212 3213 3214 3215 3216 3217// common inner routine with file scope 3218// 3219// edge for B upper triangular 3220// 3221// input arguments: 3222// r10 <- kmax 3223// r11 <- A 3224// r12 <- B 3225// r13 <- ldb 3226// ymm0 <- [d00 d10 d20 d30] 3227// ymm1 <- [d01 d11 d21 d31] 3228// ymm2 <- [d02 d12 d22 d32] 3229// ymm3 <- [d03 d13 d23 d33] 3230 3231// 3232// output arguments: 3233 3234 3235#if MACRO_LEVEL>=1 3236 .macro INNER_EDGE_DTRMM_NT_RU_ONE_4X4_LIB4C 3237#else 3238 .p2align 4,,15 3239 FUN_START(inner_edge_dtrmm_nt_ru_one_4x4_lib4c) 3240#endif 3241 3242 vmovapd 0(%r11), %ymm8 3243 vaddpd %ymm0, %ymm8, %ymm0 3244 addq %r13, %r12 3245 3246 vmovapd 32(%r11), %ymm8 3247 vbroadcastsd 0(%r12), %ymm12 3248 vmulpd %ymm8, %ymm12, %ymm15 3249 vaddpd %ymm0, %ymm15, %ymm0 3250 vaddpd %ymm1, %ymm8, %ymm1 3251 addq %r13, %r12 3252 3253 vmovapd 64(%r11), %ymm8 3254 vbroadcastsd 0(%r12), %ymm12 3255 vmulpd %ymm8, %ymm12, %ymm15 3256 vaddpd %ymm0, %ymm15, %ymm0 3257 vbroadcastsd 8(%r12), %ymm12 3258 vmulpd %ymm8, %ymm12, %ymm15 3259 vaddpd %ymm1, %ymm15, %ymm1 3260 vaddpd %ymm2, %ymm8, %ymm2 3261 addq %r13, %r12 3262 3263 vmovapd 96(%r11), %ymm8 3264 vbroadcastsd 0(%r12), %ymm12 3265 vmulpd %ymm8, %ymm12, %ymm15 3266 vaddpd %ymm0, %ymm15, %ymm0 3267 vbroadcastsd 8(%r12), %ymm12 3268 vmulpd %ymm8, %ymm12, %ymm15 3269 vaddpd %ymm1, %ymm15, %ymm1 3270 vbroadcastsd 16(%r12), %ymm12 3271 vmulpd %ymm8, %ymm12, %ymm15 3272 vaddpd %ymm2, %ymm15, %ymm2 3273 vaddpd %ymm3, %ymm8, %ymm3 3274 addq %r13, %r12 3275 3276 subl $4, %r10d 3277 addq $128, %r11 3278 3279#if MACRO_LEVEL>=1 3280 .endm 3281#else 3282 ret 3283 3284 FUN_END(inner_edge_dtrmm_nt_ru_one_4x4_lib4c) 3285#endif 3286 3287 3288 3289 3290 3291// common inner routine with file scope 3292// 3293// edge for B upper triangular 3294// 3295// input arguments: 3296// r10d <- k 3297// r11 <- A 3298// r12 <- B 3299// r13 <- ldb 3300// ymm0 <- [d00 d10 d20 d30] 3301// ymm1 <- [d01 d11 d21 d31] 3302// ymm2 <- [d02 d12 d22 d32] 3303// ymm3 <- [d03 d13 d23 d33] 3304 3305// 3306// output arguments: 3307 3308 3309#if MACRO_LEVEL>=1 3310 .macro INNER_EDGE_DTRMM_NT_RU_ONE_4X4_VS_LIB4C 3311#else 3312 .p2align 4,,15 3313 FUN_START(inner_edge_dtrmm_nt_ru_one_4x4_vs_lib4c) 3314#endif 3315 3316 cmpl $0, %r10d 3317 jle 0f // end 3318 3319 vmovapd 0(%r11), %ymm8 3320 subl $1, %r10d 3321 vaddpd %ymm0, %ymm8, %ymm0 3322 addq $32, %r11 3323 addq %r13, %r12 3324 3325 cmpl $0, %r10d 3326 jle 0f 3327 3328 vmovapd 0(%r11), %ymm8 3329 subl $1, %r10d 3330 vbroadcastsd 0(%r12), %ymm12 3331 addq $32, %r11 3332 vmulpd %ymm8, %ymm12, %ymm15 3333 vaddpd %ymm0, %ymm15, %ymm0 3334 vaddpd %ymm1, %ymm8, %ymm1 3335 addq %r13, %r12 3336 3337 cmpl $0, %r10d 3338 jle 0f 3339 3340 vmovapd 0(%r11), %ymm8 3341 subl $1, %r10d 3342 vbroadcastsd 0(%r12), %ymm12 3343 vmulpd %ymm8, %ymm12, %ymm15 3344 vaddpd %ymm0, %ymm15, %ymm0 3345 vbroadcastsd 8(%r12), %ymm12 3346 addq $32, %r11 3347 vmulpd %ymm8, %ymm12, %ymm15 3348 vaddpd %ymm1, %ymm15, %ymm1 3349 vaddpd %ymm2, %ymm8, %ymm2 3350 addq %r13, %r12 3351 3352 cmpl $0, %r10d 3353 jle 0f 3354 3355 vmovapd 0(%r11), %ymm8 3356 subl $1, %r10d 3357 vbroadcastsd 0(%r12), %ymm12 3358 vmulpd %ymm8, %ymm12, %ymm15 3359 vaddpd %ymm0, %ymm15, %ymm0 3360 vbroadcastsd 8(%r12), %ymm12 3361 addq $32, %r11 3362 vmulpd %ymm8, %ymm12, %ymm15 3363 vaddpd %ymm1, %ymm15, %ymm1 3364 vbroadcastsd 16(%r12), %ymm12 3365 vmulpd %ymm8, %ymm12, %ymm15 3366 vaddpd %ymm2, %ymm15, %ymm2 3367 vaddpd %ymm3, %ymm8, %ymm3 3368 addq %r13, %r12 3369 33700: 3371 3372#if MACRO_LEVEL>=1 3373 .endm 3374#else 3375 ret 3376 3377 FUN_END(inner_edge_dtrmm_nt_ru_one_4x4_vs_lib4c) 3378#endif 3379 3380 3381 3382 3383 3384// common inner routine with file scope 3385// 3386// triangular substitution: 3387// side = left 3388// uplo = lower 3389// tran = not-transposed 3390// unit diagonal 3391// 3392// input arguments: 3393// r10 <- E 3394// r11 <- lde 3395// ymm0 <- [d00 d10 d20 d30] 3396// ymm1 <- [d01 d11 d21 d31] 3397// ymm2 <- [d02 d12 d22 d32] 3398// ymm3 <- [d03 d13 d23 d33] 3399// 3400// output arguments: 3401 3402#if MACRO_LEVEL>=1 3403 .macro INNER_EDGE_DTRSM_LLN_ONE_4X4_LIB 3404#else 3405 .p2align 4,,15 3406 FUN_START(inner_edge_dtrsm_lln_one_4x4_lib) 3407#endif 3408 3409 vxorpd %ymm14, %ymm14, %ymm14 3410 3411 vmovupd 0(%r10), %ymm12 3412 vblendpd $0x1, %ymm14, %ymm12, %ymm12 3413 vperm2f128 $0x00, %ymm0, %ymm0, %ymm13 3414 vpermilpd $0x0, %ymm13, %ymm13 3415 vmulpd %ymm12, %ymm13, %ymm15 3416 vsubpd %ymm15, %ymm0, %ymm0 3417 vperm2f128 $0x00, %ymm1, %ymm1, %ymm13 3418 vpermilpd $0x0, %ymm13, %ymm13 3419 vmulpd %ymm12, %ymm13, %ymm15 3420 vsubpd %ymm15, %ymm1, %ymm1 3421 vperm2f128 $0x00, %ymm2, %ymm2, %ymm13 3422 vpermilpd $0x0, %ymm13, %ymm13 3423 vmulpd %ymm12, %ymm13, %ymm15 3424 vsubpd %ymm15, %ymm2, %ymm2 3425 vperm2f128 $0x00, %ymm3, %ymm3, %ymm13 3426 vpermilpd $0x0, %ymm13, %ymm13 3427 vmulpd %ymm12, %ymm13, %ymm15 3428 vsubpd %ymm15, %ymm3, %ymm3 3429 add %r11, %r10 3430 3431 vmovupd 0(%r10), %ymm12 3432 vblendpd $0x3, %ymm14, %ymm12, %ymm12 3433 vperm2f128 $0x00, %ymm0, %ymm0, %ymm13 3434 vpermilpd $0xf, %ymm13, %ymm13 3435 vmulpd %ymm12, %ymm13, %ymm15 3436 vsubpd %ymm15, %ymm0, %ymm0 3437 vperm2f128 $0x00, %ymm1, %ymm1, %ymm13 3438 vpermilpd $0xf, %ymm13, %ymm13 3439 vmulpd %ymm12, %ymm13, %ymm15 3440 vsubpd %ymm15, %ymm1, %ymm1 3441 vperm2f128 $0x00, %ymm2, %ymm2, %ymm13 3442 vpermilpd $0xf, %ymm13, %ymm13 3443 vmulpd %ymm12, %ymm13, %ymm15 3444 vsubpd %ymm15, %ymm2, %ymm2 3445 vperm2f128 $0x00, %ymm3, %ymm3, %ymm13 3446 vpermilpd $0xf, %ymm13, %ymm13 3447 vmulpd %ymm12, %ymm13, %ymm15 3448 vsubpd %ymm15, %ymm3, %ymm3 3449 add %r11, %r10 3450 3451 vmovupd 0(%r10), %ymm12 3452 vblendpd $0x7, %ymm14, %ymm12, %ymm12 3453 vperm2f128 $0x11, %ymm0, %ymm0, %ymm13 3454 vpermilpd $0x0, %ymm13, %ymm13 3455 vmulpd %ymm12, %ymm13, %ymm15 3456 vsubpd %ymm15, %ymm0, %ymm0 3457 vperm2f128 $0x11, %ymm1, %ymm1, %ymm13 3458 vpermilpd $0x0, %ymm13, %ymm13 3459 vmulpd %ymm12, %ymm13, %ymm15 3460 vsubpd %ymm15, %ymm1, %ymm1 3461 vperm2f128 $0x11, %ymm2, %ymm2, %ymm13 3462 vpermilpd $0x0, %ymm13, %ymm13 3463 vmulpd %ymm12, %ymm13, %ymm15 3464 vsubpd %ymm15, %ymm2, %ymm2 3465 vperm2f128 $0x11, %ymm3, %ymm3, %ymm13 3466 vpermilpd $0x0, %ymm13, %ymm13 3467 vmulpd %ymm12, %ymm13, %ymm15 3468 vsubpd %ymm15, %ymm3, %ymm3 3469 3470#if MACRO_LEVEL>=1 3471 .endm 3472#else 3473 ret 3474 3475 FUN_END(inner_edge_dtrsm_lln_one_4x4_lib) 3476#endif 3477 3478 3479 3480 3481 3482// common inner routine with file scope 3483// 3484// triangular substitution: 3485// side = right 3486// uplo = lower 3487// tran = not-transposed 3488// requires explicit inverse of diagonal 3489// 3490// input arguments: 3491// r10 <- E 3492// r11 <- lde 3493// r12 <- inv_diag_E 3494// ymm0 <- [d00 d10 d20 d30] 3495// ymm1 <- [d01 d11 d21 d31] 3496// ymm2 <- [d02 d12 d22 d32] 3497// ymm3 <- [d03 d13 d23 d33] 3498// 3499// output arguments: 3500 3501#if MACRO_LEVEL>=1 3502 .macro INNER_EDGE_DTRSM_RLN_INV_4X4_LIB 3503#else 3504 .p2align 4,,15 3505 FUN_START(inner_edge_dtrsm_rln_inv_4x4_lib) 3506#endif 3507 3508 // 4th column 3509 vbroadcastsd 24(%r12), %ymm13 3510 vmulpd %ymm3, %ymm13, %ymm3 3511 vbroadcastsd 24(%r10, %r11, 2), %ymm13 3512 vmulpd %ymm3, %ymm13, %ymm15 3513 vsubpd %ymm15, %ymm2, %ymm2 3514 vbroadcastsd 24(%r10, %r11, 1), %ymm13 3515 vmulpd %ymm3, %ymm13, %ymm15 3516 vsubpd %ymm15, %ymm1, %ymm1 3517 vbroadcastsd 24(%r10), %ymm13 3518 vmulpd %ymm3, %ymm13, %ymm15 3519 vsubpd %ymm15, %ymm0, %ymm0 3520 3521 // 3rd column 3522 vbroadcastsd 16(%r12), %ymm13 3523 vmulpd %ymm2, %ymm13, %ymm2 3524 vbroadcastsd 16(%r10, %r11, 1), %ymm13 3525 vmulpd %ymm2, %ymm13, %ymm15 3526 vsubpd %ymm15, %ymm1, %ymm1 3527 vbroadcastsd 16(%r10), %ymm13 3528 vmulpd %ymm2, %ymm13, %ymm15 3529 vsubpd %ymm15, %ymm0, %ymm0 3530 3531 // 2nd column 3532 vbroadcastsd 8(%r12), %ymm13 3533 vmulpd %ymm1, %ymm13, %ymm1 3534 vbroadcastsd 8(%r10), %ymm13 3535 vmulpd %ymm1, %ymm13, %ymm15 3536 vsubpd %ymm15, %ymm0, %ymm0 3537 3538 // 1st column 3539 vbroadcastsd 0(%r12), %ymm13 3540 vmulpd %ymm0, %ymm13, %ymm0 3541 3542#if MACRO_LEVEL>=1 3543 .endm 3544#else 3545 ret 3546 3547 FUN_END(inner_edge_dtrsm_rln_inv_4x4_lib) 3548#endif 3549 3550 3551 3552 3553 3554// common inner routine with file scope 3555// 3556// triangular substitution: 3557// side = right 3558// uplo = lower 3559// tran = not-transposed 3560// requires explicit inverse of diagonal 3561// 3562// input arguments: 3563// r10 <- E 3564// r11 <- lde 3565// r12 <- inv_diag_E 3566// r13 <- n1 3567// ymm0 <- [d00 d10 d20 d30] 3568// ymm1 <- [d01 d11 d21 d31] 3569// ymm2 <- [d02 d12 d22 d32] 3570// ymm3 <- [d03 d13 d23 d33] 3571// 3572// output arguments: 3573 3574#if MACRO_LEVEL>=1 3575 .macro INNER_EDGE_DTRSM_RLN_INV_4X4_VS_LIB 3576#else 3577 .p2align 4,,15 3578 FUN_START(inner_edge_dtrsm_rln_inv_4x4_vs_lib) 3579#endif 3580 3581 cmpl $3, %r13d 3582 jle 0f 3583 3584 // 4th column 3585 vbroadcastsd 24(%r12), %ymm13 3586 vmulpd %ymm3, %ymm13, %ymm3 3587 vbroadcastsd 24(%r10, %r11, 2), %ymm13 3588 vmulpd %ymm3, %ymm13, %ymm15 3589 vsubpd %ymm15, %ymm2, %ymm2 3590 vbroadcastsd 24(%r10, %r11, 1), %ymm13 3591 vmulpd %ymm3, %ymm13, %ymm15 3592 vsubpd %ymm15, %ymm1, %ymm1 3593 vbroadcastsd 24(%r10), %ymm13 3594 vmulpd %ymm3, %ymm13, %ymm15 3595 vsubpd %ymm15, %ymm0, %ymm0 3596 35970: 3598 cmpl $2, %r13d 3599 jle 0f 3600 3601 // 3rd column 3602 vbroadcastsd 16(%r12), %ymm13 3603 vmulpd %ymm2, %ymm13, %ymm2 3604 vbroadcastsd 16(%r10, %r11, 1), %ymm13 3605 vmulpd %ymm2, %ymm13, %ymm15 3606 vsubpd %ymm15, %ymm1, %ymm1 3607 vbroadcastsd 16(%r10), %ymm13 3608 vmulpd %ymm2, %ymm13, %ymm15 3609 vsubpd %ymm15, %ymm0, %ymm0 3610 36110: 3612 cmpl $1, %r13d 3613 jle 0f 3614 3615 // 2nd column 3616 vbroadcastsd 8(%r12), %ymm13 3617 vmulpd %ymm1, %ymm13, %ymm1 3618 vbroadcastsd 8(%r10), %ymm13 3619 vmulpd %ymm1, %ymm13, %ymm15 3620 vsubpd %ymm15, %ymm0, %ymm0 3621 36220: 3623 3624 // 1st column 3625 vbroadcastsd 0(%r12), %ymm13 3626 vmulpd %ymm0, %ymm13, %ymm0 3627 3628#if MACRO_LEVEL>=1 3629 .endm 3630#else 3631 ret 3632 3633 FUN_END(inner_edge_dtrsm_rln_inv_4x4_vs_lib) 3634#endif 3635 3636 3637 3638 3639 3640// common inner routine with file scope 3641// 3642// triangular substitution: 3643// side = right 3644// uplo = lower 3645// tran = not-transposed 3646// unit diagonal 3647// 3648// input arguments: 3649// r10 <- E 3650// r11 <- lde 3651// ymm0 <- [d00 d10 d20 d30] 3652// ymm1 <- [d01 d11 d21 d31] 3653// ymm2 <- [d02 d12 d22 d32] 3654// ymm3 <- [d03 d13 d23 d33] 3655// 3656// output arguments: 3657 3658#if MACRO_LEVEL>=1 3659 .macro INNER_EDGE_DTRSM_RLN_ONE_4X4_LIB 3660#else 3661 .p2align 4,,15 3662 FUN_START(inner_edge_dtrsm_rln_one_4x4_lib) 3663#endif 3664 3665 // 4th column 3666 vbroadcastsd 24(%r10, %r11, 2), %ymm13 3667 vmulpd %ymm3, %ymm13, %ymm15 3668 vsubpd %ymm15, %ymm2, %ymm2 3669 vbroadcastsd 24(%r10, %r11, 1), %ymm13 3670 vmulpd %ymm3, %ymm13, %ymm15 3671 vsubpd %ymm15, %ymm1, %ymm1 3672 vbroadcastsd 24(%r10), %ymm13 3673 vmulpd %ymm3, %ymm13, %ymm15 3674 vsubpd %ymm15, %ymm0, %ymm0 3675 3676 // 3rd column 3677 vbroadcastsd 16(%r10, %r11, 1), %ymm13 3678 vmulpd %ymm2, %ymm13, %ymm15 3679 vsubpd %ymm15, %ymm1, %ymm1 3680 vbroadcastsd 16(%r10), %ymm13 3681 vmulpd %ymm2, %ymm13, %ymm15 3682 vsubpd %ymm15, %ymm0, %ymm0 3683 3684 // 2nd column 3685 vbroadcastsd 8(%r10), %ymm13 3686 vmulpd %ymm1, %ymm13, %ymm15 3687 vsubpd %ymm15, %ymm0, %ymm0 3688 3689 // 1st column 3690 3691#if MACRO_LEVEL>=1 3692 .endm 3693#else 3694 ret 3695 3696 FUN_END(inner_edge_dtrsm_rln_one_4x4_lib) 3697#endif 3698 3699 3700 3701 3702 3703// common inner routine with file scope 3704// 3705// triangular substitution: 3706// side = right 3707// uplo = lower 3708// tran = not-transposed 3709// unit diagonal 3710// 3711// input arguments: 3712// r10 <- E 3713// r11 <- lde 3714// r12 <- n1 3715// ymm0 <- [d00 d10 d20 d30] 3716// ymm1 <- [d01 d11 d21 d31] 3717// ymm2 <- [d02 d12 d22 d32] 3718// ymm3 <- [d03 d13 d23 d33] 3719// 3720// output arguments: 3721 3722#if MACRO_LEVEL>=1 3723 .macro INNER_EDGE_DTRSM_RLN_ONE_4X4_VS_LIB 3724#else 3725 .p2align 4,,15 3726 FUN_START(inner_edge_dtrsm_rln_one_4x4_vs_lib) 3727#endif 3728 3729 cmpl $3, %r12d 3730 jle 0f 3731 3732 // 4th column 3733 vbroadcastsd 24(%r10, %r11, 2), %ymm13 3734 vmulpd %ymm3, %ymm13, %ymm15 3735 vsubpd %ymm15, %ymm2, %ymm2 3736 vbroadcastsd 24(%r10, %r11, 1), %ymm13 3737 vmulpd %ymm3, %ymm13, %ymm15 3738 vsubpd %ymm15, %ymm1, %ymm1 3739 vbroadcastsd 24(%r10), %ymm13 3740 vmulpd %ymm3, %ymm13, %ymm15 3741 vsubpd %ymm15, %ymm0, %ymm0 3742 37430: 3744 cmpl $2, %r12d 3745 jle 0f 3746 3747 // 3rd column 3748 vbroadcastsd 16(%r10, %r11, 1), %ymm13 3749 vmulpd %ymm2, %ymm13, %ymm15 3750 vsubpd %ymm15, %ymm1, %ymm1 3751 vbroadcastsd 16(%r10), %ymm13 3752 vmulpd %ymm2, %ymm13, %ymm15 3753 vsubpd %ymm15, %ymm0, %ymm0 3754 37550: 3756 cmpl $1, %r12d 3757 jle 0f 3758 3759 // 2nd column 3760 vbroadcastsd 8(%r10), %ymm13 3761 vmulpd %ymm1, %ymm13, %ymm15 3762 vsubpd %ymm15, %ymm0, %ymm0 3763 37640: 3765 3766 // 1st column 3767 3768#if MACRO_LEVEL>=1 3769 .endm 3770#else 3771 ret 3772 3773 FUN_END(inner_edge_dtrsm_rln_one_4x4_vs_lib) 3774#endif 3775 3776 3777 3778 3779 3780// common inner routine with file scope 3781// 3782// triangular substitution: 3783// side = right 3784// uplo = lower 3785// tran = transposed 3786// requires explicit inverse of diagonal 3787// 3788// input arguments: 3789// r10 <- E 3790// r11 <- lde 3791// r12 <- inv_diag_E 3792// ymm0 <- [d00 d10 d20 d30] 3793// ymm1 <- [d01 d11 d21 d31] 3794// ymm2 <- [d02 d12 d22 d32] 3795// ymm3 <- [d03 d13 d23 d33] 3796// 3797// output arguments: 3798 3799#if MACRO_LEVEL>=1 3800 .macro INNER_EDGE_DTRSM_RLT_INV_4X4_LIB 3801#else 3802 .p2align 4,,15 3803 FUN_START(inner_edge_dtrsm_rlt_inv_4x4_lib) 3804#endif 3805 3806 vbroadcastsd 0(%r12), %ymm13 3807 vmulpd %ymm0, %ymm13, %ymm0 3808 vbroadcastsd 8(%r10), %ymm13 3809 vmulpd %ymm0, %ymm13, %ymm15 3810 vsubpd %ymm15, %ymm1, %ymm1 3811 vbroadcastsd 16(%r10), %ymm13 3812 vmulpd %ymm0, %ymm13, %ymm15 3813 vsubpd %ymm15, %ymm2, %ymm2 3814 vbroadcastsd 24(%r10), %ymm13 3815 vmulpd %ymm0, %ymm13, %ymm15 3816 vsubpd %ymm15, %ymm3, %ymm3 3817 3818 vbroadcastsd 8(%r12), %ymm13 3819 vmulpd %ymm1, %ymm13, %ymm1 3820 vbroadcastsd 16(%r10, %r11, 1), %ymm13 3821 vmulpd %ymm1, %ymm13, %ymm15 3822 vsubpd %ymm15, %ymm2, %ymm2 3823 vbroadcastsd 24(%r10, %r11, 1), %ymm13 3824 vmulpd %ymm1, %ymm13, %ymm15 3825 vsubpd %ymm15, %ymm3, %ymm3 3826 3827 vbroadcastsd 16(%r12), %ymm13 3828 vmulpd %ymm2, %ymm13, %ymm2 3829 vbroadcastsd 24(%r10, %r11, 2), %ymm13 3830 vmulpd %ymm2, %ymm13, %ymm15 3831 vsubpd %ymm15, %ymm3, %ymm3 3832 3833 vbroadcastsd 24(%r12), %ymm13 3834 vmulpd %ymm3, %ymm13, %ymm3 3835 3836#if MACRO_LEVEL>=1 3837 .endm 3838#else 3839 ret 3840 3841 FUN_END(inner_edge_dtrsm_rlt_inv_4x4_lib) 3842#endif 3843 3844 3845 3846 3847 3848// common inner routine with file scope 3849// 3850// triangular substitution: 3851// side = right 3852// uplo = lower 3853// tran = transposed 3854// requires explicit inverse of diagonal 3855// 3856// input arguments: 3857// r10 <- E 3858// r11 <- lde 3859// r12 <- inv_diag_E 3860// r13d <- kn 3861// ymm0 <- [d00 d10 d20 d30] 3862// ymm1 <- [d01 d11 d21 d31] 3863// ymm2 <- [d02 d12 d22 d32] 3864// ymm3 <- [d03 d13 d23 d33] 3865// 3866// output arguments: 3867 3868#if MACRO_LEVEL>=1 3869 .macro INNER_EDGE_DTRSM_RLT_INV_4X4_VS_LIB 3870#else 3871 .p2align 4,,15 3872 FUN_START(inner_edge_dtrsm_rlt_inv_4x4_vs_lib) 3873#endif 3874 3875 vbroadcastsd 0(%r12), %ymm13 3876 vmulpd %ymm0, %ymm13, %ymm0 3877 3878 cmpl $2, %r13d 3879 jl 0f // ret 3880 3881 vbroadcastsd 8(%r10), %ymm13 3882 vmulpd %ymm0, %ymm13, %ymm15 3883 vsubpd %ymm15, %ymm1, %ymm1 3884 vbroadcastsd 8(%r12), %ymm13 3885 vmulpd %ymm1, %ymm13, %ymm1 3886 3887 cmpl $3, %r13d 3888 jl 0f // ret 3889 3890 vbroadcastsd 16(%r10), %ymm13 3891 vmulpd %ymm0, %ymm13, %ymm15 3892 vsubpd %ymm15, %ymm2, %ymm2 3893 vbroadcastsd 16(%r10, %r11, 1), %ymm13 3894 vmulpd %ymm1, %ymm13, %ymm15 3895 vsubpd %ymm15, %ymm2, %ymm2 3896 vbroadcastsd 16(%r12), %ymm13 3897 vmulpd %ymm2, %ymm13, %ymm2 3898 3899 cmpl $4, %r13d 3900 jl 0f // ret 3901 3902 vbroadcastsd 24(%r10), %ymm13 3903 vmulpd %ymm0, %ymm13, %ymm15 3904 vsubpd %ymm15, %ymm3, %ymm3 3905 vbroadcastsd 24(%r10, %r11, 1), %ymm13 3906 vmulpd %ymm1, %ymm13, %ymm15 3907 vsubpd %ymm15, %ymm3, %ymm3 3908 vbroadcastsd 24(%r10, %r11, 2), %ymm13 3909 vmulpd %ymm2, %ymm13, %ymm15 3910 vsubpd %ymm15, %ymm3, %ymm3 3911 vbroadcastsd 24(%r12), %ymm13 3912 vmulpd %ymm3, %ymm13, %ymm3 3913 39140: 3915 3916#if MACRO_LEVEL>=1 3917 .endm 3918#else 3919 ret 3920 3921 FUN_END(inner_edge_dtrsm_rlt_inv_4x4_vs_lib) 3922#endif 3923 3924 3925 3926 3927 3928// common inner routine with file scope 3929// 3930// triangular substitution: 3931// side = right 3932// uplo = lower 3933// tran = transposed 3934// unit diagonal 3935// 3936// input arguments: 3937// r10 <- E 3938// r11 <- lde 3939// ymm0 <- [d00 d10 d20 d30] 3940// ymm1 <- [d01 d11 d21 d31] 3941// ymm2 <- [d02 d12 d22 d32] 3942// ymm3 <- [d03 d13 d23 d33] 3943// 3944// output arguments: 3945 3946#if MACRO_LEVEL>=1 3947 .macro INNER_EDGE_DTRSM_RLT_ONE_4X4_LIB 3948#else 3949 .p2align 4,,15 3950 FUN_START(inner_edge_dtrsm_rlt_one_4x4_lib) 3951#endif 3952 3953 vbroadcastsd 8(%r10), %ymm13 3954 vmulpd %ymm0, %ymm13, %ymm15 3955 vsubpd %ymm15, %ymm1, %ymm1 3956 vbroadcastsd 16(%r10), %ymm13 3957 vmulpd %ymm0, %ymm13, %ymm15 3958 vsubpd %ymm15, %ymm2, %ymm2 3959 vbroadcastsd 24(%r10), %ymm13 3960 vmulpd %ymm0, %ymm13, %ymm15 3961 vsubpd %ymm15, %ymm3, %ymm3 3962 3963 vbroadcastsd 16(%r10, %r11, 1), %ymm13 3964 vmulpd %ymm1, %ymm13, %ymm15 3965 vsubpd %ymm15, %ymm2, %ymm2 3966 vbroadcastsd 24(%r10, %r11, 1), %ymm13 3967 vmulpd %ymm1, %ymm13, %ymm15 3968 vsubpd %ymm15, %ymm3, %ymm3 3969 3970 vbroadcastsd 24(%r10, %r11, 2), %ymm13 3971 vmulpd %ymm2, %ymm13, %ymm15 3972 vsubpd %ymm15, %ymm3, %ymm3 3973 3974#if MACRO_LEVEL>=1 3975 .endm 3976#else 3977 ret 3978 3979 FUN_END(inner_edge_dtrsm_rlt_one_4x4_lib) 3980#endif 3981 3982 3983 3984 3985 3986// common inner routine with file scope 3987// 3988// triangular substitution: 3989// side = right 3990// uplo = lower 3991// tran = transposed 3992// unit diagonal 3993// 3994// input arguments: 3995// r10 <- E 3996// r11 <- lde 3997// r12d <- kn 3998// ymm0 <- [d00 d10 d20 d30] 3999// ymm1 <- [d01 d11 d21 d31] 4000// ymm2 <- [d02 d12 d22 d32] 4001// ymm3 <- [d03 d13 d23 d33] 4002// 4003// output arguments: 4004 4005#if MACRO_LEVEL>=1 4006 .macro INNER_EDGE_DTRSM_RLT_ONE_4X4_VS_LIB 4007#else 4008 .p2align 4,,15 4009 FUN_START(inner_edge_dtrsm_rlt_one_4x4_vs_lib) 4010#endif 4011 4012 cmpl $2, %r12d 4013 jl 0f // ret 4014 4015 vbroadcastsd 8(%r10), %ymm13 4016 vmulpd %ymm0, %ymm13, %ymm15 4017 vsubpd %ymm15, %ymm1, %ymm1 4018 4019 cmpl $3, %r12d 4020 jl 0f // ret 4021 4022 vbroadcastsd 16(%r10), %ymm13 4023 vmulpd %ymm0, %ymm13, %ymm15 4024 vsubpd %ymm15, %ymm2, %ymm2 4025 vbroadcastsd 16(%r10, %r11, 1), %ymm13 4026 vmulpd %ymm1, %ymm13, %ymm15 4027 vsubpd %ymm15, %ymm2, %ymm2 4028 4029 cmpl $4, %r12d 4030 jl 0f // ret 4031 4032 vbroadcastsd 24(%r10), %ymm13 4033 vmulpd %ymm0, %ymm13, %ymm15 4034 vsubpd %ymm15, %ymm3, %ymm3 4035 vbroadcastsd 24(%r10, %r11, 1), %ymm13 4036 vmulpd %ymm1, %ymm13, %ymm15 4037 vsubpd %ymm15, %ymm3, %ymm3 4038 vbroadcastsd 24(%r10, %r11, 2), %ymm13 4039 vmulpd %ymm2, %ymm13, %ymm15 4040 vsubpd %ymm15, %ymm3, %ymm3 4041 40420: 4043 4044#if MACRO_LEVEL>=1 4045 .endm 4046#else 4047 ret 4048 4049 FUN_END(inner_edge_dtrsm_rlt_one_4x4_vs_lib) 4050#endif 4051 4052 4053 4054 4055 4056// common inner routine with file scope 4057// 4058// triangular substitution: 4059// side = right 4060// uplo = upper 4061// tran = not-transposed 4062// requires explicit inverse of diagonal 4063// 4064// input arguments: 4065// r10 <- E 4066// r11 <- lde 4067// r12 <- inv_diag_E 4068// ymm0 <- [d00 d10 d20 d30] 4069// ymm1 <- [d01 d11 d21 d31] 4070// ymm2 <- [d02 d12 d22 d32] 4071// ymm3 <- [d03 d13 d23 d33] 4072// 4073// output arguments: 4074 4075#if MACRO_LEVEL>=1 4076 .macro INNER_EDGE_DTRSM_RUN_INV_4X4_LIB 4077#else 4078 .p2align 4,,15 4079 FUN_START(inner_edge_dtrsm_run_inv_4x4_lib) 4080#endif 4081 4082 addq %r11, %r10 4083 4084 vbroadcastsd 0(%r12), %ymm13 4085 vmulpd %ymm0, %ymm13, %ymm0 4086 vbroadcastsd 0(%r10), %ymm13 4087 vmulpd %ymm0, %ymm13, %ymm15 4088 vsubpd %ymm15, %ymm1, %ymm1 4089 vbroadcastsd 0(%r10, %r11, 1), %ymm13 4090 vmulpd %ymm0, %ymm13, %ymm15 4091 vsubpd %ymm15, %ymm2, %ymm2 4092 vbroadcastsd 0(%r10, %r11, 2), %ymm13 4093 vmulpd %ymm0, %ymm13, %ymm15 4094 vsubpd %ymm15, %ymm3, %ymm3 4095 4096 vbroadcastsd 8(%r12), %ymm13 4097 vmulpd %ymm1, %ymm13, %ymm1 4098 vbroadcastsd 8(%r10, %r11, 1), %ymm13 4099 vmulpd %ymm1, %ymm13, %ymm15 4100 vsubpd %ymm15, %ymm2, %ymm2 4101 vbroadcastsd 8(%r10, %r11, 2), %ymm13 4102 vmulpd %ymm1, %ymm13, %ymm15 4103 vsubpd %ymm15, %ymm3, %ymm3 4104 4105 vbroadcastsd 16(%r12), %ymm13 4106 vmulpd %ymm2, %ymm13, %ymm2 4107 vbroadcastsd 16(%r10, %r11, 2), %ymm13 4108 vmulpd %ymm2, %ymm13, %ymm15 4109 vsubpd %ymm15, %ymm3, %ymm3 4110 4111 vbroadcastsd 24(%r12), %ymm13 4112 vmulpd %ymm3, %ymm13, %ymm3 4113 4114#if MACRO_LEVEL>=1 4115 .endm 4116#else 4117 ret 4118 4119 FUN_END(inner_edge_dtrsm_run_inv_4x4_lib) 4120#endif 4121 4122 4123 4124 4125 4126// common inner routine with file scope 4127// 4128// triangular substitution: 4129// side = right 4130// uplo = upper 4131// tran = not-transposed 4132// requires explicit inverse of diagonal 4133// 4134// input arguments: 4135// r10 <- E 4136// r11 <- lde 4137// r12 <- inv_diag_E 4138// r13d <- kn 4139// ymm0 <- [d00 d10 d20 d30] 4140// ymm1 <- [d01 d11 d21 d31] 4141// ymm2 <- [d02 d12 d22 d32] 4142// ymm3 <- [d03 d13 d23 d33] 4143// 4144// output arguments: 4145 4146#if MACRO_LEVEL>=1 4147 .macro INNER_EDGE_DTRSM_RUN_INV_4X4_VS_LIB 4148#else 4149 .p2align 4,,15 4150 FUN_START(inner_edge_dtrsm_run_inv_4x4_vs_lib) 4151#endif 4152 4153 addq %r11, %r10 4154 4155 vbroadcastsd 0(%r12), %ymm13 4156 vmulpd %ymm0, %ymm13, %ymm0 4157 4158 cmpl $2, %r13d 4159 jl 0f // ret 4160 4161 vbroadcastsd 0(%r10), %ymm13 4162 vmulpd %ymm0, %ymm13, %ymm15 4163 vsubpd %ymm15, %ymm1, %ymm1 4164 vbroadcastsd 8(%r12), %ymm13 4165 vmulpd %ymm1, %ymm13, %ymm1 4166 4167 cmpl $3, %r13d 4168 jl 0f // ret 4169 4170 vbroadcastsd 0(%r10, %r11, 1), %ymm13 4171 vmulpd %ymm0, %ymm13, %ymm15 4172 vsubpd %ymm15, %ymm2, %ymm2 4173 vbroadcastsd 8(%r10, %r11, 1), %ymm13 4174 vmulpd %ymm1, %ymm13, %ymm15 4175 vsubpd %ymm15, %ymm2, %ymm2 4176 vbroadcastsd 16(%r12), %ymm13 4177 vmulpd %ymm2, %ymm13, %ymm2 4178 4179 cmpl $4, %r13d 4180 jl 0f // ret 4181 4182 vbroadcastsd 0(%r10, %r11, 2), %ymm13 4183 vmulpd %ymm0, %ymm13, %ymm15 4184 vsubpd %ymm15, %ymm3, %ymm3 4185 vbroadcastsd 8(%r10, %r11, 2), %ymm13 4186 vmulpd %ymm1, %ymm13, %ymm15 4187 vsubpd %ymm15, %ymm3, %ymm3 4188 vbroadcastsd 16(%r10, %r11, 2), %ymm13 4189 vmulpd %ymm2, %ymm13, %ymm15 4190 vsubpd %ymm15, %ymm3, %ymm3 4191 vbroadcastsd 24(%r12), %ymm13 4192 vmulpd %ymm3, %ymm13, %ymm3 4193 41940: 4195 4196#if MACRO_LEVEL>=1 4197 .endm 4198#else 4199 ret 4200 4201 FUN_END(inner_edge_dtrsm_run_inv_4x4_vs_lib) 4202#endif 4203 4204 4205 4206 4207 4208// common inner routine with file scope 4209// 4210// triangular substitution: 4211// side = right 4212// uplo = upper 4213// tran = not-transposed 4214// unit diagonal 4215// 4216// input arguments: 4217// r10 <- E 4218// r11 <- lde 4219// ymm0 <- [d00 d10 d20 d30] 4220// ymm1 <- [d01 d11 d21 d31] 4221// ymm2 <- [d02 d12 d22 d32] 4222// ymm3 <- [d03 d13 d23 d33] 4223// 4224// output arguments: 4225 4226#if MACRO_LEVEL>=1 4227 .macro INNER_EDGE_DTRSM_RUN_ONE_4X4_LIB 4228#else 4229 .p2align 4,,15 4230 FUN_START(inner_edge_dtrsm_run_one_4x4_lib) 4231#endif 4232 4233 addq %r11, %r10 4234 4235 vbroadcastsd 0(%r10), %ymm13 4236 vmulpd %ymm0, %ymm13, %ymm15 4237 vsubpd %ymm15, %ymm1, %ymm1 4238 vbroadcastsd 0(%r10, %r11, 1), %ymm13 4239 vmulpd %ymm0, %ymm13, %ymm15 4240 vsubpd %ymm15, %ymm2, %ymm2 4241 vbroadcastsd 0(%r10, %r11, 2), %ymm13 4242 vmulpd %ymm0, %ymm13, %ymm15 4243 vsubpd %ymm15, %ymm3, %ymm3 4244 4245 vbroadcastsd 8(%r10, %r11, 1), %ymm13 4246 vmulpd %ymm1, %ymm13, %ymm15 4247 vsubpd %ymm15, %ymm2, %ymm2 4248 vbroadcastsd 8(%r10, %r11, 2), %ymm13 4249 vmulpd %ymm1, %ymm13, %ymm15 4250 vsubpd %ymm15, %ymm3, %ymm3 4251 4252 vbroadcastsd 16(%r10, %r11, 2), %ymm13 4253 vmulpd %ymm2, %ymm13, %ymm15 4254 vsubpd %ymm15, %ymm3, %ymm3 4255 4256#if MACRO_LEVEL>=1 4257 .endm 4258#else 4259 ret 4260 4261 FUN_END(inner_edge_dtrsm_run_one_4x4_lib) 4262#endif 4263 4264 4265 4266 4267 4268// common inner routine with file scope 4269// 4270// triangular substitution: 4271// side = right 4272// uplo = upper 4273// tran = not-transposed 4274// unit diagonal 4275// 4276// input arguments: 4277// r10 <- E 4278// r11 <- lde 4279// r12d <- kn 4280// ymm0 <- [d00 d10 d20 d30] 4281// ymm1 <- [d01 d11 d21 d31] 4282// ymm2 <- [d02 d12 d22 d32] 4283// ymm3 <- [d03 d13 d23 d33] 4284// 4285// output arguments: 4286 4287#if MACRO_LEVEL>=1 4288 .macro INNER_EDGE_DTRSM_RUN_ONE_4X4_VS_LIB 4289#else 4290 .p2align 4,,15 4291 FUN_START(inner_edge_dtrsm_run_one_4x4_vs_lib) 4292#endif 4293 4294 addq %r11, %r10 4295 4296 cmpl $2, %r12d 4297 jl 0f // ret 4298 4299 vbroadcastsd 0(%r10), %ymm13 4300 vmulpd %ymm0, %ymm13, %ymm15 4301 vsubpd %ymm15, %ymm1, %ymm1 4302 4303 cmpl $3, %r12d 4304 jl 0f // ret 4305 4306 vbroadcastsd 0(%r10, %r11, 1), %ymm13 4307 vmulpd %ymm0, %ymm13, %ymm15 4308 vsubpd %ymm15, %ymm2, %ymm2 4309 vbroadcastsd 8(%r10, %r11, 1), %ymm13 4310 vmulpd %ymm1, %ymm13, %ymm15 4311 vsubpd %ymm15, %ymm2, %ymm2 4312 4313 cmpl $4, %r12d 4314 jl 0f // ret 4315 4316 vbroadcastsd 0(%r10, %r11, 2), %ymm13 4317 vmulpd %ymm0, %ymm13, %ymm15 4318 vsubpd %ymm15, %ymm3, %ymm3 4319 vbroadcastsd 8(%r10, %r11, 2), %ymm13 4320 vmulpd %ymm1, %ymm13, %ymm15 4321 vsubpd %ymm15, %ymm3, %ymm3 4322 vbroadcastsd 16(%r10, %r11, 2), %ymm13 4323 vmulpd %ymm2, %ymm13, %ymm15 4324 vsubpd %ymm15, %ymm3, %ymm3 4325 43260: 4327 4328#if MACRO_LEVEL>=1 4329 .endm 4330#else 4331 ret 4332 4333 FUN_END(inner_edge_dtrsm_run_one_4x4_vs_lib) 4334#endif 4335 4336 4337 4338 4339 4340// common inner routine with file scope 4341// 4342// triangular substitution: 4343// side = right 4344// uplo = upper 4345// tran = transposed 4346// requires explicit inverse of diagonal 4347// 4348// input arguments: 4349// r10 <- E 4350// r11 <- lde 4351// r12 <- inv_diag_E 4352// ymm0 <- [d00 d10 d20 d30] 4353// ymm1 <- [d01 d11 d21 d31] 4354// ymm2 <- [d02 d12 d22 d32] 4355// ymm3 <- [d03 d13 d23 d33] 4356// 4357// output arguments: 4358 4359#if MACRO_LEVEL>=1 4360 .macro INNER_EDGE_DTRSM_RUT_INV_4X4_LIB 4361#else 4362 .p2align 4,,15 4363 FUN_START(inner_edge_dtrsm_rut_inv_4x4_lib) 4364#endif 4365 4366 addq %r11, %r10 4367 4368 // 4th column 4369 vbroadcastsd 24(%r12), %ymm13 4370 vmulpd %ymm3, %ymm13, %ymm3 4371 vbroadcastsd 16(%r10, %r11, 2), %ymm13 4372 vmulpd %ymm3, %ymm13, %ymm15 4373 vsubpd %ymm15, %ymm2, %ymm2 4374 vbroadcastsd 8(%r10, %r11, 2), %ymm13 4375 vmulpd %ymm3, %ymm13, %ymm15 4376 vsubpd %ymm15, %ymm1, %ymm1 4377 vbroadcastsd 0(%r10, %r11, 2), %ymm13 4378 vmulpd %ymm3, %ymm13, %ymm15 4379 vsubpd %ymm15, %ymm0, %ymm0 4380 4381 // 3rd column 4382 vbroadcastsd 16(%r12), %ymm13 4383 vmulpd %ymm2, %ymm13, %ymm2 4384 vbroadcastsd 8(%r10, %r11, 1), %ymm13 4385 vmulpd %ymm2, %ymm13, %ymm15 4386 vsubpd %ymm15, %ymm1, %ymm1 4387 vbroadcastsd 0(%r10, %r11, 1), %ymm13 4388 vmulpd %ymm2, %ymm13, %ymm15 4389 vsubpd %ymm15, %ymm0, %ymm0 4390 4391 // 2nd column 4392 vbroadcastsd 8(%r12), %ymm13 4393 vmulpd %ymm1, %ymm13, %ymm1 4394 vbroadcastsd 0(%r10), %ymm13 4395 vmulpd %ymm1, %ymm13, %ymm15 4396 vsubpd %ymm15, %ymm0, %ymm0 4397 4398 // 1st column 4399 vbroadcastsd 0(%r12), %ymm13 4400 vmulpd %ymm0, %ymm13, %ymm0 4401 4402#if MACRO_LEVEL>=1 4403 .endm 4404#else 4405 ret 4406 4407 FUN_END(inner_edge_dtrsm_rut_inv_4x4_lib) 4408#endif 4409 4410 4411 4412 4413 4414// common inner routine with file scope 4415// 4416// triangular substitution: 4417// side = right 4418// uplo = upper 4419// tran = transposed 4420// requires explicit inverse of diagonal 4421// 4422// input arguments: 4423// r10 <- E 4424// r11 <- lde 4425// r12 <- inv_diag_E 4426// r13 <- n1 4427// ymm0 <- [d00 d10 d20 d30] 4428// ymm1 <- [d01 d11 d21 d31] 4429// ymm2 <- [d02 d12 d22 d32] 4430// ymm3 <- [d03 d13 d23 d33] 4431// 4432// output arguments: 4433 4434#if MACRO_LEVEL>=1 4435 .macro INNER_EDGE_DTRSM_RUT_INV_4X4_VS_LIB 4436#else 4437 .p2align 4,,15 4438 FUN_START(inner_edge_dtrsm_rut_inv_4x4_vs_lib) 4439#endif 4440 4441 addq %r11, %r10 4442 4443 cmpl $3, %r13d 4444 jle 0f 4445 4446 // 4th column 4447 vbroadcastsd 24(%r12), %ymm13 4448 vmulpd %ymm3, %ymm13, %ymm3 4449 vbroadcastsd 16(%r10, %r11, 2), %ymm13 4450 vmulpd %ymm3, %ymm13, %ymm15 4451 vsubpd %ymm15, %ymm2, %ymm2 4452 vbroadcastsd 8(%r10, %r11, 2), %ymm13 4453 vmulpd %ymm3, %ymm13, %ymm15 4454 vsubpd %ymm15, %ymm1, %ymm1 4455 vbroadcastsd 0(%r10, %r11, 2), %ymm13 4456 vmulpd %ymm3, %ymm13, %ymm15 4457 vsubpd %ymm15, %ymm0, %ymm0 4458 44590: 4460 cmpl $2, %r13d 4461 jle 0f 4462 4463 // 3rd column 4464 vbroadcastsd 16(%r12), %ymm13 4465 vmulpd %ymm2, %ymm13, %ymm2 4466 vbroadcastsd 8(%r10, %r11, 1), %ymm13 4467 vmulpd %ymm2, %ymm13, %ymm15 4468 vsubpd %ymm15, %ymm1, %ymm1 4469 vbroadcastsd 0(%r10, %r11, 1), %ymm13 4470 vmulpd %ymm2, %ymm13, %ymm15 4471 vsubpd %ymm15, %ymm0, %ymm0 4472 44730: 4474 cmpl $1, %r13d 4475 jle 0f 4476 4477 // 2nd column 4478 vbroadcastsd 8(%r12), %ymm13 4479 vmulpd %ymm1, %ymm13, %ymm1 4480 vbroadcastsd 0(%r10), %ymm13 4481 vmulpd %ymm1, %ymm13, %ymm15 4482 vsubpd %ymm15, %ymm0, %ymm0 4483 44840: 4485 4486 // 1st column 4487 vbroadcastsd 0(%r12), %ymm13 4488 vmulpd %ymm0, %ymm13, %ymm0 4489 4490#if MACRO_LEVEL>=1 4491 .endm 4492#else 4493 ret 4494 4495 FUN_END(inner_edge_dtrsm_rut_inv_4x4_vs_lib) 4496#endif 4497 4498 4499 4500 4501 4502// common inner routine with file scope 4503// 4504// triangular substitution: 4505// side = right 4506// uplo = upper 4507// tran = transposed 4508// unit diagonal 4509// 4510// input arguments: 4511// r10 <- E 4512// r11 <- lde 4513// ymm0 <- [d00 d10 d20 d30] 4514// ymm1 <- [d01 d11 d21 d31] 4515// ymm2 <- [d02 d12 d22 d32] 4516// ymm3 <- [d03 d13 d23 d33] 4517// 4518// output arguments: 4519 4520#if MACRO_LEVEL>=1 4521 .macro INNER_EDGE_DTRSM_RUT_ONE_4X4_LIB 4522#else 4523 .p2align 4,,15 4524 FUN_START(inner_edge_dtrsm_rut_one_4x4_lib) 4525#endif 4526 4527 addq %r11, %r10 4528 4529 // 4th column 4530 vbroadcastsd 16(%r10, %r11, 2), %ymm13 4531 vmulpd %ymm3, %ymm13, %ymm15 4532 vsubpd %ymm15, %ymm2, %ymm2 4533 vbroadcastsd 8(%r10, %r11, 2), %ymm13 4534 vmulpd %ymm3, %ymm13, %ymm15 4535 vsubpd %ymm15, %ymm1, %ymm1 4536 vbroadcastsd 0(%r10, %r11, 2), %ymm13 4537 vmulpd %ymm3, %ymm13, %ymm15 4538 vsubpd %ymm15, %ymm0, %ymm0 4539 4540 // 3rd column 4541 vbroadcastsd 8(%r10, %r11, 1), %ymm13 4542 vmulpd %ymm2, %ymm13, %ymm15 4543 vsubpd %ymm15, %ymm1, %ymm1 4544 vbroadcastsd 0(%r10, %r11, 1), %ymm13 4545 vmulpd %ymm2, %ymm13, %ymm15 4546 vsubpd %ymm15, %ymm0, %ymm0 4547 4548 // 2nd column 4549 vbroadcastsd 0(%r10), %ymm13 4550 vmulpd %ymm1, %ymm13, %ymm15 4551 vsubpd %ymm15, %ymm0, %ymm0 4552 4553 // 1st column 4554 4555#if MACRO_LEVEL>=1 4556 .endm 4557#else 4558 ret 4559 4560 FUN_END(inner_edge_dtrsm_rut_one_4x4_lib) 4561#endif 4562 4563 4564 4565 4566 4567// common inner routine with file scope 4568// 4569// triangular substitution: 4570// side = right 4571// uplo = upper 4572// tran = transposed 4573// 4574// input arguments: 4575// r10 <- E 4576// r11 <- lde 4577// r12 <- n1 4578// ymm0 <- [d00 d10 d20 d30] 4579// ymm1 <- [d01 d11 d21 d31] 4580// ymm2 <- [d02 d12 d22 d32] 4581// ymm3 <- [d03 d13 d23 d33] 4582// 4583// output arguments: 4584 4585#if MACRO_LEVEL>=1 4586 .macro INNER_EDGE_DTRSM_RUT_ONE_4X4_VS_LIB 4587#else 4588 .p2align 4,,15 4589 FUN_START(inner_edge_dtrsm_rut_one_4x4_vs_lib) 4590#endif 4591 4592 addq %r11, %r10 4593 4594 cmpl $3, %r12d 4595 jle 0f 4596 4597 // 4th column 4598 vbroadcastsd 16(%r10, %r11, 2), %ymm13 4599 vmulpd %ymm3, %ymm13, %ymm15 4600 vsubpd %ymm15, %ymm2, %ymm2 4601 vbroadcastsd 8(%r10, %r11, 2), %ymm13 4602 vmulpd %ymm3, %ymm13, %ymm15 4603 vsubpd %ymm15, %ymm1, %ymm1 4604 vbroadcastsd 0(%r10, %r11, 2), %ymm13 4605 vmulpd %ymm3, %ymm13, %ymm15 4606 vsubpd %ymm15, %ymm0, %ymm0 4607 46080: 4609 cmpl $2, %r12d 4610 jle 0f 4611 4612 // 3rd column 4613 vbroadcastsd 8(%r10, %r11, 1), %ymm13 4614 vmulpd %ymm2, %ymm13, %ymm15 4615 vsubpd %ymm15, %ymm1, %ymm1 4616 vbroadcastsd 0(%r10, %r11, 1), %ymm13 4617 vmulpd %ymm2, %ymm13, %ymm15 4618 vsubpd %ymm15, %ymm0, %ymm0 4619 46200: 4621 cmpl $1, %r12d 4622 jle 0f 4623 4624 // 2nd column 4625 vbroadcastsd 0(%r10), %ymm13 4626 vmulpd %ymm1, %ymm13, %ymm15 4627 vsubpd %ymm15, %ymm0, %ymm0 4628 46290: 4630 4631 // 1st column 4632 4633#if MACRO_LEVEL>=1 4634 .endm 4635#else 4636 ret 4637 4638 FUN_END(inner_edge_dtrsm_rut_one_4x4_vs_lib) 4639#endif 4640 4641 4642 4643 4644 4645// common inner routine with file scope 4646// 4647// scale for generic alpha and beta 4648// 4649// input arguments: 4650// r10 <- alpha 4651// r11 <- beta 4652// r12 <- C 4653// r13 <- ldc 4654// ymm0 <- [d00 d11 d22 d33] 4655// ymm1 <- [d01 d10 d23 d32] 4656// ymm2 <- [d03 d12 d21 d30] 4657// ymm3 <- [d02 d13 d20 d31] 4658// 4659// output arguments: 4660 4661#if MACRO_LEVEL>=1 4662 .macro INNER_BLEND_SCALE_AB_4X4_LIB 4663#else 4664 .p2align 4,,15 4665 FUN_START(inner_blend_scale_ab_4x4_lib) 4666#endif 4667 4668 vblendpd $0xa, %ymm1, %ymm0, %ymm8 4669 vblendpd $0x5, %ymm1, %ymm0, %ymm9 4670 vblendpd $0xa, %ymm3, %ymm2, %ymm10 4671 vblendpd $0x5, %ymm3, %ymm2, %ymm11 4672 4673 vblendpd $0xc, %ymm10, %ymm8, %ymm0 4674 vblendpd $0x3, %ymm10, %ymm8, %ymm2 4675 vblendpd $0xc, %ymm11, %ymm9, %ymm1 4676 vblendpd $0x3, %ymm11, %ymm9, %ymm3 4677 4678 // alpha 4679 vbroadcastsd 0(%r10), %ymm15 4680 4681 vmulpd %ymm0, %ymm15, %ymm0 4682 vmulpd %ymm1, %ymm15, %ymm1 4683 vmulpd %ymm2, %ymm15, %ymm2 4684 vmulpd %ymm3, %ymm15, %ymm3 4685 4686 // beta 4687 vbroadcastsd 0(%r11), %ymm14 4688 4689 vxorpd %ymm15, %ymm15, %ymm15 // 0.0 4690 4691 vucomisd %xmm15, %xmm14 // beta==0.0 ? 4692 je 0f // end 4693 4694 vmovupd 0(%r12), %ymm15 4695 vmulpd %ymm14, %ymm15, %ymm15 4696 vaddpd %ymm0, %ymm15, %ymm0 4697 addq %r13, %r12 4698 vmovupd 0(%r12), %ymm15 4699 vmulpd %ymm14, %ymm15, %ymm15 4700 vaddpd %ymm1, %ymm15, %ymm1 4701 addq %r13, %r12 4702 vmovupd 0(%r12), %ymm15 4703 vmulpd %ymm14, %ymm15, %ymm15 4704 vaddpd %ymm2, %ymm15, %ymm2 4705 addq %r13, %r12 4706 vmovupd 0(%r12), %ymm15 4707 vmulpd %ymm14, %ymm15, %ymm15 4708 vaddpd %ymm3, %ymm15, %ymm3 4709// addq %r13, %r12 4710 47110: 4712 4713#if MACRO_LEVEL>=1 4714 .endm 4715#else 4716 ret 4717 4718 FUN_END(inner_blend_scale_ab_4x4_lib) 4719#endif 4720 4721 4722 4723 4724 4725// common inner routine with file scope 4726// 4727// scale for generic alpha and beta 4728// 4729// input arguments: 4730// r10 <- alpha 4731// r11 <- beta 4732// r12 <- C 4733// r13 <- ldc 4734// r14d <- km 4735// r15d <- kn 4736// ymm0 <- [d00 d11 d22 d33] 4737// ymm1 <- [d01 d10 d23 d32] 4738// ymm2 <- [d03 d12 d21 d30] 4739// ymm3 <- [d02 d13 d20 d31] 4740// 4741// output arguments: 4742 4743#if MACRO_LEVEL>=1 4744 .macro INNER_BLEND_SCALE_AB_4X4_VS_LIB 4745#else 4746 .p2align 4,,15 4747 FUN_START(inner_blend_scale_ab_4x4_vs_lib) 4748#endif 4749 4750 vblendpd $0xa, %ymm1, %ymm0, %ymm8 4751 vblendpd $0x5, %ymm1, %ymm0, %ymm9 4752 vblendpd $0xa, %ymm3, %ymm2, %ymm10 4753 vblendpd $0x5, %ymm3, %ymm2, %ymm11 4754 4755 vblendpd $0xc, %ymm10, %ymm8, %ymm0 4756 vblendpd $0x3, %ymm10, %ymm8, %ymm2 4757 vblendpd $0xc, %ymm11, %ymm9, %ymm1 4758 vblendpd $0x3, %ymm11, %ymm9, %ymm3 4759 4760 // alpha 4761 vbroadcastsd 0(%r10), %ymm15 4762 4763 vmulpd %ymm0, %ymm15, %ymm0 4764 vmulpd %ymm1, %ymm15, %ymm1 4765 vmulpd %ymm2, %ymm15, %ymm2 4766 vmulpd %ymm3, %ymm15, %ymm3 4767 4768 // beta 4769 vbroadcastsd 0(%r11), %ymm14 4770 4771 vxorpd %ymm15, %ymm15, %ymm15 // 0.0 4772 vucomisd %xmm15, %xmm14 // beta==0.0 ? 4773 je 0f // end 4774 4775 4776 vcvtsi2sd %r14d, %xmm15, %xmm15 4777#if defined(OS_LINUX) | defined(OS_WINDOWS) 4778 vmovupd .LC02(%rip), %ymm13 4779#elif defined(OS_MAC) 4780 vmovupd LC02(%rip), %ymm13 4781#endif 4782 vmovddup %xmm15, %xmm15 4783 vinsertf128 $1, %xmm15, %ymm15, %ymm15 4784 vsubpd %ymm15, %ymm13, %ymm13 4785 4786 4787 vmaskmovpd 0(%r12), %ymm13, %ymm15 4788 vmulpd %ymm14, %ymm15, %ymm15 4789 vaddpd %ymm0, %ymm15, %ymm0 4790 addq %r13, %r12 4791 cmpl $2, %r15d 4792 jl 0f // end 4793 vmaskmovpd 0(%r12), %ymm13, %ymm15 4794 vmulpd %ymm14, %ymm15, %ymm15 4795 vaddpd %ymm1, %ymm15, %ymm1 4796 addq %r13, %r12 4797 cmpl $3, %r15d 4798 jl 0f // end 4799 vmaskmovpd 0(%r12), %ymm13, %ymm15 4800 vmulpd %ymm14, %ymm15, %ymm15 4801 vaddpd %ymm2, %ymm15, %ymm2 4802 addq %r13, %r12 4803 cmpl $3, %r15d 4804 je 0f // end 4805 vmaskmovpd 0(%r12), %ymm13, %ymm15 4806 vmulpd %ymm14, %ymm15, %ymm15 4807 vaddpd %ymm3, %ymm15, %ymm3 4808// addq %r13, %r12 4809 48100: 4811 4812#if MACRO_LEVEL>=1 4813 .endm 4814#else 4815 ret 4816 4817 FUN_END(inner_blend_scale_ab_4x4_vs_lib) 4818#endif 4819 4820 4821 4822 4823 4824// common inner routine with file scope 4825// 4826// scale for alpha=-1 and generic beta 4827// 4828// input arguments: 4829// r10 <- beta 4830// r11 <- C 4831// r12 <- ldc 4832// ymm0 <- [d00 d11 d22 d33] 4833// ymm1 <- [d01 d10 d23 d32] 4834// ymm2 <- [d03 d12 d21 d30] 4835// ymm3 <- [d02 d13 d20 d31] 4836// 4837// output arguments: 4838 4839#if MACRO_LEVEL>=1 4840 .macro INNER_BLEND_SCALE_M1B_4X4_LIB 4841#else 4842 .p2align 4,,15 4843 FUN_START(inner_blend_scale_m1b_4x4_lib) 4844#endif 4845 4846 vblendpd $0xa, %ymm1, %ymm0, %ymm8 4847 vblendpd $0x5, %ymm1, %ymm0, %ymm9 4848 vblendpd $0xa, %ymm3, %ymm2, %ymm10 4849 vblendpd $0x5, %ymm3, %ymm2, %ymm11 4850 4851 vblendpd $0xc, %ymm10, %ymm8, %ymm0 4852 vblendpd $0x3, %ymm10, %ymm8, %ymm2 4853 vblendpd $0xc, %ymm11, %ymm9, %ymm1 4854 vblendpd $0x3, %ymm11, %ymm9, %ymm3 4855 4856 // beta 4857 vbroadcastsd 0(%r10), %ymm14 4858 4859 vxorpd %ymm15, %ymm15, %ymm15 // 0.0 4860 4861 vucomisd %xmm15, %xmm14 // beta==0.0 ? 4862 je 0f // end 4863 4864 vmovupd 0(%r11), %ymm15 4865 vmulpd %ymm14, %ymm15, %ymm15 4866 vsubpd %ymm0, %ymm15, %ymm0 4867 addq %r12, %r11 4868 vmovupd 0(%r11), %ymm15 4869 vmulpd %ymm14, %ymm15, %ymm15 4870 vsubpd %ymm1, %ymm15, %ymm1 4871 addq %r12, %r11 4872 vmovupd 0(%r11), %ymm15 4873 vmulpd %ymm14, %ymm15, %ymm15 4874 vsubpd %ymm2, %ymm15, %ymm2 4875 addq %r12, %r11 4876 vmovupd 0(%r11), %ymm15 4877 vmulpd %ymm14, %ymm15, %ymm15 4878 vsubpd %ymm3, %ymm15, %ymm3 4879// addq %r12, %r11 4880 48810: 4882 4883#if MACRO_LEVEL>=1 4884 .endm 4885#else 4886 ret 4887 4888 FUN_END(inner_blend_scale_m1b_4x4_lib) 4889#endif 4890 4891 4892 4893 4894 4895// common inner routine with file scope 4896// 4897// scale for generic alpha and beta 4898// 4899// input arguments: 4900// r10 <- beta 4901// r11 <- C 4902// r12 <- ldc 4903// r13d <- km 4904// r14d <- kn 4905// ymm0 <- [d00 d11 d22 d33] 4906// ymm1 <- [d01 d10 d23 d32] 4907// ymm2 <- [d03 d12 d21 d30] 4908// ymm3 <- [d02 d13 d20 d31] 4909// 4910// output arguments: 4911 4912#if MACRO_LEVEL>=1 4913 .macro INNER_BLEND_SCALE_M1B_4X4_VS_LIB 4914#else 4915 .p2align 4,,15 4916 FUN_START(inner_blend_scale_m1b_4x4_vs_lib) 4917#endif 4918 4919 vblendpd $0xa, %ymm1, %ymm0, %ymm8 4920 vblendpd $0x5, %ymm1, %ymm0, %ymm9 4921 vblendpd $0xa, %ymm3, %ymm2, %ymm10 4922 vblendpd $0x5, %ymm3, %ymm2, %ymm11 4923 4924 vblendpd $0xc, %ymm10, %ymm8, %ymm0 4925 vblendpd $0x3, %ymm10, %ymm8, %ymm2 4926 vblendpd $0xc, %ymm11, %ymm9, %ymm1 4927 vblendpd $0x3, %ymm11, %ymm9, %ymm3 4928 4929 // beta 4930 vbroadcastsd 0(%r10), %ymm14 4931 4932 vxorpd %ymm15, %ymm15, %ymm15 // 0.0 4933 vucomisd %xmm15, %xmm14 // beta==0.0 ? 4934 je 0f // end 4935 4936 4937 vcvtsi2sd %r13d, %xmm15, %xmm15 4938#if defined(OS_LINUX) | defined(OS_WINDOWS) 4939 vmovupd .LC02(%rip), %ymm13 4940#elif defined(OS_MAC) 4941 vmovupd LC02(%rip), %ymm13 4942#endif 4943 vmovddup %xmm15, %xmm15 4944 vinsertf128 $1, %xmm15, %ymm15, %ymm15 4945 vsubpd %ymm15, %ymm13, %ymm13 4946 4947 4948 vmaskmovpd 0(%r11), %ymm13, %ymm15 4949 vmulpd %ymm14, %ymm15, %ymm15 4950 vsubpd %ymm0, %ymm15, %ymm0 4951 addq %r12, %r11 4952 cmpl $2, %r14d 4953 jl 0f // end 4954 vmaskmovpd 0(%r11), %ymm13, %ymm15 4955 vmulpd %ymm14, %ymm15, %ymm15 4956 vsubpd %ymm1, %ymm15, %ymm1 4957 addq %r12, %r11 4958 cmpl $3, %r14d 4959 jl 0f // end 4960 vmaskmovpd 0(%r11), %ymm13, %ymm15 4961 vmulpd %ymm14, %ymm15, %ymm15 4962 vsubpd %ymm2, %ymm15, %ymm2 4963 addq %r12, %r11 4964 cmpl $3, %r14d 4965 je 0f // end 4966 vmaskmovpd 0(%r11), %ymm13, %ymm15 4967 vmulpd %ymm14, %ymm15, %ymm15 4968 vsubpd %ymm3, %ymm15, %ymm3 4969// addq %r12, %r11 4970 49710: 4972 4973#if MACRO_LEVEL>=1 4974 .endm 4975#else 4976 ret 4977 4978 FUN_END(inner_blend_scale_m1b_4x4_vs_lib) 4979#endif 4980 4981 4982 4983 4984 4985// common inner routine with file scope 4986// 4987// scale for alpha=-1 and beta=1 4988// 4989// input arguments: 4990// r10 <- C 4991// r11 <- ldc 4992// ymm0 <- [d00 d11 d22 d33] 4993// ymm1 <- [d01 d10 d23 d32] 4994// ymm2 <- [d03 d12 d21 d30] 4995// ymm3 <- [d02 d13 d20 d31] 4996// 4997// output arguments: 4998 4999#if MACRO_LEVEL>=1 5000 .macro INNER_BLEND_SCALE_M11_4X4_LIB 5001#else 5002 .p2align 4,,15 5003 FUN_START(inner_blend_scale_m11_4x4_lib) 5004#endif 5005 5006 vblendpd $0xa, %ymm1, %ymm0, %ymm8 5007 vblendpd $0x5, %ymm1, %ymm0, %ymm9 5008 vblendpd $0xa, %ymm3, %ymm2, %ymm10 5009 vblendpd $0x5, %ymm3, %ymm2, %ymm11 5010 5011 vblendpd $0xc, %ymm10, %ymm8, %ymm0 5012 vblendpd $0x3, %ymm10, %ymm8, %ymm2 5013 vblendpd $0xc, %ymm11, %ymm9, %ymm1 5014 vblendpd $0x3, %ymm11, %ymm9, %ymm3 5015 5016 vmovupd 0(%r10), %ymm15 5017 vsubpd %ymm0, %ymm15, %ymm0 5018 addq %r11, %r10 5019 vmovupd 0(%r10), %ymm15 5020 vsubpd %ymm1, %ymm15, %ymm1 5021 addq %r11, %r10 5022 vmovupd 0(%r10), %ymm15 5023 vsubpd %ymm2, %ymm15, %ymm2 5024 addq %r11, %r10 5025 vmovupd 0(%r10), %ymm15 5026 vsubpd %ymm3, %ymm15, %ymm3 5027// addq %r11, %r10 5028 50290: 5030 5031#if MACRO_LEVEL>=1 5032 .endm 5033#else 5034 ret 5035 5036 FUN_END(inner_blend_scale_m11_4x4_lib) 5037#endif 5038 5039 5040 5041 5042 5043// common inner routine with file scope 5044// 5045// scale for alpha=-1 and beta=1 5046// 5047// input arguments: 5048// r10 <- C 5049// r11 <- ldc 5050// r12d <- km 5051// r13d <- kn 5052// ymm0 <- [d00 d11 d22 d33] 5053// ymm1 <- [d01 d10 d23 d32] 5054// ymm2 <- [d03 d12 d21 d30] 5055// ymm3 <- [d02 d13 d20 d31] 5056// 5057// output arguments: 5058 5059#if MACRO_LEVEL>=1 5060 .macro INNER_BLEND_SCALE_M11_4X4_VS_LIB 5061#else 5062 .p2align 4,,15 5063 FUN_START(inner_blend_scale_m11_4x4_vs_lib) 5064#endif 5065 5066 vblendpd $0xa, %ymm1, %ymm0, %ymm8 5067 vblendpd $0x5, %ymm1, %ymm0, %ymm9 5068 vblendpd $0xa, %ymm3, %ymm2, %ymm10 5069 vblendpd $0x5, %ymm3, %ymm2, %ymm11 5070 5071 vblendpd $0xc, %ymm10, %ymm8, %ymm0 5072 vblendpd $0x3, %ymm10, %ymm8, %ymm2 5073 vblendpd $0xc, %ymm11, %ymm9, %ymm1 5074 vblendpd $0x3, %ymm11, %ymm9, %ymm3 5075 5076 vcvtsi2sd %r12d, %xmm15, %xmm15 5077#if defined(OS_LINUX) | defined(OS_WINDOWS) 5078 vmovupd .LC02(%rip), %ymm13 5079#elif defined(OS_MAC) 5080 vmovupd LC02(%rip), %ymm13 5081#endif 5082 vmovddup %xmm15, %xmm15 5083 vinsertf128 $1, %xmm15, %ymm15, %ymm15 5084 vsubpd %ymm15, %ymm13, %ymm13 5085 5086 5087 vmaskmovpd 0(%r10), %ymm13, %ymm15 5088 vsubpd %ymm0, %ymm15, %ymm0 5089 addq %r11, %r10 5090 cmpl $2, %r13d 5091 jl 0f // end 5092 vmaskmovpd 0(%r10), %ymm13, %ymm15 5093 vsubpd %ymm1, %ymm15, %ymm1 5094 addq %r11, %r10 5095 cmpl $3, %r13d 5096 jl 0f // end 5097 vmaskmovpd 0(%r10), %ymm13, %ymm15 5098 vsubpd %ymm2, %ymm15, %ymm2 5099 addq %r11, %r10 5100 cmpl $3, %r13d 5101 je 0f // end 5102 vmaskmovpd 0(%r10), %ymm13, %ymm15 5103 vsubpd %ymm3, %ymm15, %ymm3 5104// addq %r11, %r10 5105 51060: 5107 5108#if MACRO_LEVEL>=1 5109 .endm 5110#else 5111 ret 5112 5113 FUN_END(inner_blend_scale_m11_4x4_vs_lib) 5114#endif 5115 5116 5117 5118 5119 5120// common inner routine with file scope 5121// 5122// scale for generic alpha and beta 5123// 5124// input arguments: 5125// r10 <- alpha 5126// r11 <- beta 5127// r12 <- C 5128// r13 <- ldc 5129// ymm0 <- [d00 d11 d22 d33] 5130// ymm1 <- [d01 d10 d23 d32] 5131// ymm2 <- [d03 d12 d21 d30] 5132// ymm3 <- [d02 d13 d20 d31] 5133// 5134// output arguments: 5135 5136#if MACRO_LEVEL>=1 5137 .macro INNER_SCALE_AB_4X4_LIB 5138#else 5139 .p2align 4,,15 5140 FUN_START(inner_scale_ab_4x4_lib) 5141#endif 5142 5143 // alpha 5144 vbroadcastsd 0(%r10), %ymm15 5145 5146 vmulpd %ymm0, %ymm15, %ymm0 5147 vmulpd %ymm1, %ymm15, %ymm1 5148 vmulpd %ymm2, %ymm15, %ymm2 5149 vmulpd %ymm3, %ymm15, %ymm3 5150 5151 // beta 5152 vbroadcastsd 0(%r11), %ymm14 5153 5154 vxorpd %ymm15, %ymm15, %ymm15 // 0.0 5155 5156 vucomisd %xmm15, %xmm14 // beta==0.0 ? 5157 je 0f // end 5158 5159 vmovupd 0(%r12), %ymm15 5160 vmulpd %ymm14, %ymm15, %ymm15 5161 vaddpd %ymm0, %ymm15, %ymm0 5162 addq %r13, %r12 5163 vmovupd 0(%r12), %ymm15 5164 vmulpd %ymm14, %ymm15, %ymm15 5165 vaddpd %ymm1, %ymm15, %ymm1 5166 addq %r13, %r12 5167 vmovupd 0(%r12), %ymm15 5168 vmulpd %ymm14, %ymm15, %ymm15 5169 vaddpd %ymm2, %ymm15, %ymm2 5170 addq %r13, %r12 5171 vmovupd 0(%r12), %ymm15 5172 vmulpd %ymm14, %ymm15, %ymm15 5173 vaddpd %ymm3, %ymm15, %ymm3 5174// addq %r13, %r12 5175 51760: 5177 5178#if MACRO_LEVEL>=1 5179 .endm 5180#else 5181 ret 5182 5183 FUN_END(inner_scale_ab_4x4_lib) 5184#endif 5185 5186 5187 5188 5189 5190// common inner routine with file scope 5191// 5192// scale for generic alpha and beta 5193// 5194// input arguments: 5195// r10 <- alpha 5196// r11 <- beta 5197// r12 <- C 5198// r13 <- ldc 5199// r14d <- km 5200// r15d <- kn 5201// ymm0 <- [d00 d11 d22 d33] 5202// ymm1 <- [d01 d10 d23 d32] 5203// ymm2 <- [d03 d12 d21 d30] 5204// ymm3 <- [d02 d13 d20 d31] 5205// 5206// output arguments: 5207 5208#if MACRO_LEVEL>=1 5209 .macro INNER_SCALE_AB_4X4_VS_LIB 5210#else 5211 .p2align 4,,15 5212 FUN_START(inner_scale_ab_4x4_vs_lib) 5213#endif 5214 5215 // alpha 5216 vbroadcastsd 0(%r10), %ymm15 5217 5218 vmulpd %ymm0, %ymm15, %ymm0 5219 vmulpd %ymm1, %ymm15, %ymm1 5220 vmulpd %ymm2, %ymm15, %ymm2 5221 vmulpd %ymm3, %ymm15, %ymm3 5222 5223 // beta 5224 vbroadcastsd 0(%r11), %ymm14 5225 5226 vxorpd %ymm15, %ymm15, %ymm15 // 0.0 5227 vucomisd %xmm15, %xmm14 // beta==0.0 ? 5228 je 0f // end 5229 5230 5231 vcvtsi2sd %r14d, %xmm15, %xmm15 5232#if defined(OS_LINUX) | defined(OS_WINDOWS) 5233 vmovupd .LC02(%rip), %ymm13 5234#elif defined(OS_MAC) 5235 vmovupd LC02(%rip), %ymm13 5236#endif 5237 vmovddup %xmm15, %xmm15 5238 vinsertf128 $1, %xmm15, %ymm15, %ymm15 5239 vsubpd %ymm15, %ymm13, %ymm13 5240 5241 5242 vmaskmovpd 0(%r12), %ymm13, %ymm15 5243 vmulpd %ymm14, %ymm15, %ymm15 5244 vaddpd %ymm0, %ymm15, %ymm0 5245 addq %r13, %r12 5246 cmpl $2, %r15d 5247 jl 0f // end 5248 vmaskmovpd 0(%r12), %ymm13, %ymm15 5249 vmulpd %ymm14, %ymm15, %ymm15 5250 vaddpd %ymm1, %ymm15, %ymm1 5251 addq %r13, %r12 5252 cmpl $3, %r15d 5253 jl 0f // end 5254 vmaskmovpd 0(%r12), %ymm13, %ymm15 5255 vmulpd %ymm14, %ymm15, %ymm15 5256 vaddpd %ymm2, %ymm15, %ymm2 5257 addq %r13, %r12 5258 cmpl $3, %r15d 5259 je 0f // end 5260 vmaskmovpd 0(%r12), %ymm13, %ymm15 5261 vmulpd %ymm14, %ymm15, %ymm15 5262 vaddpd %ymm3, %ymm15, %ymm3 5263// addq %r13, %r12 5264 52650: 5266 5267#if MACRO_LEVEL>=1 5268 .endm 5269#else 5270 ret 5271 5272 FUN_END(inner_scale_ab_4x4_vs_lib) 5273#endif 5274 5275 5276 5277 5278 5279// common inner routine with file scope 5280// 5281// tran_scale for generic alpha and beta 5282// 5283// input arguments: 5284// r10 <- alpha 5285// r11 <- beta 5286// r12 <- C 5287// r13 <- ldc 5288// ymm0 <- [d00 d11 d22 d33] 5289// ymm1 <- [d01 d10 d23 d32] 5290// ymm2 <- [d03 d12 d21 d30] 5291// ymm3 <- [d02 d13 d20 d31] 5292// 5293// output arguments: 5294 5295#if MACRO_LEVEL>=1 5296 .macro INNER_TRAN_SCALE_AB_4X4_LIB 5297#else 5298 .p2align 4,,15 5299 FUN_START(inner_tran_scale_ab_4x4_lib) 5300#endif 5301 5302 vunpcklpd %ymm1, %ymm0, %ymm12 5303 vunpckhpd %ymm1, %ymm0, %ymm13 5304 vunpcklpd %ymm3, %ymm2, %ymm14 5305 vunpckhpd %ymm3, %ymm2, %ymm15 5306 5307 vperm2f128 $0x20, %ymm14, %ymm12, %ymm0 5308 vperm2f128 $0x31, %ymm14, %ymm12, %ymm2 5309 vperm2f128 $0x20, %ymm15, %ymm13, %ymm1 5310 vperm2f128 $0x31, %ymm15, %ymm13, %ymm3 5311 5312 // alpha 5313 vbroadcastsd 0(%r10), %ymm15 5314 5315 vmulpd %ymm0, %ymm15, %ymm0 5316 vmulpd %ymm1, %ymm15, %ymm1 5317 vmulpd %ymm2, %ymm15, %ymm2 5318 vmulpd %ymm3, %ymm15, %ymm3 5319 5320 // beta 5321 vbroadcastsd 0(%r11), %ymm14 5322 5323 vxorpd %ymm15, %ymm15, %ymm15 // 0.0 5324 5325 vucomisd %xmm15, %xmm14 // beta==0.0 ? 5326 je 0f // end 5327 5328 vmovupd 0(%r12), %ymm15 5329 vmulpd %ymm14, %ymm15, %ymm15 5330 vaddpd %ymm0, %ymm15, %ymm0 5331 addq %r13, %r12 5332 vmovupd 0(%r12), %ymm15 5333 vmulpd %ymm14, %ymm15, %ymm15 5334 vaddpd %ymm1, %ymm15, %ymm1 5335 addq %r13, %r12 5336 vmovupd 0(%r12), %ymm15 5337 vmulpd %ymm14, %ymm15, %ymm15 5338 vaddpd %ymm2, %ymm15, %ymm2 5339 addq %r13, %r12 5340 vmovupd 0(%r12), %ymm15 5341 vmulpd %ymm14, %ymm15, %ymm15 5342 vaddpd %ymm3, %ymm15, %ymm3 5343// addq %r13, %r12 5344 53450: 5346 5347#if MACRO_LEVEL>=1 5348 .endm 5349#else 5350 ret 5351 5352 FUN_END(inner_tran_scale_ab_4x4_lib) 5353#endif 5354 5355 5356 5357 5358 5359// common inner routine with file scope 5360// 5361// tran scale for generic alpha and beta 5362// 5363// input arguments: 5364// r10 <- alpha 5365// r11 <- beta 5366// r12 <- C 5367// r13 <- ldc 5368// r14d <- km 5369// r15d <- kn 5370// ymm0 <- [d00 d11 d22 d33] 5371// ymm1 <- [d01 d10 d23 d32] 5372// ymm2 <- [d03 d12 d21 d30] 5373// ymm3 <- [d02 d13 d20 d31] 5374// 5375// output arguments: 5376 5377#if MACRO_LEVEL>=1 5378 .macro INNER_TRAN_SCALE_AB_4X4_VS_LIB 5379#else 5380 .p2align 4,,15 5381 FUN_START(inner_tran_scale_ab_4x4_vs_lib) 5382#endif 5383 5384 vunpcklpd %ymm1, %ymm0, %ymm12 5385 vunpckhpd %ymm1, %ymm0, %ymm13 5386 vunpcklpd %ymm3, %ymm2, %ymm14 5387 vunpckhpd %ymm3, %ymm2, %ymm15 5388 5389 vperm2f128 $0x20, %ymm14, %ymm12, %ymm0 5390 vperm2f128 $0x31, %ymm14, %ymm12, %ymm2 5391 vperm2f128 $0x20, %ymm15, %ymm13, %ymm1 5392 vperm2f128 $0x31, %ymm15, %ymm13, %ymm3 5393 5394 // alpha 5395 vbroadcastsd 0(%r10), %ymm15 5396 5397 vmulpd %ymm0, %ymm15, %ymm0 5398 vmulpd %ymm1, %ymm15, %ymm1 5399 vmulpd %ymm2, %ymm15, %ymm2 5400 vmulpd %ymm3, %ymm15, %ymm3 5401 5402 // beta 5403 vbroadcastsd 0(%r11), %ymm14 5404 5405 vxorpd %ymm15, %ymm15, %ymm15 // 0.0 5406 vucomisd %xmm15, %xmm14 // beta==0.0 ? 5407 je 0f // end 5408 5409 5410 vcvtsi2sd %r14d, %xmm15, %xmm15 5411#if defined(OS_LINUX) | defined(OS_WINDOWS) 5412 vmovupd .LC02(%rip), %ymm13 5413#elif defined(OS_MAC) 5414 vmovupd LC02(%rip), %ymm13 5415#endif 5416 vmovddup %xmm15, %xmm15 5417 vinsertf128 $1, %xmm15, %ymm15, %ymm15 5418 vsubpd %ymm15, %ymm13, %ymm13 5419 5420 5421 vmaskmovpd 0(%r12), %ymm13, %ymm15 5422 vmulpd %ymm14, %ymm15, %ymm15 5423 vaddpd %ymm0, %ymm15, %ymm0 5424 addq %r13, %r12 5425 cmpl $2, %r15d 5426 jl 0f // end 5427 vmaskmovpd 0(%r12), %ymm13, %ymm15 5428 vmulpd %ymm14, %ymm15, %ymm15 5429 vaddpd %ymm1, %ymm15, %ymm1 5430 addq %r13, %r12 5431 cmpl $3, %r15d 5432 jl 0f // end 5433 vmaskmovpd 0(%r12), %ymm13, %ymm15 5434 vmulpd %ymm14, %ymm15, %ymm15 5435 vaddpd %ymm2, %ymm15, %ymm2 5436 addq %r13, %r12 5437 cmpl $3, %r15d 5438 je 0f // end 5439 vmaskmovpd 0(%r12), %ymm13, %ymm15 5440 vmulpd %ymm14, %ymm15, %ymm15 5441 vaddpd %ymm3, %ymm15, %ymm3 5442// addq %r13, %r12 5443 54440: 5445 5446#if MACRO_LEVEL>=1 5447 .endm 5448#else 5449 ret 5450 5451 FUN_END(inner_tran_scale_ab_4x4_vs_lib) 5452#endif 5453 5454 5455 5456 5457 5458// common inner routine with file scope 5459// 5460// scale for alpha=-1 and generic beta 5461// 5462// input arguments: 5463// r10 <- beta 5464// r11 <- C 5465// r12 <- ldc 5466// ymm0 <- [d00 d11 d22 d33] 5467// ymm1 <- [d01 d10 d23 d32] 5468// ymm2 <- [d03 d12 d21 d30] 5469// ymm3 <- [d02 d13 d20 d31] 5470// 5471// output arguments: 5472 5473#if MACRO_LEVEL>=1 5474 .macro INNER_SCALE_M1B_4X4_LIB 5475#else 5476 .p2align 4,,15 5477 FUN_START(inner_scale_m1b_4x4_lib) 5478#endif 5479 5480 // beta 5481 vbroadcastsd 0(%r10), %ymm14 5482 5483 vxorpd %ymm15, %ymm15, %ymm15 // 0.0 5484 5485 vucomisd %xmm15, %xmm14 // beta==0.0 ? 5486 je 0f // end 5487 5488 vmovupd 0(%r11), %ymm15 5489 vmulpd %ymm14, %ymm15, %ymm15 5490 vsubpd %ymm0, %ymm15, %ymm0 5491 addq %r12, %r11 5492 vmovupd 0(%r11), %ymm15 5493 vmulpd %ymm14, %ymm15, %ymm15 5494 vsubpd %ymm1, %ymm15, %ymm1 5495 addq %r12, %r11 5496 vmovupd 0(%r11), %ymm15 5497 vmulpd %ymm14, %ymm15, %ymm15 5498 vsubpd %ymm2, %ymm15, %ymm2 5499 addq %r12, %r11 5500 vmovupd 0(%r11), %ymm15 5501 vmulpd %ymm14, %ymm15, %ymm15 5502 vsubpd %ymm3, %ymm15, %ymm3 5503// addq %r12, %r11 5504 55050: 5506 5507#if MACRO_LEVEL>=1 5508 .endm 5509#else 5510 ret 5511 5512 FUN_END(inner_scale_m1b_4x4_lib) 5513#endif 5514 5515 5516 5517 5518 5519// common inner routine with file scope 5520// 5521// scale for generic alpha and beta 5522// 5523// input arguments: 5524// r10 <- beta 5525// r11 <- C 5526// r12 <- ldc 5527// r13d <- km 5528// r14d <- kn 5529// ymm0 <- [d00 d11 d22 d33] 5530// ymm1 <- [d01 d10 d23 d32] 5531// ymm2 <- [d03 d12 d21 d30] 5532// ymm3 <- [d02 d13 d20 d31] 5533// 5534// output arguments: 5535 5536#if MACRO_LEVEL>=1 5537 .macro INNER_SCALE_M1B_4X4_VS_LIB 5538#else 5539 .p2align 4,,15 5540 FUN_START(inner_scale_m1b_4x4_vs_lib) 5541#endif 5542 5543 // beta 5544 vbroadcastsd 0(%r10), %ymm14 5545 5546 vxorpd %ymm15, %ymm15, %ymm15 // 0.0 5547 vucomisd %xmm15, %xmm14 // beta==0.0 ? 5548 je 0f // end 5549 5550 5551 vcvtsi2sd %r13d, %xmm15, %xmm15 5552#if defined(OS_LINUX) | defined(OS_WINDOWS) 5553 vmovupd .LC02(%rip), %ymm13 5554#elif defined(OS_MAC) 5555 vmovupd LC02(%rip), %ymm13 5556#endif 5557 vmovddup %xmm15, %xmm15 5558 vinsertf128 $1, %xmm15, %ymm15, %ymm15 5559 vsubpd %ymm15, %ymm13, %ymm13 5560 5561 5562 vmaskmovpd 0(%r11), %ymm13, %ymm15 5563 vmulpd %ymm14, %ymm15, %ymm15 5564 vsubpd %ymm0, %ymm15, %ymm0 5565 addq %r12, %r11 5566 cmpl $2, %r14d 5567 jl 0f // end 5568 vmaskmovpd 0(%r11), %ymm13, %ymm15 5569 vmulpd %ymm14, %ymm15, %ymm15 5570 vsubpd %ymm1, %ymm15, %ymm1 5571 addq %r12, %r11 5572 cmpl $3, %r14d 5573 jl 0f // end 5574 vmaskmovpd 0(%r11), %ymm13, %ymm15 5575 vmulpd %ymm14, %ymm15, %ymm15 5576 vsubpd %ymm2, %ymm15, %ymm2 5577 addq %r12, %r11 5578 cmpl $3, %r14d 5579 je 0f // end 5580 vmaskmovpd 0(%r11), %ymm13, %ymm15 5581 vmulpd %ymm14, %ymm15, %ymm15 5582 vsubpd %ymm3, %ymm15, %ymm3 5583// addq %r12, %r11 5584 55850: 5586 5587#if MACRO_LEVEL>=1 5588 .endm 5589#else 5590 ret 5591 5592 FUN_END(inner_scale_m1b_4x4_vs_lib) 5593#endif 5594 5595 5596 5597 5598 5599// common inner routine with file scope 5600// 5601// scale for alpha=-1 and beta=1 5602// 5603// input arguments: 5604// r10 <- C 5605// r11 <- ldc 5606// ymm0 <- [d00 d11 d22 d33] 5607// ymm1 <- [d01 d10 d23 d32] 5608// ymm2 <- [d03 d12 d21 d30] 5609// ymm3 <- [d02 d13 d20 d31] 5610// 5611// output arguments: 5612 5613#if MACRO_LEVEL>=1 5614 .macro INNER_SCALE_M11_4X4_LIB 5615#else 5616 .p2align 4,,15 5617 FUN_START(inner_scale_m11_4x4_lib) 5618#endif 5619 5620 vmovupd 0(%r10), %ymm15 5621 vsubpd %ymm0, %ymm15, %ymm0 5622 addq %r11, %r10 5623 vmovupd 0(%r10), %ymm15 5624 vsubpd %ymm1, %ymm15, %ymm1 5625 addq %r11, %r10 5626 vmovupd 0(%r10), %ymm15 5627 vsubpd %ymm2, %ymm15, %ymm2 5628 addq %r11, %r10 5629 vmovupd 0(%r10), %ymm15 5630 vsubpd %ymm3, %ymm15, %ymm3 5631// addq %r11, %r10 5632 56330: 5634 5635#if MACRO_LEVEL>=1 5636 .endm 5637#else 5638 ret 5639 5640 FUN_END(inner_scale_m11_4x4_lib) 5641#endif 5642 5643 5644 5645 5646 5647// common inner routine with file scope 5648// 5649// scale for alpha=-1 and beta=1 5650// 5651// input arguments: 5652// r10 <- C 5653// r11 <- ldc 5654// r12d <- km 5655// r13d <- kn 5656// ymm0 <- [d00 d11 d22 d33] 5657// ymm1 <- [d01 d10 d23 d32] 5658// ymm2 <- [d03 d12 d21 d30] 5659// ymm3 <- [d02 d13 d20 d31] 5660// 5661// output arguments: 5662 5663#if MACRO_LEVEL>=1 5664 .macro INNER_SCALE_M11_4X4_VS_LIB 5665#else 5666 .p2align 4,,15 5667 FUN_START(inner_scale_m11_4x4_vs_lib) 5668#endif 5669 5670 vcvtsi2sd %r12d, %xmm15, %xmm15 5671#if defined(OS_LINUX) | defined(OS_WINDOWS) 5672 vmovupd .LC02(%rip), %ymm13 5673#elif defined(OS_MAC) 5674 vmovupd LC02(%rip), %ymm13 5675#endif 5676 vmovddup %xmm15, %xmm15 5677 vinsertf128 $1, %xmm15, %ymm15, %ymm15 5678 vsubpd %ymm15, %ymm13, %ymm13 5679 5680 5681 vmaskmovpd 0(%r10), %ymm13, %ymm15 5682 vsubpd %ymm0, %ymm15, %ymm0 5683 addq %r11, %r10 5684 cmpl $2, %r13d 5685 jl 0f // end 5686 vmaskmovpd 0(%r10), %ymm13, %ymm15 5687 vsubpd %ymm1, %ymm15, %ymm1 5688 addq %r11, %r10 5689 cmpl $3, %r13d 5690 jl 0f // end 5691 vmaskmovpd 0(%r10), %ymm13, %ymm15 5692 vsubpd %ymm2, %ymm15, %ymm2 5693 addq %r11, %r10 5694 cmpl $3, %r13d 5695 je 0f // end 5696 vmaskmovpd 0(%r10), %ymm13, %ymm15 5697 vsubpd %ymm3, %ymm15, %ymm3 5698// addq %r11, %r10 5699 57000: 5701 5702#if MACRO_LEVEL>=1 5703 .endm 5704#else 5705 ret 5706 5707 FUN_END(inner_scale_m11_4x4_vs_lib) 5708#endif 5709 5710 5711 5712 5713 5714// common inner routine with file scope 5715// 5716// store n 5717// 5718// input arguments: 5719// r10 <- D 5720// r11 <- ldd 5721// ymm0 <- [d00 d11 d22 d33] 5722// ymm1 <- [d01 d10 d23 d32] 5723// ymm2 <- [d03 d12 d21 d30] 5724// ymm3 <- [d02 d13 d20 d31] 5725// 5726// output arguments: 5727 5728#if MACRO_LEVEL>=1 5729 .macro INNER_STORE_4X4_LIB 5730#else 5731 .p2align 4,,15 5732 FUN_START(inner_store_4x4_lib) 5733#endif 5734 5735 vmovupd %ymm0, 0(%r10) 5736 addq %r11, %r10 5737 vmovupd %ymm1, 0(%r10) 5738 addq %r11, %r10 5739 vmovupd %ymm2, 0(%r10) 5740 addq %r11, %r10 5741 vmovupd %ymm3, 0(%r10) 5742// addq %r11, %r10 5743 5744#if MACRO_LEVEL>=1 5745 .endm 5746#else 5747 ret 5748 5749 FUN_END(inner_store_4x4_lib) 5750#endif 5751 5752 5753 5754 5755 5756// common inner routine with file scope 5757// 5758// store n 5759// 5760// input arguments: 5761// r10 <- D 5762// r11 <- ldd 5763// ymm0 <- [d00 d11 d22 d33] 5764// ymm1 <- [d01 d10 d23 d32] 5765// ymm2 <- [d03 d12 d21 d30] 5766// ymm3 <- [d02 d13 d20 d31] 5767// 5768// output arguments: 5769 5770#if MACRO_LEVEL>=1 5771 .macro INNER_TRAN_STORE_4X4_LIB 5772#else 5773 .p2align 4,,15 5774 FUN_START(inner_tran_store_4x4_lib) 5775#endif 5776 5777 vunpcklpd %ymm1, %ymm0, %ymm12 5778 vunpckhpd %ymm1, %ymm0, %ymm13 5779 vunpcklpd %ymm3, %ymm2, %ymm14 5780 vunpckhpd %ymm3, %ymm2, %ymm15 5781 5782 vperm2f128 $0x20, %ymm14, %ymm12, %ymm0 5783 vperm2f128 $0x31, %ymm14, %ymm12, %ymm2 5784 vperm2f128 $0x20, %ymm15, %ymm13, %ymm1 5785 vperm2f128 $0x31, %ymm15, %ymm13, %ymm3 5786 5787 vmovupd %ymm0, 0(%r10) 5788 addq %r11, %r10 5789 vmovupd %ymm1, 0(%r10) 5790 addq %r11, %r10 5791 vmovupd %ymm2, 0(%r10) 5792 addq %r11, %r10 5793 vmovupd %ymm3, 0(%r10) 5794// addq %r11, %r10 5795 5796#if MACRO_LEVEL>=1 5797 .endm 5798#else 5799 ret 5800 5801 FUN_END(inner_tran_store_4x4_lib) 5802#endif 5803 5804 5805 5806 5807 5808// common inner routine with file scope 5809// 5810// store n 5811// 5812// input arguments: 5813// r10 <- D 5814// r11 <- ldd 5815// ymm0 <- [d00 d11 d22 d33] 5816// ymm1 <- [d01 d10 d23 d32] 5817// ymm2 <- [d03 d12 d21 d30] 5818// ymm3 <- [d02 d13 d20 d31] 5819// 5820// output arguments: 5821 5822#if MACRO_LEVEL>=1 5823 .macro INNER_STORE_L_4X4_LIB 5824#else 5825 .p2align 4,,15 5826 FUN_START(inner_store_l_4x4_lib) 5827#endif 5828 5829 vmovupd %ymm0, 0(%r10) 5830 addq %r11, %r10 5831 vmovupd 0(%r10), %ymm15 5832 vblendpd $0x1, %ymm15, %ymm1, %ymm1 5833 vmovupd %ymm1, 0(%r10) 5834 addq %r11, %r10 5835 vmovupd 0(%r10), %ymm15 5836 vblendpd $0x3, %ymm15, %ymm2, %ymm2 5837 vmovupd %ymm2, 0(%r10) 5838 addq %r11, %r10 5839 vmovupd 0(%r10), %ymm15 5840 vblendpd $0x7, %ymm15, %ymm3, %ymm3 5841 vmovupd %ymm3, 0(%r10) 5842// addq %r11, %r10 5843 5844#if MACRO_LEVEL>=1 5845 .endm 5846#else 5847 ret 5848 5849 FUN_END(inner_store_l_4x4_lib) 5850#endif 5851 5852 5853 5854 5855 5856// common inner routine with file scope 5857// 5858// store n 5859// 5860// input arguments: 5861// r10 <- D 5862// r11 <- ldd 5863// ymm0 <- [d00 d11 d22 d33] 5864// ymm1 <- [d01 d10 d23 d32] 5865// ymm2 <- [d03 d12 d21 d30] 5866// ymm3 <- [d02 d13 d20 d31] 5867// 5868// output arguments: 5869 5870#if MACRO_LEVEL>=1 5871 .macro INNER_STORE_U_4X4_LIB 5872#else 5873 .p2align 4,,15 5874 FUN_START(inner_store_u_4x4_lib) 5875#endif 5876 5877 vmovupd 0(%r10), %ymm15 5878 vblendpd $0x1, %ymm0, %ymm15, %ymm0 5879 vmovupd %ymm0, 0(%r10) 5880 addq %r11, %r10 5881 vmovupd 0(%r10), %ymm15 5882 vblendpd $0x3, %ymm1, %ymm15, %ymm1 5883 vmovupd %ymm1, 0(%r10) 5884 addq %r11, %r10 5885 vmovupd 0(%r10), %ymm15 5886 vblendpd $0x7, %ymm2, %ymm15, %ymm2 5887 vmovupd %ymm2, 0(%r10) 5888 addq %r11, %r10 5889 vmovupd %ymm3, 0(%r10) 5890// addq %r11, %r10 5891 5892#if MACRO_LEVEL>=1 5893 .endm 5894#else 5895 ret 5896 5897 FUN_END(inner_store_u_4x4_lib) 5898#endif 5899 5900 5901 5902 5903 5904// common inner routine with file scope 5905// 5906// store n vs 5907// 5908// input arguments: 5909// r10 <- D 5910// r11 <- ldd 5911// r12d <- km 5912// r13d <- kn 5913// ymm0 <- [d00 d11 d22 d33] 5914// ymm1 <- [d01 d10 d23 d32] 5915// ymm2 <- [d03 d12 d21 d30] 5916// ymm3 <- [d02 d13 d20 d31] 5917// 5918// output arguments: 5919 5920#if MACRO_LEVEL>=1 5921 .macro INNER_STORE_4X4_VS_LIB 5922#else 5923 .p2align 4,,15 5924 FUN_START(inner_store_4x4_vs_lib) 5925#endif 5926 5927 vcvtsi2sd %r12d, %xmm15, %xmm15 5928#if defined(OS_LINUX) | defined(OS_WINDOWS) 5929 vmovupd .LC02(%rip), %ymm14 5930#elif defined(OS_MAC) 5931 vmovupd LC02(%rip), %ymm14 5932#endif 5933 vmovddup %xmm15, %xmm15 5934 vinsertf128 $1, %xmm15, %ymm15, %ymm15 5935 vsubpd %ymm15, %ymm14, %ymm15 5936 5937 vmaskmovpd %ymm0, %ymm15, 0(%r10) 5938 addq %r11, %r10 5939 cmpl $2, %r13d 5940 jl 0f // end 5941 vmaskmovpd %ymm1, %ymm15, 0(%r10) 5942 addq %r11, %r10 5943 cmpl $3, %r13d 5944 jl 0f // end 5945 vmaskmovpd %ymm2, %ymm15, 0(%r10) 5946 addq %r11, %r10 5947 cmpl $3, %r13d 5948 je 0f // end 5949 vmaskmovpd %ymm3, %ymm15, 0(%r10) 5950// addq %r11, %r10 5951 59520: 5953 5954#if MACRO_LEVEL>=1 5955 .endm 5956#else 5957 ret 5958 5959 FUN_END(inner_store_4x4_vs_lib) 5960#endif 5961 5962 5963 5964 5965 5966// common inner routine with file scope 5967// 5968// store n vs 5969// 5970// input arguments: 5971// r10 <- D 5972// r11 <- ldd 5973// r12d <- km 5974// r13d <- kn 5975// ymm0 <- [d00 d11 d22 d33] 5976// ymm1 <- [d01 d10 d23 d32] 5977// ymm2 <- [d03 d12 d21 d30] 5978// ymm3 <- [d02 d13 d20 d31] 5979// 5980// output arguments: 5981 5982#if MACRO_LEVEL>=1 5983 .macro INNER_TRAN_STORE_4X4_VS_LIB 5984#else 5985 .p2align 4,,15 5986 FUN_START(inner_tran_store_4x4_vs_lib) 5987#endif 5988 5989 vunpcklpd %ymm1, %ymm0, %ymm12 5990 vunpckhpd %ymm1, %ymm0, %ymm13 5991 vunpcklpd %ymm3, %ymm2, %ymm14 5992 vunpckhpd %ymm3, %ymm2, %ymm15 5993 5994 vperm2f128 $0x20, %ymm14, %ymm12, %ymm0 5995 vperm2f128 $0x31, %ymm14, %ymm12, %ymm2 5996 vperm2f128 $0x20, %ymm15, %ymm13, %ymm1 5997 vperm2f128 $0x31, %ymm15, %ymm13, %ymm3 5998 5999 vcvtsi2sd %r12d, %xmm15, %xmm15 6000#if defined(OS_LINUX) | defined(OS_WINDOWS) 6001 vmovupd .LC02(%rip), %ymm14 6002#elif defined(OS_MAC) 6003 vmovupd LC02(%rip), %ymm14 6004#endif 6005 vmovddup %xmm15, %xmm15 6006 vinsertf128 $1, %xmm15, %ymm15, %ymm15 6007 vsubpd %ymm15, %ymm14, %ymm15 6008 6009 vmaskmovpd %ymm0, %ymm15, 0(%r10) 6010 addq %r11, %r10 6011 cmpl $2, %r13d 6012 jl 0f // end 6013 vmaskmovpd %ymm1, %ymm15, 0(%r10) 6014 addq %r11, %r10 6015 cmpl $3, %r13d 6016 jl 0f // end 6017 vmaskmovpd %ymm2, %ymm15, 0(%r10) 6018 addq %r11, %r10 6019 cmpl $3, %r13d 6020 je 0f // end 6021 vmaskmovpd %ymm3, %ymm15, 0(%r10) 6022// addq %r11, %r10 6023 60240: 6025 6026#if MACRO_LEVEL>=1 6027 .endm 6028#else 6029 ret 6030 6031 FUN_END(inner_tran_store_4x4_vs_lib) 6032#endif 6033 6034 6035 6036 6037 6038// common inner routine with file scope 6039// 6040// store n 6041// 6042// input arguments: 6043// r10 <- D 6044// r11 <- ldd 6045// r12d <- km 6046// r13d <- kn 6047// ymm0 <- [d00 d11 d22 d33] 6048// ymm1 <- [d01 d10 d23 d32] 6049// ymm2 <- [d03 d12 d21 d30] 6050// ymm3 <- [d02 d13 d20 d31] 6051// 6052// output arguments: 6053 6054#if MACRO_LEVEL>=1 6055 .macro INNER_STORE_L_4X4_VS_LIB 6056#else 6057 .p2align 4,,15 6058 FUN_START(inner_store_l_4x4_vs_lib) 6059#endif 6060 6061 vcvtsi2sd %r12d, %xmm15, %xmm15 6062#if defined(OS_LINUX) | defined(OS_WINDOWS) 6063 vmovupd .LC02(%rip), %ymm14 6064#elif defined(OS_MAC) 6065 vmovupd LC02(%rip), %ymm14 6066#endif 6067 vmovddup %xmm15, %xmm15 6068 vinsertf128 $1, %xmm15, %ymm15, %ymm15 6069 vsubpd %ymm15, %ymm14, %ymm14 6070 6071 vmaskmovpd %ymm0, %ymm14, 0(%r10) 6072 addq %r11, %r10 6073 cmpl $2, %r13d 6074 jl 0f // end 6075 vmovupd 0(%r10), %ymm15 6076 vblendpd $0x1, %ymm15, %ymm1, %ymm1 6077 vmaskmovpd %ymm1, %ymm14, 0(%r10) 6078 addq %r11, %r10 6079 cmpl $3, %r13d 6080 jl 0f // end 6081 vmovupd 0(%r10), %ymm15 6082 vblendpd $0x3, %ymm15, %ymm2, %ymm2 6083 vmaskmovpd %ymm2, %ymm14, 0(%r10) 6084 addq %r11, %r10 6085 cmpl $3, %r13d 6086 je 0f // end 6087 vmovupd 0(%r10), %ymm15 6088 vblendpd $0x7, %ymm15, %ymm3, %ymm3 6089 vmaskmovpd %ymm3, %ymm14, 0(%r10) 6090// addq %r11, %r10 6091 60920: 6093 6094#if MACRO_LEVEL>=1 6095 .endm 6096#else 6097 ret 6098 6099 FUN_END(inner_store_l_4x4_vs_lib) 6100#endif 6101 6102 6103 6104 6105 6106// common inner routine with file scope 6107// 6108// store n 6109// 6110// input arguments: 6111// r10 <- D 6112// r11 <- ldd 6113// r12d <- km 6114// r13d <- kn 6115// ymm0 <- [d00 d11 d22 d33] 6116// ymm1 <- [d01 d10 d23 d32] 6117// ymm2 <- [d03 d12 d21 d30] 6118// ymm3 <- [d02 d13 d20 d31] 6119// 6120// output arguments: 6121 6122#if MACRO_LEVEL>=1 6123 .macro INNER_STORE_U_4X4_VS_LIB 6124#else 6125 .p2align 4,,15 6126 FUN_START(inner_store_u_4x4_vs_lib) 6127#endif 6128 6129 vcvtsi2sd %r12d, %xmm15, %xmm15 6130#if defined(OS_LINUX) | defined(OS_WINDOWS) 6131 vmovupd .LC02(%rip), %ymm14 6132#elif defined(OS_MAC) 6133 vmovupd LC02(%rip), %ymm14 6134#endif 6135 vmovddup %xmm15, %xmm15 6136 vinsertf128 $1, %xmm15, %ymm15, %ymm15 6137 vsubpd %ymm15, %ymm14, %ymm14 6138 6139 vmovupd 0(%r10), %ymm15 6140 vblendpd $0x1, %ymm0, %ymm15, %ymm0 6141 vmaskmovpd %ymm0, %ymm14, 0(%r10) 6142 addq %r11, %r10 6143 cmpl $2, %r13d 6144 jl 0f // end 6145 vmovupd 0(%r10), %ymm15 6146 vblendpd $0x3, %ymm1, %ymm15, %ymm1 6147 vmaskmovpd %ymm1, %ymm14, 0(%r10) 6148 addq %r11, %r10 6149 cmpl $3, %r13d 6150 jl 0f // end 6151 vmovupd 0(%r10), %ymm15 6152 vblendpd $0x7, %ymm2, %ymm15, %ymm2 6153 vmaskmovpd %ymm2, %ymm14, 0(%r10) 6154 addq %r11, %r10 6155 cmpl $3, %r13d 6156 je 0f // end 6157 vmaskmovpd %ymm3, %ymm14, 0(%r10) 6158// addq %r11, %r10 6159 61600: 6161 6162#if MACRO_LEVEL>=1 6163 .endm 6164#else 6165 ret 6166 6167 FUN_END(inner_store_u_4x4_vs_lib) 6168#endif 6169 6170 6171 6172 6173 6174// 1 2 3 4 5 6 7 8 9 10 6175// void kernel_dgemm_nt_4x4_lib4ccc(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd); 6176 6177 .p2align 4,,15 6178 GLOB_FUN_START(kernel_dgemm_nt_4x4_lib4ccc) 6179 6180 PROLOGUE 6181 6182 // zero accumulation registers 6183 6184 ZERO_ACC 6185 6186 6187 // call inner dgemm kernel nn 6188 6189 movq ARG1, %r10 // k 6190 movq ARG3, %r11 // A 6191 movq ARG4, %r12 // B 6192 movq ARG5, %r13 // ldb 6193 sall $3, %r13d 6194 6195#if MACRO_LEVEL>=2 6196 INNER_KERNEL_DGEMM_NT_4X4_LIB4C 6197#else 6198 CALL(inner_kernel_dgemm_nt_4x4_lib4c) 6199#endif 6200 6201 6202 // call inner blend 6203 6204 movq ARG2, %r10 // alpha 6205 movq ARG6, %r11 // beta 6206 movq ARG7, %r12 // C 6207 movq ARG8, %r13 // ldc 6208 sall $3, %r13d 6209 6210#if MACRO_LEVEL>=1 6211 INNER_SCALE_AB_4X4_LIB 6212#else 6213 CALL(inner_scale_ab_4x4_lib) 6214#endif 6215 6216 6217 // store n 6218 6219 movq ARG9, %r10 // D 6220 movq ARG10, %r11 // ldd 6221 sall $3, %r11d 6222 6223#if MACRO_LEVEL>=1 6224 INNER_STORE_4X4_LIB 6225#else 6226 CALL(inner_store_4x4_lib) 6227#endif 6228 6229 6230 EPILOGUE 6231 6232 ret 6233 6234 FUN_END(kernel_dgemm_nt_4x4_lib4ccc) 6235 6236 6237 6238 6239 6240// 1 2 3 4 5 6 7 8 9 10 11 12 6241// void kernel_dgemm_nt_4x4_vs_lib4ccc(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1); 6242 6243 .p2align 4,,15 6244 GLOB_FUN_START(kernel_dgemm_nt_4x4_vs_lib4ccc) 6245 6246 PROLOGUE 6247 6248 // zero accumulation registers 6249 6250 ZERO_ACC 6251 6252 6253 // call inner dgemm kernel nn 6254 6255 movq ARG1, %r10 // k 6256 movq ARG3, %r11 // A 6257 movq ARG4, %r12 // B 6258 movq ARG5, %r13 // ldb 6259 sall $3, %r13d 6260 6261 movq ARG12, %r14 // n1 6262 cmpl $1, %r14d 6263 jg 100f 6264 6265#if MACRO_LEVEL>=2 6266 INNER_KERNEL_DGEMM_NT_4X1_LIB4C 6267#else 6268 CALL(inner_kernel_dgemm_nt_4x1_lib4c) 6269#endif 6270 6271 jmp 103f 6272 6273100: 6274 6275 movq ARG12, %r14 // n1 6276 cmpl $2, %r14d 6277 jg 101f 6278 6279#if MACRO_LEVEL>=2 6280 INNER_KERNEL_DGEMM_NT_4X2_LIB4C 6281#else 6282 CALL(inner_kernel_dgemm_nt_4x2_lib4c) 6283#endif 6284 6285 jmp 103f 6286 6287101: 6288 6289 movq ARG12, %r14 // n1 6290 cmpl $3, %r14d 6291 jg 102f 6292 6293#if MACRO_LEVEL>=2 6294 INNER_KERNEL_DGEMM_NT_4X3_LIB4C 6295#else 6296 CALL(inner_kernel_dgemm_nt_4x3_lib4c) 6297#endif 6298 6299 jmp 103f 6300 6301102: 6302 6303#if MACRO_LEVEL>=2 6304 INNER_KERNEL_DGEMM_NT_4X4_LIB4C 6305#else 6306 CALL(inner_kernel_dgemm_nt_4x4_lib4c) 6307#endif 6308 6309103: 6310 6311 6312 // call inner blend 6313 6314 movq ARG2, %r10 // alpha 6315 movq ARG6, %r11 // beta 6316 movq ARG7, %r12 // C 6317 movq ARG8, %r13 // ldc 6318 sall $3, %r13d 6319 movq ARG11, %r14 // m1 6320 movq ARG12, %r15 // n1 6321 6322#if MACRO_LEVEL>=1 6323 INNER_SCALE_AB_4X4_VS_LIB 6324#else 6325 CALL(inner_scale_ab_4x4_vs_lib) 6326#endif 6327 6328 6329 // store n 6330 6331 movq ARG9, %r10 // D 6332 movq ARG10, %r11 // ldd 6333 sall $3, %r11d 6334 movq ARG11, %r12 // m1 6335 movq ARG12, %r13 // n1 6336 6337#if MACRO_LEVEL>=1 6338 INNER_STORE_4X4_VS_LIB 6339#else 6340 CALL(inner_store_4x4_vs_lib) 6341#endif 6342 6343 6344 EPILOGUE 6345 6346 ret 6347 6348 FUN_END(kernel_dgemm_nt_4x4_vs_lib4ccc) 6349 6350 6351 6352 6353 6354// 1 2 3 4 5 6 7 8 9 6355// void kernel_dgemm_nt_4x4_lib44cc(int k, double *alpha, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd); 6356 6357 .p2align 4,,15 6358 GLOB_FUN_START(kernel_dgemm_nt_4x4_lib44cc) 6359 6360 PROLOGUE 6361 6362 // zero accumulation registers 6363 6364 ZERO_ACC 6365 6366 6367 // call inner dgemm kernel nn 6368 6369 movq ARG1, %r10 // k 6370 movq ARG3, %r11 // A 6371 movq ARG4, %r12 // B 6372 6373#if MACRO_LEVEL>=2 6374 INNER_KERNEL_DGEMM_NT_4X4_LIB4 6375#else 6376 CALL(inner_kernel_dgemm_nt_4x4_lib4) 6377#endif 6378 6379 6380 // call inner blend 6381 6382 movq ARG2, %r10 // alpha 6383 movq ARG5, %r11 // beta 6384 movq ARG6, %r12 // C 6385 movq ARG7, %r13 // ldc 6386 sall $3, %r13d 6387 6388#if MACRO_LEVEL>=1 6389 INNER_BLEND_SCALE_AB_4X4_LIB 6390#else 6391 CALL(inner_blend_scale_ab_4x4_lib) 6392#endif 6393 6394 6395 // store n 6396 6397 movq ARG8, %r10 // D 6398 movq ARG9, %r11 // ldd 6399 sall $3, %r11d 6400 6401#if MACRO_LEVEL>=1 6402 INNER_STORE_4X4_LIB 6403#else 6404 CALL(inner_store_4x4_lib) 6405#endif 6406 6407 6408 EPILOGUE 6409 6410 ret 6411 6412 FUN_END(kernel_dgemm_nt_4x4_lib44cc) 6413 6414 6415 6416 6417 6418// 1 2 3 4 5 6 7 8 9 10 11 6419// void kernel_dgemm_nt_4x4_vs_lib44cc(int k, double *alpha, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1); 6420 6421 .p2align 4,,15 6422 GLOB_FUN_START(kernel_dgemm_nt_4x4_vs_lib44cc) 6423 6424 PROLOGUE 6425 6426 // zero accumulation registers 6427 6428 ZERO_ACC 6429 6430 6431 // call inner dgemm kernel nn 6432 6433 movq ARG1, %r10 // k 6434 movq ARG3, %r11 // A 6435 movq ARG4, %r12 // B 6436 6437#if MACRO_LEVEL>=2 6438 INNER_KERNEL_DGEMM_NT_4X4_LIB4 6439#else 6440 CALL(inner_kernel_dgemm_nt_4x4_lib4) 6441#endif 6442 6443 6444 // call inner blend 6445 6446 movq ARG2, %r10 // alpha 6447 movq ARG5, %r11 // beta 6448 movq ARG6, %r12 // C 6449 movq ARG7, %r13 // ldc 6450 sall $3, %r13d 6451 movq ARG10, %r14 // m1 6452 movq ARG11, %r15 // n1 6453 6454#if MACRO_LEVEL>=1 6455 INNER_BLEND_SCALE_AB_4X4_VS_LIB 6456#else 6457 CALL(inner_blend_scale_ab_4x4_vs_lib) 6458#endif 6459 6460 6461 // store n 6462 6463 movq ARG8, %r10 // D 6464 movq ARG9, %r11 // ldd 6465 sall $3, %r11d 6466 movq ARG10, %r12 // m1 6467 movq ARG11, %r13 // n1 6468 6469#if MACRO_LEVEL>=1 6470 INNER_STORE_4X4_VS_LIB 6471#else 6472 CALL(inner_store_4x4_vs_lib) 6473#endif 6474 6475 6476 EPILOGUE 6477 6478 ret 6479 6480 FUN_END(kernel_dgemm_nt_4x4_vs_lib44cc) 6481 6482 6483 6484 6485 6486// 1 2 3 4 5 6 7 8 9 10 6487// void kernel_dgemm_nt_4x4_libc4cc(int k, double *alpha, double *A, int lda, double *B, double *beta, double *C, int ldc, double *D, int ldd); 6488 6489 .p2align 4,,15 6490 GLOB_FUN_START(kernel_dgemm_nt_4x4_libc4cc) 6491 6492 PROLOGUE 6493 6494 // zero accumulation registers 6495 6496 ZERO_ACC 6497 6498 6499 // call inner dgemm kernel nn 6500 6501 movq ARG1, %r10 // k 6502 movq ARG5, %r11 // B 6503 movq ARG3, %r12 // A 6504 movq ARG4, %r13 // lda 6505 sall $3, %r13d 6506 6507#if MACRO_LEVEL>=2 6508 INNER_KERNEL_DGEMM_NT_4X4_LIB4C 6509#else 6510 CALL(inner_kernel_dgemm_nt_4x4_lib4c) 6511#endif 6512 6513 6514 // call inner blend 6515 6516 movq ARG2, %r10 // alpha 6517 movq ARG6, %r11 // beta 6518 movq ARG7, %r12 // C 6519 movq ARG8, %r13 // ldc 6520 sall $3, %r13d 6521 6522#if MACRO_LEVEL>=1 6523 INNER_TRAN_SCALE_AB_4X4_LIB 6524#else 6525 CALL(inner_tran_scale_ab_4x4_lib) 6526#endif 6527 6528 6529 // store n 6530 6531 movq ARG9, %r10 // D 6532 movq ARG10, %r11 // ldd 6533 sall $3, %r11d 6534 6535#if MACRO_LEVEL>=1 6536 INNER_STORE_4X4_LIB 6537#else 6538 CALL(inner_store_4x4_lib) 6539#endif 6540 6541 6542 EPILOGUE 6543 6544 ret 6545 6546 FUN_END(kernel_dgemm_nt_4x4_libc4cc) 6547 6548 6549 6550 6551 6552// 1 2 3 4 5 6 7 8 9 10 11 12 6553// void kernel_dgemm_nt_4x4_vs_libc4cc(int k, double *alpha, double *A, int lda, double *B, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1); 6554 6555 .p2align 4,,15 6556 GLOB_FUN_START(kernel_dgemm_nt_4x4_vs_libc4cc) 6557 6558 PROLOGUE 6559 6560 // zero accumulation registers 6561 6562 ZERO_ACC 6563 6564 6565 // call inner dgemm kernel nn 6566 6567 movq ARG1, %r10 // k 6568 movq ARG5, %r11 // B 6569 movq ARG3, %r12 // A 6570 movq ARG4, %r13 // lda 6571 sall $3, %r13d 6572 6573 movq ARG11, %r14 // m1 6574 cmpl $1, %r14d 6575 jg 100f 6576 6577#if MACRO_LEVEL>=2 6578 INNER_KERNEL_DGEMM_NT_4X1_LIB4C 6579#else 6580 CALL(inner_kernel_dgemm_nt_4x1_lib4c) 6581#endif 6582 6583 jmp 103f 6584 6585100: 6586 6587 movq ARG11, %r14 // m1 6588 cmpl $2, %r14d 6589 jg 101f 6590 6591#if MACRO_LEVEL>=2 6592 INNER_KERNEL_DGEMM_NT_4X2_LIB4C 6593#else 6594 CALL(inner_kernel_dgemm_nt_4x2_lib4c) 6595#endif 6596 6597 jmp 103f 6598 6599101: 6600 6601 movq ARG11, %r14 // m1 6602 cmpl $3, %r14d 6603 jg 102f 6604 6605#if MACRO_LEVEL>=2 6606 INNER_KERNEL_DGEMM_NT_4X3_LIB4C 6607#else 6608 CALL(inner_kernel_dgemm_nt_4x3_lib4c) 6609#endif 6610 6611 jmp 103f 6612 6613102: 6614 6615#if MACRO_LEVEL>=2 6616 INNER_KERNEL_DGEMM_NT_4X4_LIB4C 6617#else 6618 CALL(inner_kernel_dgemm_nt_4x4_lib4c) 6619#endif 6620 6621103: 6622 6623 6624 // call inner blend 6625 6626 movq ARG2, %r10 // alpha 6627 movq ARG6, %r11 // beta 6628 movq ARG7, %r12 // C 6629 movq ARG8, %r13 // ldc 6630 sall $3, %r13d 6631 movq ARG11, %r14 // m1 6632 movq ARG12, %r15 // n1 6633 6634#if MACRO_LEVEL>=1 6635 INNER_TRAN_SCALE_AB_4X4_VS_LIB 6636#else 6637 CALL(inner_tran_scale_ab_4x4_vs_lib) 6638#endif 6639 6640 6641 // store n 6642 6643 movq ARG9, %r10 // D 6644 movq ARG10, %r11 // ldd 6645 sall $3, %r11d 6646 movq ARG11, %r12 // m1 6647 movq ARG12, %r13 // n1 6648 6649#if MACRO_LEVEL>=1 6650 INNER_STORE_4X4_VS_LIB 6651#else 6652 CALL(inner_store_4x4_vs_lib) 6653#endif 6654 6655 6656 EPILOGUE 6657 6658 ret 6659 6660 FUN_END(kernel_dgemm_nt_4x4_vs_libc4cc) 6661 6662 6663 6664 6665 6666// 1 2 3 4 5 6 7 8 9 10 6667// void kernel_dgemm_nn_4x4_lib4ccc(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd); 6668 6669 .p2align 4,,15 6670 GLOB_FUN_START(kernel_dgemm_nn_4x4_lib4ccc) 6671 6672 PROLOGUE 6673 6674 // zero accumulation registers 6675 6676 ZERO_ACC 6677 6678 6679 // call inner dgemm kernel nn 6680 6681 movq ARG1, %r10 // k 6682 movq ARG3, %r11 // A 6683 movq ARG4, %r12 // B 6684 movq ARG5, %r13 // ldb 6685 sall $3, %r13d 6686 6687#if MACRO_LEVEL>=2 6688 INNER_KERNEL_DGEMM_NN_4X4_LIB4C 6689#else 6690 CALL(inner_kernel_dgemm_nn_4x4_lib4c) 6691#endif 6692 6693 6694 // call inner blend 6695 6696 movq ARG2, %r10 // alpha 6697 movq ARG6, %r11 // beta 6698 movq ARG7, %r12 // C 6699 movq ARG8, %r13 // ldc 6700 sall $3, %r13d 6701 6702#if MACRO_LEVEL>=1 6703 INNER_SCALE_AB_4X4_LIB 6704#else 6705 CALL(inner_scale_ab_4x4_lib) 6706#endif 6707 6708 6709 // store n 6710 6711 movq ARG9, %r10 // D 6712 movq ARG10, %r11 // ldd 6713 sall $3, %r11d 6714 6715#if MACRO_LEVEL>=1 6716 INNER_STORE_4X4_LIB 6717#else 6718 CALL(inner_store_4x4_lib) 6719#endif 6720 6721 6722 EPILOGUE 6723 6724 ret 6725 6726 FUN_END(kernel_dgemm_nn_4x4_lib4ccc) 6727 6728 6729 6730 6731 6732// 1 2 3 4 5 6 7 8 9 10 11 12 6733// void kernel_dgemm_nn_4x4_vs_lib4ccc(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1); 6734 6735 .p2align 4,,15 6736 GLOB_FUN_START(kernel_dgemm_nn_4x4_vs_lib4ccc) 6737 6738 PROLOGUE 6739 6740 // zero accumulation registers 6741 6742 ZERO_ACC 6743 6744 6745 // call inner dgemm kernel nn 6746 6747 movq ARG1, %r10 // k 6748 movq ARG3, %r11 // A 6749 movq ARG4, %r12 // B 6750 movq ARG5, %r13 // ldb 6751 sall $3, %r13d 6752 6753 movq ARG12, %r14 // n1 6754 cmpl $1, %r14d 6755 jg 100f 6756 6757#if MACRO_LEVEL>=2 6758 INNER_KERNEL_DGEMM_NN_4X1_LIB4C 6759#else 6760 CALL(inner_kernel_dgemm_nn_4x1_lib4c) 6761#endif 6762 6763 jmp 103f 6764 6765100: 6766 6767 movq ARG12, %r14 // n1 6768 cmpl $2, %r14d 6769 jg 101f 6770 6771#if MACRO_LEVEL>=2 6772 INNER_KERNEL_DGEMM_NN_4X2_LIB4C 6773#else 6774 CALL(inner_kernel_dgemm_nn_4x2_lib4c) 6775#endif 6776 6777 jmp 103f 6778 6779101: 6780 6781 movq ARG12, %r14 // n1 6782 cmpl $3, %r14d 6783 jg 102f 6784 6785#if MACRO_LEVEL>=2 6786 INNER_KERNEL_DGEMM_NN_4X3_LIB4C 6787#else 6788 CALL(inner_kernel_dgemm_nn_4x3_lib4c) 6789#endif 6790 6791 jmp 103f 6792 6793102: 6794 6795#if MACRO_LEVEL>=2 6796 INNER_KERNEL_DGEMM_NN_4X4_LIB4C 6797#else 6798 CALL(inner_kernel_dgemm_nn_4x4_lib4c) 6799#endif 6800 6801103: 6802 6803 6804 // call inner blend 6805 6806 movq ARG2, %r10 // alpha 6807 movq ARG6, %r11 // beta 6808 movq ARG7, %r12 // C 6809 movq ARG8, %r13 // ldc 6810 sall $3, %r13d 6811 movq ARG11, %r14 // m1 6812 movq ARG12, %r15 // n1 6813 6814#if MACRO_LEVEL>=1 6815 INNER_SCALE_AB_4X4_VS_LIB 6816#else 6817 CALL(inner_scale_ab_4x4_vs_lib) 6818#endif 6819 6820 6821 // store n 6822 6823 movq ARG9, %r10 // D 6824 movq ARG10, %r11 // ldd 6825 sall $3, %r11d 6826 movq ARG11, %r12 // m1 6827 movq ARG12, %r13 // n1 6828 6829#if MACRO_LEVEL>=1 6830 INNER_STORE_4X4_VS_LIB 6831#else 6832 CALL(inner_store_4x4_vs_lib) 6833#endif 6834 6835 6836 EPILOGUE 6837 6838 ret 6839 6840 FUN_END(kernel_dgemm_nn_4x4_vs_lib4ccc) 6841 6842 6843 6844 6845 6846// 1 2 3 4 5 6 7 8 9 10 6847// void kernel_dgemm_tt_4x4_libc4cc(int k, double *alpha, double *A, int lda, double *B, double *beta, double *C, int ldc, double *D, int ldd); 6848 6849 .p2align 4,,15 6850 GLOB_FUN_START(kernel_dgemm_tt_4x4_libc4cc) 6851 6852 PROLOGUE 6853 6854 // zero accumulation registers 6855 6856 ZERO_ACC 6857 6858 6859 // call inner dgemm kernel nn 6860 6861 movq ARG1, %r10 // k 6862 movq ARG5, %r11 // B 6863 movq ARG3, %r12 // A 6864 movq ARG4, %r13 // lda 6865 sall $3, %r13d 6866 6867#if MACRO_LEVEL>=2 6868 INNER_KERNEL_DGEMM_NN_4X4_LIB4C 6869#else 6870 CALL(inner_kernel_dgemm_nn_4x4_lib4c) 6871#endif 6872 6873 6874 // call inner blend 6875 6876 movq ARG2, %r10 // alpha 6877 movq ARG6, %r11 // beta 6878 movq ARG7, %r12 // C 6879 movq ARG8, %r13 // ldc 6880 sall $3, %r13d 6881 6882#if MACRO_LEVEL>=1 6883 INNER_TRAN_SCALE_AB_4X4_LIB 6884#else 6885 CALL(inner_tran_scale_ab_4x4_lib) 6886#endif 6887 6888 6889 // store n 6890 6891 movq ARG9, %r10 // D 6892 movq ARG10, %r11 // ldd 6893 sall $3, %r11d 6894 6895#if MACRO_LEVEL>=1 6896 INNER_STORE_4X4_LIB 6897#else 6898 CALL(inner_store_4x4_lib) 6899#endif 6900 6901 6902 EPILOGUE 6903 6904 ret 6905 6906 FUN_END(kernel_dgemm_tt_4x4_libc4cc) 6907 6908 6909 6910 6911 6912// 1 2 3 4 5 6 7 8 9 10 11 12 6913// void kernel_dgemm_tt_4x4_vs_libc4cc(int k, double *alpha, double *A, int lda, double *B, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1); 6914 6915 .p2align 4,,15 6916 GLOB_FUN_START(kernel_dgemm_tt_4x4_vs_libc4cc) 6917 6918 PROLOGUE 6919 6920 // zero accumulation registers 6921 6922 ZERO_ACC 6923 6924 6925 // call inner dgemm kernel nn 6926 6927 movq ARG1, %r10 // k 6928 movq ARG5, %r11 // B 6929 movq ARG3, %r12 // A 6930 movq ARG4, %r13 // lda 6931 sall $3, %r13d 6932 6933 movq ARG11, %r14 // m1 6934 cmpl $1, %r14d 6935 jg 100f 6936 6937#if MACRO_LEVEL>=2 6938 INNER_KERNEL_DGEMM_NN_4X1_LIB4C 6939#else 6940 CALL(inner_kernel_dgemm_nn_4x1_lib4c) 6941#endif 6942 6943 jmp 103f 6944 6945100: 6946 6947 movq ARG11, %r14 // m1 6948 cmpl $2, %r14d 6949 jg 101f 6950 6951#if MACRO_LEVEL>=2 6952 INNER_KERNEL_DGEMM_NN_4X2_LIB4C 6953#else 6954 CALL(inner_kernel_dgemm_nn_4x2_lib4c) 6955#endif 6956 6957 jmp 103f 6958 6959101: 6960 6961 movq ARG11, %r14 // m1 6962 cmpl $3, %r14d 6963 jg 102f 6964 6965#if MACRO_LEVEL>=2 6966 INNER_KERNEL_DGEMM_NN_4X3_LIB4C 6967#else 6968 CALL(inner_kernel_dgemm_nn_4x3_lib4c) 6969#endif 6970 6971 jmp 103f 6972 6973102: 6974 6975#if MACRO_LEVEL>=2 6976 INNER_KERNEL_DGEMM_NN_4X4_LIB4C 6977#else 6978 CALL(inner_kernel_dgemm_nn_4x4_lib4c) 6979#endif 6980 6981103: 6982 6983 6984 // call inner blend 6985 6986 movq ARG2, %r10 // alpha 6987 movq ARG6, %r11 // beta 6988 movq ARG7, %r12 // C 6989 movq ARG8, %r13 // ldc 6990 sall $3, %r13d 6991 movq ARG11, %r14 // m1 6992 movq ARG12, %r15 // n1 6993 6994#if MACRO_LEVEL>=1 6995 INNER_TRAN_SCALE_AB_4X4_VS_LIB 6996#else 6997 CALL(inner_tran_scale_ab_4x4_vs_lib) 6998#endif 6999 7000 7001 // store n 7002 7003 movq ARG9, %r10 // D 7004 movq ARG10, %r11 // ldd 7005 sall $3, %r11d 7006 movq ARG11, %r12 // m1 7007 movq ARG12, %r13 // n1 7008 7009#if MACRO_LEVEL>=1 7010 INNER_STORE_4X4_VS_LIB 7011#else 7012 CALL(inner_store_4x4_vs_lib) 7013#endif 7014 7015 7016 EPILOGUE 7017 7018 ret 7019 7020 FUN_END(kernel_dgemm_tt_4x4_vs_libc4cc) 7021 7022 7023 7024 7025 7026// 1 2 3 4 5 6 7 8 9 7027// void kernel_dsyrk_nt_l_4x4_lib44cc(int k, double *alpha, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd); 7028 7029 .p2align 4,,15 7030 GLOB_FUN_START(kernel_dsyrk_nt_l_4x4_lib44cc) 7031 7032 PROLOGUE 7033 7034 // zero accumulation registers 7035 7036 ZERO_ACC 7037 7038 7039 // call inner dgemm kernel nn 7040 7041 movq ARG1, %r10 // k 7042 movq ARG3, %r11 // A 7043 movq ARG4, %r12 // B 7044 7045#if MACRO_LEVEL>=2 7046 INNER_KERNEL_DGEMM_NT_4X4_LIB4 7047#else 7048 CALL(inner_kernel_dgemm_nt_4x4_lib4) 7049#endif 7050 7051 7052 // call inner blend 7053 7054 movq ARG2, %r10 // alpha 7055 movq ARG5, %r11 // beta 7056 movq ARG6, %r12 // C 7057 movq ARG7, %r13 // ldc 7058 sall $3, %r13d 7059 7060#if MACRO_LEVEL>=1 7061 INNER_BLEND_SCALE_AB_4X4_LIB 7062#else 7063 CALL(inner_blend_scale_ab_4x4_lib) 7064#endif 7065 7066 7067 // store n 7068 7069 movq ARG8, %r10 // D 7070 movq ARG9, %r11 // ldd 7071 sall $3, %r11d 7072 7073#if MACRO_LEVEL>=1 7074 INNER_STORE_L_4X4_LIB 7075#else 7076 CALL(inner_store_l_4x4_lib) 7077#endif 7078 7079 7080 EPILOGUE 7081 7082 ret 7083 7084 FUN_END(kernel_dsyrk_nt_l_4x4_lib44cc) 7085 7086 7087 7088 7089 7090// 1 2 3 4 5 6 7 8 9 10 11 7091// void kernel_dsyrk_nt_l_4x4_vs_lib44cc(int k, double *alpha, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1); 7092 7093 .p2align 4,,15 7094 GLOB_FUN_START(kernel_dsyrk_nt_l_4x4_vs_lib44cc) 7095 7096 PROLOGUE 7097 7098 // zero accumulation registers 7099 7100 ZERO_ACC 7101 7102 7103 // call inner dgemm kernel nn 7104 7105 movq ARG1, %r10 // k 7106 movq ARG3, %r11 // A 7107 movq ARG4, %r12 // B 7108 7109#if MACRO_LEVEL>=2 7110 INNER_KERNEL_DGEMM_NT_4X4_LIB4 7111#else 7112 CALL(inner_kernel_dgemm_nt_4x4_lib4) 7113#endif 7114 7115 7116 // call inner blend 7117 7118 movq ARG2, %r10 // alpha 7119 movq ARG5, %r11 // beta 7120 movq ARG6, %r12 // C 7121 movq ARG7, %r13 // ldc 7122 sall $3, %r13d 7123 movq ARG10, %r14 // m1 7124 movq ARG11, %r15 // n1 7125 7126#if MACRO_LEVEL>=1 7127 INNER_BLEND_SCALE_AB_4X4_VS_LIB 7128#else 7129 CALL(inner_blend_scale_ab_4x4_vs_lib) 7130#endif 7131 7132 7133 // store n 7134 7135 movq ARG8, %r10 // D 7136 movq ARG9, %r11 // ldd 7137 sall $3, %r11d 7138 movq ARG10, %r12 // m1 7139 movq ARG11, %r13 // n1 7140 7141#if MACRO_LEVEL>=1 7142 INNER_STORE_L_4X4_VS_LIB 7143#else 7144 CALL(inner_store_l_4x4_vs_lib) 7145#endif 7146 7147 7148 EPILOGUE 7149 7150 ret 7151 7152 FUN_END(kernel_dsyrk_nt_l_4x4_vs_lib44cc) 7153 7154 7155 7156 7157 7158// 1 2 3 4 5 6 7 8 9 7159// void kernel_dsyrk_nt_u_4x4_lib44cc(int k, double *alpha, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd); 7160 7161 .p2align 4,,15 7162 GLOB_FUN_START(kernel_dsyrk_nt_u_4x4_lib44cc) 7163 7164 PROLOGUE 7165 7166 // zero accumulation registers 7167 7168 ZERO_ACC 7169 7170 7171 // call inner dgemm kernel nn 7172 7173 movq ARG1, %r10 // k 7174 movq ARG3, %r11 // A 7175 movq ARG4, %r12 // B 7176 7177#if MACRO_LEVEL>=2 7178 INNER_KERNEL_DGEMM_NT_4X4_LIB4 7179#else 7180 CALL(inner_kernel_dgemm_nt_4x4_lib4) 7181#endif 7182 7183 7184 // call inner blend 7185 7186 movq ARG2, %r10 // alpha 7187 movq ARG5, %r11 // beta 7188 movq ARG6, %r12 // C 7189 movq ARG7, %r13 // ldc 7190 sall $3, %r13d 7191 7192#if MACRO_LEVEL>=1 7193 INNER_BLEND_SCALE_AB_4X4_LIB 7194#else 7195 CALL(inner_blend_scale_ab_4x4_lib) 7196#endif 7197 7198 7199 // store n 7200 7201 movq ARG8, %r10 // D 7202 movq ARG9, %r11 // ldd 7203 sall $3, %r11d 7204 7205#if MACRO_LEVEL>=1 7206 INNER_STORE_U_4X4_LIB 7207#else 7208 CALL(inner_store_u_4x4_lib) 7209#endif 7210 7211 7212 EPILOGUE 7213 7214 ret 7215 7216 FUN_END(kernel_dsyrk_nt_u_4x4_lib44cc) 7217 7218 7219 7220 7221 7222// 1 2 3 4 5 6 7 8 9 10 11 7223// void kernel_dsyrk_nt_u_4x4_vs_lib44cc(int k, double *alpha, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1); 7224 7225 .p2align 4,,15 7226 GLOB_FUN_START(kernel_dsyrk_nt_u_4x4_vs_lib44cc) 7227 7228 PROLOGUE 7229 7230 // zero accumulation registers 7231 7232 ZERO_ACC 7233 7234 7235 // call inner dgemm kernel nn 7236 7237 movq ARG1, %r10 // k 7238 movq ARG3, %r11 // A 7239 movq ARG4, %r12 // B 7240 7241#if MACRO_LEVEL>=2 7242 INNER_KERNEL_DGEMM_NT_4X4_LIB4 7243#else 7244 CALL(inner_kernel_dgemm_nt_4x4_lib4) 7245#endif 7246 7247 7248 // call inner blend 7249 7250 movq ARG2, %r10 // alpha 7251 movq ARG5, %r11 // beta 7252 movq ARG6, %r12 // C 7253 movq ARG7, %r13 // ldc 7254 sall $3, %r13d 7255 movq ARG10, %r14 // m1 7256 movq ARG11, %r15 // n1 7257 7258#if MACRO_LEVEL>=1 7259 INNER_BLEND_SCALE_AB_4X4_VS_LIB 7260#else 7261 CALL(inner_blend_scale_ab_4x4_vs_lib) 7262#endif 7263 7264 7265 // store n 7266 7267 movq ARG8, %r10 // D 7268 movq ARG9, %r11 // ldd 7269 sall $3, %r11d 7270 movq ARG10, %r12 // m1 7271 movq ARG11, %r13 // n1 7272 7273#if MACRO_LEVEL>=1 7274 INNER_STORE_U_4X4_VS_LIB 7275#else 7276 CALL(inner_store_u_4x4_vs_lib) 7277#endif 7278 7279 7280 EPILOGUE 7281 7282 ret 7283 7284 FUN_END(kernel_dsyrk_nt_u_4x4_vs_lib44cc) 7285 7286 7287 7288 7289 7290// 1 2 3 4 5 6 7 8 9 10 7291// void kernel_dtrmm_nn_rl_4x4_lib4ccc(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd); 7292 7293 .p2align 4,,15 7294 GLOB_FUN_START(kernel_dtrmm_nn_rl_4x4_lib4ccc) 7295 7296 PROLOGUE 7297 7298 // zero accumulation registers 7299 7300 ZERO_ACC 7301 7302 7303 // call inner dgemm kernel nn 7304 7305 movq ARG1, %r10 // k 7306 movq ARG3, %r11 // A 7307 movq ARG4, %r12 // B 7308 movq ARG5, %r13 // ldb 7309 sall $3, %r13d 7310 7311#if MACRO_LEVEL>=1 7312 INNER_EDGE_DTRMM_NN_RL_4X4_LIB4C 7313#else 7314 CALL(inner_edge_dtrmm_nn_rl_4x4_lib4c) 7315#endif 7316 7317#if MACRO_LEVEL>=2 7318 INNER_KERNEL_DGEMM_NN_4X4_LIB4C 7319#else 7320 CALL(inner_kernel_dgemm_nn_4x4_lib4c) 7321#endif 7322 7323 7324 // call inner blend 7325 7326 movq ARG2, %r10 // alpha 7327 movq ARG6, %r11 // beta 7328 movq ARG7, %r12 // C 7329 movq ARG8, %r13 // ldc 7330 sall $3, %r13d 7331 7332#if MACRO_LEVEL>=1 7333 INNER_SCALE_AB_4X4_LIB 7334#else 7335 CALL(inner_scale_ab_4x4_lib) 7336#endif 7337 7338 7339 // store n 7340 7341 movq ARG9, %r10 // D 7342 movq ARG10, %r11 // ldd 7343 sall $3, %r11d 7344 7345#if MACRO_LEVEL>=1 7346 INNER_STORE_4X4_LIB 7347#else 7348 CALL(inner_store_4x4_lib) 7349#endif 7350 7351 7352 EPILOGUE 7353 7354 ret 7355 7356 FUN_END(kernel_dtrmm_nn_rl_4x4_lib4ccc) 7357 7358 7359 7360 7361 7362// 1 2 3 4 5 6 7 8 9 10 11 12 7363// void kernel_dtrmm_nn_rl_4x4_vs_lib4ccc(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1); 7364 7365 .p2align 4,,15 7366 GLOB_FUN_START(kernel_dtrmm_nn_rl_4x4_vs_lib4ccc) 7367 7368 PROLOGUE 7369 7370 // zero accumulation registers 7371 7372 ZERO_ACC 7373 7374 7375 // call inner dgemm kernel nn 7376 7377 movq ARG1, %r10 // k 7378 movq ARG3, %r11 // A 7379 movq ARG4, %r12 // B 7380 movq ARG5, %r13 // ldb 7381 sall $3, %r13d 7382 7383#if MACRO_LEVEL>=1 7384 INNER_EDGE_DTRMM_NN_RL_4X4_VS_LIB4C 7385#else 7386 CALL(inner_edge_dtrmm_nn_rl_4x4_vs_lib4c) 7387#endif 7388 7389 movq ARG12, %r14 // n1 7390 cmpl $1, %r14d 7391 jg 100f 7392 7393#if MACRO_LEVEL>=2 7394 INNER_KERNEL_DGEMM_NN_4X1_LIB4C 7395#else 7396 CALL(inner_kernel_dgemm_nn_4x1_lib4c) 7397#endif 7398 7399 jmp 103f 7400 7401100: 7402 7403 movq ARG12, %r14 // n1 7404 cmpl $2, %r14d 7405 jg 101f 7406 7407#if MACRO_LEVEL>=2 7408 INNER_KERNEL_DGEMM_NN_4X2_LIB4C 7409#else 7410 CALL(inner_kernel_dgemm_nn_4x2_lib4c) 7411#endif 7412 7413 jmp 103f 7414 7415101: 7416 7417 movq ARG12, %r14 // n1 7418 cmpl $3, %r14d 7419 jg 102f 7420 7421#if MACRO_LEVEL>=2 7422 INNER_KERNEL_DGEMM_NN_4X3_LIB4C 7423#else 7424 CALL(inner_kernel_dgemm_nn_4x3_lib4c) 7425#endif 7426 7427 jmp 103f 7428 7429102: 7430 7431#if MACRO_LEVEL>=2 7432 INNER_KERNEL_DGEMM_NN_4X4_LIB4C 7433#else 7434 CALL(inner_kernel_dgemm_nn_4x4_lib4c) 7435#endif 7436 7437103: 7438 7439 7440 // call inner blend 7441 7442 movq ARG2, %r10 // alpha 7443 movq ARG6, %r11 // beta 7444 movq ARG7, %r12 // C 7445 movq ARG8, %r13 // ldc 7446 sall $3, %r13d 7447 movq ARG11, %r14 // m1 7448 movq ARG12, %r15 // n1 7449 7450#if MACRO_LEVEL>=1 7451 INNER_SCALE_AB_4X4_VS_LIB 7452#else 7453 CALL(inner_scale_ab_4x4_vs_lib) 7454#endif 7455 7456 7457 // store n 7458 7459 movq ARG9, %r10 // D 7460 movq ARG10, %r11 // ldd 7461 sall $3, %r11d 7462 movq ARG11, %r12 // m1 7463 movq ARG12, %r13 // n1 7464 7465#if MACRO_LEVEL>=1 7466 INNER_STORE_4X4_VS_LIB 7467#else 7468 CALL(inner_store_4x4_vs_lib) 7469#endif 7470 7471 7472 EPILOGUE 7473 7474 ret 7475 7476 FUN_END(kernel_dtrmm_nn_rl_4x4_vs_lib4ccc) 7477 7478 7479 7480 7481 7482// 1 2 3 4 5 6 7 8 9 10 7483// void kernel_dtrmm_nn_rl_4x4_tran_lib4c4c(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, double *D, int ldd); 7484 7485 .p2align 4,,15 7486 GLOB_FUN_START(kernel_dtrmm_nn_rl_4x4_tran_lib4c4c) 7487 7488 PROLOGUE 7489 7490 // zero accumulation registers 7491 7492 ZERO_ACC 7493 7494 7495 // call inner dgemm kernel nn 7496 7497 movq ARG1, %r10 // k 7498 movq ARG3, %r11 // A 7499 movq ARG4, %r12 // B 7500 movq ARG5, %r13 // ldb 7501 sall $3, %r13d 7502 7503#if MACRO_LEVEL>=1 7504 INNER_EDGE_DTRMM_NN_RL_4X4_LIB4C 7505#else 7506 CALL(inner_edge_dtrmm_nn_rl_4x4_lib4c) 7507#endif 7508 7509#if MACRO_LEVEL>=2 7510 INNER_KERNEL_DGEMM_NN_4X4_LIB4C 7511#else 7512 CALL(inner_kernel_dgemm_nn_4x4_lib4c) 7513#endif 7514 7515 7516 // call inner blend 7517 7518 movq ARG2, %r10 // alpha 7519 movq ARG6, %r11 // beta 7520 movq ARG7, %r12 // C 7521 7522#if MACRO_LEVEL>=1 7523 INNER_SCALE_AB_4X4_LIB4 7524#else 7525 CALL(inner_scale_ab_4x4_lib4) 7526#endif 7527 7528 7529 // store n 7530 7531 movq ARG8, %r10 // D 7532 movq ARG9, %r11 // ldd 7533 sall $3, %r11d 7534 7535#if MACRO_LEVEL>=1 7536 INNER_TRAN_STORE_4X4_LIB 7537#else 7538 CALL(inner_tran_store_4x4_lib) 7539#endif 7540 7541 7542 EPILOGUE 7543 7544 ret 7545 7546 FUN_END(kernel_dtrmm_nn_rl_4x4_tran_lib4c4c) 7547 7548 7549 7550 7551 7552// 1 2 3 4 5 6 7 8 9 10 11 12 7553// void kernel_dtrmm_nn_rl_4x4_tran_vs_lib4c4c(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, double *D, int ldd, int m1, int n1); 7554 7555 .p2align 4,,15 7556 GLOB_FUN_START(kernel_dtrmm_nn_rl_4x4_tran_vs_lib4c4c) 7557 7558 PROLOGUE 7559 7560 // zero accumulation registers 7561 7562 ZERO_ACC 7563 7564 7565 // call inner dgemm kernel nn 7566 7567 movq ARG1, %r10 // k 7568 movq ARG3, %r11 // A 7569 movq ARG4, %r12 // B 7570 movq ARG5, %r13 // ldb 7571 sall $3, %r13d 7572 7573#if MACRO_LEVEL>=1 7574 INNER_EDGE_DTRMM_NN_RL_4X4_VS_LIB4C 7575#else 7576 CALL(inner_edge_dtrmm_nn_rl_4x4_vs_lib4c) 7577#endif 7578 7579 movq ARG10, %r14 // m1 7580 cmpl $1, %r14d 7581 jg 100f 7582 7583#if MACRO_LEVEL>=2 7584 INNER_KERNEL_DGEMM_NN_4X1_LIB4C 7585#else 7586 CALL(inner_kernel_dgemm_nn_4x1_lib4c) 7587#endif 7588 7589 jmp 103f 7590 7591100: 7592 7593 movq ARG10, %r14 // m1 7594 cmpl $2, %r14d 7595 jg 101f 7596 7597#if MACRO_LEVEL>=2 7598 INNER_KERNEL_DGEMM_NN_4X2_LIB4C 7599#else 7600 CALL(inner_kernel_dgemm_nn_4x2_lib4c) 7601#endif 7602 7603 jmp 103f 7604 7605101: 7606 7607 movq ARG10, %r14 // m1 7608 cmpl $3, %r14d 7609 jg 102f 7610 7611#if MACRO_LEVEL>=2 7612 INNER_KERNEL_DGEMM_NN_4X3_LIB4C 7613#else 7614 CALL(inner_kernel_dgemm_nn_4x3_lib4c) 7615#endif 7616 7617 jmp 103f 7618 7619102: 7620 7621#if MACRO_LEVEL>=2 7622 INNER_KERNEL_DGEMM_NN_4X4_LIB4C 7623#else 7624 CALL(inner_kernel_dgemm_nn_4x4_lib4c) 7625#endif 7626 7627103: 7628 7629 7630 // call inner blend 7631 7632 movq ARG2, %r10 // alpha 7633 movq ARG6, %r11 // beta 7634 movq ARG7, %r12 // C 7635 7636#if MACRO_LEVEL>=1 7637 INNER_SCALE_AB_4X4_LIB4 7638#else 7639 CALL(inner_scale_ab_4x4_lib4) 7640#endif 7641 7642 7643 // store n 7644 7645 movq ARG8, %r10 // D 7646 movq ARG9, %r11 // ldd 7647 sall $3, %r11d 7648 movq ARG10, %r12 // m1 7649 movq ARG11, %r13 // n1 7650 7651#if MACRO_LEVEL>=1 7652 INNER_TRAN_STORE_4X4_VS_LIB 7653#else 7654 CALL(inner_tran_store_4x4_vs_lib) 7655#endif 7656 7657 7658 EPILOGUE 7659 7660 ret 7661 7662 FUN_END(kernel_dtrmm_nn_rl_4x4_tran_vs_lib4c4c) 7663 7664 7665 7666 7667 7668// 1 2 3 4 5 6 7 8 9 10 7669// void kernel_dtrmm_nn_rl_one_4x4_lib4ccc(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd); 7670 7671 .p2align 4,,15 7672 GLOB_FUN_START(kernel_dtrmm_nn_rl_one_4x4_lib4ccc) 7673 7674 PROLOGUE 7675 7676 // zero accumulation registers 7677 7678 ZERO_ACC 7679 7680 7681 // call inner dgemm kernel nn 7682 7683 movq ARG1, %r10 // k 7684 movq ARG3, %r11 // A 7685 movq ARG4, %r12 // B 7686 movq ARG5, %r13 // ldb 7687 sall $3, %r13d 7688 7689#if MACRO_LEVEL>=1 7690 INNER_EDGE_DTRMM_NN_RL_ONE_4X4_LIB4C 7691#else 7692 CALL(inner_edge_dtrmm_nn_rl_one_4x4_lib4c) 7693#endif 7694 7695#if MACRO_LEVEL>=2 7696 INNER_KERNEL_DGEMM_NN_4X4_LIB4C 7697#else 7698 CALL(inner_kernel_dgemm_nn_4x4_lib4c) 7699#endif 7700 7701 7702 // call inner blend 7703 7704 movq ARG2, %r10 // alpha 7705 movq ARG6, %r11 // beta 7706 movq ARG7, %r12 // C 7707 movq ARG8, %r13 // ldc 7708 sall $3, %r13d 7709 7710#if MACRO_LEVEL>=1 7711 INNER_SCALE_AB_4X4_LIB 7712#else 7713 CALL(inner_scale_ab_4x4_lib) 7714#endif 7715 7716 7717 // store n 7718 7719 movq ARG9, %r10 // D 7720 movq ARG10, %r11 // ldd 7721 sall $3, %r11d 7722 7723#if MACRO_LEVEL>=1 7724 INNER_STORE_4X4_LIB 7725#else 7726 CALL(inner_store_4x4_lib) 7727#endif 7728 7729 7730 EPILOGUE 7731 7732 ret 7733 7734 FUN_END(kernel_dtrmm_nn_rl_one_4x4_lib4ccc) 7735 7736 7737 7738 7739 7740// 1 2 3 4 5 6 7 8 9 10 11 12 7741// void kernel_dtrmm_nn_rl_4x4_one_vs_lib4ccc(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1); 7742 7743 .p2align 4,,15 7744 GLOB_FUN_START(kernel_dtrmm_nn_rl_one_4x4_vs_lib4ccc) 7745 7746 PROLOGUE 7747 7748 // zero accumulation registers 7749 7750 ZERO_ACC 7751 7752 7753 // call inner dgemm kernel nn 7754 7755 movq ARG1, %r10 // k 7756 movq ARG3, %r11 // A 7757 movq ARG4, %r12 // B 7758 movq ARG5, %r13 // ldb 7759 sall $3, %r13d 7760 7761#if MACRO_LEVEL>=1 7762 INNER_EDGE_DTRMM_NN_RL_ONE_4X4_VS_LIB4C 7763#else 7764 CALL(inner_edge_dtrmm_nn_rl_one_4x4_vs_lib4c) 7765#endif 7766 7767 movq ARG12, %r14 // n1 7768 cmpl $1, %r14d 7769 jg 100f 7770 7771#if MACRO_LEVEL>=2 7772 INNER_KERNEL_DGEMM_NN_4X1_LIB4C 7773#else 7774 CALL(inner_kernel_dgemm_nn_4x1_lib4c) 7775#endif 7776 7777 jmp 103f 7778 7779100: 7780 7781 movq ARG12, %r14 // n1 7782 cmpl $2, %r14d 7783 jg 101f 7784 7785#if MACRO_LEVEL>=2 7786 INNER_KERNEL_DGEMM_NN_4X2_LIB4C 7787#else 7788 CALL(inner_kernel_dgemm_nn_4x2_lib4c) 7789#endif 7790 7791 jmp 103f 7792 7793101: 7794 7795 movq ARG12, %r14 // n1 7796 cmpl $3, %r14d 7797 jg 102f 7798 7799#if MACRO_LEVEL>=2 7800 INNER_KERNEL_DGEMM_NN_4X3_LIB4C 7801#else 7802 CALL(inner_kernel_dgemm_nn_4x3_lib4c) 7803#endif 7804 7805 jmp 103f 7806 7807102: 7808 7809#if MACRO_LEVEL>=2 7810 INNER_KERNEL_DGEMM_NN_4X4_LIB4C 7811#else 7812 CALL(inner_kernel_dgemm_nn_4x4_lib4c) 7813#endif 7814 7815103: 7816 7817 7818 // call inner blend 7819 7820 movq ARG2, %r10 // alpha 7821 movq ARG6, %r11 // beta 7822 movq ARG7, %r12 // C 7823 movq ARG8, %r13 // ldc 7824 sall $3, %r13d 7825 movq ARG11, %r14 // m1 7826 movq ARG12, %r15 // n1 7827 7828#if MACRO_LEVEL>=1 7829 INNER_SCALE_AB_4X4_VS_LIB 7830#else 7831 CALL(inner_scale_ab_4x4_vs_lib) 7832#endif 7833 7834 7835 // store n 7836 7837 movq ARG9, %r10 // D 7838 movq ARG10, %r11 // ldd 7839 sall $3, %r11d 7840 movq ARG11, %r12 // m1 7841 movq ARG12, %r13 // n1 7842 7843#if MACRO_LEVEL>=1 7844 INNER_STORE_4X4_VS_LIB 7845#else 7846 CALL(inner_store_4x4_vs_lib) 7847#endif 7848 7849 7850 EPILOGUE 7851 7852 ret 7853 7854 FUN_END(kernel_dtrmm_nn_rl_one_4x4_vs_lib4ccc) 7855 7856 7857 7858 7859 7860// 1 2 3 4 5 6 7 8 9 10 7861// void kernel_dtrmm_nn_rl_one_4x4_tran_lib4c4c(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, double *D, int ldd); 7862 7863 .p2align 4,,15 7864 GLOB_FUN_START(kernel_dtrmm_nn_rl_one_4x4_tran_lib4c4c) 7865 7866 PROLOGUE 7867 7868 // zero accumulation registers 7869 7870 ZERO_ACC 7871 7872 7873 // call inner dgemm kernel nn 7874 7875 movq ARG1, %r10 // k 7876 movq ARG3, %r11 // A 7877 movq ARG4, %r12 // B 7878 movq ARG5, %r13 // ldb 7879 sall $3, %r13d 7880 7881#if MACRO_LEVEL>=1 7882 INNER_EDGE_DTRMM_NN_RL_ONE_4X4_LIB4C 7883#else 7884 CALL(inner_edge_dtrmm_nn_rl_one_4x4_lib4c) 7885#endif 7886 7887#if MACRO_LEVEL>=2 7888 INNER_KERNEL_DGEMM_NN_4X4_LIB4C 7889#else 7890 CALL(inner_kernel_dgemm_nn_4x4_lib4c) 7891#endif 7892 7893 7894 // call inner blend 7895 7896 movq ARG2, %r10 // alpha 7897 movq ARG6, %r11 // beta 7898 movq ARG7, %r12 // C 7899 7900#if MACRO_LEVEL>=1 7901 INNER_SCALE_AB_4X4_LIB4 7902#else 7903 CALL(inner_scale_ab_4x4_lib4) 7904#endif 7905 7906 7907 // store n 7908 7909 movq ARG8, %r10 // D 7910 movq ARG9, %r11 // ldd 7911 sall $3, %r11d 7912 7913#if MACRO_LEVEL>=1 7914 INNER_TRAN_STORE_4X4_LIB 7915#else 7916 CALL(inner_tran_store_4x4_lib) 7917#endif 7918 7919 7920 EPILOGUE 7921 7922 ret 7923 7924 FUN_END(kernel_dtrmm_nn_rl_one_4x4_tran_lib4c4c) 7925 7926 7927 7928 7929 7930// 1 2 3 4 5 6 7 8 9 10 11 12 7931// void kernel_dtrmm_nn_rl_one_4x4_tran_vs_lib4c4c(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, double *D, int ldd, int m1, int n1); 7932 7933 .p2align 4,,15 7934 GLOB_FUN_START(kernel_dtrmm_nn_rl_one_4x4_tran_vs_lib4c4c) 7935 7936 PROLOGUE 7937 7938 // zero accumulation registers 7939 7940 ZERO_ACC 7941 7942 7943 // call inner dgemm kernel nn 7944 7945 movq ARG1, %r10 // k 7946 movq ARG3, %r11 // A 7947 movq ARG4, %r12 // B 7948 movq ARG5, %r13 // ldb 7949 sall $3, %r13d 7950 7951#if MACRO_LEVEL>=1 7952 INNER_EDGE_DTRMM_NN_RL_ONE_4X4_VS_LIB4C 7953#else 7954 CALL(inner_edge_dtrmm_nn_rl_one_4x4_vs_lib4c) 7955#endif 7956 7957 movq ARG10, %r14 // m1 7958 cmpl $1, %r14d 7959 jg 100f 7960 7961#if MACRO_LEVEL>=2 7962 INNER_KERNEL_DGEMM_NN_4X1_LIB4C 7963#else 7964 CALL(inner_kernel_dgemm_nn_4x1_lib4c) 7965#endif 7966 7967 jmp 103f 7968 7969100: 7970 7971 movq ARG10, %r14 // m1 7972 cmpl $2, %r14d 7973 jg 101f 7974 7975#if MACRO_LEVEL>=2 7976 INNER_KERNEL_DGEMM_NN_4X2_LIB4C 7977#else 7978 CALL(inner_kernel_dgemm_nn_4x2_lib4c) 7979#endif 7980 7981 jmp 103f 7982 7983101: 7984 7985 movq ARG10, %r14 // m1 7986 cmpl $3, %r14d 7987 jg 102f 7988 7989#if MACRO_LEVEL>=2 7990 INNER_KERNEL_DGEMM_NN_4X3_LIB4C 7991#else 7992 CALL(inner_kernel_dgemm_nn_4x3_lib4c) 7993#endif 7994 7995 jmp 103f 7996 7997102: 7998 7999#if MACRO_LEVEL>=2 8000 INNER_KERNEL_DGEMM_NN_4X4_LIB4C 8001#else 8002 CALL(inner_kernel_dgemm_nn_4x4_lib4c) 8003#endif 8004 8005103: 8006 8007 8008 // call inner blend 8009 8010 movq ARG2, %r10 // alpha 8011 movq ARG6, %r11 // beta 8012 movq ARG7, %r12 // C 8013 8014#if MACRO_LEVEL>=1 8015 INNER_SCALE_AB_4X4_LIB4 8016#else 8017 CALL(inner_scale_ab_4x4_lib4) 8018#endif 8019 8020 8021 // store n 8022 8023 movq ARG8, %r10 // D 8024 movq ARG9, %r11 // ldd 8025 sall $3, %r11d 8026 movq ARG10, %r12 // m1 8027 movq ARG11, %r13 // n1 8028 8029#if MACRO_LEVEL>=1 8030 INNER_TRAN_STORE_4X4_VS_LIB 8031#else 8032 CALL(inner_tran_store_4x4_vs_lib) 8033#endif 8034 8035 8036 EPILOGUE 8037 8038 ret 8039 8040 FUN_END(kernel_dtrmm_nn_rl_one_4x4_tran_vs_lib4c4c) 8041 8042 8043 8044 8045 8046// 1 2 3 4 5 6 7 8 9 10 8047// void kernel_dtrmm_nn_ru_4x4_lib4ccc(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd); 8048 8049 .p2align 4,,15 8050 GLOB_FUN_START(kernel_dtrmm_nn_ru_4x4_lib4ccc) 8051 8052 PROLOGUE 8053 8054 // zero accumulation registers 8055 8056 ZERO_ACC 8057 8058 8059 // call inner dgemm kernel nn 8060 8061 movq ARG1, %r10 // k 8062 movq ARG3, %r11 // A 8063 movq ARG4, %r12 // B 8064 movq ARG5, %r13 // ldb 8065 sall $3, %r13d 8066 8067#if MACRO_LEVEL>=2 8068 INNER_KERNEL_DGEMM_NN_4X4_LIB4C 8069#else 8070 CALL(inner_kernel_dgemm_nn_4x4_lib4c) 8071#endif 8072 8073#if MACRO_LEVEL>=1 8074 INNER_EDGE_DTRMM_NN_RU_4X4_LIB4C 8075#else 8076 CALL(inner_edge_dtrmm_nn_ru_4x4_lib4c) 8077#endif 8078 8079 8080 // call inner blend 8081 8082 movq ARG2, %r10 // alpha 8083 movq ARG6, %r11 // beta 8084 movq ARG7, %r12 // C 8085 movq ARG8, %r13 // ldc 8086 sall $3, %r13d 8087 8088#if MACRO_LEVEL>=1 8089 INNER_SCALE_AB_4X4_LIB 8090#else 8091 CALL(inner_scale_ab_4x4_lib) 8092#endif 8093 8094 8095 // store n 8096 8097 movq ARG9, %r10 // D 8098 movq ARG10, %r11 // ldd 8099 sall $3, %r11d 8100 8101#if MACRO_LEVEL>=1 8102 INNER_STORE_4X4_LIB 8103#else 8104 CALL(inner_store_4x4_lib) 8105#endif 8106 8107 8108 EPILOGUE 8109 8110 ret 8111 8112 FUN_END(kernel_dtrmm_nn_ru_4x4_lib4ccc) 8113 8114 8115 8116 8117 8118// 1 2 3 4 5 6 7 8 9 10 11 12 8119// void kernel_dtrmm_nn_ru_4x4_vs_lib4ccc(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1); 8120 8121 .p2align 4,,15 8122 GLOB_FUN_START(kernel_dtrmm_nn_ru_4x4_vs_lib4ccc) 8123 8124 PROLOGUE 8125 8126 // zero accumulation registers 8127 8128 ZERO_ACC 8129 8130 8131 // call inner dgemm kernel nn 8132 8133 movq ARG1, %r10 // k 8134 movq ARG3, %r11 // A 8135 movq ARG4, %r12 // B 8136 movq ARG5, %r13 // ldb 8137 sall $3, %r13d 8138 8139 movq ARG12, %r14 // n1 8140 cmpl $1, %r14d 8141 jg 100f 8142 8143#if MACRO_LEVEL>=2 8144 INNER_KERNEL_DGEMM_NN_4X1_LIB4C 8145#else 8146 CALL(inner_kernel_dgemm_nn_4x1_lib4c) 8147#endif 8148 8149 jmp 103f 8150 8151100: 8152 8153 movq ARG12, %r14 // n1 8154 cmpl $2, %r14d 8155 jg 101f 8156 8157#if MACRO_LEVEL>=2 8158 INNER_KERNEL_DGEMM_NN_4X2_LIB4C 8159#else 8160 CALL(inner_kernel_dgemm_nn_4x2_lib4c) 8161#endif 8162 8163 jmp 103f 8164 8165101: 8166 8167 movq ARG12, %r14 // n1 8168 cmpl $3, %r14d 8169 jg 102f 8170 8171#if MACRO_LEVEL>=2 8172 INNER_KERNEL_DGEMM_NN_4X3_LIB4C 8173#else 8174 CALL(inner_kernel_dgemm_nn_4x3_lib4c) 8175#endif 8176 8177 jmp 103f 8178 8179102: 8180 8181#if MACRO_LEVEL>=2 8182 INNER_KERNEL_DGEMM_NN_4X4_LIB4C 8183#else 8184 CALL(inner_kernel_dgemm_nn_4x4_lib4c) 8185#endif 8186 8187103: 8188 8189 movq ARG12, %r14 // n1 8190 8191#if MACRO_LEVEL>=1 8192 INNER_EDGE_DTRMM_NN_RU_4X4_VS_LIB4C 8193#else 8194 CALL(inner_edge_dtrmm_nn_ru_4x4_vs_lib4c) 8195#endif 8196 8197 8198 // call inner blend 8199 8200 movq ARG2, %r10 // alpha 8201 movq ARG6, %r11 // beta 8202 movq ARG7, %r12 // C 8203 movq ARG8, %r13 // ldc 8204 sall $3, %r13d 8205 movq ARG11, %r14 // m1 8206 movq ARG12, %r15 // n1 8207 8208#if MACRO_LEVEL>=1 8209 INNER_SCALE_AB_4X4_VS_LIB 8210#else 8211 CALL(inner_scale_ab_4x4_vs_lib) 8212#endif 8213 8214 8215 // store n 8216 8217 movq ARG9, %r10 // D 8218 movq ARG10, %r11 // ldd 8219 sall $3, %r11d 8220 movq ARG11, %r12 // m1 8221 movq ARG12, %r13 // n1 8222 8223#if MACRO_LEVEL>=1 8224 INNER_STORE_4X4_VS_LIB 8225#else 8226 CALL(inner_store_4x4_vs_lib) 8227#endif 8228 8229 8230 EPILOGUE 8231 8232 ret 8233 8234 FUN_END(kernel_dtrmm_nn_ru_4x4_vs_lib4ccc) 8235 8236 8237 8238 8239 8240// 1 2 3 4 5 6 7 8 9 10 8241// void kernel_dtrmm_nn_ru_4x4_tran_lib4c4c(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, double *D, int ldd); 8242 8243 .p2align 4,,15 8244 GLOB_FUN_START(kernel_dtrmm_nn_ru_4x4_tran_lib4c4c) 8245 8246 PROLOGUE 8247 8248 // zero accumulation registers 8249 8250 ZERO_ACC 8251 8252 8253 // call inner dgemm kernel nn 8254 8255 movq ARG1, %r10 // k 8256 movq ARG3, %r11 // A 8257 movq ARG4, %r12 // B 8258 movq ARG5, %r13 // ldb 8259 sall $3, %r13d 8260 8261#if MACRO_LEVEL>=2 8262 INNER_KERNEL_DGEMM_NN_4X4_LIB4C 8263#else 8264 CALL(inner_kernel_dgemm_nn_4x4_lib4c) 8265#endif 8266 8267#if MACRO_LEVEL>=1 8268 INNER_EDGE_DTRMM_NN_RU_4X4_LIB4C 8269#else 8270 CALL(inner_edge_dtrmm_nn_ru_4x4_lib4c) 8271#endif 8272 8273 8274 // call inner blend 8275 8276 movq ARG2, %r10 // alpha 8277 movq ARG6, %r11 // beta 8278 movq ARG7, %r12 // C 8279 8280#if MACRO_LEVEL>=1 8281 INNER_SCALE_AB_4X4_LIB4 8282#else 8283 CALL(inner_scale_ab_4x4_lib4) 8284#endif 8285 8286 8287 // store n 8288 8289 movq ARG8, %r10 // D 8290 movq ARG9, %r11 // ldd 8291 sall $3, %r11d 8292 8293#if MACRO_LEVEL>=1 8294 INNER_TRAN_STORE_4X4_LIB 8295#else 8296 CALL(inner_tran_store_4x4_lib) 8297#endif 8298 8299 8300 EPILOGUE 8301 8302 ret 8303 8304 FUN_END(kernel_dtrmm_nn_ru_4x4_tran_lib4c4c) 8305 8306 8307 8308 8309 8310// 1 2 3 4 5 6 7 8 9 10 11 12 8311// void kernel_dtrmm_nn_ru_4x4_tran_vs_lib4c4c(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, double *D, int ldd, int m1, int n1); 8312 8313 .p2align 4,,15 8314 GLOB_FUN_START(kernel_dtrmm_nn_ru_4x4_tran_vs_lib4c4c) 8315 8316 PROLOGUE 8317 8318 // zero accumulation registers 8319 8320 ZERO_ACC 8321 8322 8323 // call inner dgemm kernel nn 8324 8325 movq ARG1, %r10 // k 8326 movq ARG3, %r11 // A 8327 movq ARG4, %r12 // B 8328 movq ARG5, %r13 // ldb 8329 sall $3, %r13d 8330 8331 movq ARG10, %r14 // m1 8332 cmpl $1, %r14d 8333 jg 100f 8334 8335#if MACRO_LEVEL>=2 8336 INNER_KERNEL_DGEMM_NN_4X1_LIB4C 8337#else 8338 CALL(inner_kernel_dgemm_nn_4x1_lib4c) 8339#endif 8340 8341 jmp 103f 8342 8343100: 8344 8345 movq ARG10, %r14 // m1 8346 cmpl $2, %r14d 8347 jg 101f 8348 8349#if MACRO_LEVEL>=2 8350 INNER_KERNEL_DGEMM_NN_4X2_LIB4C 8351#else 8352 CALL(inner_kernel_dgemm_nn_4x2_lib4c) 8353#endif 8354 8355 jmp 103f 8356 8357101: 8358 8359 movq ARG10, %r14 // m1 8360 cmpl $3, %r14d 8361 jg 102f 8362 8363#if MACRO_LEVEL>=2 8364 INNER_KERNEL_DGEMM_NN_4X3_LIB4C 8365#else 8366 CALL(inner_kernel_dgemm_nn_4x3_lib4c) 8367#endif 8368 8369 jmp 103f 8370 8371102: 8372 8373#if MACRO_LEVEL>=2 8374 INNER_KERNEL_DGEMM_NN_4X4_LIB4C 8375#else 8376 CALL(inner_kernel_dgemm_nn_4x4_lib4c) 8377#endif 8378 8379103: 8380 8381 movq ARG10, %r14 // m1 8382 8383#if MACRO_LEVEL>=1 8384 INNER_EDGE_DTRMM_NN_RU_4X4_VS_LIB4C 8385#else 8386 CALL(inner_edge_dtrmm_nn_ru_4x4_vs_lib4c) 8387#endif 8388 8389 8390 // call inner blend 8391 8392 movq ARG2, %r10 // alpha 8393 movq ARG6, %r11 // beta 8394 movq ARG7, %r12 // C 8395 8396#if MACRO_LEVEL>=1 8397 INNER_SCALE_AB_4X4_LIB4 8398#else 8399 CALL(inner_scale_ab_4x4_lib4) 8400#endif 8401 8402 8403 // store n 8404 8405 movq ARG8, %r10 // D 8406 movq ARG9, %r11 // ldd 8407 sall $3, %r11d 8408 movq ARG10, %r12 // m1 8409 movq ARG11, %r13 // n1 8410 8411#if MACRO_LEVEL>=1 8412 INNER_TRAN_STORE_4X4_VS_LIB 8413#else 8414 CALL(inner_tran_store_4x4_vs_lib) 8415#endif 8416 8417 8418 EPILOGUE 8419 8420 ret 8421 8422 FUN_END(kernel_dtrmm_nn_ru_4x4_tran_vs_lib4c4c) 8423 8424 8425 8426 8427 8428// 1 2 3 4 5 6 7 8 9 10 8429// void kernel_dtrmm_nn_ru_one_4x4_lib4ccc(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd); 8430 8431 .p2align 4,,15 8432 GLOB_FUN_START(kernel_dtrmm_nn_ru_one_4x4_lib4ccc) 8433 8434 PROLOGUE 8435 8436 // zero accumulation registers 8437 8438 ZERO_ACC 8439 8440 8441 // call inner dgemm kernel nn 8442 8443 movq ARG1, %r10 // k 8444 movq ARG3, %r11 // A 8445 movq ARG4, %r12 // B 8446 movq ARG5, %r13 // ldb 8447 sall $3, %r13d 8448 8449#if MACRO_LEVEL>=2 8450 INNER_KERNEL_DGEMM_NN_4X4_LIB4C 8451#else 8452 CALL(inner_kernel_dgemm_nn_4x4_lib4c) 8453#endif 8454 8455#if MACRO_LEVEL>=1 8456 INNER_EDGE_DTRMM_NN_RU_ONE_4X4_LIB4C 8457#else 8458 CALL(inner_edge_dtrmm_nn_ru_one_4x4_lib4c) 8459#endif 8460 8461 8462 // call inner blend 8463 8464 movq ARG2, %r10 // alpha 8465 movq ARG6, %r11 // beta 8466 movq ARG7, %r12 // C 8467 movq ARG8, %r13 // ldc 8468 sall $3, %r13d 8469 8470#if MACRO_LEVEL>=1 8471 INNER_SCALE_AB_4X4_LIB 8472#else 8473 CALL(inner_scale_ab_4x4_lib) 8474#endif 8475 8476 8477 // store n 8478 8479 movq ARG9, %r10 // D 8480 movq ARG10, %r11 // ldd 8481 sall $3, %r11d 8482 8483#if MACRO_LEVEL>=1 8484 INNER_STORE_4X4_LIB 8485#else 8486 CALL(inner_store_4x4_lib) 8487#endif 8488 8489 8490 EPILOGUE 8491 8492 ret 8493 8494 FUN_END(kernel_dtrmm_nn_ru_one_4x4_lib4ccc) 8495 8496 8497 8498 8499 8500// 1 2 3 4 5 6 7 8 9 10 11 12 8501// void kernel_dtrmm_nn_ru_one_4x4_vs_lib4ccc(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1); 8502 8503 .p2align 4,,15 8504 GLOB_FUN_START(kernel_dtrmm_nn_ru_one_4x4_vs_lib4ccc) 8505 8506 PROLOGUE 8507 8508 // zero accumulation registers 8509 8510 ZERO_ACC 8511 8512 8513 // call inner dgemm kernel nn 8514 8515 movq ARG1, %r10 // k 8516 movq ARG3, %r11 // A 8517 movq ARG4, %r12 // B 8518 movq ARG5, %r13 // ldb 8519 sall $3, %r13d 8520 8521 movq ARG12, %r14 // n1 8522 cmpl $1, %r14d 8523 jg 100f 8524 8525#if MACRO_LEVEL>=2 8526 INNER_KERNEL_DGEMM_NN_4X1_LIB4C 8527#else 8528 CALL(inner_kernel_dgemm_nn_4x1_lib4c) 8529#endif 8530 8531 jmp 103f 8532 8533100: 8534 8535 movq ARG12, %r14 // n1 8536 cmpl $2, %r14d 8537 jg 101f 8538 8539#if MACRO_LEVEL>=2 8540 INNER_KERNEL_DGEMM_NN_4X2_LIB4C 8541#else 8542 CALL(inner_kernel_dgemm_nn_4x2_lib4c) 8543#endif 8544 8545 jmp 103f 8546 8547101: 8548 8549 movq ARG12, %r14 // n1 8550 cmpl $3, %r14d 8551 jg 102f 8552 8553#if MACRO_LEVEL>=2 8554 INNER_KERNEL_DGEMM_NN_4X3_LIB4C 8555#else 8556 CALL(inner_kernel_dgemm_nn_4x3_lib4c) 8557#endif 8558 8559 jmp 103f 8560 8561102: 8562 8563#if MACRO_LEVEL>=2 8564 INNER_KERNEL_DGEMM_NN_4X4_LIB4C 8565#else 8566 CALL(inner_kernel_dgemm_nn_4x4_lib4c) 8567#endif 8568 8569103: 8570 8571 movq ARG12, %r14 // n1 8572 8573#if MACRO_LEVEL>=1 8574 INNER_EDGE_DTRMM_NN_RU_ONE_4X4_VS_LIB4C 8575#else 8576 CALL(inner_edge_dtrmm_nn_ru_one_4x4_vs_lib4c) 8577#endif 8578 8579 8580 // call inner blend 8581 8582 movq ARG2, %r10 // alpha 8583 movq ARG6, %r11 // beta 8584 movq ARG7, %r12 // C 8585 movq ARG8, %r13 // ldc 8586 sall $3, %r13d 8587 movq ARG11, %r14 // m1 8588 movq ARG12, %r15 // n1 8589 8590#if MACRO_LEVEL>=1 8591 INNER_SCALE_AB_4X4_VS_LIB 8592#else 8593 CALL(inner_scale_ab_4x4_vs_lib) 8594#endif 8595 8596 8597 // store n 8598 8599 movq ARG9, %r10 // D 8600 movq ARG10, %r11 // ldd 8601 sall $3, %r11d 8602 movq ARG11, %r12 // m1 8603 movq ARG12, %r13 // n1 8604 8605#if MACRO_LEVEL>=1 8606 INNER_STORE_4X4_VS_LIB 8607#else 8608 CALL(inner_store_4x4_vs_lib) 8609#endif 8610 8611 8612 EPILOGUE 8613 8614 ret 8615 8616 FUN_END(kernel_dtrmm_nn_ru_one_4x4_vs_lib4ccc) 8617 8618 8619 8620 8621 8622// 1 2 3 4 5 6 7 8 9 10 8623// void kernel_dtrmm_nn_ru_one_4x4_tran_lib4c4c(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, double *D, int ldd); 8624 8625 .p2align 4,,15 8626 GLOB_FUN_START(kernel_dtrmm_nn_ru_one_4x4_tran_lib4c4c) 8627 8628 PROLOGUE 8629 8630 // zero accumulation registers 8631 8632 ZERO_ACC 8633 8634 8635 // call inner dgemm kernel nn 8636 8637 movq ARG1, %r10 // k 8638 movq ARG3, %r11 // A 8639 movq ARG4, %r12 // B 8640 movq ARG5, %r13 // ldb 8641 sall $3, %r13d 8642 8643#if MACRO_LEVEL>=2 8644 INNER_KERNEL_DGEMM_NN_4X4_LIB4C 8645#else 8646 CALL(inner_kernel_dgemm_nn_4x4_lib4c) 8647#endif 8648 8649#if MACRO_LEVEL>=1 8650 INNER_EDGE_DTRMM_NN_RU_ONE_4X4_LIB4C 8651#else 8652 CALL(inner_edge_dtrmm_nn_ru_one_4x4_lib4c) 8653#endif 8654 8655 8656 // call inner blend 8657 8658 movq ARG2, %r10 // alpha 8659 movq ARG6, %r11 // beta 8660 movq ARG7, %r12 // C 8661 8662#if MACRO_LEVEL>=1 8663 INNER_SCALE_AB_4X4_LIB4 8664#else 8665 CALL(inner_scale_ab_4x4_lib4) 8666#endif 8667 8668 8669 // store n 8670 8671 movq ARG8, %r10 // D 8672 movq ARG9, %r11 // ldd 8673 sall $3, %r11d 8674 8675#if MACRO_LEVEL>=1 8676 INNER_TRAN_STORE_4X4_LIB 8677#else 8678 CALL(inner_tran_store_4x4_lib) 8679#endif 8680 8681 8682 EPILOGUE 8683 8684 ret 8685 8686 FUN_END(kernel_dtrmm_nn_ru_one_4x4_tran_lib4c4c) 8687 8688 8689 8690 8691 8692// 1 2 3 4 5 6 7 8 9 10 11 12 8693// void kernel_dtrmm_nn_ru_one_4x4_tran_vs_lib4c4c(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, double *D, int ldd, int m1, int n1); 8694 8695 .p2align 4,,15 8696 GLOB_FUN_START(kernel_dtrmm_nn_ru_one_4x4_tran_vs_lib4c4c) 8697 8698 PROLOGUE 8699 8700 // zero accumulation registers 8701 8702 ZERO_ACC 8703 8704 8705 // call inner dgemm kernel nn 8706 8707 movq ARG1, %r10 // k 8708 movq ARG3, %r11 // A 8709 movq ARG4, %r12 // B 8710 movq ARG5, %r13 // ldb 8711 sall $3, %r13d 8712 8713 movq ARG10, %r14 // m1 8714 cmpl $1, %r14d 8715 jg 100f 8716 8717#if MACRO_LEVEL>=2 8718 INNER_KERNEL_DGEMM_NN_4X1_LIB4C 8719#else 8720 CALL(inner_kernel_dgemm_nn_4x1_lib4c) 8721#endif 8722 8723 jmp 103f 8724 8725100: 8726 8727 movq ARG10, %r14 // m1 8728 cmpl $2, %r14d 8729 jg 101f 8730 8731#if MACRO_LEVEL>=2 8732 INNER_KERNEL_DGEMM_NN_4X2_LIB4C 8733#else 8734 CALL(inner_kernel_dgemm_nn_4x2_lib4c) 8735#endif 8736 8737 jmp 103f 8738 8739101: 8740 8741 movq ARG10, %r14 // m1 8742 cmpl $3, %r14d 8743 jg 102f 8744 8745#if MACRO_LEVEL>=2 8746 INNER_KERNEL_DGEMM_NN_4X3_LIB4C 8747#else 8748 CALL(inner_kernel_dgemm_nn_4x3_lib4c) 8749#endif 8750 8751 jmp 103f 8752 8753102: 8754 8755#if MACRO_LEVEL>=2 8756 INNER_KERNEL_DGEMM_NN_4X4_LIB4C 8757#else 8758 CALL(inner_kernel_dgemm_nn_4x4_lib4c) 8759#endif 8760 8761103: 8762 8763 movq ARG10, %r14 // m1 8764 8765#if MACRO_LEVEL>=1 8766 INNER_EDGE_DTRMM_NN_RU_ONE_4X4_VS_LIB4C 8767#else 8768 CALL(inner_edge_dtrmm_nn_ru_one_4x4_vs_lib4c) 8769#endif 8770 8771 8772 // call inner blend 8773 8774 movq ARG2, %r10 // alpha 8775 movq ARG6, %r11 // beta 8776 movq ARG7, %r12 // C 8777 8778#if MACRO_LEVEL>=1 8779 INNER_SCALE_AB_4X4_LIB4 8780#else 8781 CALL(inner_scale_ab_4x4_lib4) 8782#endif 8783 8784 8785 // store n 8786 8787 movq ARG8, %r10 // D 8788 movq ARG9, %r11 // ldd 8789 sall $3, %r11d 8790 movq ARG10, %r12 // m1 8791 movq ARG11, %r13 // n1 8792 8793#if MACRO_LEVEL>=1 8794 INNER_TRAN_STORE_4X4_VS_LIB 8795#else 8796 CALL(inner_tran_store_4x4_vs_lib) 8797#endif 8798 8799 8800 EPILOGUE 8801 8802 ret 8803 8804 FUN_END(kernel_dtrmm_nn_ru_one_4x4_tran_vs_lib4c4c) 8805 8806 8807 8808 8809 8810// 1 2 3 4 5 6 7 8 9 8811// void kernel_dtrmm_nt_rl_4x4_lib44cc(int k, double *alpha, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd); 8812 8813 .p2align 4,,15 8814 GLOB_FUN_START(kernel_dtrmm_nt_rl_4x4_lib44cc) 8815 8816 PROLOGUE 8817 8818 // zero accumulation registers 8819 8820 ZERO_ACC 8821 8822 8823 // call inner dgemm kernel nn 8824 8825 movq ARG1, %r10 // k 8826 movq ARG3, %r11 // A 8827 movq ARG4, %r12 // B 8828 8829#if MACRO_LEVEL>=1 8830// INNER_EDGE_DTRMM_NT_RU_4X4_LIB4 8831#else 8832// CALL(inner_edge_dtrmm_nt_ru_4x4_lib4) 8833#endif 8834 8835#if MACRO_LEVEL>=2 8836 INNER_KERNEL_DGEMM_NT_4X4_LIB4 8837#else 8838 CALL(inner_kernel_dgemm_nt_4x4_lib4) 8839#endif 8840 8841 8842 // call inner blend 8843 8844#if MACRO_LEVEL>=1 8845 INNER_BLEND_4X4_LIB4 8846#else 8847 CALL(inner_blend_4x4_lib4) 8848#endif 8849 8850 8851 // final triangle 8852 8853// movq ARG1, %r10 8854// movq ARG3, %r11 8855// movq ARG4, %r12 8856 8857#if MACRO_LEVEL>=1 8858 INNER_EDGE_DTRMM_NT_RL_4X4_LIB4 8859#else 8860 CALL(inner_edge_dtrmm_nt_rl_4x4_lib4) 8861#endif 8862 8863 8864 // call inner blend 8865 8866 movq ARG2, %r10 // alpha 8867 movq ARG5, %r11 // beta 8868 movq ARG6, %r12 // C 8869 movq ARG7, %r13 // ldc 8870 sall $3, %r13d 8871 8872#if MACRO_LEVEL>=1 8873 INNER_SCALE_AB_4X4_LIB 8874#else 8875 CALL(inner_scale_ab_4x4_lib) 8876#endif 8877 8878 8879 // store n 8880 8881 movq ARG8, %r10 // D 8882 movq ARG9, %r11 // ldd 8883 sall $3, %r11d 8884 8885#if MACRO_LEVEL>=1 8886 INNER_STORE_4X4_LIB 8887#else 8888 CALL(inner_store_4x4_lib) 8889#endif 8890 8891 8892 EPILOGUE 8893 8894 ret 8895 8896 FUN_END(kernel_dtrmm_nt_rl_4x4_lib44cc) 8897 8898 8899 8900 8901 8902// 1 2 3 4 5 6 7 8 9 10 11 8903// void kernel_dtrmm_nt_rl_4x4_vs_lib44cc(int k, double *alpha, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1); 8904 8905 .p2align 4,,15 8906 GLOB_FUN_START(kernel_dtrmm_nt_rl_4x4_vs_lib44cc) 8907 8908 PROLOGUE 8909 8910 // zero accumulation registers 8911 8912 ZERO_ACC 8913 8914 8915 // call inner dgemm kernel nn 8916 8917 movq ARG1, %r10 // k 8918// subl $4, %r10d 8919 movq ARG3, %r11 // A 8920// addq $128, %r11 8921 movq ARG4, %r12 // B 8922// addq $128, %r12 8923 8924#if MACRO_LEVEL>=1 8925// INNER_EDGE_DTRMM_NT_RU_4X4_VS_LIB4 8926#else 8927// CALL(inner_edge_dtrmm_nt_ru_4x4_vs_lib4) 8928#endif 8929 8930#if MACRO_LEVEL>=2 8931 INNER_KERNEL_DGEMM_NT_4X4_LIB4 8932#else 8933 CALL(inner_kernel_dgemm_nt_4x4_lib4) 8934#endif 8935 8936 8937 // call inner blend 8938 8939#if MACRO_LEVEL>=1 8940 INNER_BLEND_4X4_LIB4 8941#else 8942 CALL(inner_blend_4x4_lib4) 8943#endif 8944 8945 8946 // initial triangle 8947 8948// movq ARG1, %r10 8949// movq ARG3, %r11 8950// movq ARG4, %r12 8951 movq ARG11, %r13 8952 8953#if MACRO_LEVEL>=1 8954 INNER_EDGE_DTRMM_NT_RL_4X4_VS_LIB4 8955#else 8956 CALL(inner_edge_dtrmm_nt_rl_4x4_vs_lib4) 8957#endif 8958 8959 8960 // call inner blend 8961 8962 movq ARG2, %r10 // alpha 8963 movq ARG5, %r11 // beta 8964 movq ARG6, %r12 // C 8965 movq ARG7, %r13 // ldc 8966 sall $3, %r13d 8967 movq ARG10, %r14 // m1 8968 movq ARG11, %r15 // n1 8969 8970#if MACRO_LEVEL>=1 8971 INNER_SCALE_AB_4X4_VS_LIB 8972#else 8973 CALL(inner_scale_ab_4x4_vs_lib) 8974#endif 8975 8976 8977 // store n 8978 8979 movq ARG8, %r10 // D 8980 movq ARG9, %r11 // ldd 8981 sall $3, %r11d 8982 movq ARG10, %r12 // m1 8983 movq ARG11, %r13 // n1 8984 8985#if MACRO_LEVEL>=1 8986 INNER_STORE_4X4_VS_LIB 8987#else 8988 CALL(inner_store_4x4_vs_lib) 8989#endif 8990 8991 8992 EPILOGUE 8993 8994 ret 8995 8996 FUN_END(kernel_dtrmm_nt_rl_4x4_vs_lib44cc) 8997 8998 8999 9000 9001 9002// 1 2 3 4 5 6 7 8 9 9003// void kernel_dtrmm_nt_rl_4x4_tran_lib444c(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int ldd); 9004 9005 .p2align 4,,15 9006 GLOB_FUN_START(kernel_dtrmm_nt_rl_4x4_tran_lib444c) 9007 9008 PROLOGUE 9009 9010 // zero accumulation registers 9011 9012 ZERO_ACC 9013 9014 9015 // call inner dgemm kernel nn 9016 9017 movq ARG1, %r10 // k 9018 movq ARG3, %r11 // A 9019 movq ARG4, %r12 // B 9020 9021#if MACRO_LEVEL>=1 9022// INNER_EDGE_DTRMM_NT_RU_4X4_LIB4 9023#else 9024// CALL(inner_edge_dtrmm_nt_ru_4x4_lib4) 9025#endif 9026 9027#if MACRO_LEVEL>=2 9028 INNER_KERNEL_DGEMM_NT_4X4_LIB4 9029#else 9030 CALL(inner_kernel_dgemm_nt_4x4_lib4) 9031#endif 9032 9033 9034 // call inner blend 9035 9036#if MACRO_LEVEL>=1 9037 INNER_BLEND_4X4_LIB4 9038#else 9039 CALL(inner_blend_4x4_lib4) 9040#endif 9041 9042 9043 // final triangle 9044 9045// movq ARG1, %r10 9046// movq ARG3, %r11 9047// movq ARG4, %r12 9048 9049#if MACRO_LEVEL>=1 9050 INNER_EDGE_DTRMM_NT_RL_4X4_LIB4 9051#else 9052 CALL(inner_edge_dtrmm_nt_rl_4x4_lib4) 9053#endif 9054 9055 9056 // call inner blend 9057 9058 movq ARG2, %r10 // alpha 9059 movq ARG5, %r11 // beta 9060 movq ARG6, %r12 // C 9061 9062#if MACRO_LEVEL>=1 9063 INNER_SCALE_AB_4X4_LIB4 9064#else 9065 CALL(inner_scale_ab_4x4_lib4) 9066#endif 9067 9068 9069 // store n 9070 9071 movq ARG7, %r10 // D 9072 movq ARG8, %r11 // ldd 9073 sall $3, %r11d 9074 9075#if MACRO_LEVEL>=1 9076 INNER_TRAN_STORE_4X4_LIB 9077#else 9078 CALL(inner_tran_store_4x4_lib) 9079#endif 9080 9081 9082 EPILOGUE 9083 9084 ret 9085 9086 FUN_END(kernel_dtrmm_nt_rl_4x4_tran_lib444c) 9087 9088 9089 9090 9091 9092// 1 2 3 4 5 6 7 8 9 10 11 9093// void kernel_dtrmm_nt_rl_4x4_tran_vs_lib444c(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int ldd, int m1, int n1); 9094 9095 .p2align 4,,15 9096 GLOB_FUN_START(kernel_dtrmm_nt_rl_4x4_tran_vs_lib444c) 9097 9098 PROLOGUE 9099 9100 // zero accumulation registers 9101 9102 ZERO_ACC 9103 9104 9105 // call inner dgemm kernel nn 9106 9107 movq ARG1, %r10 // k 9108// subl $4, %r10d 9109 movq ARG3, %r11 // A 9110// addq $128, %r11 9111 movq ARG4, %r12 // B 9112// addq $128, %r12 9113 9114#if MACRO_LEVEL>=1 9115// INNER_EDGE_DTRMM_NT_RU_4X4_VS_LIB4 9116#else 9117// CALL(inner_edge_dtrmm_nt_ru_4x4_vs_lib4) 9118#endif 9119 9120#if MACRO_LEVEL>=2 9121 INNER_KERNEL_DGEMM_NT_4X4_LIB4 9122#else 9123 CALL(inner_kernel_dgemm_nt_4x4_lib4) 9124#endif 9125 9126 9127 // call inner blend 9128 9129#if MACRO_LEVEL>=1 9130 INNER_BLEND_4X4_LIB4 9131#else 9132 CALL(inner_blend_4x4_lib4) 9133#endif 9134 9135 9136 // initial triangle 9137 9138// movq ARG1, %r10 9139// movq ARG3, %r11 9140// movq ARG4, %r12 9141 movq ARG9, %r13 // m1 9142 9143#if MACRO_LEVEL>=1 9144 INNER_EDGE_DTRMM_NT_RL_4X4_VS_LIB4 9145#else 9146 CALL(inner_edge_dtrmm_nt_rl_4x4_vs_lib4) 9147#endif 9148 9149 9150 // call inner blend 9151 9152 movq ARG2, %r10 // alpha 9153 movq ARG5, %r11 // beta 9154 movq ARG6, %r12 // C 9155 9156#if MACRO_LEVEL>=1 9157 INNER_SCALE_AB_4X4_LIB4 9158#else 9159 CALL(inner_scale_ab_4x4_lib4) 9160#endif 9161 9162 9163 // store n 9164 9165 movq ARG7, %r10 // D 9166 movq ARG8, %r11 // ldd 9167 sall $3, %r11d 9168 movq ARG9, %r12 // m1 9169 movq ARG10, %r13 // n1 9170 9171#if MACRO_LEVEL>=1 9172 INNER_TRAN_STORE_4X4_VS_LIB 9173#else 9174 CALL(inner_tran_store_4x4_vs_lib) 9175#endif 9176 9177 9178 EPILOGUE 9179 9180 ret 9181 9182 FUN_END(kernel_dtrmm_nt_rl_4x4_tran_vs_lib444c) 9183 9184 9185 9186 9187 9188// 1 2 3 4 5 6 7 8 9 9189// void kernel_dtrmm_nt_rl_one_4x4_lib44cc(int k, double *alpha, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd); 9190 9191 .p2align 4,,15 9192 GLOB_FUN_START(kernel_dtrmm_nt_rl_one_4x4_lib44cc) 9193 9194 PROLOGUE 9195 9196 // zero accumulation registers 9197 9198 ZERO_ACC 9199 9200 9201 // call inner dgemm kernel nn 9202 9203 movq ARG1, %r10 // k 9204 movq ARG3, %r11 // A 9205 movq ARG4, %r12 // B 9206 9207#if MACRO_LEVEL>=1 9208// INNER_EDGE_DTRMM_NT_RU_4X4_LIB4 9209#else 9210// CALL(inner_edge_dtrmm_nt_ru_4x4_lib4) 9211#endif 9212 9213#if MACRO_LEVEL>=2 9214 INNER_KERNEL_DGEMM_NT_4X4_LIB4 9215#else 9216 CALL(inner_kernel_dgemm_nt_4x4_lib4) 9217#endif 9218 9219 9220 // call inner blend 9221 9222#if MACRO_LEVEL>=1 9223 INNER_BLEND_4X4_LIB4 9224#else 9225 CALL(inner_blend_4x4_lib4) 9226#endif 9227 9228 9229 // final triangle 9230 9231// movq ARG1, %r10 9232// movq ARG3, %r11 9233// movq ARG4, %r12 9234 9235#if MACRO_LEVEL>=1 9236 INNER_EDGE_DTRMM_NT_RL_ONE_4X4_LIB4 9237#else 9238 CALL(inner_edge_dtrmm_nt_rl_one_4x4_lib4) 9239#endif 9240 9241 9242 // call inner blend 9243 9244 movq ARG2, %r10 // alpha 9245 movq ARG5, %r11 // beta 9246 movq ARG6, %r12 // C 9247 movq ARG7, %r13 // ldc 9248 sall $3, %r13d 9249 9250#if MACRO_LEVEL>=1 9251 INNER_SCALE_AB_4X4_LIB 9252#else 9253 CALL(inner_scale_ab_4x4_lib) 9254#endif 9255 9256 9257 // store n 9258 9259 movq ARG8, %r10 // D 9260 movq ARG9, %r11 // ldd 9261 sall $3, %r11d 9262 9263#if MACRO_LEVEL>=1 9264 INNER_STORE_4X4_LIB 9265#else 9266 CALL(inner_store_4x4_lib) 9267#endif 9268 9269 9270 EPILOGUE 9271 9272 ret 9273 9274 FUN_END(kernel_dtrmm_nt_rl_one_4x4_lib44cc) 9275 9276 9277 9278 9279 9280// 1 2 3 4 5 6 7 8 9 10 11 9281// void kernel_dtrmm_nt_rl_one_4x4_vs_lib44cc(int k, double *alpha, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1); 9282 9283 .p2align 4,,15 9284 GLOB_FUN_START(kernel_dtrmm_nt_rl_one_4x4_vs_lib44cc) 9285 9286 PROLOGUE 9287 9288 // zero accumulation registers 9289 9290 ZERO_ACC 9291 9292 9293 // call inner dgemm kernel nn 9294 9295 movq ARG1, %r10 // k 9296// subl $4, %r10d 9297 movq ARG3, %r11 // A 9298// addq $128, %r11 9299 movq ARG4, %r12 // B 9300// addq $128, %r12 9301 9302#if MACRO_LEVEL>=1 9303// INNER_EDGE_DTRMM_NT_RU_4X4_VS_LIB4 9304#else 9305// CALL(inner_edge_dtrmm_nt_ru_4x4_vs_lib4) 9306#endif 9307 9308#if MACRO_LEVEL>=2 9309 INNER_KERNEL_DGEMM_NT_4X4_LIB4 9310#else 9311 CALL(inner_kernel_dgemm_nt_4x4_lib4) 9312#endif 9313 9314 9315 // call inner blend 9316 9317#if MACRO_LEVEL>=1 9318 INNER_BLEND_4X4_LIB4 9319#else 9320 CALL(inner_blend_4x4_lib4) 9321#endif 9322 9323 9324 // initial triangle 9325 9326// movq ARG1, %r10 9327// movq ARG3, %r11 9328// movq ARG4, %r12 9329 movq ARG11, %r13 9330 9331#if MACRO_LEVEL>=1 9332 INNER_EDGE_DTRMM_NT_RL_ONE_4X4_VS_LIB4 9333#else 9334 CALL(inner_edge_dtrmm_nt_rl_one_4x4_vs_lib4) 9335#endif 9336 9337 9338 // call inner blend 9339 9340 movq ARG2, %r10 // alpha 9341 movq ARG5, %r11 // beta 9342 movq ARG6, %r12 // C 9343 movq ARG7, %r13 // ldc 9344 sall $3, %r13d 9345 movq ARG10, %r14 // m1 9346 movq ARG11, %r15 // n1 9347 9348#if MACRO_LEVEL>=1 9349 INNER_SCALE_AB_4X4_VS_LIB 9350#else 9351 CALL(inner_scale_ab_4x4_vs_lib) 9352#endif 9353 9354 9355 // store n 9356 9357 movq ARG8, %r10 // D 9358 movq ARG9, %r11 // ldd 9359 sall $3, %r11d 9360 movq ARG10, %r12 // m1 9361 movq ARG11, %r13 // n1 9362 9363#if MACRO_LEVEL>=1 9364 INNER_STORE_4X4_VS_LIB 9365#else 9366 CALL(inner_store_4x4_vs_lib) 9367#endif 9368 9369 9370 EPILOGUE 9371 9372 ret 9373 9374 FUN_END(kernel_dtrmm_nt_rl_one_4x4_vs_lib44cc) 9375 9376 9377 9378 9379 9380// 1 2 3 4 5 6 7 8 9 9381// void kernel_dtrmm_nt_rl_one_4x4_tran_lib444c(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int ldd); 9382 9383 .p2align 4,,15 9384 GLOB_FUN_START(kernel_dtrmm_nt_rl_one_4x4_tran_lib444c) 9385 9386 PROLOGUE 9387 9388 // zero accumulation registers 9389 9390 ZERO_ACC 9391 9392 9393 // call inner dgemm kernel nn 9394 9395 movq ARG1, %r10 // k 9396 movq ARG3, %r11 // A 9397 movq ARG4, %r12 // B 9398 9399#if MACRO_LEVEL>=1 9400// INNER_EDGE_DTRMM_NT_RU_4X4_LIB4 9401#else 9402// CALL(inner_edge_dtrmm_nt_ru_4x4_lib4) 9403#endif 9404 9405#if MACRO_LEVEL>=2 9406 INNER_KERNEL_DGEMM_NT_4X4_LIB4 9407#else 9408 CALL(inner_kernel_dgemm_nt_4x4_lib4) 9409#endif 9410 9411 9412 // call inner blend 9413 9414#if MACRO_LEVEL>=1 9415 INNER_BLEND_4X4_LIB4 9416#else 9417 CALL(inner_blend_4x4_lib4) 9418#endif 9419 9420 9421 // final triangle 9422 9423// movq ARG1, %r10 9424// movq ARG3, %r11 9425// movq ARG4, %r12 9426 9427#if MACRO_LEVEL>=1 9428 INNER_EDGE_DTRMM_NT_RL_ONE_4X4_LIB4 9429#else 9430 CALL(inner_edge_dtrmm_nt_rl_one_4x4_lib4) 9431#endif 9432 9433 9434 // call inner blend 9435 9436 movq ARG2, %r10 // alpha 9437 movq ARG5, %r11 // beta 9438 movq ARG6, %r12 // C 9439 9440#if MACRO_LEVEL>=1 9441 INNER_SCALE_AB_4X4_LIB4 9442#else 9443 CALL(inner_scale_ab_4x4_lib4) 9444#endif 9445 9446 9447 // store n 9448 9449 movq ARG7, %r10 // D 9450 movq ARG8, %r11 // ldd 9451 sall $3, %r11d 9452 9453#if MACRO_LEVEL>=1 9454 INNER_TRAN_STORE_4X4_LIB 9455#else 9456 CALL(inner_tran_store_4x4_lib) 9457#endif 9458 9459 9460 EPILOGUE 9461 9462 ret 9463 9464 FUN_END(kernel_dtrmm_nt_rl_one_4x4_tran_lib444c) 9465 9466 9467 9468 9469 9470// 1 2 3 4 5 6 7 8 9 10 11 9471// void kernel_dtrmm_nt_rl_one_4x4_tran_vs_lib444c(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int ldd, int m1, int n1); 9472 9473 .p2align 4,,15 9474 GLOB_FUN_START(kernel_dtrmm_nt_rl_one_4x4_tran_vs_lib444c) 9475 9476 PROLOGUE 9477 9478 // zero accumulation registers 9479 9480 ZERO_ACC 9481 9482 9483 // call inner dgemm kernel nn 9484 9485 movq ARG1, %r10 // k 9486// subl $4, %r10d 9487 movq ARG3, %r11 // A 9488// addq $128, %r11 9489 movq ARG4, %r12 // B 9490// addq $128, %r12 9491 9492#if MACRO_LEVEL>=1 9493// INNER_EDGE_DTRMM_NT_RU_4X4_VS_LIB4 9494#else 9495// CALL(inner_edge_dtrmm_nt_ru_4x4_vs_lib4) 9496#endif 9497 9498#if MACRO_LEVEL>=2 9499 INNER_KERNEL_DGEMM_NT_4X4_LIB4 9500#else 9501 CALL(inner_kernel_dgemm_nt_4x4_lib4) 9502#endif 9503 9504 9505 // call inner blend 9506 9507#if MACRO_LEVEL>=1 9508 INNER_BLEND_4X4_LIB4 9509#else 9510 CALL(inner_blend_4x4_lib4) 9511#endif 9512 9513 9514 // initial triangle 9515 9516// movq ARG1, %r10 9517// movq ARG3, %r11 9518// movq ARG4, %r12 9519 movq ARG9, %r13 // m1 9520 9521#if MACRO_LEVEL>=1 9522 INNER_EDGE_DTRMM_NT_RL_ONE_4X4_VS_LIB4 9523#else 9524 CALL(inner_edge_dtrmm_nt_rl_one_4x4_vs_lib4) 9525#endif 9526 9527 9528 // call inner blend 9529 9530 movq ARG2, %r10 // alpha 9531 movq ARG5, %r11 // beta 9532 movq ARG6, %r12 // C 9533 9534#if MACRO_LEVEL>=1 9535 INNER_SCALE_AB_4X4_LIB4 9536#else 9537 CALL(inner_scale_ab_4x4_lib4) 9538#endif 9539 9540 9541 // store n 9542 9543 movq ARG7, %r10 // D 9544 movq ARG8, %r11 // ldd 9545 sall $3, %r11d 9546 movq ARG9, %r12 // m1 9547 movq ARG10, %r13 // n1 9548 9549#if MACRO_LEVEL>=1 9550 INNER_TRAN_STORE_4X4_VS_LIB 9551#else 9552 CALL(inner_tran_store_4x4_vs_lib) 9553#endif 9554 9555 9556 EPILOGUE 9557 9558 ret 9559 9560 FUN_END(kernel_dtrmm_nt_rl_one_4x4_tran_vs_lib444c) 9561 9562 9563 9564 9565 9566// 1 2 3 4 5 6 7 8 9 10 9567// void kernel_dtrmm_nt_rl_4x4_lib4ccc(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd); 9568 9569 .p2align 4,,15 9570 GLOB_FUN_START(kernel_dtrmm_nt_rl_4x4_lib4ccc) 9571 9572 PROLOGUE 9573 9574 // zero accumulation registers 9575 9576 ZERO_ACC 9577 9578 9579 // call inner dgemm kernel nn 9580 9581 movq ARG1, %r10 // k 9582 movq ARG3, %r11 // A 9583 movq ARG4, %r12 // B 9584 movq ARG5, %r13 // ldb 9585 sall $3, %r13d 9586 9587#if MACRO_LEVEL>=2 9588 INNER_KERNEL_DGEMM_NT_4X4_LIB4C 9589#else 9590 CALL(inner_kernel_dgemm_nt_4x4_lib4c) 9591#endif 9592 9593#if MACRO_LEVEL>=1 9594 INNER_EDGE_DTRMM_NT_RL_4X4_LIB4C 9595#else 9596 CALL(inner_edge_dtrmm_nt_rl_4x4_lib4c) 9597#endif 9598 9599 9600 // call inner blend 9601 9602 movq ARG2, %r10 // alpha 9603 movq ARG6, %r11 // beta 9604 movq ARG7, %r12 // C 9605 movq ARG8, %r13 // ldc 9606 sall $3, %r13d 9607 9608#if MACRO_LEVEL>=1 9609 INNER_SCALE_AB_4X4_LIB 9610#else 9611 CALL(inner_scale_ab_4x4_lib) 9612#endif 9613 9614 9615 // store n 9616 9617 movq ARG9, %r10 // D 9618 movq ARG10, %r11 // ldd 9619 sall $3, %r11d 9620 9621#if MACRO_LEVEL>=1 9622 INNER_STORE_4X4_LIB 9623#else 9624 CALL(inner_store_4x4_lib) 9625#endif 9626 9627 9628 EPILOGUE 9629 9630 ret 9631 9632 FUN_END(kernel_dtrmm_nt_rl_4x4_lib4ccc) 9633 9634 9635 9636 9637 9638// 1 2 3 4 5 6 7 8 9 10 11 12 9639// void kernel_dtrmm_nt_rl_4x4_vs_lib4ccc(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1); 9640 9641 .p2align 4,,15 9642 GLOB_FUN_START(kernel_dtrmm_nt_rl_4x4_vs_lib4ccc) 9643 9644 PROLOGUE 9645 9646 // zero accumulation registers 9647 9648 ZERO_ACC 9649 9650 9651 // call inner dgemm kernel nn 9652 9653 movq ARG1, %r10 // k 9654 movq ARG3, %r11 // A 9655 movq ARG4, %r12 // B 9656 movq ARG5, %r13 // ldb 9657 sall $3, %r13d 9658 9659 movq ARG12, %r14 // n1 9660 cmpl $1, %r14d 9661 jg 100f 9662 9663#if MACRO_LEVEL>=2 9664 INNER_KERNEL_DGEMM_NT_4X1_LIB4C 9665#else 9666 CALL(inner_kernel_dgemm_nt_4x1_lib4c) 9667#endif 9668 9669 jmp 103f 9670 9671100: 9672 9673 movq ARG12, %r14 // n1 9674 cmpl $2, %r14d 9675 jg 101f 9676 9677#if MACRO_LEVEL>=2 9678 INNER_KERNEL_DGEMM_NT_4X2_LIB4C 9679#else 9680 CALL(inner_kernel_dgemm_nt_4x2_lib4c) 9681#endif 9682 9683 jmp 103f 9684 9685101: 9686 9687 movq ARG12, %r14 // n1 9688 cmpl $3, %r14d 9689 jg 102f 9690 9691#if MACRO_LEVEL>=2 9692 INNER_KERNEL_DGEMM_NT_4X3_LIB4C 9693#else 9694 CALL(inner_kernel_dgemm_nt_4x3_lib4c) 9695#endif 9696 9697 jmp 103f 9698 9699102: 9700 9701#if MACRO_LEVEL>=2 9702 INNER_KERNEL_DGEMM_NT_4X4_LIB4C 9703#else 9704 CALL(inner_kernel_dgemm_nt_4x4_lib4c) 9705#endif 9706 9707103: 9708 9709 movq ARG12, %r14 // n1 9710 9711#if MACRO_LEVEL>=1 9712 INNER_EDGE_DTRMM_NT_RL_4X4_VS_LIB4C 9713#else 9714 CALL(inner_edge_dtrmm_nt_rl_4x4_vs_lib4c) 9715#endif 9716 9717 9718 // call inner blend 9719 9720 movq ARG2, %r10 // alpha 9721 movq ARG6, %r11 // beta 9722 movq ARG7, %r12 // C 9723 movq ARG8, %r13 // ldc 9724 sall $3, %r13d 9725 movq ARG11, %r14 // m1 9726 movq ARG12, %r15 // n1 9727 9728#if MACRO_LEVEL>=1 9729 INNER_SCALE_AB_4X4_VS_LIB 9730#else 9731 CALL(inner_scale_ab_4x4_vs_lib) 9732#endif 9733 9734 9735 // store n 9736 9737 movq ARG9, %r10 // D 9738 movq ARG10, %r11 // ldd 9739 sall $3, %r11d 9740 movq ARG11, %r12 // m1 9741 movq ARG12, %r13 // n1 9742 9743#if MACRO_LEVEL>=1 9744 INNER_STORE_4X4_VS_LIB 9745#else 9746 CALL(inner_store_4x4_vs_lib) 9747#endif 9748 9749 9750 EPILOGUE 9751 9752 ret 9753 9754 FUN_END(kernel_dtrmm_nt_rl_4x4_vs_lib4ccc) 9755 9756 9757 9758 9759 9760// 1 2 3 4 5 6 7 8 9 9761// void kernel_dtrmm_nt_rl_4x4_tran_lib4c4c(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, double *D, int ldd); 9762 9763 .p2align 4,,15 9764 GLOB_FUN_START(kernel_dtrmm_nt_rl_4x4_tran_lib4c4c) 9765 9766 PROLOGUE 9767 9768 // zero accumulation registers 9769 9770 ZERO_ACC 9771 9772 9773 // call inner dgemm kernel nn 9774 9775 movq ARG1, %r10 // k 9776 movq ARG3, %r11 // A 9777 movq ARG4, %r12 // B 9778 movq ARG5, %r13 // ldb 9779 sall $3, %r13d 9780 9781#if MACRO_LEVEL>=2 9782 INNER_KERNEL_DGEMM_NT_4X4_LIB4C 9783#else 9784 CALL(inner_kernel_dgemm_nt_4x4_lib4c) 9785#endif 9786 9787#if MACRO_LEVEL>=1 9788 INNER_EDGE_DTRMM_NT_RL_4X4_LIB4C 9789#else 9790 CALL(inner_edge_dtrmm_nt_rl_4x4_lib4c) 9791#endif 9792 9793 9794 // call inner blend 9795 9796 movq ARG2, %r10 // alpha 9797 movq ARG6, %r11 // beta 9798 movq ARG7, %r12 // C 9799 9800#if MACRO_LEVEL>=1 9801 INNER_SCALE_AB_4X4_LIB4 9802#else 9803 CALL(inner_scale_ab_4x4_lib4) 9804#endif 9805 9806 9807 // store n 9808 9809 movq ARG8, %r10 // D 9810 movq ARG9, %r11 // ldd 9811 sall $3, %r11d 9812 9813#if MACRO_LEVEL>=1 9814 INNER_TRAN_STORE_4X4_LIB 9815#else 9816 CALL(inner_tran_store_4x4_lib) 9817#endif 9818 9819 9820 EPILOGUE 9821 9822 ret 9823 9824 FUN_END(kernel_dtrmm_nt_rl_4x4_tran_lib4c4c) 9825 9826 9827 9828 9829 9830// 1 2 3 4 5 6 7 8 9 10 11 9831// void kernel_dtrmm_nt_rl_4x4_tran_vs_lib4c4c(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, double *D, int ldd, int m1, int n1); 9832 9833 .p2align 4,,15 9834 GLOB_FUN_START(kernel_dtrmm_nt_rl_4x4_tran_vs_lib4c4c) 9835 9836 PROLOGUE 9837 9838 // zero accumulation registers 9839 9840 ZERO_ACC 9841 9842 9843 // call inner dgemm kernel nn 9844 9845 movq ARG1, %r10 // k 9846 movq ARG3, %r11 // A 9847 movq ARG4, %r12 // B 9848 movq ARG5, %r13 // ldb 9849 sall $3, %r13d 9850 9851 movq ARG10, %r14 // m1 9852 cmpl $1, %r14d 9853 jg 100f 9854 9855#if MACRO_LEVEL>=2 9856 INNER_KERNEL_DGEMM_NT_4X1_LIB4C 9857#else 9858 CALL(inner_kernel_dgemm_nt_4x1_lib4c) 9859#endif 9860 9861 jmp 103f 9862 9863100: 9864 9865 movq ARG10, %r14 // m1 9866 cmpl $2, %r14d 9867 jg 101f 9868 9869#if MACRO_LEVEL>=2 9870 INNER_KERNEL_DGEMM_NT_4X2_LIB4C 9871#else 9872 CALL(inner_kernel_dgemm_nt_4x2_lib4c) 9873#endif 9874 9875 jmp 103f 9876 9877101: 9878 9879 movq ARG10, %r14 // m1 9880 cmpl $3, %r14d 9881 jg 102f 9882 9883#if MACRO_LEVEL>=2 9884 INNER_KERNEL_DGEMM_NT_4X3_LIB4C 9885#else 9886 CALL(inner_kernel_dgemm_nt_4x3_lib4c) 9887#endif 9888 9889 jmp 103f 9890 9891102: 9892 9893#if MACRO_LEVEL>=2 9894 INNER_KERNEL_DGEMM_NT_4X4_LIB4C 9895#else 9896 CALL(inner_kernel_dgemm_nt_4x4_lib4c) 9897#endif 9898 9899103: 9900 9901 movq ARG10, %r14 // m1 9902 9903#if MACRO_LEVEL>=1 9904 INNER_EDGE_DTRMM_NT_RL_4X4_VS_LIB4C 9905#else 9906 CALL(inner_edge_dtrmm_nt_rl_4x4_vs_lib4c) 9907#endif 9908 9909 9910 // call inner blend 9911 9912 movq ARG2, %r10 // alpha 9913 movq ARG6, %r11 // beta 9914 movq ARG7, %r12 // C 9915 9916#if MACRO_LEVEL>=1 9917 INNER_SCALE_AB_4X4_LIB4 9918#else 9919 CALL(inner_scale_ab_4x4_lib4) 9920#endif 9921 9922 9923 // store n 9924 9925 movq ARG8, %r10 // D 9926 movq ARG9, %r11 // ldd 9927 sall $3, %r11d 9928 movq ARG10, %r12 // m1 9929 movq ARG11, %r13 // n1 9930 9931#if MACRO_LEVEL>=1 9932 INNER_TRAN_STORE_4X4_VS_LIB 9933#else 9934 CALL(inner_tran_store_4x4_vs_lib) 9935#endif 9936 9937 9938 EPILOGUE 9939 9940 ret 9941 9942 FUN_END(kernel_dtrmm_nt_rl_4x4_tran_vs_lib4c4c) 9943 9944 9945 9946 9947 9948// 1 2 3 4 5 6 7 8 9 10 9949// void kernel_dtrmm_nt_rl_one_4x4_lib4ccc(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd); 9950 9951 .p2align 4,,15 9952 GLOB_FUN_START(kernel_dtrmm_nt_rl_one_4x4_lib4ccc) 9953 9954 PROLOGUE 9955 9956 // zero accumulation registers 9957 9958 ZERO_ACC 9959 9960 9961 // call inner dgemm kernel nn 9962 9963 movq ARG1, %r10 // k 9964 movq ARG3, %r11 // A 9965 movq ARG4, %r12 // B 9966 movq ARG5, %r13 // ldb 9967 sall $3, %r13d 9968 9969#if MACRO_LEVEL>=2 9970 INNER_KERNEL_DGEMM_NT_4X4_LIB4C 9971#else 9972 CALL(inner_kernel_dgemm_nt_4x4_lib4c) 9973#endif 9974 9975#if MACRO_LEVEL>=1 9976 INNER_EDGE_DTRMM_NT_RL_ONE_4X4_LIB4C 9977#else 9978 CALL(inner_edge_dtrmm_nt_rl_one_4x4_lib4c) 9979#endif 9980 9981 9982 // call inner blend 9983 9984 movq ARG2, %r10 // alpha 9985 movq ARG6, %r11 // beta 9986 movq ARG7, %r12 // C 9987 movq ARG8, %r13 // ldc 9988 sall $3, %r13d 9989 9990#if MACRO_LEVEL>=1 9991 INNER_SCALE_AB_4X4_LIB 9992#else 9993 CALL(inner_scale_ab_4x4_lib) 9994#endif 9995 9996 9997 // store n 9998 9999 movq ARG9, %r10 // D 10000 movq ARG10, %r11 // ldd 10001 sall $3, %r11d 10002 10003#if MACRO_LEVEL>=1 10004 INNER_STORE_4X4_LIB 10005#else 10006 CALL(inner_store_4x4_lib) 10007#endif 10008 10009 10010 EPILOGUE 10011 10012 ret 10013 10014 FUN_END(kernel_dtrmm_nt_rl_one_4x4_lib4ccc) 10015 10016 10017 10018 10019 10020// 1 2 3 4 5 6 7 8 9 10 11 12 10021// void kernel_dtrmm_nt_rl_one_4x4_vs_lib4ccc(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1); 10022 10023 .p2align 4,,15 10024 GLOB_FUN_START(kernel_dtrmm_nt_rl_one_4x4_vs_lib4ccc) 10025 10026 PROLOGUE 10027 10028 // zero accumulation registers 10029 10030 ZERO_ACC 10031 10032 10033 // call inner dgemm kernel nn 10034 10035 movq ARG1, %r10 // k 10036 movq ARG3, %r11 // A 10037 movq ARG4, %r12 // B 10038 movq ARG5, %r13 // ldb 10039 sall $3, %r13d 10040 10041 movq ARG12, %r14 // n1 10042 cmpl $1, %r14d 10043 jg 100f 10044 10045#if MACRO_LEVEL>=2 10046 INNER_KERNEL_DGEMM_NT_4X1_LIB4C 10047#else 10048 CALL(inner_kernel_dgemm_nt_4x1_lib4c) 10049#endif 10050 10051 jmp 103f 10052 10053100: 10054 10055 movq ARG12, %r14 // n1 10056 cmpl $2, %r14d 10057 jg 101f 10058 10059#if MACRO_LEVEL>=2 10060 INNER_KERNEL_DGEMM_NT_4X2_LIB4C 10061#else 10062 CALL(inner_kernel_dgemm_nt_4x2_lib4c) 10063#endif 10064 10065 jmp 103f 10066 10067101: 10068 10069 movq ARG12, %r14 // n1 10070 cmpl $3, %r14d 10071 jg 102f 10072 10073#if MACRO_LEVEL>=2 10074 INNER_KERNEL_DGEMM_NT_4X3_LIB4C 10075#else 10076 CALL(inner_kernel_dgemm_nt_4x3_lib4c) 10077#endif 10078 10079 jmp 103f 10080 10081102: 10082 10083#if MACRO_LEVEL>=2 10084 INNER_KERNEL_DGEMM_NT_4X4_LIB4C 10085#else 10086 CALL(inner_kernel_dgemm_nt_4x4_lib4c) 10087#endif 10088 10089103: 10090 10091 movq ARG12, %r14 // n1 10092 10093#if MACRO_LEVEL>=1 10094 INNER_EDGE_DTRMM_NT_RL_ONE_4X4_VS_LIB4C 10095#else 10096 CALL(inner_edge_dtrmm_nt_rl_one_4x4_vs_lib4c) 10097#endif 10098 10099 10100 // call inner blend 10101 10102 movq ARG2, %r10 // alpha 10103 movq ARG6, %r11 // beta 10104 movq ARG7, %r12 // C 10105 movq ARG8, %r13 // ldc 10106 sall $3, %r13d 10107 movq ARG11, %r14 // m1 10108 movq ARG12, %r15 // n1 10109 10110#if MACRO_LEVEL>=1 10111 INNER_SCALE_AB_4X4_VS_LIB 10112#else 10113 CALL(inner_scale_ab_4x4_vs_lib) 10114#endif 10115 10116 10117 // store n 10118 10119 movq ARG9, %r10 // D 10120 movq ARG10, %r11 // ldd 10121 sall $3, %r11d 10122 movq ARG11, %r12 // m1 10123 movq ARG12, %r13 // n1 10124 10125#if MACRO_LEVEL>=1 10126 INNER_STORE_4X4_VS_LIB 10127#else 10128 CALL(inner_store_4x4_vs_lib) 10129#endif 10130 10131 10132 EPILOGUE 10133 10134 ret 10135 10136 FUN_END(kernel_dtrmm_nt_rl_one_4x4_vs_lib4ccc) 10137 10138 10139 10140 10141 10142// 1 2 3 4 5 6 7 8 9 10143// void kernel_dtrmm_nt_rl_one_4x4_tran_lib4c4c(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, double *D, int ldd); 10144 10145 .p2align 4,,15 10146 GLOB_FUN_START(kernel_dtrmm_nt_rl_one_4x4_tran_lib4c4c) 10147 10148 PROLOGUE 10149 10150 // zero accumulation registers 10151 10152 ZERO_ACC 10153 10154 10155 // call inner dgemm kernel nn 10156 10157 movq ARG1, %r10 // k 10158 movq ARG3, %r11 // A 10159 movq ARG4, %r12 // B 10160 movq ARG5, %r13 // ldb 10161 sall $3, %r13d 10162 10163#if MACRO_LEVEL>=2 10164 INNER_KERNEL_DGEMM_NT_4X4_LIB4C 10165#else 10166 CALL(inner_kernel_dgemm_nt_4x4_lib4c) 10167#endif 10168 10169#if MACRO_LEVEL>=1 10170 INNER_EDGE_DTRMM_NT_RL_ONE_4X4_LIB4C 10171#else 10172 CALL(inner_edge_dtrmm_nt_rl_one_4x4_lib4c) 10173#endif 10174 10175 10176 // call inner blend 10177 10178 movq ARG2, %r10 // alpha 10179 movq ARG6, %r11 // beta 10180 movq ARG7, %r12 // C 10181 10182#if MACRO_LEVEL>=1 10183 INNER_SCALE_AB_4X4_LIB4 10184#else 10185 CALL(inner_scale_ab_4x4_lib4) 10186#endif 10187 10188 10189 // store n 10190 10191 movq ARG8, %r10 // D 10192 movq ARG9, %r11 // ldd 10193 sall $3, %r11d 10194 10195#if MACRO_LEVEL>=1 10196 INNER_TRAN_STORE_4X4_LIB 10197#else 10198 CALL(inner_tran_store_4x4_lib) 10199#endif 10200 10201 10202 EPILOGUE 10203 10204 ret 10205 10206 FUN_END(kernel_dtrmm_nt_rl_one_4x4_tran_lib4c4c) 10207 10208 10209 10210 10211 10212// 1 2 3 4 5 6 7 8 9 10 11 10213// void kernel_dtrmm_nt_rl_one_4x4_tran_vs_lib4c4c(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, double *D, int ldd, int m1, int n1); 10214 10215 .p2align 4,,15 10216 GLOB_FUN_START(kernel_dtrmm_nt_rl_one_4x4_tran_vs_lib4c4c) 10217 10218 PROLOGUE 10219 10220 // zero accumulation registers 10221 10222 ZERO_ACC 10223 10224 10225 // call inner dgemm kernel nn 10226 10227 movq ARG1, %r10 // k 10228 movq ARG3, %r11 // A 10229 movq ARG4, %r12 // B 10230 movq ARG5, %r13 // ldb 10231 sall $3, %r13d 10232 10233 movq ARG10, %r14 // m1 10234 cmpl $1, %r14d 10235 jg 100f 10236 10237#if MACRO_LEVEL>=2 10238 INNER_KERNEL_DGEMM_NT_4X1_LIB4C 10239#else 10240 CALL(inner_kernel_dgemm_nt_4x1_lib4c) 10241#endif 10242 10243 jmp 103f 10244 10245100: 10246 10247 movq ARG10, %r14 // m1 10248 cmpl $2, %r14d 10249 jg 101f 10250 10251#if MACRO_LEVEL>=2 10252 INNER_KERNEL_DGEMM_NT_4X2_LIB4C 10253#else 10254 CALL(inner_kernel_dgemm_nt_4x2_lib4c) 10255#endif 10256 10257 jmp 103f 10258 10259101: 10260 10261 movq ARG10, %r14 // m1 10262 cmpl $3, %r14d 10263 jg 102f 10264 10265#if MACRO_LEVEL>=2 10266 INNER_KERNEL_DGEMM_NT_4X3_LIB4C 10267#else 10268 CALL(inner_kernel_dgemm_nt_4x3_lib4c) 10269#endif 10270 10271 jmp 103f 10272 10273102: 10274 10275#if MACRO_LEVEL>=2 10276 INNER_KERNEL_DGEMM_NT_4X4_LIB4C 10277#else 10278 CALL(inner_kernel_dgemm_nt_4x4_lib4c) 10279#endif 10280 10281103: 10282 10283 movq ARG10, %r14 // m1 10284 10285#if MACRO_LEVEL>=1 10286 INNER_EDGE_DTRMM_NT_RL_ONE_4X4_VS_LIB4C 10287#else 10288 CALL(inner_edge_dtrmm_nt_rl_one_4x4_vs_lib4c) 10289#endif 10290 10291 10292 // call inner blend 10293 10294 movq ARG2, %r10 // alpha 10295 movq ARG6, %r11 // beta 10296 movq ARG7, %r12 // C 10297 10298#if MACRO_LEVEL>=1 10299 INNER_SCALE_AB_4X4_LIB4 10300#else 10301 CALL(inner_scale_ab_4x4_lib4) 10302#endif 10303 10304 10305 // store n 10306 10307 movq ARG8, %r10 // D 10308 movq ARG9, %r11 // ldd 10309 sall $3, %r11d 10310 movq ARG10, %r12 // m1 10311 movq ARG11, %r13 // n1 10312 10313#if MACRO_LEVEL>=1 10314 INNER_TRAN_STORE_4X4_VS_LIB 10315#else 10316 CALL(inner_tran_store_4x4_vs_lib) 10317#endif 10318 10319 10320 EPILOGUE 10321 10322 ret 10323 10324 FUN_END(kernel_dtrmm_nt_rl_one_4x4_tran_vs_lib4c4c) 10325 10326 10327 10328 10329 10330// 1 2 3 4 5 6 7 8 9 10331// void kernel_dtrmm_nt_ru_4x4_lib44cc(int k, double *alpha, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd); 10332 10333 .p2align 4,,15 10334 GLOB_FUN_START(kernel_dtrmm_nt_ru_4x4_lib44cc) 10335 10336 PROLOGUE 10337 10338 // zero accumulation registers 10339 10340 ZERO_ACC 10341 10342 10343 // call inner dgemm kernel nn 10344 10345 movq ARG1, %r10 // k 10346 subl $4, %r10d 10347 movq ARG3, %r11 // A 10348 addq $128, %r11 10349 movq ARG4, %r12 // B 10350 addq $128, %r12 10351 10352#if MACRO_LEVEL>=1 10353// INNER_EDGE_DTRMM_NT_RU_4X4_LIB4 10354#else 10355// CALL(inner_edge_dtrmm_nt_ru_4x4_lib4) 10356#endif 10357 10358#if MACRO_LEVEL>=2 10359 INNER_KERNEL_DGEMM_NT_4X4_LIB4 10360#else 10361 CALL(inner_kernel_dgemm_nt_4x4_lib4) 10362#endif 10363 10364 10365 // call inner blend 10366 10367#if MACRO_LEVEL>=1 10368 INNER_BLEND_4X4_LIB4 10369#else 10370 CALL(inner_blend_4x4_lib4) 10371#endif 10372 10373 10374 // initial triangle 10375 10376 movq ARG1, %r10 10377 movq ARG3, %r11 10378 movq ARG4, %r12 10379 10380#if MACRO_LEVEL>=1 10381 INNER_EDGE_DTRMM_NT_RU_4X4_LIB4 10382#else 10383 CALL(inner_edge_dtrmm_nt_ru_4x4_lib4) 10384#endif 10385 10386 10387 // call inner blend 10388 10389 movq ARG2, %r10 // alpha 10390 movq ARG5, %r11 // beta 10391 movq ARG6, %r12 // C 10392 movq ARG7, %r13 // ldc 10393 sall $3, %r13d 10394 10395#if MACRO_LEVEL>=1 10396 INNER_SCALE_AB_4X4_LIB 10397#else 10398 CALL(inner_scale_ab_4x4_lib) 10399#endif 10400 10401 10402 // store n 10403 10404 movq ARG8, %r10 // D 10405 movq ARG9, %r11 // ldd 10406 sall $3, %r11d 10407 10408#if MACRO_LEVEL>=1 10409 INNER_STORE_4X4_LIB 10410#else 10411 CALL(inner_store_4x4_lib) 10412#endif 10413 10414 10415 EPILOGUE 10416 10417 ret 10418 10419 FUN_END(kernel_dtrmm_nt_ru_4x4_lib44cc) 10420 10421 10422 10423 10424 10425// 1 2 3 4 5 6 7 8 9 10 11 10426// void kernel_dtrmm_nt_ru_4x4_vs_lib44cc(int k, double *alpha, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1); 10427 10428 .p2align 4,,15 10429 GLOB_FUN_START(kernel_dtrmm_nt_ru_4x4_vs_lib44cc) 10430 10431 PROLOGUE 10432 10433 // zero accumulation registers 10434 10435 ZERO_ACC 10436 10437 10438 // call inner dgemm kernel nn 10439 10440 movq ARG1, %r10 // k 10441 subl $4, %r10d 10442 movq ARG3, %r11 // A 10443 addq $128, %r11 10444 movq ARG4, %r12 // B 10445 addq $128, %r12 10446 10447#if MACRO_LEVEL>=1 10448// INNER_EDGE_DTRMM_NT_RU_4X4_VS_LIB4 10449#else 10450// CALL(inner_edge_dtrmm_nt_ru_4x4_vs_lib4) 10451#endif 10452 10453#if MACRO_LEVEL>=2 10454 INNER_KERNEL_DGEMM_NT_4X4_LIB4 10455#else 10456 CALL(inner_kernel_dgemm_nt_4x4_lib4) 10457#endif 10458 10459 10460 // call inner blend 10461 10462#if MACRO_LEVEL>=1 10463 INNER_BLEND_4X4_LIB4 10464#else 10465 CALL(inner_blend_4x4_lib4) 10466#endif 10467 10468 10469 // initial triangle 10470 10471 movq ARG1, %r10 10472 movq ARG3, %r11 10473 movq ARG4, %r12 10474 10475#if MACRO_LEVEL>=1 10476 INNER_EDGE_DTRMM_NT_RU_4X4_VS_LIB4 10477#else 10478 CALL(inner_edge_dtrmm_nt_ru_4x4_vs_lib4) 10479#endif 10480 10481 10482 // call inner blend 10483 10484 movq ARG2, %r10 // alpha 10485 movq ARG5, %r11 // beta 10486 movq ARG6, %r12 // C 10487 movq ARG7, %r13 // ldc 10488 sall $3, %r13d 10489 movq ARG10, %r14 // m1 10490 movq ARG11, %r15 // n1 10491 10492#if MACRO_LEVEL>=1 10493 INNER_SCALE_AB_4X4_VS_LIB 10494#else 10495 CALL(inner_scale_ab_4x4_vs_lib) 10496#endif 10497 10498 10499 // store n 10500 10501 movq ARG8, %r10 // D 10502 movq ARG9, %r11 // ldd 10503 sall $3, %r11d 10504 movq ARG10, %r12 // m1 10505 movq ARG11, %r13 // n1 10506 10507#if MACRO_LEVEL>=1 10508 INNER_STORE_4X4_VS_LIB 10509#else 10510 CALL(inner_store_4x4_vs_lib) 10511#endif 10512 10513 10514 EPILOGUE 10515 10516 ret 10517 10518 FUN_END(kernel_dtrmm_nt_ru_4x4_vs_lib44cc) 10519 10520 10521 10522 10523 10524// 1 2 3 4 5 6 7 8 9 10525// void kernel_dtrmm_nt_ru_4x4_tran_lib444c(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int ldd); 10526 10527 .p2align 4,,15 10528 GLOB_FUN_START(kernel_dtrmm_nt_ru_4x4_tran_lib444c) 10529 10530 PROLOGUE 10531 10532 // zero accumulation registers 10533 10534 ZERO_ACC 10535 10536 10537 // call inner dgemm kernel nn 10538 10539 movq ARG1, %r10 // k 10540 subl $4, %r10d 10541 movq ARG3, %r11 // A 10542 addq $128, %r11 10543 movq ARG4, %r12 // B 10544 addq $128, %r12 10545 10546#if MACRO_LEVEL>=1 10547// INNER_EDGE_DTRMM_NT_RU_4X4_LIB4 10548#else 10549// CALL(inner_edge_dtrmm_nt_ru_4x4_lib4) 10550#endif 10551 10552#if MACRO_LEVEL>=2 10553 INNER_KERNEL_DGEMM_NT_4X4_LIB4 10554#else 10555 CALL(inner_kernel_dgemm_nt_4x4_lib4) 10556#endif 10557 10558 10559 // call inner blend 10560 10561#if MACRO_LEVEL>=1 10562 INNER_BLEND_4X4_LIB4 10563#else 10564 CALL(inner_blend_4x4_lib4) 10565#endif 10566 10567 10568 // initial triangle 10569 10570 movq ARG1, %r10 10571 movq ARG3, %r11 10572 movq ARG4, %r12 10573 10574#if MACRO_LEVEL>=1 10575 INNER_EDGE_DTRMM_NT_RU_4X4_LIB4 10576#else 10577 CALL(inner_edge_dtrmm_nt_ru_4x4_lib4) 10578#endif 10579 10580 10581 // call inner blend 10582 10583 movq ARG2, %r10 // alpha 10584 movq ARG5, %r11 // beta 10585 movq ARG6, %r12 // C 10586 10587#if MACRO_LEVEL>=1 10588 INNER_SCALE_AB_4X4_LIB4 10589#else 10590 CALL(inner_scale_ab_4x4_lib4) 10591#endif 10592 10593 10594 // store n 10595 10596 movq ARG7, %r10 // D 10597 movq ARG8, %r11 // ldd 10598 sall $3, %r11d 10599 10600#if MACRO_LEVEL>=1 10601 INNER_TRAN_STORE_4X4_LIB 10602#else 10603 CALL(inner_tran_store_4x4_lib) 10604#endif 10605 10606 10607 EPILOGUE 10608 10609 ret 10610 10611 FUN_END(kernel_dtrmm_nt_ru_4x4_tran_lib444c) 10612 10613 10614 10615 10616 10617// 1 2 3 4 5 6 7 8 9 10 11 10618// void kernel_dtrmm_nt_ru_4x4_tran_vs_lib444c(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int ldd, int m1, int n1); 10619 10620 .p2align 4,,15 10621 GLOB_FUN_START(kernel_dtrmm_nt_ru_4x4_tran_vs_lib444c) 10622 10623 PROLOGUE 10624 10625 // zero accumulation registers 10626 10627 ZERO_ACC 10628 10629 10630 // call inner dgemm kernel nn 10631 10632 movq ARG1, %r10 // k 10633 subl $4, %r10d 10634 movq ARG3, %r11 // A 10635 addq $128, %r11 10636 movq ARG4, %r12 // B 10637 addq $128, %r12 10638 10639#if MACRO_LEVEL>=1 10640// INNER_EDGE_DTRMM_NT_RU_4X4_VS_LIB4 10641#else 10642// CALL(inner_edge_dtrmm_nt_ru_4x4_vs_lib4) 10643#endif 10644 10645#if MACRO_LEVEL>=2 10646 INNER_KERNEL_DGEMM_NT_4X4_LIB4 10647#else 10648 CALL(inner_kernel_dgemm_nt_4x4_lib4) 10649#endif 10650 10651 10652 // call inner blend 10653 10654#if MACRO_LEVEL>=1 10655 INNER_BLEND_4X4_LIB4 10656#else 10657 CALL(inner_blend_4x4_lib4) 10658#endif 10659 10660 10661 // initial triangle 10662 10663 movq ARG1, %r10 10664 movq ARG3, %r11 10665 movq ARG4, %r12 10666 10667#if MACRO_LEVEL>=1 10668 INNER_EDGE_DTRMM_NT_RU_4X4_VS_LIB4 10669#else 10670 CALL(inner_edge_dtrmm_nt_ru_4x4_vs_lib4) 10671#endif 10672 10673 10674 // call inner blend 10675 10676 movq ARG2, %r10 // alpha 10677 movq ARG5, %r11 // beta 10678 movq ARG6, %r12 // C 10679 10680#if MACRO_LEVEL>=1 10681 INNER_SCALE_AB_4X4_LIB4 10682#else 10683 CALL(inner_scale_ab_4x4_lib4) 10684#endif 10685 10686 10687 // store n 10688 10689 movq ARG7, %r10 // D 10690 movq ARG8, %r11 // ldd 10691 sall $3, %r11d 10692 movq ARG9, %r12 // m1 10693 movq ARG10, %r13 // n1 10694 10695#if MACRO_LEVEL>=1 10696 INNER_TRAN_STORE_4X4_VS_LIB 10697#else 10698 CALL(inner_tran_store_4x4_vs_lib) 10699#endif 10700 10701 10702 EPILOGUE 10703 10704 ret 10705 10706 FUN_END(kernel_dtrmm_nt_ru_4x4_tran_vs_lib444c) 10707 10708 10709 10710 10711 10712// 1 2 3 4 5 6 7 8 9 10 10713// void kernel_dtrmm_nt_ru_4x4_lib4ccc(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd); 10714 10715 .p2align 4,,15 10716 GLOB_FUN_START(kernel_dtrmm_nt_ru_4x4_lib4ccc) 10717 10718 PROLOGUE 10719 10720 // zero accumulation registers 10721 10722 ZERO_ACC 10723 10724 10725 // call inner dgemm kernel nn 10726 10727 movq ARG1, %r10 // k 10728 movq ARG3, %r11 // A 10729 movq ARG4, %r12 // B 10730 movq ARG5, %r13 // ldb 10731 sall $3, %r13d 10732 10733#if MACRO_LEVEL>=1 10734 INNER_EDGE_DTRMM_NT_RU_4X4_LIB4C 10735#else 10736 CALL(inner_edge_dtrmm_nt_ru_4x4_lib4c) 10737#endif 10738 10739#if MACRO_LEVEL>=2 10740 INNER_KERNEL_DGEMM_NT_4X4_LIB4C 10741#else 10742 CALL(inner_kernel_dgemm_nt_4x4_lib4c) 10743#endif 10744 10745 10746 // call inner blend 10747 10748 movq ARG2, %r10 // alpha 10749 movq ARG6, %r11 // beta 10750 movq ARG7, %r12 // C 10751 movq ARG8, %r13 // ldc 10752 sall $3, %r13d 10753 10754#if MACRO_LEVEL>=1 10755 INNER_SCALE_AB_4X4_LIB 10756#else 10757 CALL(inner_scale_ab_4x4_lib) 10758#endif 10759 10760 10761 // store n 10762 10763 movq ARG9, %r10 // D 10764 movq ARG10, %r11 // ldd 10765 sall $3, %r11d 10766 10767#if MACRO_LEVEL>=1 10768 INNER_STORE_4X4_LIB 10769#else 10770 CALL(inner_store_4x4_lib) 10771#endif 10772 10773 10774 EPILOGUE 10775 10776 ret 10777 10778 FUN_END(kernel_dtrmm_nt_ru_4x4_lib4ccc) 10779 10780 10781 10782 10783 10784// 1 2 3 4 5 6 7 8 9 10 11 12 10785// void kernel_dtrmm_nt_ru_4x4_vs_lib4ccc(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1); 10786 10787 .p2align 4,,15 10788 GLOB_FUN_START(kernel_dtrmm_nt_ru_4x4_vs_lib4ccc) 10789 10790 PROLOGUE 10791 10792 // zero accumulation registers 10793 10794 ZERO_ACC 10795 10796 10797 // call inner dgemm kernel nn 10798 10799 movq ARG1, %r10 // k 10800 movq ARG3, %r11 // A 10801 movq ARG4, %r12 // B 10802 movq ARG5, %r13 // ldb 10803 sall $3, %r13d 10804 10805#if MACRO_LEVEL>=1 10806 INNER_EDGE_DTRMM_NT_RU_4X4_VS_LIB4C 10807#else 10808 CALL(inner_edge_dtrmm_nt_ru_4x4_vs_lib4c) 10809#endif 10810 10811 movq ARG12, %r14 // n1 10812 cmpl $1, %r14d 10813 jg 100f 10814 10815#if MACRO_LEVEL>=2 10816 INNER_KERNEL_DGEMM_NT_4X1_LIB4C 10817#else 10818 CALL(inner_kernel_dgemm_nt_4x1_lib4c) 10819#endif 10820 10821 jmp 103f 10822 10823100: 10824 10825 movq ARG12, %r14 // n1 10826 cmpl $2, %r14d 10827 jg 101f 10828 10829#if MACRO_LEVEL>=2 10830 INNER_KERNEL_DGEMM_NT_4X2_LIB4C 10831#else 10832 CALL(inner_kernel_dgemm_nt_4x2_lib4c) 10833#endif 10834 10835 jmp 103f 10836 10837101: 10838 10839 movq ARG12, %r14 // n1 10840 cmpl $3, %r14d 10841 jg 102f 10842 10843#if MACRO_LEVEL>=2 10844 INNER_KERNEL_DGEMM_NT_4X3_LIB4C 10845#else 10846 CALL(inner_kernel_dgemm_nt_4x3_lib4c) 10847#endif 10848 10849 jmp 103f 10850 10851102: 10852 10853#if MACRO_LEVEL>=2 10854 INNER_KERNEL_DGEMM_NT_4X4_LIB4C 10855#else 10856 CALL(inner_kernel_dgemm_nt_4x4_lib4c) 10857#endif 10858 10859103: 10860 10861 10862 // call inner blend 10863 10864 movq ARG2, %r10 // alpha 10865 movq ARG6, %r11 // beta 10866 movq ARG7, %r12 // C 10867 movq ARG8, %r13 // ldc 10868 sall $3, %r13d 10869 movq ARG11, %r14 // m1 10870 movq ARG12, %r15 // n1 10871 10872#if MACRO_LEVEL>=1 10873 INNER_SCALE_AB_4X4_VS_LIB 10874#else 10875 CALL(inner_scale_ab_4x4_vs_lib) 10876#endif 10877 10878 10879 // store n 10880 10881 movq ARG9, %r10 // D 10882 movq ARG10, %r11 // ldd 10883 sall $3, %r11d 10884 movq ARG11, %r12 // m1 10885 movq ARG12, %r13 // n1 10886 10887#if MACRO_LEVEL>=1 10888 INNER_STORE_4X4_VS_LIB 10889#else 10890 CALL(inner_store_4x4_vs_lib) 10891#endif 10892 10893 10894 EPILOGUE 10895 10896 ret 10897 10898 FUN_END(kernel_dtrmm_nt_ru_4x4_vs_lib4ccc) 10899 10900 10901 10902 10903 10904// 1 2 3 4 5 6 7 8 9 10 10905// void kernel_dtrmm_nt_ru_4x4_tran_lib4c4c(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, double *D, int ldd); 10906 10907 .p2align 4,,15 10908 GLOB_FUN_START(kernel_dtrmm_nt_ru_4x4_tran_lib4c4c) 10909 10910 PROLOGUE 10911 10912 // zero accumulation registers 10913 10914 ZERO_ACC 10915 10916 10917 // call inner dgemm kernel nn 10918 10919 movq ARG1, %r10 // k 10920 movq ARG3, %r11 // A 10921 movq ARG4, %r12 // B 10922 movq ARG5, %r13 // ldb 10923 sall $3, %r13d 10924 10925#if MACRO_LEVEL>=1 10926 INNER_EDGE_DTRMM_NT_RU_4X4_LIB4C 10927#else 10928 CALL(inner_edge_dtrmm_nt_ru_4x4_lib4c) 10929#endif 10930 10931#if MACRO_LEVEL>=2 10932 INNER_KERNEL_DGEMM_NT_4X4_LIB4C 10933#else 10934 CALL(inner_kernel_dgemm_nt_4x4_lib4c) 10935#endif 10936 10937 10938 // call inner blend 10939 10940 movq ARG2, %r10 // alpha 10941 movq ARG6, %r11 // beta 10942 movq ARG7, %r12 // C 10943 10944#if MACRO_LEVEL>=1 10945 INNER_SCALE_AB_4X4_LIB4 10946#else 10947 CALL(inner_scale_ab_4x4_lib4) 10948#endif 10949 10950 10951 // store n 10952 10953 movq ARG8, %r10 // D 10954 movq ARG9, %r11 // ldd 10955 sall $3, %r11d 10956 10957#if MACRO_LEVEL>=1 10958 INNER_TRAN_STORE_4X4_LIB 10959#else 10960 CALL(inner_tran_store_4x4_lib) 10961#endif 10962 10963 10964 EPILOGUE 10965 10966 ret 10967 10968 FUN_END(kernel_dtrmm_nt_ru_4x4_tran_lib4c4c) 10969 10970 10971 10972 10973 10974// 1 2 3 4 5 6 7 8 9 10 11 12 10975// void kernel_dtrmm_nt_ru_4x4_tran_vs_lib4c4c(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, double *D, int ldd, int m1, int n1); 10976 10977 .p2align 4,,15 10978 GLOB_FUN_START(kernel_dtrmm_nt_ru_4x4_tran_vs_lib4c4c) 10979 10980 PROLOGUE 10981 10982 // zero accumulation registers 10983 10984 ZERO_ACC 10985 10986 10987 // call inner dgemm kernel nn 10988 10989 movq ARG1, %r10 // k 10990 movq ARG3, %r11 // A 10991 movq ARG4, %r12 // B 10992 movq ARG5, %r13 // ldb 10993 sall $3, %r13d 10994 10995#if MACRO_LEVEL>=1 10996 INNER_EDGE_DTRMM_NT_RU_4X4_VS_LIB4C 10997#else 10998 CALL(inner_edge_dtrmm_nt_ru_4x4_vs_lib4c) 10999#endif 11000 11001 movq ARG10, %r14 // m1 11002 cmpl $1, %r14d 11003 jg 100f 11004 11005#if MACRO_LEVEL>=2 11006 INNER_KERNEL_DGEMM_NT_4X1_LIB4C 11007#else 11008 CALL(inner_kernel_dgemm_nt_4x1_lib4c) 11009#endif 11010 11011 jmp 103f 11012 11013100: 11014 11015 movq ARG10, %r14 // m1 11016 cmpl $2, %r14d 11017 jg 101f 11018 11019#if MACRO_LEVEL>=2 11020 INNER_KERNEL_DGEMM_NT_4X2_LIB4C 11021#else 11022 CALL(inner_kernel_dgemm_nt_4x2_lib4c) 11023#endif 11024 11025 jmp 103f 11026 11027101: 11028 11029 movq ARG10, %r14 // m1 11030 cmpl $3, %r14d 11031 jg 102f 11032 11033#if MACRO_LEVEL>=2 11034 INNER_KERNEL_DGEMM_NT_4X3_LIB4C 11035#else 11036 CALL(inner_kernel_dgemm_nt_4x3_lib4c) 11037#endif 11038 11039 jmp 103f 11040 11041102: 11042 11043#if MACRO_LEVEL>=2 11044 INNER_KERNEL_DGEMM_NT_4X4_LIB4C 11045#else 11046 CALL(inner_kernel_dgemm_nt_4x4_lib4c) 11047#endif 11048 11049103: 11050 11051 11052 // call inner blend 11053 11054 movq ARG2, %r10 // alpha 11055 movq ARG6, %r11 // beta 11056 movq ARG7, %r12 // C 11057 11058#if MACRO_LEVEL>=1 11059 INNER_SCALE_AB_4X4_LIB4 11060#else 11061 CALL(inner_scale_ab_4x4_lib4) 11062#endif 11063 11064 11065 // store n 11066 11067 movq ARG8, %r10 // D 11068 movq ARG9, %r11 // ldd 11069 sall $3, %r11d 11070 movq ARG10, %r12 // m1 11071 movq ARG11, %r13 // n1 11072 11073#if MACRO_LEVEL>=1 11074 INNER_TRAN_STORE_4X4_VS_LIB 11075#else 11076 CALL(inner_tran_store_4x4_vs_lib) 11077#endif 11078 11079 11080 EPILOGUE 11081 11082 ret 11083 11084 FUN_END(kernel_dtrmm_nt_ru_4x4_tran_vs_lib4c4c) 11085 11086 11087 11088 11089 11090// 1 2 3 4 5 6 7 8 9 11091// void kernel_dtrmm_nt_ru_one_4x4_lib44cc(int k, double *alpha, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd); 11092 11093 .p2align 4,,15 11094 GLOB_FUN_START(kernel_dtrmm_nt_ru_one_4x4_lib44cc) 11095 11096 PROLOGUE 11097 11098 // zero accumulation registers 11099 11100 ZERO_ACC 11101 11102 11103 // call inner dgemm kernel nn 11104 11105 movq ARG1, %r10 // k 11106 subl $4, %r10d 11107 movq ARG3, %r11 // A 11108 addq $128, %r11 11109 movq ARG4, %r12 // B 11110 addq $128, %r12 11111 11112#if MACRO_LEVEL>=1 11113// INNER_EDGE_DTRMM_NT_RU_ONE_4X4_LIB4 11114#else 11115// CALL(inner_edge_dtrmm_one_nt_ru_4x4_lib4) 11116#endif 11117 11118#if MACRO_LEVEL>=2 11119 INNER_KERNEL_DGEMM_NT_4X4_LIB4 11120#else 11121 CALL(inner_kernel_dgemm_nt_4x4_lib4) 11122#endif 11123 11124 11125 // call inner blend 11126 11127#if MACRO_LEVEL>=1 11128 INNER_BLEND_4X4_LIB4 11129#else 11130 CALL(inner_blend_4x4_lib4) 11131#endif 11132 11133 11134 // initial triangle 11135 11136 movq ARG1, %r10 11137 movq ARG3, %r11 11138 movq ARG4, %r12 11139 11140#if MACRO_LEVEL>=1 11141 INNER_EDGE_DTRMM_NT_RU_ONE_4X4_LIB4 11142#else 11143 CALL(inner_edge_dtrmm_nt_ru_one_4x4_lib4) 11144#endif 11145 11146 11147 // call inner blend 11148 11149 movq ARG2, %r10 // alpha 11150 movq ARG5, %r11 // beta 11151 movq ARG6, %r12 // C 11152 movq ARG7, %r13 // ldc 11153 sall $3, %r13d 11154 11155#if MACRO_LEVEL>=1 11156 INNER_SCALE_AB_4X4_LIB 11157#else 11158 CALL(inner_scale_ab_4x4_lib) 11159#endif 11160 11161 11162 // store n 11163 11164 movq ARG8, %r10 // D 11165 movq ARG9, %r11 // ldd 11166 sall $3, %r11d 11167 11168#if MACRO_LEVEL>=1 11169 INNER_STORE_4X4_LIB 11170#else 11171 CALL(inner_store_4x4_lib) 11172#endif 11173 11174 11175 EPILOGUE 11176 11177 ret 11178 11179 FUN_END(kernel_dtrmm_nt_ru_one_4x4_lib44cc) 11180 11181 11182 11183 11184 11185// 1 2 3 4 5 6 7 8 9 10 11 11186// void kernel_dtrmm_nt_ru_one_4x4_vs_lib44cc(int k, double *alpha, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1); 11187 11188 .p2align 4,,15 11189 GLOB_FUN_START(kernel_dtrmm_nt_ru_one_4x4_vs_lib44cc) 11190 11191 PROLOGUE 11192 11193 // zero accumulation registers 11194 11195 ZERO_ACC 11196 11197 11198 // call inner dgemm kernel nn 11199 11200 movq ARG1, %r10 // k 11201 subl $4, %r10d 11202 movq ARG3, %r11 // A 11203 addq $128, %r11 11204 movq ARG4, %r12 // B 11205 addq $128, %r12 11206 11207#if MACRO_LEVEL>=1 11208// INNER_EDGE_DTRMM_NT_RU_ONE_4X4_VS_LIB4 11209#else 11210// CALL(inner_edge_dtrmm_nt_ru_one_4x4_vs_lib4) 11211#endif 11212 11213#if MACRO_LEVEL>=2 11214 INNER_KERNEL_DGEMM_NT_4X4_LIB4 11215#else 11216 CALL(inner_kernel_dgemm_nt_4x4_lib4) 11217#endif 11218 11219 11220 // call inner blend 11221 11222#if MACRO_LEVEL>=1 11223 INNER_BLEND_4X4_LIB4 11224#else 11225 CALL(inner_blend_4x4_lib4) 11226#endif 11227 11228 11229 // initial triangle 11230 11231 movq ARG1, %r10 11232 movq ARG3, %r11 11233 movq ARG4, %r12 11234 11235#if MACRO_LEVEL>=1 11236 INNER_EDGE_DTRMM_NT_RU_ONE_4X4_VS_LIB4 11237#else 11238 CALL(inner_edge_dtrmm_nt_ru_one_4x4_vs_lib4) 11239#endif 11240 11241 11242 // call inner blend 11243 11244 movq ARG2, %r10 // alpha 11245 movq ARG5, %r11 // beta 11246 movq ARG6, %r12 // C 11247 movq ARG7, %r13 // ldc 11248 sall $3, %r13d 11249 movq ARG10, %r14 // m1 11250 movq ARG11, %r15 // n1 11251 11252#if MACRO_LEVEL>=1 11253 INNER_SCALE_AB_4X4_VS_LIB 11254#else 11255 CALL(inner_scale_ab_4x4_vs_lib) 11256#endif 11257 11258 11259 // store n 11260 11261 movq ARG8, %r10 // D 11262 movq ARG9, %r11 // ldd 11263 sall $3, %r11d 11264 movq ARG10, %r12 // m1 11265 movq ARG11, %r13 // n1 11266 11267#if MACRO_LEVEL>=1 11268 INNER_STORE_4X4_LIB 11269#else 11270 CALL(inner_store_4x4_lib) 11271#endif 11272 11273 11274 EPILOGUE 11275 11276 ret 11277 11278 FUN_END(kernel_dtrmm_nt_ru_one_4x4_vs_lib44cc) 11279 11280 11281 11282 11283 11284// 1 2 3 4 5 6 7 8 9 11285// void kernel_dtrmm_nt_ru_one_4x4_tran_lib444c(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int ldd); 11286 11287 .p2align 4,,15 11288 GLOB_FUN_START(kernel_dtrmm_nt_ru_one_4x4_tran_lib444c) 11289 11290 PROLOGUE 11291 11292 // zero accumulation registers 11293 11294 ZERO_ACC 11295 11296 11297 // call inner dgemm kernel nn 11298 11299 movq ARG1, %r10 // k 11300 subl $4, %r10d 11301 movq ARG3, %r11 // A 11302 addq $128, %r11 11303 movq ARG4, %r12 // B 11304 addq $128, %r12 11305 11306#if MACRO_LEVEL>=1 11307// INNER_EDGE_DTRMM_NT_RU_4X4_LIB4 11308#else 11309// CALL(inner_edge_dtrmm_nt_ru_4x4_lib4) 11310#endif 11311 11312#if MACRO_LEVEL>=2 11313 INNER_KERNEL_DGEMM_NT_4X4_LIB4 11314#else 11315 CALL(inner_kernel_dgemm_nt_4x4_lib4) 11316#endif 11317 11318 11319 // call inner blend 11320 11321#if MACRO_LEVEL>=1 11322 INNER_BLEND_4X4_LIB4 11323#else 11324 CALL(inner_blend_4x4_lib4) 11325#endif 11326 11327 11328 // initial triangle 11329 11330 movq ARG1, %r10 11331 movq ARG3, %r11 11332 movq ARG4, %r12 11333 11334#if MACRO_LEVEL>=1 11335 INNER_EDGE_DTRMM_NT_RU_ONE_4X4_LIB4 11336#else 11337 CALL(inner_edge_dtrmm_nt_ru_one_4x4_lib4) 11338#endif 11339 11340 11341 // call inner blend 11342 11343 movq ARG2, %r10 // alpha 11344 movq ARG5, %r11 // beta 11345 movq ARG6, %r12 // C 11346 11347#if MACRO_LEVEL>=1 11348 INNER_SCALE_AB_4X4_LIB4 11349#else 11350 CALL(inner_scale_ab_4x4_lib4) 11351#endif 11352 11353 11354 // store n 11355 11356 movq ARG7, %r10 // D 11357 movq ARG8, %r11 // ldd 11358 sall $3, %r11d 11359 11360#if MACRO_LEVEL>=1 11361 INNER_TRAN_STORE_4X4_LIB 11362#else 11363 CALL(inner_tran_store_4x4_lib) 11364#endif 11365 11366 11367 EPILOGUE 11368 11369 ret 11370 11371 FUN_END(kernel_dtrmm_nt_ru_one_4x4_tran_lib444c) 11372 11373 11374 11375 11376 11377// 1 2 3 4 5 6 7 8 9 10 11 11378// void kernel_dtrmm_nt_ru_one_4x4_tran_vs_lib444c(int k, double *alpha, double *A, double *B, double *beta, double *C, double *D, int ldd, int m1, int n1); 11379 11380 .p2align 4,,15 11381 GLOB_FUN_START(kernel_dtrmm_nt_ru_one_4x4_tran_vs_lib444c) 11382 11383 PROLOGUE 11384 11385 // zero accumulation registers 11386 11387 ZERO_ACC 11388 11389 11390 // call inner dgemm kernel nn 11391 11392 movq ARG1, %r10 // k 11393 subl $4, %r10d 11394 movq ARG3, %r11 // A 11395 addq $128, %r11 11396 movq ARG4, %r12 // B 11397 addq $128, %r12 11398 11399#if MACRO_LEVEL>=1 11400// INNER_EDGE_DTRMM_NT_RU_4X4_VS_LIB4 11401#else 11402// CALL(inner_edge_dtrmm_nt_ru_4x4_vs_lib4) 11403#endif 11404 11405#if MACRO_LEVEL>=2 11406 INNER_KERNEL_DGEMM_NT_4X4_LIB4 11407#else 11408 CALL(inner_kernel_dgemm_nt_4x4_lib4) 11409#endif 11410 11411 11412 // call inner blend 11413 11414#if MACRO_LEVEL>=1 11415 INNER_BLEND_4X4_LIB4 11416#else 11417 CALL(inner_blend_4x4_lib4) 11418#endif 11419 11420 11421 // initial triangle 11422 11423 movq ARG1, %r10 11424 movq ARG3, %r11 11425 movq ARG4, %r12 11426 11427#if MACRO_LEVEL>=1 11428 INNER_EDGE_DTRMM_NT_RU_ONE_4X4_VS_LIB4 11429#else 11430 CALL(inner_edge_dtrmm_nt_ru_one_4x4_vs_lib4) 11431#endif 11432 11433 11434 // call inner blend 11435 11436 movq ARG2, %r10 // alpha 11437 movq ARG5, %r11 // beta 11438 movq ARG6, %r12 // C 11439 11440#if MACRO_LEVEL>=1 11441 INNER_SCALE_AB_4X4_LIB4 11442#else 11443 CALL(inner_scale_ab_4x4_lib4) 11444#endif 11445 11446 11447 // store n 11448 11449 movq ARG7, %r10 // D 11450 movq ARG8, %r11 // ldd 11451 sall $3, %r11d 11452 movq ARG9, %r12 // m1 11453 movq ARG10, %r13 // n1 11454 11455#if MACRO_LEVEL>=1 11456 INNER_TRAN_STORE_4X4_VS_LIB 11457#else 11458 CALL(inner_tran_store_4x4_vs_lib) 11459#endif 11460 11461 11462 EPILOGUE 11463 11464 ret 11465 11466 FUN_END(kernel_dtrmm_nt_ru_one_4x4_tran_vs_lib444c) 11467 11468 11469 11470 11471 11472// 1 2 3 4 5 6 7 8 9 10 11473// void kernel_dtrmm_nt_ru_one_4x4_lib4ccc(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd); 11474 11475 .p2align 4,,15 11476 GLOB_FUN_START(kernel_dtrmm_nt_ru_one_4x4_lib4ccc) 11477 11478 PROLOGUE 11479 11480 // zero accumulation registers 11481 11482 ZERO_ACC 11483 11484 11485 // call inner dgemm kernel nn 11486 11487 movq ARG1, %r10 // k 11488 movq ARG3, %r11 // A 11489 movq ARG4, %r12 // B 11490 movq ARG5, %r13 // ldb 11491 sall $3, %r13d 11492 11493#if MACRO_LEVEL>=1 11494 INNER_EDGE_DTRMM_NT_RU_ONE_4X4_LIB4C 11495#else 11496 CALL(inner_edge_dtrmm_nt_ru_one_4x4_lib4c) 11497#endif 11498 11499#if MACRO_LEVEL>=2 11500 INNER_KERNEL_DGEMM_NT_4X4_LIB4C 11501#else 11502 CALL(inner_kernel_dgemm_nt_4x4_lib4c) 11503#endif 11504 11505 11506 // call inner blend 11507 11508 movq ARG2, %r10 // alpha 11509 movq ARG6, %r11 // beta 11510 movq ARG7, %r12 // C 11511 movq ARG8, %r13 // ldc 11512 sall $3, %r13d 11513 11514#if MACRO_LEVEL>=1 11515 INNER_SCALE_AB_4X4_LIB 11516#else 11517 CALL(inner_scale_ab_4x4_lib) 11518#endif 11519 11520 11521 // store n 11522 11523 movq ARG9, %r10 // D 11524 movq ARG10, %r11 // ldd 11525 sall $3, %r11d 11526 11527#if MACRO_LEVEL>=1 11528 INNER_STORE_4X4_LIB 11529#else 11530 CALL(inner_store_4x4_lib) 11531#endif 11532 11533 11534 EPILOGUE 11535 11536 ret 11537 11538 FUN_END(kernel_dtrmm_nt_ru_one_4x4_lib4ccc) 11539 11540 11541 11542 11543 11544// 1 2 3 4 5 6 7 8 9 10 11 12 11545// void kernel_dtrmm_nt_ru_one_4x4_vs_lib4ccc(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1); 11546 11547 .p2align 4,,15 11548 GLOB_FUN_START(kernel_dtrmm_nt_ru_one_4x4_vs_lib4ccc) 11549 11550 PROLOGUE 11551 11552 // zero accumulation registers 11553 11554 ZERO_ACC 11555 11556 11557 // call inner dgemm kernel nn 11558 11559 movq ARG1, %r10 // k 11560 movq ARG3, %r11 // A 11561 movq ARG4, %r12 // B 11562 movq ARG5, %r13 // ldb 11563 sall $3, %r13d 11564 11565#if MACRO_LEVEL>=1 11566 INNER_EDGE_DTRMM_NT_RU_ONE_4X4_VS_LIB4C 11567#else 11568 CALL(inner_edge_dtrmm_nt_ru_one_4x4_vs_lib4c) 11569#endif 11570 11571 movq ARG12, %r14 // n1 11572 cmpl $1, %r14d 11573 jg 100f 11574 11575#if MACRO_LEVEL>=2 11576 INNER_KERNEL_DGEMM_NT_4X1_LIB4C 11577#else 11578 CALL(inner_kernel_dgemm_nt_4x1_lib4c) 11579#endif 11580 11581 jmp 103f 11582 11583100: 11584 11585 movq ARG12, %r14 // n1 11586 cmpl $2, %r14d 11587 jg 101f 11588 11589#if MACRO_LEVEL>=2 11590 INNER_KERNEL_DGEMM_NT_4X2_LIB4C 11591#else 11592 CALL(inner_kernel_dgemm_nt_4x2_lib4c) 11593#endif 11594 11595 jmp 103f 11596 11597101: 11598 11599 movq ARG12, %r14 // n1 11600 cmpl $3, %r14d 11601 jg 102f 11602 11603#if MACRO_LEVEL>=2 11604 INNER_KERNEL_DGEMM_NT_4X3_LIB4C 11605#else 11606 CALL(inner_kernel_dgemm_nt_4x3_lib4c) 11607#endif 11608 11609 jmp 103f 11610 11611102: 11612 11613#if MACRO_LEVEL>=2 11614 INNER_KERNEL_DGEMM_NT_4X4_LIB4C 11615#else 11616 CALL(inner_kernel_dgemm_nt_4x4_lib4c) 11617#endif 11618 11619103: 11620 11621 11622 // call inner blend 11623 11624 movq ARG2, %r10 // alpha 11625 movq ARG6, %r11 // beta 11626 movq ARG7, %r12 // C 11627 movq ARG8, %r13 // ldc 11628 sall $3, %r13d 11629 movq ARG11, %r14 // m1 11630 movq ARG12, %r15 // n1 11631 11632#if MACRO_LEVEL>=1 11633 INNER_SCALE_AB_4X4_VS_LIB 11634#else 11635 CALL(inner_scale_ab_4x4_vs_lib) 11636#endif 11637 11638 11639 // store n 11640 11641 movq ARG9, %r10 // D 11642 movq ARG10, %r11 // ldd 11643 sall $3, %r11d 11644 movq ARG11, %r12 // m1 11645 movq ARG12, %r13 // n1 11646 11647#if MACRO_LEVEL>=1 11648 INNER_STORE_4X4_VS_LIB 11649#else 11650 CALL(inner_store_4x4_vs_lib) 11651#endif 11652 11653 11654 EPILOGUE 11655 11656 ret 11657 11658 FUN_END(kernel_dtrmm_nt_ru_one_4x4_vs_lib4ccc) 11659 11660 11661 11662 11663 11664// 1 2 3 4 5 6 7 8 9 10 11665// void kernel_dtrmm_nt_ru_one_4x4_tran_lib4c4c(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, double *D, int ldd); 11666 11667 .p2align 4,,15 11668 GLOB_FUN_START(kernel_dtrmm_nt_ru_one_4x4_tran_lib4c4c) 11669 11670 PROLOGUE 11671 11672 // zero accumulation registers 11673 11674 ZERO_ACC 11675 11676 11677 // call inner dgemm kernel nn 11678 11679 movq ARG1, %r10 // k 11680 movq ARG3, %r11 // A 11681 movq ARG4, %r12 // B 11682 movq ARG5, %r13 // ldb 11683 sall $3, %r13d 11684 11685#if MACRO_LEVEL>=1 11686 INNER_EDGE_DTRMM_NT_RU_ONE_4X4_LIB4C 11687#else 11688 CALL(inner_edge_dtrmm_nt_ru_one_4x4_lib4c) 11689#endif 11690 11691#if MACRO_LEVEL>=2 11692 INNER_KERNEL_DGEMM_NT_4X4_LIB4C 11693#else 11694 CALL(inner_kernel_dgemm_nt_4x4_lib4c) 11695#endif 11696 11697 11698 // call inner blend 11699 11700 movq ARG2, %r10 // alpha 11701 movq ARG6, %r11 // beta 11702 movq ARG7, %r12 // C 11703 11704#if MACRO_LEVEL>=1 11705 INNER_SCALE_AB_4X4_LIB4 11706#else 11707 CALL(inner_scale_ab_4x4_lib4) 11708#endif 11709 11710 11711 // store n 11712 11713 movq ARG8, %r10 // D 11714 movq ARG9, %r11 // ldd 11715 sall $3, %r11d 11716 11717#if MACRO_LEVEL>=1 11718 INNER_TRAN_STORE_4X4_LIB 11719#else 11720 CALL(inner_tran_store_4x4_lib) 11721#endif 11722 11723 11724 EPILOGUE 11725 11726 ret 11727 11728 FUN_END(kernel_dtrmm_nt_ru_one_4x4_tran_lib4c4c) 11729 11730 11731 11732 11733 11734// 1 2 3 4 5 6 7 8 9 10 11 12 11735// void kernel_dtrmm_nt_ru_one_4x4_tran_vs_lib4c4c(int k, double *alpha, double *A, double *B, int ldb, double *beta, double *C, double *D, int ldd, int m1, int n1); 11736 11737 .p2align 4,,15 11738 GLOB_FUN_START(kernel_dtrmm_nt_ru_one_4x4_tran_vs_lib4c4c) 11739 11740 PROLOGUE 11741 11742 // zero accumulation registers 11743 11744 ZERO_ACC 11745 11746 11747 // call inner dgemm kernel nn 11748 11749 movq ARG1, %r10 // k 11750 movq ARG3, %r11 // A 11751 movq ARG4, %r12 // B 11752 movq ARG5, %r13 // ldb 11753 sall $3, %r13d 11754 11755#if MACRO_LEVEL>=1 11756 INNER_EDGE_DTRMM_NT_RU_ONE_4X4_VS_LIB4C 11757#else 11758 CALL(inner_edge_dtrmm_nt_ru_one_4x4_vs_lib4c) 11759#endif 11760 11761 movq ARG10, %r14 // m1 11762 cmpl $1, %r14d 11763 jg 100f 11764 11765#if MACRO_LEVEL>=2 11766 INNER_KERNEL_DGEMM_NT_4X1_LIB4C 11767#else 11768 CALL(inner_kernel_dgemm_nt_4x1_lib4c) 11769#endif 11770 11771 jmp 103f 11772 11773100: 11774 11775 movq ARG10, %r14 // m1 11776 cmpl $2, %r14d 11777 jg 101f 11778 11779#if MACRO_LEVEL>=2 11780 INNER_KERNEL_DGEMM_NT_4X2_LIB4C 11781#else 11782 CALL(inner_kernel_dgemm_nt_4x2_lib4c) 11783#endif 11784 11785 jmp 103f 11786 11787101: 11788 11789 movq ARG10, %r14 // m1 11790 cmpl $3, %r14d 11791 jg 102f 11792 11793#if MACRO_LEVEL>=2 11794 INNER_KERNEL_DGEMM_NT_4X3_LIB4C 11795#else 11796 CALL(inner_kernel_dgemm_nt_4x3_lib4c) 11797#endif 11798 11799 jmp 103f 11800 11801102: 11802 11803#if MACRO_LEVEL>=2 11804 INNER_KERNEL_DGEMM_NT_4X4_LIB4C 11805#else 11806 CALL(inner_kernel_dgemm_nt_4x4_lib4c) 11807#endif 11808 11809103: 11810 11811 11812 // call inner blend 11813 11814 movq ARG2, %r10 // alpha 11815 movq ARG6, %r11 // beta 11816 movq ARG7, %r12 // C 11817 11818#if MACRO_LEVEL>=1 11819 INNER_SCALE_AB_4X4_LIB4 11820#else 11821 CALL(inner_scale_ab_4x4_lib4) 11822#endif 11823 11824 11825 // store n 11826 11827 movq ARG8, %r10 // D 11828 movq ARG9, %r11 // ldd 11829 sall $3, %r11d 11830 movq ARG10, %r12 // m1 11831 movq ARG11, %r13 // n1 11832 11833#if MACRO_LEVEL>=1 11834 INNER_TRAN_STORE_4X4_VS_LIB 11835#else 11836 CALL(inner_tran_store_4x4_vs_lib) 11837#endif 11838 11839 11840 EPILOGUE 11841 11842 ret 11843 11844 FUN_END(kernel_dtrmm_nt_ru_one_4x4_tran_vs_lib4c4c) 11845 11846 11847 11848 11849 11850// 1 2 3 4 5 6 7 8 11851// void kernel_dpotrf_nt_l_4x4_lib44cc(int k, double *A, double *B, double *C, int ldc, double *D, int ldd, double *inv_diag_D); 11852 11853 .p2align 4,,15 11854 GLOB_FUN_START(kernel_dpotrf_nt_l_4x4_lib44cc) 11855 11856 PROLOGUE 11857 11858 // zero accumulation registers 11859 11860 ZERO_ACC 11861 11862 11863 // call inner dgemm kernel nt 11864 11865 movq ARG1, %r10 // kmax 11866 movq ARG2, %r11 // A 11867 movq ARG3, %r12 // B 11868 11869#if MACRO_LEVEL>=2 11870 INNER_KERNEL_DGEMM_NT_4X4_LIB4 11871#else 11872 CALL(inner_kernel_dgemm_nt_4x4_lib4) 11873#endif 11874 11875 11876 // call inner blender_loader nn 11877 11878 movq ARG4, %r10 // C 11879 movq ARG5, %r11 // ldc 11880 sall $3, %r11d 11881 11882#if MACRO_LEVEL>=1 11883 INNER_BLEND_SCALE_M11_4X4_LIB 11884#else 11885 CALL(inner_blend_scale_m11_4x4_lib) 11886#endif 11887 11888 11889 // factorization 11890 11891 movq ARG8, %r10 // inv_diag_D 11892 movl $4, %r11d 11893 11894#if MACRO_LEVEL>=1 11895 INNER_EDGE_DPOTRF_4X4_VS_LIB4 11896#else 11897 CALL(inner_edge_dpotrf_4x4_vs_lib4) 11898#endif 11899 11900 11901 // store 11902 11903 movq ARG6, %r10 // D 11904 movq ARG7, %r11 // ldd 11905 sall $3, %r11d 11906 11907#if MACRO_LEVEL>=1 11908 INNER_STORE_L_4X4_LIB 11909#else 11910 CALL(inner_store_l_4x4_lib) 11911#endif 11912 11913 11914 EPILOGUE 11915 11916 ret 11917 11918 FUN_END(kernel_dpotrf_nt_l_4x4_lib44cc) 11919 11920 11921 11922 11923 11924// 1 2 3 4 5 6 7 8 9 10 11925// void kernel_dpotrf_nt_l_4x4_vs_lib44cc(int k, double *A, double *B, double *C, int ldc, double *D, int ldd, double *inv_diag_D, int m1, int n1); 11926 11927 .p2align 4,,15 11928 GLOB_FUN_START(kernel_dpotrf_nt_l_4x4_vs_lib44cc) 11929 11930 PROLOGUE 11931 11932 // zero accumulation registers 11933 11934 ZERO_ACC 11935 11936 11937 // call inner dgemm kernel nt 11938 11939 movq ARG1, %r10 // kmax 11940 movq ARG2, %r11 // A 11941 movq ARG3, %r12 // B 11942 11943#if MACRO_LEVEL>=2 11944 INNER_KERNEL_DGEMM_NT_4X4_LIB4 11945#else 11946 CALL(inner_kernel_dgemm_nt_4x4_lib4) 11947#endif 11948 11949 11950 // call inner blender_loader nn 11951 11952 movq ARG4, %r10 // C 11953 movq ARG5, %r11 // ldc 11954 sall $3, %r11d 11955 movq ARG9, %r12 // m1 11956 movq ARG10, %r13 // n1 11957 11958#if MACRO_LEVEL>=1 11959 INNER_BLEND_SCALE_M11_4X4_VS_LIB 11960#else 11961 CALL(inner_blend_scale_m11_4x4_vs_lib) 11962#endif 11963 11964 11965 // factorization 11966 11967 movq ARG8, %r10 // inv_diag_D 11968 movq ARG10, %r11 // n1 11969 11970#if MACRO_LEVEL>=1 11971 INNER_EDGE_DPOTRF_4X4_VS_LIB4 11972#else 11973 CALL(inner_edge_dpotrf_4x4_vs_lib4) 11974#endif 11975 11976 11977 // store 11978 11979 movq ARG6, %r10 // D 11980 movq ARG7, %r11 // ldd 11981 sall $3, %r11d 11982 movq ARG9, %r12 // m1 11983 movq ARG10, %r13 // n1 11984 11985#if MACRO_LEVEL>=1 11986 INNER_STORE_L_4X4_VS_LIB 11987#else 11988 CALL(inner_store_l_4x4_vs_lib) 11989#endif 11990 11991 11992 EPILOGUE 11993 11994 ret 11995 11996 FUN_END(kernel_dpotrf_nt_l_4x4_vs_lib44cc) 11997 11998 11999 12000 12001 12002// 1 2 3 4 5 6 7 8 9 10 12003// void kernel_dtrsm_nn_rl_inv_4x4_lib4c44c(int k, double *A, double *B, int ldb, double *beta, double *C, double *D, double *E, int lde, double *inv_diag_E); 12004 12005 .p2align 4,,15 12006 GLOB_FUN_START(kernel_dtrsm_nn_rl_inv_4x4_lib4c44c) 12007 12008 PROLOGUE 12009 12010 // zero accumulation registers 12011 12012 ZERO_ACC 12013 12014 12015 // call inner dgemm kernel nt 12016 12017 movq ARG1, %r10 // kmax 12018 movq ARG2, %r11 // A 12019 movq ARG3, %r12 // B 12020 movq ARG4, %r13 // ldb 12021 sall $3, %r13d 12022 12023#if MACRO_LEVEL>=2 12024 INNER_KERNEL_DGEMM_NN_4X4_LIB4C 12025#else 12026 CALL(inner_kernel_dgemm_nn_4x4_lib4c) 12027#endif 12028 12029 12030 // call inner blender_loader nn 12031 12032 movq ARG5, %r10 // beta 12033 movq ARG6, %r11 // C 12034 12035#if MACRO_LEVEL>=1 12036 INNER_SCALE_M1B_4X4_LIB4 12037#else 12038 CALL(inner_scale_m1b_4x4_lib4) 12039#endif 12040 12041 12042 // solve 12043 12044 movq ARG8, %r10 // E 12045 movq ARG9, %r11 // lde 12046 sall $3, %r11d 12047 movq ARG10, %r12 // inv_diag_E 12048 12049#if MACRO_LEVEL>=1 12050 INNER_EDGE_DTRSM_RLN_INV_4X4_LIB 12051#else 12052 CALL(inner_edge_dtrsm_rln_inv_4x4_lib) 12053#endif 12054 12055 12056 // store 12057 12058 movq ARG7, %r10 // D 12059 12060#if MACRO_LEVEL>=1 12061 INNER_STORE_4X4_LIB4 12062#else 12063 CALL(inner_store_4x4_lib4) 12064#endif 12065 12066 12067 EPILOGUE 12068 12069 ret 12070 12071 FUN_END(kernel_dtrsm_nn_rl_inv_4x4_lib4c44c) 12072 12073 12074 12075 12076 12077// 1 2 3 4 5 6 7 8 9 10 11 12 12078// void kernel_dtrsm_nn_rl_inv_4x4_vs_lib4c44c(int k, double *A, double *B, int ldb, double *beta, double *C, double *D, double *E, int lde, double *inv_diag_E, int m1, int n1); 12079 12080 .p2align 4,,15 12081 GLOB_FUN_START(kernel_dtrsm_nn_rl_inv_4x4_vs_lib4c44c) 12082 12083 PROLOGUE 12084 12085 // zero accumulation registers 12086 12087 ZERO_ACC 12088 12089 12090 // call inner dgemm kernel nt 12091 12092 movq ARG1, %r10 // kmax 12093 movq ARG2, %r11 // A 12094 movq ARG3, %r12 // B 12095 movq ARG4, %r13 // ldb 12096 sall $3, %r13d 12097 12098 movq ARG12, %r14 // n1 12099 cmpl $1, %r14d 12100 jg 100f 12101 12102#if MACRO_LEVEL>=2 12103 INNER_KERNEL_DGEMM_NN_4X1_LIB4C 12104#else 12105 CALL(inner_kernel_dgemm_nn_4x1_lib4c) 12106#endif 12107 12108 jmp 103f 12109 12110100: 12111 12112 movq ARG12, %r14 // n1 12113 cmpl $2, %r14d 12114 jg 101f 12115 12116#if MACRO_LEVEL>=2 12117 INNER_KERNEL_DGEMM_NN_4X2_LIB4C 12118#else 12119 CALL(inner_kernel_dgemm_nn_4x2_lib4c) 12120#endif 12121 12122 jmp 103f 12123 12124101: 12125 12126 movq ARG12, %r14 // n1 12127 cmpl $3, %r14d 12128 jg 102f 12129 12130#if MACRO_LEVEL>=2 12131 INNER_KERNEL_DGEMM_NN_4X3_LIB4C 12132#else 12133 CALL(inner_kernel_dgemm_nn_4x3_lib4c) 12134#endif 12135 12136 jmp 103f 12137 12138102: 12139 12140#if MACRO_LEVEL>=2 12141 INNER_KERNEL_DGEMM_NN_4X4_LIB4C 12142#else 12143 CALL(inner_kernel_dgemm_nn_4x4_lib4c) 12144#endif 12145 12146103: 12147 12148 12149 // call inner blender_loader nn 12150 12151 movq ARG5, %r10 // beta 12152 movq ARG6, %r11 // C 12153 12154#if MACRO_LEVEL>=1 12155 INNER_SCALE_M1B_4X4_LIB4 12156#else 12157 CALL(inner_scale_m1b_4x4_lib4) 12158#endif 12159 12160 12161 // solve 12162 12163 movq ARG8, %r10 // E 12164 movq ARG9, %r11 // lde 12165 sall $3, %r11d 12166 movq ARG10, %r12 // inv_diag_E 12167 movq ARG12, %r13 // n1 12168 12169#if MACRO_LEVEL>=1 12170 INNER_EDGE_DTRSM_RLN_INV_4X4_VS_LIB 12171#else 12172 CALL(inner_edge_dtrsm_rln_inv_4x4_vs_lib) 12173#endif 12174 12175 12176 // store 12177 12178 movq ARG7, %r10 // D 12179 movq ARG11, %r11 // m1 12180 movq ARG12, %r12 // n1 12181 12182#if MACRO_LEVEL>=1 12183 INNER_STORE_4X4_VS_LIB4 12184#else 12185 CALL(inner_store_4x4_vs_lib4) 12186#endif 12187 12188 12189 EPILOGUE 12190 12191 ret 12192 12193 FUN_END(kernel_dtrsm_nn_rl_inv_4x4_vs_lib4c44c) 12194 12195 12196 12197 12198 12199// 1 2 3 4 5 6 7 8 9 10 11 12 12200// void kernel_dtrsm_nn_rl_inv_4x4_lib4cccc(int k, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, double *E, int lde, double *inv_diag_E); 12201 12202 .p2align 4,,15 12203 GLOB_FUN_START(kernel_dtrsm_nn_rl_inv_4x4_lib4cccc) 12204 12205 PROLOGUE 12206 12207 // zero accumulation registers 12208 12209 ZERO_ACC 12210 12211 12212 // call inner dgemm kernel nt 12213 12214 movq ARG1, %r10 12215 movq ARG2, %r11 12216 movq ARG3, %r12 12217 movq ARG4, %r13 // ldb 12218 sall $3, %r13d 12219 12220#if MACRO_LEVEL>=2 12221 INNER_KERNEL_DGEMM_NN_4X4_LIB4C 12222#else 12223 CALL(inner_kernel_dgemm_nn_4x4_lib4c) 12224#endif 12225 12226 12227 // call inner blender_loader nn 12228 12229 movq ARG5, %r10 // beta 12230 movq ARG6, %r11 // C 12231 movq ARG7, %r12 // ldc 12232 sall $3, %r12d 12233 12234#if MACRO_LEVEL>=1 12235 INNER_SCALE_M1B_4X4_LIB 12236#else 12237 CALL(inner_scale_m1b_4x4_lib) 12238#endif 12239 12240 12241 // solve 12242 12243 movq ARG10, %r10 // E 12244 movq ARG11, %r11 // lde 12245 sall $3, %r11d 12246 movq ARG12, %r12 // inv_diag_E 12247 12248#if MACRO_LEVEL>=1 12249 INNER_EDGE_DTRSM_RLN_INV_4X4_LIB 12250#else 12251 CALL(inner_edge_dtrsm_rln_inv_4x4_lib) 12252#endif 12253 12254 12255 // store 12256 12257 movq ARG8, %r10 // D 12258 movq ARG9, %r11 // ldd 12259 sall $3, %r11d 12260 12261#if MACRO_LEVEL>=1 12262 INNER_STORE_4X4_LIB 12263#else 12264 CALL(inner_store_4x4_lib) 12265#endif 12266 12267 12268 EPILOGUE 12269 12270 ret 12271 12272 FUN_END(kernel_dtrsm_nn_rl_inv_4x4_lib4cccc) 12273 12274 12275 12276 12277 12278// 1 2 3 4 5 6 7 8 9 10 11 12 13 14 12279// void kernel_dtrsm_nn_rl_inv_4x4_vs_lib4cccc(int k, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, double *E, int lde, double *inv_diag_E, int m1, int n1); 12280 12281 .p2align 4,,15 12282 GLOB_FUN_START(kernel_dtrsm_nn_rl_inv_4x4_vs_lib4cccc) 12283 12284 PROLOGUE 12285 12286 // zero accumulation registers 12287 12288 ZERO_ACC 12289 12290 12291 // call inner dgemm kernel nt 12292 12293 movq ARG1, %r10 12294 movq ARG2, %r11 12295 movq ARG3, %r12 12296 movq ARG4, %r13 // ldb 12297 sall $3, %r13d 12298 12299 12300 movq ARG14, %r14 // n1 12301 cmpl $1, %r14d 12302 jg 100f 12303 12304#if MACRO_LEVEL>=2 12305 INNER_KERNEL_DGEMM_NN_4X1_LIB4C 12306#else 12307 CALL(inner_kernel_dgemm_nn_4x1_lib4c) 12308#endif 12309 12310 jmp 103f 12311 12312100: 12313 12314 movq ARG14, %r14 // n1 12315 cmpl $2, %r14d 12316 jg 101f 12317 12318#if MACRO_LEVEL>=2 12319 INNER_KERNEL_DGEMM_NN_4X2_LIB4C 12320#else 12321 CALL(inner_kernel_dgemm_nn_4x2_lib4c) 12322#endif 12323 12324 jmp 103f 12325 12326101: 12327 12328 movq ARG14, %r14 // n1 12329 cmpl $3, %r14d 12330 jg 102f 12331 12332#if MACRO_LEVEL>=2 12333 INNER_KERNEL_DGEMM_NN_4X3_LIB4C 12334#else 12335 CALL(inner_kernel_dgemm_nn_4x3_lib4c) 12336#endif 12337 12338 jmp 103f 12339 12340102: 12341 12342#if MACRO_LEVEL>=2 12343 INNER_KERNEL_DGEMM_NN_4X4_LIB4C 12344#else 12345 CALL(inner_kernel_dgemm_nn_4x4_lib4c) 12346#endif 12347 12348103: 12349 12350 12351 // call inner blender_loader nn 12352 12353 movq ARG5, %r10 // beta 12354 movq ARG6, %r11 // C 12355 movq ARG7, %r12 // ldc 12356 sall $3, %r12d 12357 movq ARG13, %r13 // m1 12358 movq ARG14, %r14 // n1 12359 12360#if MACRO_LEVEL>=1 12361 INNER_SCALE_M1B_4X4_VS_LIB 12362#else 12363 CALL(inner_scale_m1b_4x4_vs_lib) 12364#endif 12365 12366 12367 // solve 12368 12369 movq ARG10, %r10 // E 12370 movq ARG11, %r11 // lde 12371 sall $3, %r11d 12372 movq ARG12, %r12 // inv_diag_E 12373 movq ARG14, %r13 // n1 12374 12375#if MACRO_LEVEL>=1 12376 INNER_EDGE_DTRSM_RLN_INV_4X4_VS_LIB 12377#else 12378 CALL(inner_edge_dtrsm_rln_inv_4x4_vs_lib) 12379#endif 12380 12381 12382 // store 12383 12384 movq ARG8, %r10 // D 12385 movq ARG9, %r11 // ldd 12386 sall $3, %r11d 12387 movq ARG13, %r12 // m1 12388 movq ARG14, %r13 // n1 12389 12390#if MACRO_LEVEL>=1 12391 INNER_STORE_4X4_VS_LIB 12392#else 12393 CALL(inner_store_4x4_vs_lib) 12394#endif 12395 12396 12397 EPILOGUE 12398 12399 ret 12400 12401 FUN_END(kernel_dtrsm_nn_rl_inv_4x4_vs_lib4cccc) 12402 12403 12404 12405 12406 12407// 1 2 3 4 5 6 7 8 9 12408// void kernel_dtrsm_nn_rl_one_4x4_lib4c44c(int k, double *A, double *B, int ldb, double *beta, double *C, double *D, double *E, int lde); 12409 12410 .p2align 4,,15 12411 GLOB_FUN_START(kernel_dtrsm_nn_rl_one_4x4_lib4c44c) 12412 12413 PROLOGUE 12414 12415 // zero accumulation registers 12416 12417 ZERO_ACC 12418 12419 12420 // call inner dgemm kernel nt 12421 12422 movq ARG1, %r10 // kmax 12423 movq ARG2, %r11 // A 12424 movq ARG3, %r12 // B 12425 movq ARG4, %r13 // ldb 12426 sall $3, %r13d 12427 12428#if MACRO_LEVEL>=2 12429 INNER_KERNEL_DGEMM_NN_4X4_LIB4C 12430#else 12431 CALL(inner_kernel_dgemm_nn_4x4_lib4c) 12432#endif 12433 12434 12435 // call inner blender_loader nn 12436 12437 movq ARG5, %r10 // beta 12438 movq ARG6, %r11 // C 12439 12440#if MACRO_LEVEL>=1 12441 INNER_SCALE_M1B_4X4_LIB4 12442#else 12443 CALL(inner_scale_m1b_4x4_lib4) 12444#endif 12445 12446 12447 // solve 12448 12449 movq ARG8, %r10 // E 12450 movq ARG9, %r11 // lde 12451 sall $3, %r11d 12452 12453#if MACRO_LEVEL>=1 12454 INNER_EDGE_DTRSM_RLN_ONE_4X4_LIB 12455#else 12456 CALL(inner_edge_dtrsm_rln_one_4x4_lib) 12457#endif 12458 12459 12460 // store 12461 12462 movq ARG7, %r10 // D 12463 12464#if MACRO_LEVEL>=1 12465 INNER_STORE_4X4_LIB4 12466#else 12467 CALL(inner_store_4x4_lib4) 12468#endif 12469 12470 12471 EPILOGUE 12472 12473 ret 12474 12475 FUN_END(kernel_dtrsm_nn_rl_one_4x4_lib4c44c) 12476 12477 12478 12479 12480 12481// 1 2 3 4 5 6 7 8 9 10 11 12482// void kernel_dtrsm_nn_rl_one_4x4_vs_lib4c44c(int k, double *A, double *B, int ldb, double *beta, double *C, double *D, double *E, int lde, int m1, int n1); 12483 12484 .p2align 4,,15 12485 GLOB_FUN_START(kernel_dtrsm_nn_rl_one_4x4_vs_lib4c44c) 12486 12487 PROLOGUE 12488 12489 // zero accumulation registers 12490 12491 ZERO_ACC 12492 12493 12494 // call inner dgemm kernel nt 12495 12496 movq ARG1, %r10 // kmax 12497 movq ARG2, %r11 // A 12498 movq ARG3, %r12 // B 12499 movq ARG4, %r13 // ldb 12500 sall $3, %r13d 12501 12502 movq ARG11, %r14 // n1 12503 cmpl $1, %r14d 12504 jg 100f 12505 12506#if MACRO_LEVEL>=2 12507 INNER_KERNEL_DGEMM_NN_4X1_LIB4C 12508#else 12509 CALL(inner_kernel_dgemm_nn_4x1_lib4c) 12510#endif 12511 12512 jmp 103f 12513 12514100: 12515 12516 movq ARG11, %r14 // n1 12517 cmpl $2, %r14d 12518 jg 101f 12519 12520#if MACRO_LEVEL>=2 12521 INNER_KERNEL_DGEMM_NN_4X2_LIB4C 12522#else 12523 CALL(inner_kernel_dgemm_nn_4x2_lib4c) 12524#endif 12525 12526 jmp 103f 12527 12528101: 12529 12530 movq ARG11, %r14 // n1 12531 cmpl $3, %r14d 12532 jg 102f 12533 12534#if MACRO_LEVEL>=2 12535 INNER_KERNEL_DGEMM_NN_4X3_LIB4C 12536#else 12537 CALL(inner_kernel_dgemm_nn_4x3_lib4c) 12538#endif 12539 12540 jmp 103f 12541 12542102: 12543 12544#if MACRO_LEVEL>=2 12545 INNER_KERNEL_DGEMM_NN_4X4_LIB4C 12546#else 12547 CALL(inner_kernel_dgemm_nn_4x4_lib4c) 12548#endif 12549 12550103: 12551 12552 12553 // call inner blender_loader nn 12554 12555 movq ARG5, %r10 // beta 12556 movq ARG6, %r11 // C 12557 12558#if MACRO_LEVEL>=1 12559 INNER_SCALE_M1B_4X4_LIB4 12560#else 12561 CALL(inner_scale_m1b_4x4_lib4) 12562#endif 12563 12564 12565 // solve 12566 12567 movq ARG8, %r10 // E 12568 movq ARG9, %r11 // lde 12569 sall $3, %r11d 12570 movq ARG11, %r12 // n1 12571 12572#if MACRO_LEVEL>=1 12573 INNER_EDGE_DTRSM_RLN_ONE_4X4_VS_LIB 12574#else 12575 CALL(inner_edge_dtrsm_rln_one_4x4_vs_lib) 12576#endif 12577 12578 12579 // store 12580 12581 movq ARG7, %r10 // D 12582 movq ARG10, %r11 // m1 12583 movq ARG11, %r12 // n1 12584 12585#if MACRO_LEVEL>=1 12586 INNER_STORE_4X4_VS_LIB4 12587#else 12588 CALL(inner_store_4x4_vs_lib4) 12589#endif 12590 12591 12592 EPILOGUE 12593 12594 ret 12595 12596 FUN_END(kernel_dtrsm_nn_rl_one_4x4_vs_lib4c44c) 12597 12598 12599 12600 12601 12602// 1 2 3 4 5 6 7 8 9 10 11 12603// void kernel_dtrsm_nn_rl_one_4x4_lib4cccc(int k, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, double *E, int lde); 12604 12605 .p2align 4,,15 12606 GLOB_FUN_START(kernel_dtrsm_nn_rl_one_4x4_lib4cccc) 12607 12608 PROLOGUE 12609 12610 // zero accumulation registers 12611 12612 ZERO_ACC 12613 12614 12615 // call inner dgemm kernel nt 12616 12617 movq ARG1, %r10 12618 movq ARG2, %r11 12619 movq ARG3, %r12 12620 movq ARG4, %r13 // ldb 12621 sall $3, %r13d 12622 12623#if MACRO_LEVEL>=2 12624 INNER_KERNEL_DGEMM_NN_4X4_LIB4C 12625#else 12626 CALL(inner_kernel_dgemm_nn_4x4_lib4c) 12627#endif 12628 12629 12630 // call inner blender_loader nn 12631 12632 movq ARG5, %r10 // beta 12633 movq ARG6, %r11 // C 12634 movq ARG7, %r12 // ldc 12635 sall $3, %r12d 12636 12637#if MACRO_LEVEL>=1 12638 INNER_SCALE_M1B_4X4_LIB 12639#else 12640 CALL(inner_scale_m1b_4x4_lib) 12641#endif 12642 12643 12644 // solve 12645 12646 movq ARG10, %r10 // E 12647 movq ARG11, %r11 // lde 12648 sall $3, %r11d 12649 12650#if MACRO_LEVEL>=1 12651 INNER_EDGE_DTRSM_RLN_ONE_4X4_LIB 12652#else 12653 CALL(inner_edge_dtrsm_rln_one_4x4_lib) 12654#endif 12655 12656 12657 // store 12658 12659 movq ARG8, %r10 // D 12660 movq ARG9, %r11 // ldd 12661 sall $3, %r11d 12662 12663#if MACRO_LEVEL>=1 12664 INNER_STORE_4X4_LIB 12665#else 12666 CALL(inner_store_4x4_lib) 12667#endif 12668 12669 12670 EPILOGUE 12671 12672 ret 12673 12674 FUN_END(kernel_dtrsm_nn_rl_one_4x4_lib4cccc) 12675 12676 12677 12678 12679 12680// 1 2 3 4 5 6 7 8 9 10 11 12 13 12681// void kernel_dtrsm_nn_rl_one_4x4_vs_lib4cccc(int k, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, double *E, int lde, int m1, int n1); 12682 12683 .p2align 4,,15 12684 GLOB_FUN_START(kernel_dtrsm_nn_rl_one_4x4_vs_lib4cccc) 12685 12686 PROLOGUE 12687 12688 // zero accumulation registers 12689 12690 ZERO_ACC 12691 12692 12693 // call inner dgemm kernel nt 12694 12695 movq ARG1, %r10 12696 movq ARG2, %r11 12697 movq ARG3, %r12 12698 movq ARG4, %r13 // ldb 12699 sall $3, %r13d 12700 12701 12702 movq ARG13, %r14 // n1 12703 cmpl $1, %r14d 12704 jg 100f 12705 12706#if MACRO_LEVEL>=2 12707 INNER_KERNEL_DGEMM_NN_4X1_LIB4C 12708#else 12709 CALL(inner_kernel_dgemm_nn_4x1_lib4c) 12710#endif 12711 12712 jmp 103f 12713 12714100: 12715 12716 movq ARG13, %r14 // n1 12717 cmpl $2, %r14d 12718 jg 101f 12719 12720#if MACRO_LEVEL>=2 12721 INNER_KERNEL_DGEMM_NN_4X2_LIB4C 12722#else 12723 CALL(inner_kernel_dgemm_nn_4x2_lib4c) 12724#endif 12725 12726 jmp 103f 12727 12728101: 12729 12730 movq ARG13, %r14 // n1 12731 cmpl $3, %r14d 12732 jg 102f 12733 12734#if MACRO_LEVEL>=2 12735 INNER_KERNEL_DGEMM_NN_4X3_LIB4C 12736#else 12737 CALL(inner_kernel_dgemm_nn_4x3_lib4c) 12738#endif 12739 12740 jmp 103f 12741 12742102: 12743 12744#if MACRO_LEVEL>=2 12745 INNER_KERNEL_DGEMM_NN_4X4_LIB4C 12746#else 12747 CALL(inner_kernel_dgemm_nn_4x4_lib4c) 12748#endif 12749 12750103: 12751 12752 12753 // call inner blender_loader nn 12754 12755 movq ARG5, %r10 // beta 12756 movq ARG6, %r11 // C 12757 movq ARG7, %r12 // ldc 12758 sall $3, %r12d 12759 movq ARG12, %r13 // m1 12760 movq ARG13, %r14 // n1 12761 12762#if MACRO_LEVEL>=1 12763 INNER_SCALE_M1B_4X4_VS_LIB 12764#else 12765 CALL(inner_scale_m1b_4x4_vs_lib) 12766#endif 12767 12768 12769 // solve 12770 12771 movq ARG10, %r10 // E 12772 movq ARG11, %r11 // lde 12773 sall $3, %r11d 12774 movq ARG13, %r13 // n1 12775 12776#if MACRO_LEVEL>=1 12777 INNER_EDGE_DTRSM_RLN_ONE_4X4_VS_LIB 12778#else 12779 CALL(inner_edge_dtrsm_rln_one_4x4_vs_lib) 12780#endif 12781 12782 12783 // store 12784 12785 movq ARG8, %r10 // D 12786 movq ARG9, %r11 // ldd 12787 sall $3, %r11d 12788 movq ARG12, %r12 // m1 12789 movq ARG13, %r13 // n1 12790 12791#if MACRO_LEVEL>=1 12792 INNER_STORE_4X4_VS_LIB 12793#else 12794 CALL(inner_store_4x4_vs_lib) 12795#endif 12796 12797 12798 EPILOGUE 12799 12800 ret 12801 12802 FUN_END(kernel_dtrsm_nn_rl_one_4x4_vs_lib4cccc) 12803 12804 12805 12806 12807 12808// 1 2 3 4 5 6 7 8 9 10 12809// void kernel_dtrsm_nt_rl_inv_4x4_lib44cc4(int k, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd, double *E, double *inv_diag_E); 12810 12811 .p2align 4,,15 12812 GLOB_FUN_START(kernel_dtrsm_nt_rl_inv_4x4_lib44cc4) 12813 12814 PROLOGUE 12815 12816 // zero accumulation registers 12817 12818 ZERO_ACC 12819 12820 12821 // call inner dgemm kernel nt 12822 12823 movq ARG1, %r10 12824 movq ARG2, %r11 12825 movq ARG3, %r12 12826 12827#if MACRO_LEVEL>=2 12828 INNER_KERNEL_DGEMM_NT_4X4_LIB4 12829#else 12830 CALL(inner_kernel_dgemm_nt_4x4_lib4) 12831#endif 12832 12833 12834 // call inner blender_loader nn 12835 12836 movq ARG4, %r10 // beta 12837 movq ARG5, %r11 // C 12838 movq ARG6, %r12 // ldc 12839 sall $3, %r12d 12840 12841#if MACRO_LEVEL>=1 12842 INNER_BLEND_SCALE_M1B_4X4_LIB 12843#else 12844 CALL(inner_blend_scale_m1b_4x4_lib) 12845#endif 12846 12847 12848 // solve 12849 12850 movq ARG9, %r10 // E 12851 movq ARG10, %r11 // inv_diag_E 12852 12853#if MACRO_LEVEL>=1 12854 INNER_EDGE_DTRSM_RLT_INV_4X4_LIB4 12855#else 12856 CALL(inner_edge_dtrsm_rlt_inv_4x4_lib4) 12857#endif 12858 12859 12860 // store 12861 12862 movq ARG7, %r10 // D 12863 movq ARG8, %r11 // ldd 12864 sall $3, %r11d 12865 12866#if MACRO_LEVEL>=1 12867 INNER_STORE_4X4_LIB 12868#else 12869 CALL(inner_store_4x4_lib) 12870#endif 12871 12872 12873 EPILOGUE 12874 12875 ret 12876 12877 FUN_END(kernel_dtrsm_nt_rl_inv_4x4_lib44cc4) 12878 12879 12880 12881 12882 12883// 1 2 3 4 5 6 7 8 9 10 11 12 12884// void kernel_dtrsm_nt_rl_inv_4x4_vs_lib44cc4(int k, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd, double *E, double *inv_diag_E, int m1, int n1); 12885 12886 .p2align 4,,15 12887 GLOB_FUN_START(kernel_dtrsm_nt_rl_inv_4x4_vs_lib44cc4) 12888 12889 PROLOGUE 12890 12891 // zero accumulation registers 12892 12893 ZERO_ACC 12894 12895 12896 // call inner dgemm kernel nt 12897 12898 movq ARG1, %r10 12899 movq ARG2, %r11 12900 movq ARG3, %r12 12901 12902#if MACRO_LEVEL>=2 12903 INNER_KERNEL_DGEMM_NT_4X4_LIB4 12904#else 12905 CALL(inner_kernel_dgemm_nt_4x4_lib4) 12906#endif 12907 12908 12909 // call inner blender_loader nn 12910 12911 movq ARG4, %r10 // beta 12912 movq ARG5, %r11 // C 12913 movq ARG6, %r12 // ldc 12914 sall $3, %r12d 12915 movq ARG11, %r13 // m1 12916 movq ARG12, %r14 // n1 12917 12918#if MACRO_LEVEL>=1 12919 INNER_BLEND_SCALE_M1B_4X4_VS_LIB 12920#else 12921 CALL(inner_blend_scale_m1b_4x4_vs_lib) 12922#endif 12923 12924 12925 // solve 12926 12927 movq ARG9, %r10 // E 12928 movq ARG10, %r11 // inv_diag_E 12929 12930#if MACRO_LEVEL>=1 12931 INNER_EDGE_DTRSM_RLT_INV_4X4_LIB4 12932#else 12933 CALL(inner_edge_dtrsm_rlt_inv_4x4_lib4) 12934#endif 12935 12936 12937 // store 12938 12939 movq ARG7, %r10 // D 12940 movq ARG8, %r11 // ldd 12941 sall $3, %r11d 12942 movq ARG11, %r12 // m1 12943 movq ARG12, %r13 // n1 12944 12945#if MACRO_LEVEL>=1 12946 INNER_STORE_4X4_VS_LIB 12947#else 12948 CALL(inner_store_4x4_vs_lib) 12949#endif 12950 12951 12952 EPILOGUE 12953 12954 ret 12955 12956 FUN_END(kernel_dtrsm_nt_rl_inv_4x4_vs_lib44cc4) 12957 12958 12959 12960 12961 12962 12963// 1 2 3 4 5 6 7 8 9 10 12964// void kernel_dtrsm_nt_rl_inv_4x4_lib44ccc(int k, double *A, double *B, double *C, int ldc, double *D, int ldd, double *E, int lde, double *inv_diag_E); 12965 12966 .p2align 4,,15 12967 GLOB_FUN_START(kernel_dtrsm_nt_rl_inv_4x4_lib44ccc) 12968 12969 PROLOGUE 12970 12971 // zero accumulation registers 12972 12973 ZERO_ACC 12974 12975 12976 // call inner dgemm kernel nt 12977 12978 movq ARG1, %r10 12979 movq ARG2, %r11 12980 movq ARG3, %r12 12981 12982#if MACRO_LEVEL>=2 12983 INNER_KERNEL_DGEMM_NT_4X4_LIB4 12984#else 12985 CALL(inner_kernel_dgemm_nt_4x4_lib4) 12986#endif 12987 12988 12989 // call inner blender_loader nn 12990 12991 movq ARG4, %r10 // C 12992 movq ARG5, %r11 // ldc 12993 sall $3, %r11d 12994 12995#if MACRO_LEVEL>=1 12996 INNER_BLEND_SCALE_M11_4X4_LIB 12997#else 12998 CALL(inner_blend_scale_m11_4x4_lib) 12999#endif 13000 13001 13002 // solve 13003 13004 movq ARG8, %r10 // E 13005 movq ARG9, %r11 // lde 13006 sall $3, %r11d 13007 movq ARG10, %r12 // inv_diag_E 13008 13009#if MACRO_LEVEL>=1 13010 INNER_EDGE_DTRSM_RLT_INV_4X4_LIB 13011#else 13012 CALL(inner_edge_dtrsm_rlt_inv_4x4_lib) 13013#endif 13014 13015 13016 // store 13017 13018 movq ARG6, %r10 // D 13019 movq ARG7, %r11 // ldd 13020 sall $3, %r11d 13021 13022#if MACRO_LEVEL>=1 13023 INNER_STORE_4X4_LIB 13024#else 13025 CALL(inner_store_4x4_lib) 13026#endif 13027 13028 13029 EPILOGUE 13030 13031 ret 13032 13033 FUN_END(kernel_dtrsm_nt_rl_inv_4x4_lib44ccc) 13034 13035 13036 13037 13038// 1 2 3 4 5 6 7 8 9 10 11 12 13039// void kernel_dtrsm_nt_rl_inv_4x4_vs_lib44ccc(int k, double *A, double *B, double *C, int ldc, double *D, int ldd, double *E, int lde, double *inv_diag_E, int m1, int n1); 13040 13041 .p2align 4,,15 13042 GLOB_FUN_START(kernel_dtrsm_nt_rl_inv_4x4_vs_lib44ccc) 13043 13044 PROLOGUE 13045 13046 // zero accumulation registers 13047 13048 ZERO_ACC 13049 13050 13051 // call inner dgemm kernel nt 13052 13053 movq ARG1, %r10 13054 movq ARG2, %r11 13055 movq ARG3, %r12 13056 13057#if MACRO_LEVEL>=2 13058 INNER_KERNEL_DGEMM_NT_4X4_LIB4 13059#else 13060 CALL(inner_kernel_dgemm_nt_4x4_lib4) 13061#endif 13062 13063 13064 // call inner blender_loader nn 13065 13066 movq ARG4, %r10 // C 13067 movq ARG5, %r11 // ldc 13068 sall $3, %r11d 13069 movq ARG11, %r12 // m1 13070 movq ARG12, %r13 // n1 13071 13072#if MACRO_LEVEL>=1 13073 INNER_BLEND_SCALE_M11_4X4_VS_LIB 13074#else 13075 CALL(inner_blend_scale_m11_4x4_vs_lib) 13076#endif 13077 13078 13079 // solve 13080 13081 movq ARG8, %r10 // E 13082 movq ARG9, %r11 // lde 13083 sall $3, %r11d 13084 movq ARG10, %r12 // inv_diag_E 13085 movq ARG12, %r13 // n1 13086 13087#if MACRO_LEVEL>=1 13088 INNER_EDGE_DTRSM_RLT_INV_4X4_VS_LIB 13089#else 13090 CALL(inner_edge_dtrsm_rlt_inv_4x4_vs_lib) 13091#endif 13092 13093 13094 // store 13095 13096 movq ARG6, %r10 // D 13097 movq ARG7, %r11 // ldd 13098 sall $3, %r11d 13099 movq ARG11, %r12 // m1 13100 movq ARG12, %r13 // n1 13101 13102#if MACRO_LEVEL>=1 13103 INNER_STORE_4X4_VS_LIB 13104#else 13105 CALL(inner_store_4x4_vs_lib) 13106#endif 13107 13108 13109 EPILOGUE 13110 13111 ret 13112 13113 FUN_END(kernel_dtrsm_nt_rl_inv_4x4_vs_lib44ccc) 13114 13115 13116 13117 13118 13119// 1 2 3 4 5 6 7 8 9 10 13120// void kernel_dtrsm_nt_rl_inv_4x4_lib4c44c(int k, double *A, double *B, int ldb, double *beta, double *C, double *D, double *E, int lde, double *inv_diag_E); 13121 13122 .p2align 4,,15 13123 GLOB_FUN_START(kernel_dtrsm_nt_rl_inv_4x4_lib4c44c) 13124 13125 PROLOGUE 13126 13127 // zero accumulation registers 13128 13129 ZERO_ACC 13130 13131 13132 // call inner dgemm kernel nt 13133 13134 movq ARG1, %r10 // kmax 13135 movq ARG2, %r11 // A 13136 movq ARG3, %r12 // B 13137 movq ARG4, %r13 // ldb 13138 sall $3, %r13d 13139 13140#if MACRO_LEVEL>=2 13141 INNER_KERNEL_DGEMM_NT_4X4_LIB4C 13142#else 13143 CALL(inner_kernel_dgemm_nt_4x4_lib4c) 13144#endif 13145 13146 13147 // call inner blender_loader nn 13148 13149 movq ARG5, %r10 // beta 13150 movq ARG6, %r11 // C 13151 13152#if MACRO_LEVEL>=1 13153 INNER_SCALE_M1B_4X4_LIB4 13154#else 13155 CALL(inner_scale_m1b_4x4_lib4) 13156#endif 13157 13158 13159 // solve 13160 13161 movq ARG8, %r10 // E 13162 movq ARG9, %r11 // lde 13163 sall $3, %r11d 13164 movq ARG10, %r12 // inv_diag_E 13165 13166#if MACRO_LEVEL>=1 13167 INNER_EDGE_DTRSM_RLT_INV_4X4_LIB 13168#else 13169 CALL(inner_edge_dtrsm_rlt_inv_4x4_lib) 13170#endif 13171 13172 13173 // store 13174 13175 movq ARG7, %r10 // D 13176 13177#if MACRO_LEVEL>=1 13178 INNER_STORE_4X4_LIB4 13179#else 13180 CALL(inner_store_4x4_lib4) 13181#endif 13182 13183 13184 EPILOGUE 13185 13186 ret 13187 13188 FUN_END(kernel_dtrsm_nt_rl_inv_4x4_lib4c44c) 13189 13190 13191 13192 13193 13194// 1 2 3 4 5 6 7 8 9 10 11 12 13195// void kernel_dtrsm_nt_rl_inv_4x4_vs_lib4c44c(int k, double *A, double *B, int ldb, double *beta, double *C, double *D, double *E, int lde, double *inv_diag_E, int m1, int n1); 13196 13197 .p2align 4,,15 13198 GLOB_FUN_START(kernel_dtrsm_nt_rl_inv_4x4_vs_lib4c44c) 13199 13200 PROLOGUE 13201 13202 // zero accumulation registers 13203 13204 ZERO_ACC 13205 13206 13207 // call inner dgemm kernel nt 13208 13209 movq ARG1, %r10 // kmax 13210 movq ARG2, %r11 // A 13211 movq ARG3, %r12 // B 13212 movq ARG4, %r13 // ldb 13213 sall $3, %r13d 13214 13215 movq ARG12, %r14 // n1 13216 cmpl $1, %r14d 13217 jg 100f 13218 13219#if MACRO_LEVEL>=2 13220 INNER_KERNEL_DGEMM_NT_4X1_LIB4C 13221#else 13222 CALL(inner_kernel_dgemm_nt_4x1_lib4c) 13223#endif 13224 13225 jmp 103f 13226 13227100: 13228 13229 movq ARG12, %r14 // n1 13230 cmpl $2, %r14d 13231 jg 101f 13232 13233#if MACRO_LEVEL>=2 13234 INNER_KERNEL_DGEMM_NT_4X2_LIB4C 13235#else 13236 CALL(inner_kernel_dgemm_nt_4x2_lib4c) 13237#endif 13238 13239 jmp 103f 13240 13241101: 13242 13243 movq ARG12, %r14 // n1 13244 cmpl $3, %r14d 13245 jg 102f 13246 13247#if MACRO_LEVEL>=2 13248 INNER_KERNEL_DGEMM_NT_4X3_LIB4C 13249#else 13250 CALL(inner_kernel_dgemm_nt_4x3_lib4c) 13251#endif 13252 13253 jmp 103f 13254 13255102: 13256 13257#if MACRO_LEVEL>=2 13258 INNER_KERNEL_DGEMM_NT_4X4_LIB4C 13259#else 13260 CALL(inner_kernel_dgemm_nt_4x4_lib4c) 13261#endif 13262 13263103: 13264 13265 13266 // call inner blender_loader nn 13267 13268 movq ARG5, %r10 // beta 13269 movq ARG6, %r11 // C 13270 13271#if MACRO_LEVEL>=1 13272 INNER_SCALE_M1B_4X4_LIB4 13273#else 13274 CALL(inner_scale_m1b_4x4_lib4) 13275#endif 13276 13277 13278 // solve 13279 13280 movq ARG8, %r10 // E 13281 movq ARG9, %r11 // lde 13282 sall $3, %r11d 13283 movq ARG10, %r12 // inv_diag_E 13284 movq ARG12, %r13 // n1 13285 13286#if MACRO_LEVEL>=1 13287 INNER_EDGE_DTRSM_RLT_INV_4X4_VS_LIB 13288#else 13289 CALL(inner_edge_dtrsm_rlt_inv_4x4_vs_lib) 13290#endif 13291 13292 13293 // store 13294 13295 movq ARG7, %r10 // D 13296 movq ARG11, %r11 // m1 13297 movq ARG12, %r12 // n1 13298 13299#if MACRO_LEVEL>=1 13300 INNER_STORE_4X4_VS_LIB4 13301#else 13302 CALL(inner_store_4x4_vs_lib4) 13303#endif 13304 13305 13306 EPILOGUE 13307 13308 ret 13309 13310 FUN_END(kernel_dtrsm_nt_rl_inv_4x4_vs_lib4c44c) 13311 13312 13313 13314 13315 13316// 1 2 3 4 5 6 7 8 9 10 11 12 13317// void kernel_dtrsm_nt_rl_inv_4x4_lib4cccc(int k, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, double *E, int lde, double *inv_diag_E); 13318 13319 .p2align 4,,15 13320 GLOB_FUN_START(kernel_dtrsm_nt_rl_inv_4x4_lib4cccc) 13321 13322 PROLOGUE 13323 13324 // zero accumulation registers 13325 13326 ZERO_ACC 13327 13328 13329 // call inner dgemm kernel nt 13330 13331 movq ARG1, %r10 13332 movq ARG2, %r11 13333 movq ARG3, %r12 13334 movq ARG4, %r13 // ldb 13335 sall $3, %r13d 13336 13337#if MACRO_LEVEL>=2 13338 INNER_KERNEL_DGEMM_NT_4X4_LIB4C 13339#else 13340 CALL(inner_kernel_dgemm_nt_4x4_lib4c) 13341#endif 13342 13343 13344 // call inner blender_loader nn 13345 13346 movq ARG5, %r10 // beta 13347 movq ARG6, %r11 // C 13348 movq ARG7, %r12 // ldc 13349 sall $3, %r12d 13350 13351#if MACRO_LEVEL>=1 13352 INNER_SCALE_M1B_4X4_LIB 13353#else 13354 CALL(inner_scale_m1b_4x4_lib) 13355#endif 13356 13357 13358 // solve 13359 13360 movq ARG10, %r10 // E 13361 movq ARG11, %r11 // lde 13362 sall $3, %r11d 13363 movq ARG12, %r12 // inv_diag_E 13364 13365#if MACRO_LEVEL>=1 13366 INNER_EDGE_DTRSM_RLT_INV_4X4_LIB 13367#else 13368 CALL(inner_edge_dtrsm_rlt_inv_4x4_lib) 13369#endif 13370 13371 13372 // store 13373 13374 movq ARG8, %r10 // D 13375 movq ARG9, %r11 // ldd 13376 sall $3, %r11d 13377 13378#if MACRO_LEVEL>=1 13379 INNER_STORE_4X4_LIB 13380#else 13381 CALL(inner_store_4x4_lib) 13382#endif 13383 13384 13385 EPILOGUE 13386 13387 ret 13388 13389 FUN_END(kernel_dtrsm_nt_rl_inv_4x4_lib4cccc) 13390 13391 13392 13393 13394 13395// 1 2 3 4 5 6 7 8 9 10 11 12 13 14 13396// void kernel_dtrsm_nt_rl_inv_4x4_vs_lib4cccc(int k, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, double *E, int lde, double *inv_diag_E, int m1, int n1); 13397 13398 .p2align 4,,15 13399 GLOB_FUN_START(kernel_dtrsm_nt_rl_inv_4x4_vs_lib4cccc) 13400 13401 PROLOGUE 13402 13403 // zero accumulation registers 13404 13405 ZERO_ACC 13406 13407 13408 // call inner dgemm kernel nt 13409 13410 movq ARG1, %r10 13411 movq ARG2, %r11 13412 movq ARG3, %r12 13413 movq ARG4, %r13 // ldb 13414 sall $3, %r13d 13415 13416 13417 movq ARG14, %r14 // n1 13418 cmpl $1, %r14d 13419 jg 100f 13420 13421#if MACRO_LEVEL>=2 13422 INNER_KERNEL_DGEMM_NT_4X1_LIB4C 13423#else 13424 CALL(inner_kernel_dgemm_nt_4x1_lib4c) 13425#endif 13426 13427 jmp 103f 13428 13429100: 13430 13431 movq ARG14, %r14 // n1 13432 cmpl $2, %r14d 13433 jg 101f 13434 13435#if MACRO_LEVEL>=2 13436 INNER_KERNEL_DGEMM_NT_4X2_LIB4C 13437#else 13438 CALL(inner_kernel_dgemm_nt_4x2_lib4c) 13439#endif 13440 13441 jmp 103f 13442 13443101: 13444 13445 movq ARG14, %r14 // n1 13446 cmpl $3, %r14d 13447 jg 102f 13448 13449#if MACRO_LEVEL>=2 13450 INNER_KERNEL_DGEMM_NT_4X3_LIB4C 13451#else 13452 CALL(inner_kernel_dgemm_nt_4x3_lib4c) 13453#endif 13454 13455 jmp 103f 13456 13457102: 13458 13459#if MACRO_LEVEL>=2 13460 INNER_KERNEL_DGEMM_NT_4X4_LIB4C 13461#else 13462 CALL(inner_kernel_dgemm_nt_4x4_lib4c) 13463#endif 13464 13465103: 13466 13467 13468 // call inner blender_loader nn 13469 13470 movq ARG5, %r10 // beta 13471 movq ARG6, %r11 // C 13472 movq ARG7, %r12 // ldc 13473 sall $3, %r12d 13474 movq ARG13, %r13 // m1 13475 movq ARG14, %r14 // n1 13476 13477#if MACRO_LEVEL>=1 13478 INNER_SCALE_M1B_4X4_VS_LIB 13479#else 13480 CALL(inner_scale_m1b_4x4_vs_lib) 13481#endif 13482 13483 13484 // solve 13485 13486 movq ARG10, %r10 // E 13487 movq ARG11, %r11 // lde 13488 sall $3, %r11d 13489 movq ARG12, %r12 // inv_diag_E 13490 movq ARG14, %r13 // n1 13491 13492#if MACRO_LEVEL>=1 13493 INNER_EDGE_DTRSM_RLT_INV_4X4_VS_LIB 13494#else 13495 CALL(inner_edge_dtrsm_rlt_inv_4x4_vs_lib) 13496#endif 13497 13498 13499 // store 13500 13501 movq ARG8, %r10 // D 13502 movq ARG9, %r11 // ldd 13503 sall $3, %r11d 13504 movq ARG13, %r12 // m1 13505 movq ARG14, %r13 // n1 13506 13507#if MACRO_LEVEL>=1 13508 INNER_STORE_4X4_VS_LIB 13509#else 13510 CALL(inner_store_4x4_vs_lib) 13511#endif 13512 13513 13514 EPILOGUE 13515 13516 ret 13517 13518 FUN_END(kernel_dtrsm_nt_rl_inv_4x4_vs_lib4cccc) 13519 13520 13521 13522 13523 13524// 1 2 3 4 5 6 7 8 9 13525// void kernel_dtrsm_nt_rl_one_4x4_lib44cc4(int k, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd, double *E); 13526 13527 .p2align 4,,15 13528 GLOB_FUN_START(kernel_dtrsm_nt_rl_one_4x4_lib44cc4) 13529 13530 PROLOGUE 13531 13532 // zero accumulation registers 13533 13534 ZERO_ACC 13535 13536 13537 // call inner dgemm kernel nt 13538 13539 movq ARG1, %r10 13540 movq ARG2, %r11 13541 movq ARG3, %r12 13542 13543#if MACRO_LEVEL>=2 13544 INNER_KERNEL_DGEMM_NT_4X4_LIB4 13545#else 13546 CALL(inner_kernel_dgemm_nt_4x4_lib4) 13547#endif 13548 13549 13550 // call inner blender_loader nn 13551 13552 movq ARG4, %r10 // beta 13553 movq ARG5, %r11 // C 13554 movq ARG6, %r12 // ldc 13555 sall $3, %r12d 13556 13557#if MACRO_LEVEL>=1 13558 INNER_BLEND_SCALE_M1B_4X4_LIB 13559#else 13560 CALL(inner_blend_scale_m1b_4x4_lib) 13561#endif 13562 13563 13564 // solve 13565 13566 movq ARG9, %r10 // E 13567 13568#if MACRO_LEVEL>=1 13569 INNER_EDGE_DTRSM_RLT_ONE_4X4_LIB4 13570#else 13571 CALL(inner_edge_dtrsm_rlt_one_4x4_lib4) 13572#endif 13573 13574 13575 // store 13576 13577 movq ARG7, %r10 // D 13578 movq ARG8, %r11 // ldd 13579 sall $3, %r11d 13580 13581#if MACRO_LEVEL>=1 13582 INNER_STORE_4X4_LIB 13583#else 13584 CALL(inner_store_4x4_lib) 13585#endif 13586 13587 13588 EPILOGUE 13589 13590 ret 13591 13592 FUN_END(kernel_dtrsm_nt_rl_one_4x4_lib44cc4) 13593 13594 13595 13596 13597 13598// 1 2 3 4 5 6 7 8 9 10 11 13599// void kernel_dtrsm_nt_rl_one_4x4_vs_lib44cc4(int k, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd, double *E, int m1, int n1); 13600 13601 .p2align 4,,15 13602 GLOB_FUN_START(kernel_dtrsm_nt_rl_one_4x4_vs_lib44cc4) 13603 13604 PROLOGUE 13605 13606 // zero accumulation registers 13607 13608 ZERO_ACC 13609 13610 13611 // call inner dgemm kernel nt 13612 13613 movq ARG1, %r10 13614 movq ARG2, %r11 13615 movq ARG3, %r12 13616 13617#if MACRO_LEVEL>=2 13618 INNER_KERNEL_DGEMM_NT_4X4_LIB4 13619#else 13620 CALL(inner_kernel_dgemm_nt_4x4_lib4) 13621#endif 13622 13623 13624 // call inner blender_loader nn 13625 13626 movq ARG4, %r10 // beta 13627 movq ARG5, %r11 // C 13628 movq ARG6, %r12 // ldc 13629 sall $3, %r12d 13630 movq ARG10, %r13 // m1 13631 movq ARG11, %r14 // n1 13632 13633#if MACRO_LEVEL>=1 13634 INNER_BLEND_SCALE_M1B_4X4_VS_LIB 13635#else 13636 CALL(inner_blend_scale_m1b_4x4_vs_lib) 13637#endif 13638 13639 13640 // solve 13641 13642 movq ARG9, %r10 // E 13643 13644#if MACRO_LEVEL>=1 13645 INNER_EDGE_DTRSM_RLT_ONE_4X4_LIB4 13646#else 13647 CALL(inner_edge_dtrsm_rlt_one_4x4_lib4) 13648#endif 13649 13650 13651 // store 13652 13653 movq ARG7, %r10 // D 13654 movq ARG8, %r11 // ldd 13655 sall $3, %r11d 13656 movq ARG10, %r12 // m1 13657 movq ARG11, %r13 // n1 13658 13659#if MACRO_LEVEL>=1 13660 INNER_STORE_4X4_VS_LIB 13661#else 13662 CALL(inner_store_4x4_vs_lib) 13663#endif 13664 13665 13666 EPILOGUE 13667 13668 ret 13669 13670 FUN_END(kernel_dtrsm_nt_rl_one_4x4_vs_lib44cc4) 13671 13672 13673 13674 13675 13676 13677// 1 2 3 4 5 6 7 8 9 13678// void kernel_dtrsm_nt_rl_one_4x4_lib4c44c(int k, double *A, double *B, int ldb, double *beta, double *C, double *D, double *E, int lde); 13679 13680 .p2align 4,,15 13681 GLOB_FUN_START(kernel_dtrsm_nt_rl_one_4x4_lib4c44c) 13682 13683 PROLOGUE 13684 13685 // zero accumulation registers 13686 13687 ZERO_ACC 13688 13689 13690 // call inner dgemm kernel nt 13691 13692 movq ARG1, %r10 // kmax 13693 movq ARG2, %r11 // A 13694 movq ARG3, %r12 // B 13695 movq ARG4, %r13 // ldb 13696 sall $3, %r13d 13697 13698#if MACRO_LEVEL>=2 13699 INNER_KERNEL_DGEMM_NT_4X4_LIB4C 13700#else 13701 CALL(inner_kernel_dgemm_nt_4x4_lib4c) 13702#endif 13703 13704 13705 // call inner blender_loader nn 13706 13707 movq ARG5, %r10 // beta 13708 movq ARG6, %r11 // C 13709 13710#if MACRO_LEVEL>=1 13711 INNER_SCALE_M1B_4X4_LIB4 13712#else 13713 CALL(inner_scale_m1b_4x4_lib4) 13714#endif 13715 13716 13717 // solve 13718 13719 movq ARG8, %r10 // E 13720 movq ARG9, %r11 // lde 13721 sall $3, %r11d 13722 13723#if MACRO_LEVEL>=1 13724 INNER_EDGE_DTRSM_RLT_ONE_4X4_LIB 13725#else 13726 CALL(inner_edge_dtrsm_rlt_one_4x4_lib) 13727#endif 13728 13729 13730 // store 13731 13732 movq ARG7, %r10 // D 13733 13734#if MACRO_LEVEL>=1 13735 INNER_STORE_4X4_LIB4 13736#else 13737 CALL(inner_store_4x4_lib4) 13738#endif 13739 13740 13741 EPILOGUE 13742 13743 ret 13744 13745 FUN_END(kernel_dtrsm_nt_rl_one_4x4_lib4c44c) 13746 13747 13748 13749 13750 13751// 1 2 3 4 5 6 7 8 9 10 11 13752// void kernel_dtrsm_nt_rl_one_4x4_vs_lib4c44c(int k, double *A, double *B, int ldb, double *beta, double *C, double *D, double *E, int lde, int m1, int n1); 13753 13754 .p2align 4,,15 13755 GLOB_FUN_START(kernel_dtrsm_nt_rl_one_4x4_vs_lib4c44c) 13756 13757 PROLOGUE 13758 13759 // zero accumulation registers 13760 13761 ZERO_ACC 13762 13763 13764 // call inner dgemm kernel nt 13765 13766 movq ARG1, %r10 // kmax 13767 movq ARG2, %r11 // A 13768 movq ARG3, %r12 // B 13769 movq ARG4, %r13 // ldb 13770 sall $3, %r13d 13771 13772 movq ARG11, %r14 // n1 13773 cmpl $1, %r14d 13774 jg 100f 13775 13776#if MACRO_LEVEL>=2 13777 INNER_KERNEL_DGEMM_NT_4X1_LIB4C 13778#else 13779 CALL(inner_kernel_dgemm_nt_4x1_lib4c) 13780#endif 13781 13782 jmp 103f 13783 13784100: 13785 13786 movq ARG11, %r14 // n1 13787 cmpl $2, %r14d 13788 jg 101f 13789 13790#if MACRO_LEVEL>=2 13791 INNER_KERNEL_DGEMM_NT_4X2_LIB4C 13792#else 13793 CALL(inner_kernel_dgemm_nt_4x2_lib4c) 13794#endif 13795 13796 jmp 103f 13797 13798101: 13799 13800 movq ARG11, %r14 // n1 13801 cmpl $3, %r14d 13802 jg 102f 13803 13804#if MACRO_LEVEL>=2 13805 INNER_KERNEL_DGEMM_NT_4X3_LIB4C 13806#else 13807 CALL(inner_kernel_dgemm_nt_4x3_lib4c) 13808#endif 13809 13810 jmp 103f 13811 13812102: 13813 13814#if MACRO_LEVEL>=2 13815 INNER_KERNEL_DGEMM_NT_4X4_LIB4C 13816#else 13817 CALL(inner_kernel_dgemm_nt_4x4_lib4c) 13818#endif 13819 13820103: 13821 13822 13823 // call inner blender_loader nn 13824 13825 movq ARG5, %r10 // beta 13826 movq ARG6, %r11 // C 13827 13828#if MACRO_LEVEL>=1 13829 INNER_SCALE_M1B_4X4_LIB4 13830#else 13831 CALL(inner_scale_m1b_4x4_lib4) 13832#endif 13833 13834 13835 // solve 13836 13837 movq ARG8, %r10 // E 13838 movq ARG9, %r11 // lde 13839 sall $3, %r11d 13840 movq ARG11, %r12 // n1 13841 13842#if MACRO_LEVEL>=1 13843 INNER_EDGE_DTRSM_RLT_ONE_4X4_VS_LIB 13844#else 13845 CALL(inner_edge_dtrsm_rlt_one_4x4_vs_lib) 13846#endif 13847 13848 13849 // store 13850 13851 movq ARG7, %r10 // D 13852 movq ARG10, %r11 // m1 13853 movq ARG11, %r12 // n1 13854 13855#if MACRO_LEVEL>=1 13856 INNER_STORE_4X4_VS_LIB4 13857#else 13858 CALL(inner_store_4x4_vs_lib4) 13859#endif 13860 13861 13862 EPILOGUE 13863 13864 ret 13865 13866 FUN_END(kernel_dtrsm_nt_rl_one_4x4_vs_lib4c44c) 13867 13868 13869 13870 13871 13872// 1 2 3 4 5 6 7 8 9 10 11 13873// void kernel_dtrsm_nt_rl_one_4x4_lib4cccc(int k, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, double *E, int lde); 13874 13875 .p2align 4,,15 13876 GLOB_FUN_START(kernel_dtrsm_nt_rl_one_4x4_lib4cccc) 13877 13878 PROLOGUE 13879 13880 // zero accumulation registers 13881 13882 ZERO_ACC 13883 13884 13885 // call inner dgemm kernel nt 13886 13887 movq ARG1, %r10 13888 movq ARG2, %r11 13889 movq ARG3, %r12 13890 movq ARG4, %r13 // ldb 13891 sall $3, %r13d 13892 13893#if MACRO_LEVEL>=2 13894 INNER_KERNEL_DGEMM_NT_4X4_LIB4C 13895#else 13896 CALL(inner_kernel_dgemm_nt_4x4_lib4c) 13897#endif 13898 13899 13900 // call inner blender_loader nn 13901 13902 movq ARG5, %r10 // beta 13903 movq ARG6, %r11 // C 13904 movq ARG7, %r12 // ldc 13905 sall $3, %r12d 13906 13907#if MACRO_LEVEL>=1 13908 INNER_SCALE_M1B_4X4_LIB 13909#else 13910 CALL(inner_scale_m1b_4x4_lib) 13911#endif 13912 13913 13914 // solve 13915 13916 movq ARG10, %r10 // E 13917 movq ARG11, %r11 // lde 13918 sall $3, %r11d 13919 13920#if MACRO_LEVEL>=1 13921 INNER_EDGE_DTRSM_RLT_ONE_4X4_LIB 13922#else 13923 CALL(inner_edge_dtrsm_rlt_one_4x4_lib) 13924#endif 13925 13926 13927 // store 13928 13929 movq ARG8, %r10 // D 13930 movq ARG9, %r11 // ldd 13931 sall $3, %r11d 13932 13933#if MACRO_LEVEL>=1 13934 INNER_STORE_4X4_LIB 13935#else 13936 CALL(inner_store_4x4_lib) 13937#endif 13938 13939 13940 EPILOGUE 13941 13942 ret 13943 13944 FUN_END(kernel_dtrsm_nt_rl_one_4x4_lib4cccc) 13945 13946 13947 13948 13949 13950// 1 2 3 4 5 6 7 8 9 10 11 12 13 13951// void kernel_dtrsm_nt_rl_one_4x4_vs_lib4cccc(int k, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, double *E, int lde, int m1, int n1); 13952 13953 .p2align 4,,15 13954 GLOB_FUN_START(kernel_dtrsm_nt_rl_one_4x4_vs_lib4cccc) 13955 13956 PROLOGUE 13957 13958 // zero accumulation registers 13959 13960 ZERO_ACC 13961 13962 13963 // call inner dgemm kernel nt 13964 13965 movq ARG1, %r10 13966 movq ARG2, %r11 13967 movq ARG3, %r12 13968 movq ARG4, %r13 // ldb 13969 sall $3, %r13d 13970 13971 13972 movq ARG13, %r14 // n1 13973 cmpl $1, %r14d 13974 jg 100f 13975 13976#if MACRO_LEVEL>=2 13977 INNER_KERNEL_DGEMM_NT_4X1_LIB4C 13978#else 13979 CALL(inner_kernel_dgemm_nt_4x1_lib4c) 13980#endif 13981 13982 jmp 103f 13983 13984100: 13985 13986 movq ARG13, %r14 // n1 13987 cmpl $2, %r14d 13988 jg 101f 13989 13990#if MACRO_LEVEL>=2 13991 INNER_KERNEL_DGEMM_NT_4X2_LIB4C 13992#else 13993 CALL(inner_kernel_dgemm_nt_4x2_lib4c) 13994#endif 13995 13996 jmp 103f 13997 13998101: 13999 14000 movq ARG13, %r14 // n1 14001 cmpl $3, %r14d 14002 jg 102f 14003 14004#if MACRO_LEVEL>=2 14005 INNER_KERNEL_DGEMM_NT_4X3_LIB4C 14006#else 14007 CALL(inner_kernel_dgemm_nt_4x3_lib4c) 14008#endif 14009 14010 jmp 103f 14011 14012102: 14013 14014#if MACRO_LEVEL>=2 14015 INNER_KERNEL_DGEMM_NT_4X4_LIB4C 14016#else 14017 CALL(inner_kernel_dgemm_nt_4x4_lib4c) 14018#endif 14019 14020103: 14021 14022 14023 // call inner blender_loader nn 14024 14025 movq ARG5, %r10 // beta 14026 movq ARG6, %r11 // C 14027 movq ARG7, %r12 // ldc 14028 sall $3, %r12d 14029 movq ARG12, %r13 // m1 14030 movq ARG13, %r14 // n1 14031 14032#if MACRO_LEVEL>=1 14033 INNER_SCALE_M1B_4X4_VS_LIB 14034#else 14035 CALL(inner_scale_m1b_4x4_vs_lib) 14036#endif 14037 14038 14039 // solve 14040 14041 movq ARG10, %r10 // E 14042 movq ARG11, %r11 // lde 14043 sall $3, %r11d 14044 movq ARG13, %r12 // n1 14045 14046#if MACRO_LEVEL>=1 14047 INNER_EDGE_DTRSM_RLT_ONE_4X4_VS_LIB 14048#else 14049 CALL(inner_edge_dtrsm_rlt_one_4x4_vs_lib) 14050#endif 14051 14052 14053 // store 14054 14055 movq ARG8, %r10 // D 14056 movq ARG9, %r11 // ldd 14057 sall $3, %r11d 14058 movq ARG12, %r12 // m1 14059 movq ARG13, %r13 // n1 14060 14061#if MACRO_LEVEL>=1 14062 INNER_STORE_4X4_VS_LIB 14063#else 14064 CALL(inner_store_4x4_vs_lib) 14065#endif 14066 14067 14068 EPILOGUE 14069 14070 ret 14071 14072 FUN_END(kernel_dtrsm_nt_rl_one_4x4_vs_lib4cccc) 14073 14074 14075 14076 14077 14078// 1 2 3 4 5 6 7 8 9 10 14079// void kernel_dtrsm_nn_ru_inv_4x4_lib4c44c(int k, double *A, double *B, int ldb, double *beta, double *C, double *D, double *E, int lde, double *inv_diag_E); 14080 14081 .p2align 4,,15 14082 GLOB_FUN_START(kernel_dtrsm_nn_ru_inv_4x4_lib4c44c) 14083 14084 PROLOGUE 14085 14086 // zero accumulation registers 14087 14088 ZERO_ACC 14089 14090 14091 // call inner dgemm kernel nt 14092 14093 movq ARG1, %r10 // kmax 14094 movq ARG2, %r11 // A 14095 movq ARG3, %r12 // B 14096 movq ARG4, %r13 // ldb 14097 sall $3, %r13d 14098 14099#if MACRO_LEVEL>=2 14100 INNER_KERNEL_DGEMM_NN_4X4_LIB4C 14101#else 14102 CALL(inner_kernel_dgemm_nn_4x4_lib4c) 14103#endif 14104 14105 14106 // call inner blender_loader nn 14107 14108 movq ARG5, %r10 // beta 14109 movq ARG6, %r11 // C 14110 14111#if MACRO_LEVEL>=1 14112 INNER_SCALE_M1B_4X4_LIB4 14113#else 14114 CALL(inner_scale_m1b_4x4_lib4) 14115#endif 14116 14117 14118 // solve 14119 14120 movq ARG8, %r10 // E 14121 movq ARG9, %r11 // lde 14122 sall $3, %r11d 14123 movq ARG10, %r12 // inv_diag_E 14124 14125#if MACRO_LEVEL>=1 14126 INNER_EDGE_DTRSM_RUN_INV_4X4_LIB 14127#else 14128 CALL(inner_edge_dtrsm_run_inv_4x4_lib) 14129#endif 14130 14131 14132 // store 14133 14134 movq ARG7, %r10 // D 14135 14136#if MACRO_LEVEL>=1 14137 INNER_STORE_4X4_LIB4 14138#else 14139 CALL(inner_store_4x4_lib4) 14140#endif 14141 14142 14143 EPILOGUE 14144 14145 ret 14146 14147 FUN_END(kernel_dtrsm_nn_ru_inv_4x4_lib4c44c) 14148 14149 14150 14151 14152 14153// 1 2 3 4 5 6 7 8 9 10 11 12 14154// void kernel_dtrsm_nn_ru_inv_4x4_vs_lib4c44c(int k, double *A, double *B, int ldb, double *beta, double *C, double *D, double *E, int lde, double *inv_diag_E, int m1, int n1); 14155 14156 .p2align 4,,15 14157 GLOB_FUN_START(kernel_dtrsm_nn_ru_inv_4x4_vs_lib4c44c) 14158 14159 PROLOGUE 14160 14161 // zero accumulation registers 14162 14163 ZERO_ACC 14164 14165 14166 // call inner dgemm kernel nt 14167 14168 movq ARG1, %r10 // kmax 14169 movq ARG2, %r11 // A 14170 movq ARG3, %r12 // B 14171 movq ARG4, %r13 // ldb 14172 sall $3, %r13d 14173 14174 movq ARG12, %r14 // n1 14175 cmpl $1, %r14d 14176 jg 100f 14177 14178#if MACRO_LEVEL>=2 14179 INNER_KERNEL_DGEMM_NN_4X1_LIB4C 14180#else 14181 CALL(inner_kernel_dgemm_nn_4x1_lib4c) 14182#endif 14183 14184 jmp 103f 14185 14186100: 14187 14188 movq ARG12, %r14 // n1 14189 cmpl $2, %r14d 14190 jg 101f 14191 14192#if MACRO_LEVEL>=2 14193 INNER_KERNEL_DGEMM_NN_4X2_LIB4C 14194#else 14195 CALL(inner_kernel_dgemm_nn_4x2_lib4c) 14196#endif 14197 14198 jmp 103f 14199 14200101: 14201 14202 movq ARG12, %r14 // n1 14203 cmpl $3, %r14d 14204 jg 102f 14205 14206#if MACRO_LEVEL>=2 14207 INNER_KERNEL_DGEMM_NN_4X3_LIB4C 14208#else 14209 CALL(inner_kernel_dgemm_nn_4x3_lib4c) 14210#endif 14211 14212 jmp 103f 14213 14214102: 14215 14216#if MACRO_LEVEL>=2 14217 INNER_KERNEL_DGEMM_NN_4X4_LIB4C 14218#else 14219 CALL(inner_kernel_dgemm_nn_4x4_lib4c) 14220#endif 14221 14222103: 14223 14224 14225 // call inner blender_loader nn 14226 14227 movq ARG5, %r10 // beta 14228 movq ARG6, %r11 // C 14229 14230#if MACRO_LEVEL>=1 14231 INNER_SCALE_M1B_4X4_LIB4 14232#else 14233 CALL(inner_scale_m1b_4x4_lib4) 14234#endif 14235 14236 14237 // solve 14238 14239 movq ARG8, %r10 // E 14240 movq ARG9, %r11 // lde 14241 sall $3, %r11d 14242 movq ARG10, %r12 // inv_diag_E 14243 movq ARG12, %r13 // n1 14244 14245#if MACRO_LEVEL>=1 14246 INNER_EDGE_DTRSM_RUN_INV_4X4_VS_LIB 14247#else 14248 CALL(inner_edge_dtrsm_run_inv_4x4_vs_lib) 14249#endif 14250 14251 14252 // store 14253 14254 movq ARG7, %r10 // D 14255 movq ARG11, %r11 // m1 14256 movq ARG12, %r12 // n1 14257 14258#if MACRO_LEVEL>=1 14259 INNER_STORE_4X4_VS_LIB4 14260#else 14261 CALL(inner_store_4x4_vs_lib4) 14262#endif 14263 14264 14265 EPILOGUE 14266 14267 ret 14268 14269 FUN_END(kernel_dtrsm_nn_ru_inv_4x4_vs_lib4c44c) 14270 14271 14272 14273 14274 14275// 1 2 3 4 5 6 7 8 9 10 11 12 14276// void kernel_dtrsm_nn_ru_inv_4x4_lib4cccc(int k, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, double *E, int lde, double *inv_diag_E); 14277 14278 .p2align 4,,15 14279 GLOB_FUN_START(kernel_dtrsm_nn_ru_inv_4x4_lib4cccc) 14280 14281 PROLOGUE 14282 14283 // zero accumulation registers 14284 14285 ZERO_ACC 14286 14287 14288 // call inner dgemm kernel nt 14289 14290 movq ARG1, %r10 // kmax 14291 movq ARG2, %r11 // A 14292 movq ARG3, %r12 // B 14293 movq ARG4, %r13 // ldb 14294 sall $3, %r13d 14295 14296#if MACRO_LEVEL>=2 14297 INNER_KERNEL_DGEMM_NN_4X4_LIB4C 14298#else 14299 CALL(inner_kernel_dgemm_nn_4x4_lib4c) 14300#endif 14301 14302 14303 // call inner blender_loader nn 14304 14305 movq ARG5, %r10 // beta 14306 movq ARG6, %r11 // C 14307 movq ARG7, %r12 // ldc 14308 sall $3, %r12d 14309 14310#if MACRO_LEVEL>=1 14311 INNER_SCALE_M1B_4X4_LIB 14312#else 14313 CALL(inner_scale_m1b_4x4_lib) 14314#endif 14315 14316 14317 // solve 14318 14319 movq ARG10, %r10 // E 14320 movq ARG11, %r11 // lde 14321 sall $3, %r11d 14322 movq ARG12, %r12 // inv_diag_E 14323 14324#if MACRO_LEVEL>=1 14325 INNER_EDGE_DTRSM_RUN_INV_4X4_LIB 14326#else 14327 CALL(inner_edge_dtrsm_run_inv_4x4_lib) 14328#endif 14329 14330 14331 // store 14332 14333 movq ARG8, %r10 // D 14334 movq ARG9, %r11 // ldd 14335 sall $3, %r11d 14336 14337#if MACRO_LEVEL>=1 14338 INNER_STORE_4X4_LIB 14339#else 14340 CALL(inner_store_4x4_lib) 14341#endif 14342 14343 14344 EPILOGUE 14345 14346 ret 14347 14348 FUN_END(kernel_dtrsm_nn_ru_inv_4x4_lib4cccc) 14349 14350 14351 14352 14353 14354// 1 2 3 4 5 6 7 8 9 10 11 12 13 14 14355// void kernel_dtrsm_nn_ru_inv_4x4_vs_lib4cccc(int k, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, double *E, int lde, double *inv_diag_E, int m1, int n1); 14356 14357 .p2align 4,,15 14358 GLOB_FUN_START(kernel_dtrsm_nn_ru_inv_4x4_vs_lib4cccc) 14359 14360 PROLOGUE 14361 14362 // zero accumulation registers 14363 14364 ZERO_ACC 14365 14366 14367 // call inner dgemm kernel nt 14368 14369 movq ARG1, %r10 // kmax 14370 movq ARG2, %r11 // A 14371 movq ARG3, %r12 // B 14372 movq ARG4, %r13 // ldb 14373 sall $3, %r13d 14374 14375 movq ARG14, %r14 // n1 14376 cmpl $1, %r14d 14377 jg 100f 14378 14379#if MACRO_LEVEL>=2 14380 INNER_KERNEL_DGEMM_NN_4X1_LIB4C 14381#else 14382 CALL(inner_kernel_dgemm_nn_4x1_lib4c) 14383#endif 14384 14385 jmp 103f 14386 14387100: 14388 14389 movq ARG14, %r14 // n1 14390 cmpl $2, %r14d 14391 jg 101f 14392 14393#if MACRO_LEVEL>=2 14394 INNER_KERNEL_DGEMM_NN_4X2_LIB4C 14395#else 14396 CALL(inner_kernel_dgemm_nn_4x2_lib4c) 14397#endif 14398 14399 jmp 103f 14400 14401101: 14402 14403 movq ARG14, %r14 // n1 14404 cmpl $3, %r14d 14405 jg 102f 14406 14407#if MACRO_LEVEL>=2 14408 INNER_KERNEL_DGEMM_NN_4X3_LIB4C 14409#else 14410 CALL(inner_kernel_dgemm_nn_4x3_lib4c) 14411#endif 14412 14413 jmp 103f 14414 14415102: 14416 14417#if MACRO_LEVEL>=2 14418 INNER_KERNEL_DGEMM_NN_4X4_LIB4C 14419#else 14420 CALL(inner_kernel_dgemm_nn_4x4_lib4c) 14421#endif 14422 14423103: 14424 14425 14426 // call inner blender_loader nn 14427 14428 movq ARG5, %r10 // beta 14429 movq ARG6, %r11 // C 14430 movq ARG7, %r12 // ldc 14431 sall $3, %r12d 14432 movq ARG13, %r13 // m1 14433 movq ARG14, %r14 // n1 14434 14435#if MACRO_LEVEL>=1 14436 INNER_SCALE_M1B_4X4_VS_LIB 14437#else 14438 CALL(inner_scale_m1b_4x4_vs_lib) 14439#endif 14440 14441 14442 // solve 14443 14444 movq ARG10, %r10 // E 14445 movq ARG11, %r11 // lde 14446 sall $3, %r11d 14447 movq ARG12, %r12 // inv_diag_E 14448 movq ARG14, %r13 // n1 14449 14450#if MACRO_LEVEL>=1 14451 INNER_EDGE_DTRSM_RUN_INV_4X4_VS_LIB 14452#else 14453 CALL(inner_edge_dtrsm_run_inv_4x4_vs_lib) 14454#endif 14455 14456 14457 // store 14458 14459 movq ARG8, %r10 // D 14460 movq ARG9, %r11 // ldd 14461 sall $3, %r11d 14462 movq ARG13, %r12 // m1 14463 movq ARG14, %r13 // n1 14464 14465#if MACRO_LEVEL>=1 14466 INNER_STORE_4X4_VS_LIB 14467#else 14468 CALL(inner_store_4x4_vs_lib) 14469#endif 14470 14471 14472 EPILOGUE 14473 14474 ret 14475 14476 FUN_END(kernel_dtrsm_nn_ru_inv_4x4_vs_lib4cccc) 14477 14478 14479 14480 14481 14482// 1 2 3 4 5 6 7 8 9 14483// void kernel_dtrsm_nn_ru_one_4x4_lib4c44c(int k, double *A, double *B, int ldb, double *beta, double *C, double *D, double *E, int lde); 14484 14485 .p2align 4,,15 14486 GLOB_FUN_START(kernel_dtrsm_nn_ru_one_4x4_lib4c44c) 14487 14488 PROLOGUE 14489 14490 // zero accumulation registers 14491 14492 ZERO_ACC 14493 14494 14495 // call inner dgemm kernel nt 14496 14497 movq ARG1, %r10 // kmax 14498 movq ARG2, %r11 // A 14499 movq ARG3, %r12 // B 14500 movq ARG4, %r13 // ldb 14501 sall $3, %r13d 14502 14503#if MACRO_LEVEL>=2 14504 INNER_KERNEL_DGEMM_NN_4X4_LIB4C 14505#else 14506 CALL(inner_kernel_dgemm_nn_4x4_lib4c) 14507#endif 14508 14509 14510 // call inner blender_loader nn 14511 14512 movq ARG5, %r10 // beta 14513 movq ARG6, %r11 // C 14514 14515#if MACRO_LEVEL>=1 14516 INNER_SCALE_M1B_4X4_LIB4 14517#else 14518 CALL(inner_scale_m1b_4x4_lib4) 14519#endif 14520 14521 14522 // solve 14523 14524 movq ARG8, %r10 // E 14525 movq ARG9, %r11 // lde 14526 sall $3, %r11d 14527 14528#if MACRO_LEVEL>=1 14529 INNER_EDGE_DTRSM_RUN_ONE_4X4_LIB 14530#else 14531 CALL(inner_edge_dtrsm_run_one_4x4_lib) 14532#endif 14533 14534 14535 // store 14536 14537 movq ARG7, %r10 // D 14538 14539#if MACRO_LEVEL>=1 14540 INNER_STORE_4X4_LIB4 14541#else 14542 CALL(inner_store_4x4_lib4) 14543#endif 14544 14545 14546 EPILOGUE 14547 14548 ret 14549 14550 FUN_END(kernel_dtrsm_nn_ru_one_4x4_lib4c44c) 14551 14552 14553 14554 14555 14556// 1 2 3 4 5 6 7 8 9 10 11 14557// void kernel_dtrsm_nn_ru_one_4x4_vs_lib4c44c(int k, double *A, double *B, int ldb, double *beta, double *C, double *D, double *E, int lde, int m1, int n1); 14558 14559 .p2align 4,,15 14560 GLOB_FUN_START(kernel_dtrsm_nn_ru_one_4x4_vs_lib4c44c) 14561 14562 PROLOGUE 14563 14564 // zero accumulation registers 14565 14566 ZERO_ACC 14567 14568 14569 // call inner dgemm kernel nt 14570 14571 movq ARG1, %r10 // kmax 14572 movq ARG2, %r11 // A 14573 movq ARG3, %r12 // B 14574 movq ARG4, %r13 // ldb 14575 sall $3, %r13d 14576 14577 movq ARG11, %r14 // n1 14578 cmpl $1, %r14d 14579 jg 100f 14580 14581#if MACRO_LEVEL>=2 14582 INNER_KERNEL_DGEMM_NN_4X1_LIB4C 14583#else 14584 CALL(inner_kernel_dgemm_nn_4x1_lib4c) 14585#endif 14586 14587 jmp 103f 14588 14589100: 14590 14591 movq ARG11, %r14 // n1 14592 cmpl $2, %r14d 14593 jg 101f 14594 14595#if MACRO_LEVEL>=2 14596 INNER_KERNEL_DGEMM_NN_4X2_LIB4C 14597#else 14598 CALL(inner_kernel_dgemm_nn_4x2_lib4c) 14599#endif 14600 14601 jmp 103f 14602 14603101: 14604 14605 movq ARG11, %r14 // n1 14606 cmpl $3, %r14d 14607 jg 102f 14608 14609#if MACRO_LEVEL>=2 14610 INNER_KERNEL_DGEMM_NN_4X3_LIB4C 14611#else 14612 CALL(inner_kernel_dgemm_nn_4x3_lib4c) 14613#endif 14614 14615 jmp 103f 14616 14617102: 14618 14619#if MACRO_LEVEL>=2 14620 INNER_KERNEL_DGEMM_NN_4X4_LIB4C 14621#else 14622 CALL(inner_kernel_dgemm_nn_4x4_lib4c) 14623#endif 14624 14625103: 14626 14627 14628 // call inner blender_loader nn 14629 14630 movq ARG5, %r10 // beta 14631 movq ARG6, %r11 // C 14632 14633#if MACRO_LEVEL>=1 14634 INNER_SCALE_M1B_4X4_LIB4 14635#else 14636 CALL(inner_scale_m1b_4x4_lib4) 14637#endif 14638 14639 14640 // solve 14641 14642 movq ARG8, %r10 // E 14643 movq ARG9, %r11 // lde 14644 sall $3, %r11d 14645 movq ARG11, %r12 // n1 14646 14647#if MACRO_LEVEL>=1 14648 INNER_EDGE_DTRSM_RUN_ONE_4X4_VS_LIB 14649#else 14650 CALL(inner_edge_dtrsm_run_one_4x4_vs_lib) 14651#endif 14652 14653 14654 // store 14655 14656 movq ARG7, %r10 // D 14657 movq ARG10, %r11 // m1 14658 movq ARG11, %r12 // n1 14659 14660#if MACRO_LEVEL>=1 14661 INNER_STORE_4X4_VS_LIB4 14662#else 14663 CALL(inner_store_4x4_vs_lib4) 14664#endif 14665 14666 14667 EPILOGUE 14668 14669 ret 14670 14671 FUN_END(kernel_dtrsm_nn_ru_one_4x4_vs_lib4c44c) 14672 14673 14674 14675 14676 14677// 1 2 3 4 5 6 7 8 9 10 11 14678// void kernel_dtrsm_nn_ru_one_4x4_lib4cccc(int k, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, double *E, int lde); 14679 14680 .p2align 4,,15 14681 GLOB_FUN_START(kernel_dtrsm_nn_ru_one_4x4_lib4cccc) 14682 14683 PROLOGUE 14684 14685 // zero accumulation registers 14686 14687 ZERO_ACC 14688 14689 14690 // call inner dgemm kernel nt 14691 14692 movq ARG1, %r10 // kmax 14693 movq ARG2, %r11 // A 14694 movq ARG3, %r12 // B 14695 movq ARG4, %r13 // ldb 14696 sall $3, %r13d 14697 14698#if MACRO_LEVEL>=2 14699 INNER_KERNEL_DGEMM_NN_4X4_LIB4C 14700#else 14701 CALL(inner_kernel_dgemm_nn_4x4_lib4c) 14702#endif 14703 14704 14705 // call inner blender_loader nn 14706 14707 movq ARG5, %r10 // beta 14708 movq ARG6, %r11 // C 14709 movq ARG7, %r12 // ldc 14710 sall $3, %r12d 14711 14712#if MACRO_LEVEL>=1 14713 INNER_SCALE_M1B_4X4_LIB 14714#else 14715 CALL(inner_scale_m1b_4x4_lib) 14716#endif 14717 14718 14719 // solve 14720 14721 movq ARG10, %r10 // E 14722 movq ARG11, %r11 // lde 14723 sall $3, %r11d 14724 14725#if MACRO_LEVEL>=1 14726 INNER_EDGE_DTRSM_RUN_ONE_4X4_LIB 14727#else 14728 CALL(inner_edge_dtrsm_run_one_4x4_lib) 14729#endif 14730 14731 14732 // store 14733 14734 movq ARG8, %r10 // D 14735 movq ARG9, %r11 // ldd 14736 sall $3, %r11d 14737 14738#if MACRO_LEVEL>=1 14739 INNER_STORE_4X4_LIB 14740#else 14741 CALL(inner_store_4x4_lib) 14742#endif 14743 14744 14745 EPILOGUE 14746 14747 ret 14748 14749 FUN_END(kernel_dtrsm_nn_ru_one_4x4_lib4cccc) 14750 14751 14752 14753 14754 14755// 1 2 3 4 5 6 7 8 9 10 11 12 13 14756// void kernel_dtrsm_nn_ru_one_4x4_vs_lib4cccc(int k, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, double *E, int lde, int m1, int n1); 14757 14758 .p2align 4,,15 14759 GLOB_FUN_START(kernel_dtrsm_nn_ru_one_4x4_vs_lib4cccc) 14760 14761 PROLOGUE 14762 14763 // zero accumulation registers 14764 14765 ZERO_ACC 14766 14767 14768 // call inner dgemm kernel nt 14769 14770 movq ARG1, %r10 // kmax 14771 movq ARG2, %r11 // A 14772 movq ARG3, %r12 // B 14773 movq ARG4, %r13 // ldb 14774 sall $3, %r13d 14775 14776 movq ARG13, %r14 // n1 14777 cmpl $1, %r14d 14778 jg 100f 14779 14780#if MACRO_LEVEL>=2 14781 INNER_KERNEL_DGEMM_NN_4X1_LIB4C 14782#else 14783 CALL(inner_kernel_dgemm_nn_4x1_lib4c) 14784#endif 14785 14786 jmp 103f 14787 14788100: 14789 14790 movq ARG13, %r14 // n1 14791 cmpl $2, %r14d 14792 jg 101f 14793 14794#if MACRO_LEVEL>=2 14795 INNER_KERNEL_DGEMM_NN_4X2_LIB4C 14796#else 14797 CALL(inner_kernel_dgemm_nn_4x2_lib4c) 14798#endif 14799 14800 jmp 103f 14801 14802101: 14803 14804 movq ARG13, %r14 // n1 14805 cmpl $3, %r14d 14806 jg 102f 14807 14808#if MACRO_LEVEL>=2 14809 INNER_KERNEL_DGEMM_NN_4X3_LIB4C 14810#else 14811 CALL(inner_kernel_dgemm_nn_4x3_lib4c) 14812#endif 14813 14814 jmp 103f 14815 14816102: 14817 14818#if MACRO_LEVEL>=2 14819 INNER_KERNEL_DGEMM_NN_4X4_LIB4C 14820#else 14821 CALL(inner_kernel_dgemm_nn_4x4_lib4c) 14822#endif 14823 14824103: 14825 14826 14827 // call inner blender_loader nn 14828 14829 movq ARG5, %r10 // beta 14830 movq ARG6, %r11 // C 14831 movq ARG7, %r12 // ldc 14832 sall $3, %r12d 14833 movq ARG12, %r13 // m1 14834 movq ARG13, %r14 // n1 14835 14836#if MACRO_LEVEL>=1 14837 INNER_SCALE_M1B_4X4_VS_LIB 14838#else 14839 CALL(inner_scale_m1b_4x4_vs_lib) 14840#endif 14841 14842 14843 // solve 14844 14845 movq ARG10, %r10 // E 14846 movq ARG11, %r11 // lde 14847 sall $3, %r11d 14848 movq ARG13, %r12 // n1 14849 14850#if MACRO_LEVEL>=1 14851 INNER_EDGE_DTRSM_RUN_ONE_4X4_VS_LIB 14852#else 14853 CALL(inner_edge_dtrsm_run_one_4x4_vs_lib) 14854#endif 14855 14856 14857 // store 14858 14859 movq ARG8, %r10 // D 14860 movq ARG9, %r11 // ldd 14861 sall $3, %r11d 14862 movq ARG12, %r12 // m1 14863 movq ARG13, %r13 // n1 14864 14865#if MACRO_LEVEL>=1 14866 INNER_STORE_4X4_VS_LIB 14867#else 14868 CALL(inner_store_4x4_vs_lib) 14869#endif 14870 14871 14872 EPILOGUE 14873 14874 ret 14875 14876 FUN_END(kernel_dtrsm_nn_ru_one_4x4_vs_lib4cccc) 14877 14878 14879 14880 14881 14882// 1 2 3 4 5 6 7 8 9 10 14883// void kernel_dtrsm_nt_ru_inv_4x4_lib44cc4(int k, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd, double *E, double *inv_diag_E); 14884 14885 .p2align 4,,15 14886 GLOB_FUN_START(kernel_dtrsm_nt_ru_inv_4x4_lib44cc4) 14887 14888 PROLOGUE 14889 14890 // zero accumulation registers 14891 14892 ZERO_ACC 14893 14894 14895 // call inner dgemm kernel nt 14896 14897 movq ARG1, %r10 // kmax 14898 movq ARG2, %r11 // A 14899 movq ARG3, %r12 // B 14900 14901#if MACRO_LEVEL>=2 14902 INNER_KERNEL_DGEMM_NT_4X4_LIB4 14903#else 14904 CALL(inner_kernel_dgemm_nt_4x4_lib4) 14905#endif 14906 14907 14908 // call inner blender_loader nn 14909 14910 movq ARG4, %r10 // beta 14911 movq ARG5, %r11 // C 14912 movq ARG6, %r12 // ldc 14913 sall $3, %r12d 14914 14915#if MACRO_LEVEL>=1 14916 INNER_BLEND_SCALE_M1B_4X4_LIB 14917#else 14918 CALL(inner_blend_scale_m1b_4x4_lib) 14919#endif 14920 14921 14922 // solve 14923 14924 movq ARG9, %r10 // E 14925 movq ARG10, %r11 // inv_diag_E 14926 14927#if MACRO_LEVEL>=1 14928 INNER_EDGE_DTRSM_RUT_INV_4X4_LIB4 14929#else 14930 CALL(inner_edge_dtrsm_rut_inv_4x4_lib4) 14931#endif 14932 14933 14934 // store 14935 14936 movq ARG7, %r10 // D 14937 movq ARG8, %r11 // ldd 14938 sall $3, %r11d 14939 14940#if MACRO_LEVEL>=1 14941 INNER_STORE_4X4_LIB 14942#else 14943 CALL(inner_store_4x4_lib) 14944#endif 14945 14946 14947 EPILOGUE 14948 14949 ret 14950 14951 FUN_END(kernel_dtrsm_nt_ru_inv_4x4_lib44cc4) 14952 14953 14954 14955 14956 14957// 1 2 3 4 5 6 7 8 9 10 11 12 14958// void kernel_dtrsm_nt_ru_inv_4x4_vs_lib44cc4(int k, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd, double *E, double *inv_diag_E, int m1, in1 n1); 14959 14960 .p2align 4,,15 14961 GLOB_FUN_START(kernel_dtrsm_nt_ru_inv_4x4_vs_lib44cc4) 14962 14963 PROLOGUE 14964 14965 // zero accumulation registers 14966 14967 ZERO_ACC 14968 14969 14970 // call inner dgemm kernel nt 14971 14972 movq ARG1, %r10 // kmax 14973 movq ARG2, %r11 // A 14974 movq ARG3, %r12 // B 14975 14976#if MACRO_LEVEL>=2 14977 INNER_KERNEL_DGEMM_NT_4X4_LIB4 14978#else 14979 CALL(inner_kernel_dgemm_nt_4x4_lib4) 14980#endif 14981 14982 14983 // call inner blender_loader nn 14984 14985 movq ARG4, %r10 // beta 14986 movq ARG5, %r11 // C 14987 movq ARG6, %r12 // ldc 14988 sall $3, %r12d 14989 movq ARG11, %r13 // m1 14990 movq ARG12, %r14 // n1 14991 14992#if MACRO_LEVEL>=1 14993 INNER_BLEND_SCALE_M1B_4X4_VS_LIB 14994#else 14995 CALL(inner_blend_scale_m1b_4x4_vs_lib) 14996#endif 14997 14998 14999 // solve 15000 15001 movq ARG9, %r10 // E 15002 movq ARG10, %r11 // inv_diag_E 15003 movq ARG12, %r12 // n1 15004 15005#if MACRO_LEVEL>=1 15006 INNER_EDGE_DTRSM_RUT_INV_4X4_VS_LIB4 15007#else 15008 CALL(inner_edge_dtrsm_rut_inv_4x4_vs_lib4) 15009#endif 15010 15011 15012 // store 15013 15014 movq ARG7, %r10 // D 15015 movq ARG8, %r11 // ldd 15016 sall $3, %r11d 15017 movq ARG11, %r12 // m1 15018 movq ARG12, %r13 // n1 15019 15020#if MACRO_LEVEL>=1 15021 INNER_STORE_4X4_VS_LIB 15022#else 15023 CALL(inner_store_4x4_vs_lib) 15024#endif 15025 15026 15027 EPILOGUE 15028 15029 ret 15030 15031 FUN_END(kernel_dtrsm_nt_ru_inv_4x4_vs_lib44cc4) 15032 15033 15034 15035 15036 15037// 1 2 3 4 5 6 7 8 9 10 15038// void kernel_dtrsm_nt_ru_inv_4x4_lib4c44c(int k, double *A, double *B, int ldb, double *beta, double *C, double *D, double *E, int lde, double *inv_diag_E); 15039 15040 .p2align 4,,15 15041 GLOB_FUN_START(kernel_dtrsm_nt_ru_inv_4x4_lib4c44c) 15042 15043 PROLOGUE 15044 15045 // zero accumulation registers 15046 15047 ZERO_ACC 15048 15049 15050 // call inner dgemm kernel nt 15051 15052 movq ARG1, %r10 // kmax 15053 movq ARG2, %r11 // A 15054 movq ARG3, %r12 // B 15055 movq ARG4, %r13 // ldb 15056 sall $3, %r13d 15057 15058#if MACRO_LEVEL>=2 15059 INNER_KERNEL_DGEMM_NT_4X4_LIB4C 15060#else 15061 CALL(inner_kernel_dgemm_nt_4x4_lib4c) 15062#endif 15063 15064 15065 // call inner blender_loader nn 15066 15067 movq ARG5, %r10 // beta 15068 movq ARG6, %r11 // C 15069 15070#if MACRO_LEVEL>=1 15071 INNER_SCALE_M1B_4X4_LIB4 15072#else 15073 CALL(inner_scale_m1b_4x4_lib4) 15074#endif 15075 15076 15077 // solve 15078 15079 movq ARG8, %r10 // E 15080 movq ARG9, %r11 // lde 15081 sall $3, %r11d 15082 movq ARG10, %r12 // inv_diag_E 15083 15084#if MACRO_LEVEL>=1 15085 INNER_EDGE_DTRSM_RUT_INV_4X4_LIB 15086#else 15087 CALL(inner_edge_dtrsm_rut_inv_4x4_lib) 15088#endif 15089 15090 15091 // store 15092 15093 movq ARG7, %r10 // D 15094 15095#if MACRO_LEVEL>=1 15096 INNER_STORE_4X4_LIB4 15097#else 15098 CALL(inner_store_4x4_lib4) 15099#endif 15100 15101 15102 EPILOGUE 15103 15104 ret 15105 15106 FUN_END(kernel_dtrsm_nt_ru_inv_4x4_lib4c44c) 15107 15108 15109 15110 15111 15112// 1 2 3 4 5 6 7 8 9 10 11 12 15113// void kernel_dtrsm_nt_ru_inv_4x4_vs_lib4c44c(int k, double *A, double *B, int ldb, double *beta, double *C, double *D, double *E, int lde, double *inv_diag_E, int m1, int n1); 15114 15115 .p2align 4,,15 15116 GLOB_FUN_START(kernel_dtrsm_nt_ru_inv_4x4_vs_lib4c44c) 15117 15118 PROLOGUE 15119 15120 // zero accumulation registers 15121 15122 ZERO_ACC 15123 15124 15125 // call inner dgemm kernel nt 15126 15127 movq ARG1, %r10 // kmax 15128 movq ARG2, %r11 // A 15129 movq ARG3, %r12 // B 15130 movq ARG4, %r13 // ldb 15131 sall $3, %r13d 15132 15133 movq ARG12, %r14 // n1 15134 cmpl $1, %r14d 15135 jg 100f 15136 15137#if MACRO_LEVEL>=2 15138 INNER_KERNEL_DGEMM_NT_4X1_LIB4C 15139#else 15140 CALL(inner_kernel_dgemm_nt_4x1_lib4c) 15141#endif 15142 15143 jmp 103f 15144 15145100: 15146 15147 movq ARG12, %r14 // n1 15148 cmpl $2, %r14d 15149 jg 101f 15150 15151#if MACRO_LEVEL>=2 15152 INNER_KERNEL_DGEMM_NT_4X2_LIB4C 15153#else 15154 CALL(inner_kernel_dgemm_nt_4x2_lib4c) 15155#endif 15156 15157 jmp 103f 15158 15159101: 15160 15161 movq ARG12, %r14 // n1 15162 cmpl $3, %r14d 15163 jg 102f 15164 15165#if MACRO_LEVEL>=2 15166 INNER_KERNEL_DGEMM_NT_4X3_LIB4C 15167#else 15168 CALL(inner_kernel_dgemm_nt_4x3_lib4c) 15169#endif 15170 15171 jmp 103f 15172 15173102: 15174 15175#if MACRO_LEVEL>=2 15176 INNER_KERNEL_DGEMM_NT_4X4_LIB4C 15177#else 15178 CALL(inner_kernel_dgemm_nt_4x4_lib4c) 15179#endif 15180 15181103: 15182 15183 15184 // call inner blender_loader nn 15185 15186 movq ARG5, %r10 // beta 15187 movq ARG6, %r11 // C 15188 15189#if MACRO_LEVEL>=1 15190 INNER_SCALE_M1B_4X4_LIB4 15191#else 15192 CALL(inner_scale_m1b_4x4_lib4) 15193#endif 15194 15195 15196 // solve 15197 15198 movq ARG8, %r10 // E 15199 movq ARG9, %r11 // lde 15200 sall $3, %r11d 15201 movq ARG10, %r12 // inv_diag_E 15202 movq ARG12, %r13 // n1 15203 15204#if MACRO_LEVEL>=1 15205 INNER_EDGE_DTRSM_RUT_INV_4X4_VS_LIB 15206#else 15207 CALL(inner_edge_dtrsm_rut_inv_4x4_vs_lib) 15208#endif 15209 15210 15211 // store 15212 15213 movq ARG7, %r10 // D 15214 movq ARG11, %r11 // m1 15215 movq ARG12, %r12 // n1 15216 15217#if MACRO_LEVEL>=1 15218 INNER_STORE_4X4_VS_LIB4 15219#else 15220 CALL(inner_store_4x4_vs_lib4) 15221#endif 15222 15223 15224 EPILOGUE 15225 15226 ret 15227 15228 FUN_END(kernel_dtrsm_nt_ru_inv_4x4_vs_lib4c44c) 15229 15230 15231 15232 15233// 1 2 3 4 5 6 7 8 9 10 11 12 15234// void kernel_dtrsm_nt_ru_inv_4x4_lib4cccc(int k, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, double *E, int lde, double *inv_diag_E); 15235 15236 .p2align 4,,15 15237 GLOB_FUN_START(kernel_dtrsm_nt_ru_inv_4x4_lib4cccc) 15238 15239 PROLOGUE 15240 15241 // zero accumulation registers 15242 15243 ZERO_ACC 15244 15245 15246 // call inner dgemm kernel nt 15247 15248 movq ARG1, %r10 // kmax 15249 movq ARG2, %r11 // A 15250 movq ARG3, %r12 // B 15251 movq ARG4, %r13 // ldb 15252 sall $3, %r13d 15253 15254#if MACRO_LEVEL>=2 15255 INNER_KERNEL_DGEMM_NT_4X4_LIB4C 15256#else 15257 CALL(inner_kernel_dgemm_nt_4x4_lib4c) 15258#endif 15259 15260 15261 // call inner blender_loader nn 15262 15263 movq ARG5, %r10 // beta 15264 movq ARG6, %r11 // C 15265 movq ARG7, %r12 // ldc 15266 sall $3, %r12d 15267 15268#if MACRO_LEVEL>=1 15269 INNER_SCALE_M1B_4X4_LIB 15270#else 15271 CALL(inner_scale_m1b_4x4_lib) 15272#endif 15273 15274 15275 // solve 15276 15277 movq ARG10, %r10 // E 15278 movq ARG11, %r11 // lde 15279 sall $3, %r11d 15280 movq ARG12, %r12 // inv_diag_E 15281 15282#if MACRO_LEVEL>=1 15283 INNER_EDGE_DTRSM_RUT_INV_4X4_LIB 15284#else 15285 CALL(inner_edge_dtrsm_rut_inv_4x4_lib) 15286#endif 15287 15288 15289 // store 15290 15291 movq ARG8, %r10 // D 15292 movq ARG9, %r11 // ldd 15293 sall $3, %r11d 15294 15295#if MACRO_LEVEL>=1 15296 INNER_STORE_4X4_LIB 15297#else 15298 CALL(inner_store_4x4_lib) 15299#endif 15300 15301 15302 EPILOGUE 15303 15304 ret 15305 15306 FUN_END(kernel_dtrsm_nt_ru_inv_4x4_lib4cccc) 15307 15308 15309 15310 15311 15312// 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15313// void kernel_dtrsm_nt_ru_inv_4x4_vs_lib4cccc(int k, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, double *E, int lde, double *inv_diag_E, int m1, int n1); 15314 15315 .p2align 4,,15 15316 GLOB_FUN_START(kernel_dtrsm_nt_ru_inv_4x4_vs_lib4cccc) 15317 15318 PROLOGUE 15319 15320 // zero accumulation registers 15321 15322 ZERO_ACC 15323 15324 15325 // call inner dgemm kernel nt 15326 15327 movq ARG1, %r10 // kmax 15328 movq ARG2, %r11 // A 15329 movq ARG3, %r12 // B 15330 movq ARG4, %r13 // ldb 15331 sall $3, %r13d 15332 15333 movq ARG14, %r14 // n1 15334 cmpl $1, %r14d 15335 jg 100f 15336 15337#if MACRO_LEVEL>=2 15338 INNER_KERNEL_DGEMM_NT_4X1_LIB4C 15339#else 15340 CALL(inner_kernel_dgemm_nt_4x1_lib4c) 15341#endif 15342 15343 jmp 103f 15344 15345100: 15346 15347 movq ARG14, %r14 // n1 15348 cmpl $2, %r14d 15349 jg 101f 15350 15351#if MACRO_LEVEL>=2 15352 INNER_KERNEL_DGEMM_NT_4X2_LIB4C 15353#else 15354 CALL(inner_kernel_dgemm_nt_4x2_lib4c) 15355#endif 15356 15357 jmp 103f 15358 15359101: 15360 15361 movq ARG14, %r14 // n1 15362 cmpl $3, %r14d 15363 jg 102f 15364 15365#if MACRO_LEVEL>=2 15366 INNER_KERNEL_DGEMM_NT_4X3_LIB4C 15367#else 15368 CALL(inner_kernel_dgemm_nt_4x3_lib4c) 15369#endif 15370 15371 jmp 103f 15372 15373102: 15374 15375#if MACRO_LEVEL>=2 15376 INNER_KERNEL_DGEMM_NT_4X4_LIB4C 15377#else 15378 CALL(inner_kernel_dgemm_nt_4x4_lib4c) 15379#endif 15380 15381103: 15382 15383 15384 // call inner blender_loader nn 15385 15386 movq ARG5, %r10 // beta 15387 movq ARG6, %r11 // C 15388 movq ARG7, %r12 // ldc 15389 sall $3, %r12d 15390 movq ARG13, %r13 // m1 15391 movq ARG14, %r14 // n1 15392 15393#if MACRO_LEVEL>=1 15394 INNER_SCALE_M1B_4X4_VS_LIB 15395#else 15396 CALL(inner_scale_m1b_4x4_vs_lib) 15397#endif 15398 15399 15400 // solve 15401 15402 movq ARG10, %r10 // E 15403 movq ARG11, %r11 // lde 15404 sall $3, %r11d 15405 movq ARG12, %r12 // inv_diag_E 15406 movq ARG14, %r13 // n1 15407 15408#if MACRO_LEVEL>=1 15409 INNER_EDGE_DTRSM_RUT_INV_4X4_VS_LIB 15410#else 15411 CALL(inner_edge_dtrsm_rut_inv_4x4_vs_lib) 15412#endif 15413 15414 15415 // store 15416 15417 movq ARG8, %r10 // D 15418 movq ARG9, %r11 // ldd 15419 sall $3, %r11d 15420 movq ARG13, %r12 // m1 15421 movq ARG14, %r13 // n1 15422 15423#if MACRO_LEVEL>=1 15424 INNER_STORE_4X4_VS_LIB 15425#else 15426 CALL(inner_store_4x4_vs_lib) 15427#endif 15428 15429 15430 EPILOGUE 15431 15432 ret 15433 15434 FUN_END(kernel_dtrsm_nt_ru_inv_4x4_vs_lib4cccc) 15435 15436 15437 15438 15439 15440// 1 2 3 4 5 6 7 8 9 15441// void kernel_dtrsm_nt_ru_one_4x4_lib44cc4(int k, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd, double *E); 15442 15443 .p2align 4,,15 15444 GLOB_FUN_START(kernel_dtrsm_nt_ru_one_4x4_lib44cc4) 15445 15446 PROLOGUE 15447 15448 // zero accumulation registers 15449 15450 ZERO_ACC 15451 15452 15453 // call inner dgemm kernel nt 15454 15455 movq ARG1, %r10 // kmax 15456 movq ARG2, %r11 // A 15457 movq ARG3, %r12 // B 15458 15459#if MACRO_LEVEL>=2 15460 INNER_KERNEL_DGEMM_NT_4X4_LIB4 15461#else 15462 CALL(inner_kernel_dgemm_nt_4x4_lib4) 15463#endif 15464 15465 15466 // call inner blender_loader nn 15467 15468 movq ARG4, %r10 // beta 15469 movq ARG5, %r11 // C 15470 movq ARG6, %r12 // ldc 15471 sall $3, %r12d 15472 15473#if MACRO_LEVEL>=1 15474 INNER_BLEND_SCALE_M1B_4X4_LIB 15475#else 15476 CALL(inner_blend_scale_m1b_4x4_lib) 15477#endif 15478 15479 15480 // solve 15481 15482 movq ARG9, %r10 // E 15483 15484#if MACRO_LEVEL>=1 15485 INNER_EDGE_DTRSM_RUT_ONE_4X4_LIB4 15486#else 15487 CALL(inner_edge_dtrsm_rut_one_4x4_lib4) 15488#endif 15489 15490 15491 // store 15492 15493 movq ARG7, %r10 // D 15494 movq ARG8, %r11 // ldd 15495 sall $3, %r11d 15496 15497#if MACRO_LEVEL>=1 15498 INNER_STORE_4X4_LIB 15499#else 15500 CALL(inner_store_4x4_lib) 15501#endif 15502 15503 15504 EPILOGUE 15505 15506 ret 15507 15508 FUN_END(kernel_dtrsm_nt_ru_one_4x4_lib44cc4) 15509 15510 15511 15512 15513 15514// 1 2 3 4 5 6 7 8 9 10 11 15515// void kernel_dtrsm_nt_ru_one_4x4_vs_lib44cc4(int k, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd, double *E, int m1, in1 n1); 15516 15517 .p2align 4,,15 15518 GLOB_FUN_START(kernel_dtrsm_nt_ru_one_4x4_vs_lib44cc4) 15519 15520 PROLOGUE 15521 15522 // zero accumulation registers 15523 15524 ZERO_ACC 15525 15526 15527 // call inner dgemm kernel nt 15528 15529 movq ARG1, %r10 // kmax 15530 movq ARG2, %r11 // A 15531 movq ARG3, %r12 // B 15532 15533#if MACRO_LEVEL>=2 15534 INNER_KERNEL_DGEMM_NT_4X4_LIB4 15535#else 15536 CALL(inner_kernel_dgemm_nt_4x4_lib4) 15537#endif 15538 15539 15540 // call inner blender_loader nn 15541 15542 movq ARG4, %r10 // beta 15543 movq ARG5, %r11 // C 15544 movq ARG6, %r12 // ldc 15545 sall $3, %r12d 15546 movq ARG10, %r13 // m1 15547 movq ARG11, %r14 // n1 15548 15549#if MACRO_LEVEL>=1 15550 INNER_BLEND_SCALE_M1B_4X4_VS_LIB 15551#else 15552 CALL(inner_blend_scale_m1b_4x4_vs_lib) 15553#endif 15554 15555 15556 // solve 15557 15558 movq ARG9, %r10 // E 15559 movq ARG11, %r11 // n1 15560 15561#if MACRO_LEVEL>=1 15562 INNER_EDGE_DTRSM_RUT_ONE_4X4_VS_LIB4 15563#else 15564 CALL(inner_edge_dtrsm_rut_one_4x4_vs_lib4) 15565#endif 15566 15567 15568 // store 15569 15570 movq ARG7, %r10 // D 15571 movq ARG8, %r11 // ldd 15572 sall $3, %r11d 15573 movq ARG10, %r12 // m1 15574 movq ARG11, %r13 // n1 15575 15576#if MACRO_LEVEL>=1 15577 INNER_STORE_4X4_VS_LIB 15578#else 15579 CALL(inner_store_4x4_vs_lib) 15580#endif 15581 15582 15583 EPILOGUE 15584 15585 ret 15586 15587 FUN_END(kernel_dtrsm_nt_ru_one_4x4_vs_lib44cc4) 15588 15589 15590 15591 15592 15593// 1 2 3 4 5 6 7 8 9 15594// void kernel_dtrsm_nt_ru_one_4x4_lib4c44c(int k, double *A, double *B, int ldb, double *beta, double *C, double *D, double *E, int lde); 15595 15596 .p2align 4,,15 15597 GLOB_FUN_START(kernel_dtrsm_nt_ru_one_4x4_lib4c44c) 15598 15599 PROLOGUE 15600 15601 // zero accumulation registers 15602 15603 ZERO_ACC 15604 15605 15606 // call inner dgemm kernel nt 15607 15608 movq ARG1, %r10 // kmax 15609 movq ARG2, %r11 // A 15610 movq ARG3, %r12 // B 15611 movq ARG4, %r13 // ldb 15612 sall $3, %r13d 15613 15614#if MACRO_LEVEL>=2 15615 INNER_KERNEL_DGEMM_NT_4X4_LIB4C 15616#else 15617 CALL(inner_kernel_dgemm_nt_4x4_lib4c) 15618#endif 15619 15620 15621 // call inner blender_loader nn 15622 15623 movq ARG5, %r10 // beta 15624 movq ARG6, %r11 // C 15625 15626#if MACRO_LEVEL>=1 15627 INNER_SCALE_M1B_4X4_LIB4 15628#else 15629 CALL(inner_scale_m1b_4x4_lib4) 15630#endif 15631 15632 15633 // solve 15634 15635 movq ARG8, %r10 // E 15636 movq ARG9, %r11 // lde 15637 sall $3, %r11d 15638 15639#if MACRO_LEVEL>=1 15640 INNER_EDGE_DTRSM_RUT_ONE_4X4_LIB 15641#else 15642 CALL(inner_edge_dtrsm_rut_one_4x4_lib) 15643#endif 15644 15645 15646 // store 15647 15648 movq ARG7, %r10 // D 15649 15650#if MACRO_LEVEL>=1 15651 INNER_STORE_4X4_LIB4 15652#else 15653 CALL(inner_store_4x4_lib4) 15654#endif 15655 15656 15657 EPILOGUE 15658 15659 ret 15660 15661 FUN_END(kernel_dtrsm_nt_ru_one_4x4_lib4c44c) 15662 15663 15664 15665 15666 15667// 1 2 3 4 5 6 7 8 9 10 11 15668// void kernel_dtrsm_nt_ru_one_4x4_vs_lib4c44c(int k, double *A, double *B, int ldb, double *beta, double *C, double *D, double *E, int lde, int m1, int n1); 15669 15670 .p2align 4,,15 15671 GLOB_FUN_START(kernel_dtrsm_nt_ru_one_4x4_vs_lib4c44c) 15672 15673 PROLOGUE 15674 15675 // zero accumulation registers 15676 15677 ZERO_ACC 15678 15679 15680 // call inner dgemm kernel nt 15681 15682 movq ARG1, %r10 // kmax 15683 movq ARG2, %r11 // A 15684 movq ARG3, %r12 // B 15685 movq ARG4, %r13 // ldb 15686 sall $3, %r13d 15687 15688 movq ARG11, %r14 // n1 15689 cmpl $1, %r14d 15690 jg 100f 15691 15692#if MACRO_LEVEL>=2 15693 INNER_KERNEL_DGEMM_NT_4X1_LIB4C 15694#else 15695 CALL(inner_kernel_dgemm_nt_4x1_lib4c) 15696#endif 15697 15698 jmp 103f 15699 15700100: 15701 15702 movq ARG11, %r14 // n1 15703 cmpl $2, %r14d 15704 jg 101f 15705 15706#if MACRO_LEVEL>=2 15707 INNER_KERNEL_DGEMM_NT_4X2_LIB4C 15708#else 15709 CALL(inner_kernel_dgemm_nt_4x2_lib4c) 15710#endif 15711 15712 jmp 103f 15713 15714101: 15715 15716 movq ARG11, %r14 // n1 15717 cmpl $3, %r14d 15718 jg 102f 15719 15720#if MACRO_LEVEL>=2 15721 INNER_KERNEL_DGEMM_NT_4X3_LIB4C 15722#else 15723 CALL(inner_kernel_dgemm_nt_4x3_lib4c) 15724#endif 15725 15726 jmp 103f 15727 15728102: 15729 15730#if MACRO_LEVEL>=2 15731 INNER_KERNEL_DGEMM_NT_4X4_LIB4C 15732#else 15733 CALL(inner_kernel_dgemm_nt_4x4_lib4c) 15734#endif 15735 15736103: 15737 15738 15739 // call inner blender_loader nn 15740 15741 movq ARG5, %r10 // beta 15742 movq ARG6, %r11 // C 15743 15744#if MACRO_LEVEL>=1 15745 INNER_SCALE_M1B_4X4_LIB4 15746#else 15747 CALL(inner_scale_m1b_4x4_lib4) 15748#endif 15749 15750 15751 // solve 15752 15753 movq ARG8, %r10 // E 15754 movq ARG9, %r11 // lde 15755 sall $3, %r11d 15756 movq ARG11, %r12 // n1 15757 15758#if MACRO_LEVEL>=1 15759 INNER_EDGE_DTRSM_RUT_ONE_4X4_VS_LIB 15760#else 15761 CALL(inner_edge_dtrsm_rut_one_4x4_vs_lib) 15762#endif 15763 15764 15765 // store 15766 15767 movq ARG7, %r10 // D 15768 movq ARG10, %r11 // m1 15769 movq ARG11, %r12 // n1 15770 15771#if MACRO_LEVEL>=1 15772 INNER_STORE_4X4_VS_LIB4 15773#else 15774 CALL(inner_store_4x4_vs_lib4) 15775#endif 15776 15777 15778 EPILOGUE 15779 15780 ret 15781 15782 FUN_END(kernel_dtrsm_nt_ru_one_4x4_vs_lib4c44c) 15783 15784 15785 15786 15787 15788// 1 2 3 4 5 6 7 8 9 10 11 15789// void kernel_dtrsm_nt_ru_one_4x4_lib4cccc(int k, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, double *E, int lde); 15790 15791 .p2align 4,,15 15792 GLOB_FUN_START(kernel_dtrsm_nt_ru_one_4x4_lib4cccc) 15793 15794 PROLOGUE 15795 15796 // zero accumulation registers 15797 15798 ZERO_ACC 15799 15800 15801 // call inner dgemm kernel nt 15802 15803 movq ARG1, %r10 // kmax 15804 movq ARG2, %r11 // A 15805 movq ARG3, %r12 // B 15806 movq ARG4, %r13 // ldb 15807 sall $3, %r13d 15808 15809#if MACRO_LEVEL>=2 15810 INNER_KERNEL_DGEMM_NT_4X4_LIB4C 15811#else 15812 CALL(inner_kernel_dgemm_nt_4x4_lib4c) 15813#endif 15814 15815 15816 // call inner blender_loader nn 15817 15818 movq ARG5, %r10 // beta 15819 movq ARG6, %r11 // C 15820 movq ARG7, %r12 // ldc 15821 sall $3, %r12d 15822 15823#if MACRO_LEVEL>=1 15824 INNER_SCALE_M1B_4X4_LIB 15825#else 15826 CALL(inner_scale_m1b_4x4_lib) 15827#endif 15828 15829 15830 // solve 15831 15832 movq ARG10, %r10 // E 15833 movq ARG11, %r11 // lde 15834 sall $3, %r11d 15835 15836#if MACRO_LEVEL>=1 15837 INNER_EDGE_DTRSM_RUT_ONE_4X4_LIB 15838#else 15839 CALL(inner_edge_dtrsm_rut_one_4x4_lib) 15840#endif 15841 15842 15843 // store 15844 15845 movq ARG8, %r10 // D 15846 movq ARG9, %r11 // ldd 15847 sall $3, %r11d 15848 15849#if MACRO_LEVEL>=1 15850 INNER_STORE_4X4_LIB 15851#else 15852 CALL(inner_store_4x4_lib) 15853#endif 15854 15855 15856 EPILOGUE 15857 15858 ret 15859 15860 FUN_END(kernel_dtrsm_nt_ru_one_4x4_lib4cccc) 15861 15862 15863 15864 15865 15866// 1 2 3 4 5 6 7 8 9 10 11 12 13 15867// void kernel_dtrsm_nt_ru_one_4x4_vs_lib4cccc(int k, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, double *E, int lde, int m1, int n1); 15868 15869 .p2align 4,,15 15870 GLOB_FUN_START(kernel_dtrsm_nt_ru_one_4x4_vs_lib4cccc) 15871 15872 PROLOGUE 15873 15874 // zero accumulation registers 15875 15876 ZERO_ACC 15877 15878 15879 // call inner dgemm kernel nt 15880 15881 movq ARG1, %r10 // kmax 15882 movq ARG2, %r11 // A 15883 movq ARG3, %r12 // B 15884 movq ARG4, %r13 // ldb 15885 sall $3, %r13d 15886 15887 movq ARG13, %r14 // n1 15888 cmpl $1, %r14d 15889 jg 100f 15890 15891#if MACRO_LEVEL>=2 15892 INNER_KERNEL_DGEMM_NT_4X1_LIB4C 15893#else 15894 CALL(inner_kernel_dgemm_nt_4x1_lib4c) 15895#endif 15896 15897 jmp 103f 15898 15899100: 15900 15901 movq ARG13, %r14 // n1 15902 cmpl $2, %r14d 15903 jg 101f 15904 15905#if MACRO_LEVEL>=2 15906 INNER_KERNEL_DGEMM_NT_4X2_LIB4C 15907#else 15908 CALL(inner_kernel_dgemm_nt_4x2_lib4c) 15909#endif 15910 15911 jmp 103f 15912 15913101: 15914 15915 movq ARG13, %r14 // n1 15916 cmpl $3, %r14d 15917 jg 102f 15918 15919#if MACRO_LEVEL>=2 15920 INNER_KERNEL_DGEMM_NT_4X3_LIB4C 15921#else 15922 CALL(inner_kernel_dgemm_nt_4x3_lib4c) 15923#endif 15924 15925 jmp 103f 15926 15927102: 15928 15929#if MACRO_LEVEL>=2 15930 INNER_KERNEL_DGEMM_NT_4X4_LIB4C 15931#else 15932 CALL(inner_kernel_dgemm_nt_4x4_lib4c) 15933#endif 15934 15935103: 15936 15937 15938 // call inner blender_loader nn 15939 15940 movq ARG5, %r10 // beta 15941 movq ARG6, %r11 // C 15942 movq ARG7, %r12 // ldc 15943 sall $3, %r12d 15944 movq ARG12, %r13 // m1 15945 movq ARG13, %r14 // n1 15946 15947#if MACRO_LEVEL>=1 15948 INNER_SCALE_M1B_4X4_VS_LIB 15949#else 15950 CALL(inner_scale_m1b_4x4_vs_lib) 15951#endif 15952 15953 15954 // solve 15955 15956 movq ARG10, %r10 // E 15957 movq ARG11, %r11 // lde 15958 sall $3, %r11d 15959 movq ARG13, %r12 // n1 15960 15961#if MACRO_LEVEL>=1 15962 INNER_EDGE_DTRSM_RUT_ONE_4X4_VS_LIB 15963#else 15964 CALL(inner_edge_dtrsm_rut_one_4x4_vs_lib) 15965#endif 15966 15967 15968 // store 15969 15970 movq ARG8, %r10 // D 15971 movq ARG9, %r11 // ldd 15972 sall $3, %r11d 15973 movq ARG12, %r12 // m1 15974 movq ARG13, %r13 // n1 15975 15976#if MACRO_LEVEL>=1 15977 INNER_STORE_4X4_VS_LIB 15978#else 15979 CALL(inner_store_4x4_vs_lib) 15980#endif 15981 15982 15983 EPILOGUE 15984 15985 ret 15986 15987 FUN_END(kernel_dtrsm_nt_ru_one_4x4_vs_lib4cccc) 15988 15989 15990 15991 15992 15993// 1 2 3 4 5 6 7 8 9 10 11 15994// void kernel_dtrsm_nn_ll_one_4x4_lib4cccc(int k, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, double *E, int lde); 15995 15996 .p2align 4,,15 15997 GLOB_FUN_START(kernel_dtrsm_nn_ll_one_4x4_lib4cccc) 15998 15999 PROLOGUE 16000 16001 // zero accumulation registers 16002 16003 ZERO_ACC 16004 16005 16006 // call inner dgemm kernel nt 16007 16008 movq ARG1, %r10 // k 16009 movq ARG2, %r11 // A 16010 movq ARG3, %r12 // B 16011 movq ARG4, %r13 // ldb 16012 sall $3, %r13d // ldb*sizeof(double) 16013 16014#if MACRO_LEVEL>=2 16015 INNER_KERNEL_DGEMM_NN_4X4_LIB4C 16016#else 16017 CALL(inner_kernel_dgemm_nn_4x4_lib4c) 16018#endif 16019 16020 16021 // call inner blender_loader nn 16022 16023 movq ARG5, %r10 // beta 16024 movq ARG6, %r11 // C 16025 movq ARG7, %r12 // ldc 16026 sall $3, %r12d 16027 16028#if MACRO_LEVEL>=1 16029 INNER_SCALE_M1B_4X4_LIB 16030#else 16031 CALL(inner_scale_m1b_4x4_lib) 16032#endif 16033 16034 16035 // solve 16036 16037 movq ARG10, %r10 // E 16038 movq ARG11, %r11 // lde 16039 sall $3, %r11d 16040 16041#if MACRO_LEVEL>=1 16042 INNER_EDGE_DTRSM_LLN_ONE_4X4_LIB 16043#else 16044 CALL(inner_edge_dtrsm_lln_one_4x4_lib) 16045#endif 16046 16047 16048 // store 16049 16050 movq ARG8, %r10 // D 16051 movq ARG9, %r11 // ldd 16052 sall $3, %r11d 16053 16054#if MACRO_LEVEL>=1 16055 INNER_STORE_4X4_LIB 16056#else 16057 CALL(inner_store_4x4_lib) 16058#endif 16059 16060 16061 EPILOGUE 16062 16063 ret 16064 16065 FUN_END(kernel_dtrsm_nn_ll_one_4x4_lib4cccc) 16066 16067 16068 16069 16070 16071// 1 2 3 4 5 6 7 8 9 10 11 12 13 16072// void kernel_dtrsm_nn_ll_one_4x4_vs_lib4cccc(int k, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, double *E, int lde, int m1, int n1); 16073 16074 .p2align 4,,15 16075 GLOB_FUN_START(kernel_dtrsm_nn_ll_one_4x4_vs_lib4cccc) 16076 16077 PROLOGUE 16078 16079 // zero accumulation registers 16080 16081 ZERO_ACC 16082 16083 16084 // call inner dgemm kernel nt 16085 16086 movq ARG1, %r10 16087 movq ARG2, %r11 16088 movq ARG3, %r12 16089 movq ARG4, %r13 // ldb 16090 sall $3, %r13d 16091 16092 16093 movq ARG14, %r14 // n1 16094 cmpl $1, %r14d 16095 jg 100f 16096 16097#if MACRO_LEVEL>=2 16098 INNER_KERNEL_DGEMM_NN_4X1_LIB4C 16099#else 16100 CALL(inner_kernel_dgemm_nn_4x1_lib4c) 16101#endif 16102 16103 jmp 103f 16104 16105100: 16106 16107 movq ARG14, %r14 // n1 16108 cmpl $2, %r14d 16109 jg 101f 16110 16111#if MACRO_LEVEL>=2 16112 INNER_KERNEL_DGEMM_NN_4X2_LIB4C 16113#else 16114 CALL(inner_kernel_dgemm_nn_4x2_lib4c) 16115#endif 16116 16117 jmp 103f 16118 16119101: 16120 16121 movq ARG14, %r14 // n1 16122 cmpl $3, %r14d 16123 jg 102f 16124 16125#if MACRO_LEVEL>=2 16126 INNER_KERNEL_DGEMM_NN_4X3_LIB4C 16127#else 16128 CALL(inner_kernel_dgemm_nn_4x3_lib4c) 16129#endif 16130 16131 jmp 103f 16132 16133102: 16134 16135#if MACRO_LEVEL>=2 16136 INNER_KERNEL_DGEMM_NN_4X4_LIB4C 16137#else 16138 CALL(inner_kernel_dgemm_nn_4x4_lib4c) 16139#endif 16140 16141103: 16142 16143 16144 // call inner blender_loader nn 16145 16146 movq ARG5, %r10 // beta 16147 movq ARG6, %r11 // C 16148 movq ARG7, %r12 // ldc 16149 sall $3, %r12d 16150 movq ARG12, %r13 // m1 16151 movq ARG13, %r14 // n1 16152 16153#if MACRO_LEVEL>=1 16154 INNER_SCALE_M1B_4X4_VS_LIB 16155#else 16156 CALL(inner_scale_m1b_4x4_vs_lib) 16157#endif 16158 16159 16160 // solve 16161 16162 movq ARG10, %r10 // E 16163 movq ARG11, %r11 // lde 16164 sall $3, %r11d 16165 16166#if MACRO_LEVEL>=1 16167 INNER_EDGE_DTRSM_LLN_ONE_4X4_LIB 16168#else 16169 CALL(inner_edge_dtrsm_lln_one_4x4_lib) 16170#endif 16171 16172 16173 // store 16174 16175 movq ARG8, %r10 // D 16176 movq ARG9, %r11 // ldd 16177 sall $3, %r11d 16178 movq ARG12, %r12 // m1 16179 movq ARG13, %r13 // n1 16180 16181#if MACRO_LEVEL>=1 16182 INNER_STORE_4X4_VS_LIB 16183#else 16184 CALL(inner_store_4x4_vs_lib) 16185#endif 16186 16187 16188 EPILOGUE 16189 16190 ret 16191 16192 FUN_END(kernel_dtrsm_nn_ll_one_4x4_vs_lib4cccc) 16193 16194 16195 16196 16197 16198// 1 2 3 4 5 6 7 8 9 16199// void kernel_dgetrf_nn_4x4_lib4ccc(int k, double *A, double *B, int a, double *C, int ldc, double *D, int ldd, double *inv_diag_D); 16200 16201 .p2align 4,,15 16202 GLOB_FUN_START(kernel_dgetrf_nn_4x4_lib4ccc) 16203 16204 PROLOGUE 16205 16206 // zero accumulation registers 16207 16208 ZERO_ACC 16209 16210 16211 // call inner dgemm kernel nt 16212 16213 movq ARG1, %r10 // k 16214 movq ARG2, %r11 // A 16215 movq ARG3, %r12 // B 16216 movq ARG4, %r13 // ldb 16217 sall $3, %r13d 16218 16219#if MACRO_LEVEL>=2 16220 INNER_KERNEL_DGEMM_NN_4X4_LIB4C 16221#else 16222 CALL(inner_kernel_dgemm_nn_4x4_lib4c) 16223#endif 16224 16225 16226 // call inner blender_loader nn 16227 16228 movq ARG5, %r10 // C 16229 movq ARG6, %r11 // ldc 16230 sall $3, %r11d 16231 16232#if MACRO_LEVEL>=1 16233 INNER_SCALE_M11_4X4_LIB 16234#else 16235 CALL(inner_scale_m11_4x4_lib) 16236#endif 16237 16238 16239 // factorization 16240 16241 movq ARG9, %r10 // inv_diag_D 16242 16243#if MACRO_LEVEL>=1 16244 INNER_EDGE_DGETRF_4X4_LIB4 16245#else 16246 CALL(inner_edge_dgetrf_4x4_lib4) 16247#endif 16248 16249 16250 // store 16251 16252 movq ARG7, %r10 // D 16253 movq ARG8, %r11 // ldd 16254 sall $3, %r11d 16255 16256#if MACRO_LEVEL>=1 16257 INNER_STORE_4X4_LIB 16258#else 16259 CALL(inner_store_4x4_lib) 16260#endif 16261 16262 16263 EPILOGUE 16264 16265 ret 16266 16267 FUN_END(kernel_dgetrf_nn_4x4_lib4ccc) 16268 16269 16270 16271 16272 16273// 1 2 3 4 5 6 7 8 9 10 11 16274// void kernel_dgetrf_nn_4x4_vs_lib4ccc(int k, double *A, double *B, int ldb, double *C, int ldc, double *D, int ldd, double *inv_diag_D, int m1, int n1); 16275 16276 .p2align 4,,15 16277 GLOB_FUN_START(kernel_dgetrf_nn_4x4_vs_lib4ccc) 16278 16279 PROLOGUE 16280 16281 // zero accumulation registers 16282 16283 ZERO_ACC 16284 16285 16286 // call inner dgemm kernel nt 16287 16288 movq ARG1, %r10 // k 16289 movq ARG2, %r11 // A 16290 movq ARG3, %r12 // B 16291 movq ARG4, %r13 // ldb 16292 sall $3, %r13d 16293 16294 movq ARG11, %r14 // m1 16295 cmpl $1, %r14d 16296 jg 100f 16297 16298#if MACRO_LEVEL>=2 16299 INNER_KERNEL_DGEMM_NN_4X1_LIB4C 16300#else 16301 CALL(inner_kernel_dgemm_nn_4x1_lib4c) 16302#endif 16303 16304 jmp 103f 16305 16306100: 16307 16308 movq ARG11, %r14 // m1 16309 cmpl $2, %r14d 16310 jg 101f 16311 16312#if MACRO_LEVEL>=2 16313 INNER_KERNEL_DGEMM_NN_4X2_LIB4C 16314#else 16315 CALL(inner_kernel_dgemm_nn_4x2_lib4c) 16316#endif 16317 16318 jmp 103f 16319 16320101: 16321 16322 movq ARG11, %r14 // m1 16323 cmpl $3, %r14d 16324 jg 102f 16325 16326#if MACRO_LEVEL>=2 16327 INNER_KERNEL_DGEMM_NN_4X3_LIB4C 16328#else 16329 CALL(inner_kernel_dgemm_nn_4x3_lib4c) 16330#endif 16331 16332 jmp 103f 16333 16334102: 16335 16336#if MACRO_LEVEL>=2 16337 INNER_KERNEL_DGEMM_NN_4X4_LIB4C 16338#else 16339 CALL(inner_kernel_dgemm_nn_4x4_lib4c) 16340#endif 16341 16342103: 16343 16344 16345 // call inner blender_loader nn 16346 16347 movq ARG5, %r10 // C 16348 movq ARG6, %r11 // ldc 16349 sall $3, %r11d 16350 movq ARG10, %r12 // m1 16351 movq ARG11, %r13 // n1 16352 16353#if MACRO_LEVEL>=1 16354 INNER_SCALE_M11_4X4_VS_LIB 16355#else 16356 CALL(inner_scale_m11_4x4_vs_lib) 16357#endif 16358 16359 16360 // factorization 16361 16362 movq ARG9, %r10 // inv_diag_D 16363 16364#if MACRO_LEVEL>=1 16365 INNER_EDGE_DGETRF_4X4_LIB4 16366#else 16367 CALL(inner_edge_dgetrf_4x4_lib4) 16368#endif 16369 16370 16371 // store 16372 16373 movq ARG7, %r10 // D 16374 movq ARG8, %r11 // ldd 16375 sall $3, %r11d 16376 movq ARG10, %r12 // m1 16377 movq ARG11, %r13 // n1 16378 16379#if MACRO_LEVEL>=1 16380 INNER_STORE_4X4_VS_LIB 16381#else 16382 CALL(inner_store_4x4_vs_lib) 16383#endif 16384 16385 16386 EPILOGUE 16387 16388 ret 16389 16390 FUN_END(kernel_dgetrf_nn_4x4_vs_lib4ccc) 16391 16392 16393 16394 16395 16396 16397