1/************************************************************************************************** 2* * 3* This file is part of BLASFEO. * 4* * 5* BLASFEO -- BLAS For Embedded Optimization. * 6* Copyright (C) 2019 by Gianluca Frison. * 7* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. * 8* All rights reserved. * 9* * 10* The 2-Clause BSD License * 11* * 12* Redistribution and use in source and binary forms, with or without * 13* modification, are permitted provided that the following conditions are met: * 14* * 15* 1. Redistributions of source code must retain the above copyright notice, this * 16* list of conditions and the following disclaimer. * 17* 2. Redistributions in binary form must reproduce the above copyright notice, * 18* this list of conditions and the following disclaimer in the documentation * 19* and/or other materials provided with the distribution. * 20* * 21* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * 22* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * 23* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * 24* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * 25* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * 26* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * 27* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * 28* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * 29* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * 30* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * 31* * 32* Author: Gianluca Frison, gianluca.frison (at) imtek.uni-freiburg.de * 33* * 34**************************************************************************************************/ 35 36#if defined(OS_LINUX) | defined(OS_MAC) 37 38//#define STACKSIZE 96 39#define STACKSIZE 64 40#define ARG1 %rdi 41#define ARG2 %rsi 42#define ARG3 %rdx 43#define ARG4 %rcx 44#define ARG5 %r8 45#define ARG6 %r9 46#define ARG7 STACKSIZE + 8(%rsp) 47#define ARG8 STACKSIZE + 16(%rsp) 48#define ARG9 STACKSIZE + 24(%rsp) 49#define ARG10 STACKSIZE + 32(%rsp) 50#define ARG11 STACKSIZE + 40(%rsp) 51#define ARG12 STACKSIZE + 48(%rsp) 52#define ARG13 STACKSIZE + 56(%rsp) 53#define ARG14 STACKSIZE + 64(%rsp) 54#define ARG15 STACKSIZE + 72(%rsp) 55#define ARG16 STACKSIZE + 80(%rsp) 56#define ARG17 STACKSIZE + 88(%rsp) 57#define ARG18 STACKSIZE + 96(%rsp) 58#define PROLOGUE \ 59 subq $STACKSIZE, %rsp; \ 60 movq %rbx, (%rsp); \ 61 movq %rbp, 8(%rsp); \ 62 movq %r12, 16(%rsp); \ 63 movq %r13, 24(%rsp); \ 64 movq %r14, 32(%rsp); \ 65 movq %r15, 40(%rsp); \ 66 vzeroupper; 67#define EPILOGUE \ 68 vzeroupper; \ 69 movq (%rsp), %rbx; \ 70 movq 8(%rsp), %rbp; \ 71 movq 16(%rsp), %r12; \ 72 movq 24(%rsp), %r13; \ 73 movq 32(%rsp), %r14; \ 74 movq 40(%rsp), %r15; \ 75 addq $STACKSIZE, %rsp; 76 77#if defined(OS_LINUX) 78 79#define GLOB_FUN_START(NAME) \ 80 .globl NAME; \ 81 .type NAME, @function; \ 82NAME: 83#define FUN_START(NAME) \ 84 .type NAME, @function; \ 85NAME: 86#define FUN_END(NAME) \ 87 .size NAME, .-NAME 88#define CALL(NAME) \ 89 call NAME 90#define ZERO_ACC \ 91 vxorps %ymm0, %ymm0, %ymm0; \ 92 vmovaps %ymm0, %ymm1; \ 93 vmovaps %ymm0, %ymm2; \ 94 vmovaps %ymm0, %ymm3; \ 95 vmovaps %ymm0, %ymm4; \ 96 vmovaps %ymm0, %ymm5; \ 97 vmovaps %ymm0, %ymm6; \ 98 vmovaps %ymm0, %ymm7 99//#define NEG_ACC \ 100// vmovapd .LC11(%rip), %ymm15; \ 101// vxorpd %ymm15, %ymm0, %ymm0; \ 102// vxorpd %ymm15, %ymm1, %ymm1; \ 103// vxorpd %ymm15, %ymm2, %ymm2; \ 104// vxorpd %ymm15, %ymm3, %ymm3 105 106#else // defined(OS_MAC) 107 108#define GLOB_FUN_START(NAME) \ 109 .globl _ ## NAME; \ 110_ ## NAME: 111#define FUN_START(NAME) \ 112_ ## NAME: 113#define FUN_END(NAME) 114#define CALL(NAME) \ 115 callq _ ## NAME 116#define ZERO_ACC \ 117 vxorps %ymm0, %ymm0, %ymm0; \ 118 vmovaps %ymm0, %ymm1; \ 119 vmovaps %ymm0, %ymm2; \ 120 vmovaps %ymm0, %ymm3; \ 121 vmovaps %ymm0, %ymm4; \ 122 vmovaps %ymm0, %ymm5; \ 123 vmovaps %ymm0, %ymm6; \ 124 vmovaps %ymm0, %ymm7 125//#define NEG_ACC \ 126// vmovapd LC11(%rip), %ymm15; \ 127// vxorpd %ymm15, %ymm0, %ymm0; \ 128// vxorpd %ymm15, %ymm1, %ymm1; \ 129// vxorpd %ymm15, %ymm2, %ymm2; \ 130// vxorpd %ymm15, %ymm3, %ymm3 131 132#endif 133 134#elif defined(OS_WINDOWS) 135 136#define STACKSIZE 256 137#define ARG1 %rcx 138#define ARG2 %rdx 139#define ARG3 %r8 140#define ARG4 %r9 141#define ARG5 STACKSIZE + 40(%rsp) 142#define ARG6 STACKSIZE + 48(%rsp) 143#define ARG7 STACKSIZE + 56(%rsp) 144#define ARG8 STACKSIZE + 64(%rsp) 145#define ARG9 STACKSIZE + 72(%rsp) 146#define ARG10 STACKSIZE + 80(%rsp) 147#define ARG11 STACKSIZE + 88(%rsp) 148#define ARG12 STACKSIZE + 96(%rsp) 149#define ARG13 STACKSIZE + 104(%rsp) 150#define ARG14 STACKSIZE + 112(%rsp) 151#define ARG15 STACKSIZE + 120(%rsp) 152#define ARG16 STACKSIZE + 128(%rsp) 153#define ARG17 STACKSIZE + 136(%rsp) 154#define ARG18 STACKSIZE + 144(%rsp) 155#define PROLOGUE \ 156 subq $STACKSIZE, %rsp; \ 157 movq %rbx, (%rsp); \ 158 movq %rbp, 8(%rsp); \ 159 movq %r12, 16(%rsp); \ 160 movq %r13, 24(%rsp); \ 161 movq %r14, 32(%rsp); \ 162 movq %r15, 40(%rsp); \ 163 movq %rdi, 48(%rsp); \ 164 movq %rsi, 56(%rsp); \ 165 vmovups %xmm6, 64(%rsp); \ 166 vmovups %xmm7, 80(%rsp); \ 167 vmovups %xmm8, 96(%rsp); \ 168 vmovups %xmm9, 112(%rsp); \ 169 vmovups %xmm10, 128(%rsp); \ 170 vmovups %xmm11, 144(%rsp); \ 171 vmovups %xmm12, 160(%rsp); \ 172 vmovups %xmm13, 176(%rsp); \ 173 vmovups %xmm14, 192(%rsp); \ 174 vmovups %xmm15, 208(%rsp); \ 175 vzeroupper; 176#define EPILOGUE \ 177 vzeroupper; \ 178 movq (%rsp), %rbx; \ 179 movq 8(%rsp), %rbp; \ 180 movq 16(%rsp), %r12; \ 181 movq 24(%rsp), %r13; \ 182 movq 32(%rsp), %r14; \ 183 movq 40(%rsp), %r15; \ 184 movq 48(%rsp), %rdi; \ 185 movq 56(%rsp), %rsi; \ 186 vmovups 64(%rsp), %xmm6; \ 187 vmovups 80(%rsp), %xmm7; \ 188 vmovups 96(%rsp), %xmm8; \ 189 vmovups 112(%rsp), %xmm9; \ 190 vmovups 128(%rsp), %xmm10; \ 191 vmovups 144(%rsp), %xmm11; \ 192 vmovups 160(%rsp), %xmm12; \ 193 vmovups 176(%rsp), %xmm13; \ 194 vmovups 192(%rsp), %xmm14; \ 195 vmovups 208(%rsp), %xmm15; \ 196 addq $STACKSIZE, %rsp; 197 198#define GLOB_FUN_START(NAME) \ 199 .globl NAME; \ 200 .def NAME; .scl 2; .type 32; .endef; \ 201NAME: 202#define FUN_START(NAME) \ 203 .def NAME; .scl 2; .type 32; .endef; \ 204NAME: 205#define FUN_END(NAME) 206#define CALL(NAME) \ 207 call NAME 208#define ZERO_ACC \ 209 vxorps %ymm0, %ymm0, %ymm0; \ 210 vmovaps %ymm0, %ymm1; \ 211 vmovaps %ymm0, %ymm2; \ 212 vmovaps %ymm0, %ymm3; \ 213 vmovaps %ymm0, %ymm4; \ 214 vmovaps %ymm0, %ymm5; \ 215 vmovaps %ymm0, %ymm6; \ 216 vmovaps %ymm0, %ymm7 217//#define NEG_ACC \ 218// vmovapd .LC11(%rip), %ymm15; \ 219// vxorpd %ymm15, %ymm0, %ymm0; \ 220// vxorpd %ymm15, %ymm1, %ymm1; \ 221// vxorpd %ymm15, %ymm2, %ymm2; \ 222// vxorpd %ymm15, %ymm3, %ymm3 223 224#else 225 226#error wrong OS 227 228#endif 229 230 231 232#if defined(OS_LINUX) | defined(OS_WINDOWS) 233 .text 234#elif defined(OS_MAC) 235 .section __TEXT,__text,regular,pure_instructions 236#endif 237 238 239 240// common inner routine with file scope 241// 242// input arguments: 243// r10d <- k 244// r11 <- A 245// r12 <- B 246// ymm0 <- [d00 d11 d22 d33 d40 d51 d62 d73] 247// ymm1 <- [d01 d10 d23 d32 d41 d50 d63 d72] 248// ymm2 <- [d03 d12 d21 d30 d43 d52 d61 d70] 249// ymm3 <- [d02 d13 d20 d31 d42 d53 d60 d71] 250// ymm4 <- [] 251// ymm5 <- [] 252// ymm6 <- [] 253// ymm7 <- [] 254// ymm8 <- dirty 255// ymm9 <- dirty 256// ymm10 <- dirty 257// ymm11 <- dirty 258// ymm12 <- dirty 259// ymm13 <- dirty 260// ymm14 <- dirty 261// ymm15 <- dirty 262 263// 264// output arguments: 265// r10d <- 0 266// r11 <- A+4*k*sizeof(double) 267// r12 <- B+4*k*sizeof(double) 268// ymm0 <- [d00 d11 d22 d33 d40 d51 d62 d73] 269// ymm1 <- [d01 d10 d23 d32 d41 d50 d63 d72] 270// ymm2 <- [d03 d12 d21 d30 d43 d52 d61 d70] 271// ymm3 <- [d02 d13 d20 d31 d42 d53 d60 d71] 272// ymm4 <- [] 273// ymm5 <- [] 274// ymm6 <- [] 275// ymm7 <- [] 276// ymm8 <- dirty 277// ymm9 <- dirty 278// ymm10 <- dirty 279// ymm11 <- dirty 280// ymm12 <- dirty 281// ymm13 <- dirty 282// ymm14 <- dirty 283// ymm15 <- dirty 284 285#if MACRO_LEVEL>=2 286 .macro INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8 287#else 288 .p2align 4,,15 289 FUN_START(inner_kernel_gemm_add_nt_8x8_lib8) 290#endif 291 292 293// broadcast scheme 294#if 1 295 296 cmpl $0, %r10d 297 jle 2f // return 298 299 // preload 300 301 cmpl $3, %r10d 302 jle 4f // consider clean-up loop 303 304 // main loop 305 .p2align 3 3061: // main loop 307 308 // unroll 0 309 vmovaps 0(%r11), %ymm13 // A 310 vbroadcastss 0(%r12), %ymm12 // B 311 vfmadd231ps %ymm13, %ymm12, %ymm0 312 vbroadcastss 4(%r12), %ymm12 // B 313 vfmadd231ps %ymm13, %ymm12, %ymm1 314 vbroadcastss 8(%r12), %ymm12 // B 315 vfmadd231ps %ymm13, %ymm12, %ymm2 316 vbroadcastss 12(%r12), %ymm12 // B 317 vfmadd231ps %ymm13, %ymm12, %ymm3 318 vbroadcastss 16(%r12), %ymm12 // B 319 vfmadd231ps %ymm13, %ymm12, %ymm4 320 vbroadcastss 20(%r12), %ymm12 // B 321 vfmadd231ps %ymm13, %ymm12, %ymm5 322 vbroadcastss 24(%r12), %ymm12 // B 323 vfmadd231ps %ymm13, %ymm12, %ymm6 324 vbroadcastss 28(%r12), %ymm12 // B 325 vfmadd231ps %ymm13, %ymm12, %ymm7 326 subl $4, %r10d 327 328 // unroll 0 329 vmovaps 32(%r11), %ymm13 // A 330 vbroadcastss 32(%r12), %ymm12 // B 331 vfmadd231ps %ymm13, %ymm12, %ymm0 332 vbroadcastss 36(%r12), %ymm12 // B 333 vfmadd231ps %ymm13, %ymm12, %ymm1 334 vbroadcastss 40(%r12), %ymm12 // B 335 vfmadd231ps %ymm13, %ymm12, %ymm2 336 vbroadcastss 44(%r12), %ymm12 // B 337 vfmadd231ps %ymm13, %ymm12, %ymm3 338 vbroadcastss 48(%r12), %ymm12 // B 339 vfmadd231ps %ymm13, %ymm12, %ymm4 340 vbroadcastss 52(%r12), %ymm12 // B 341 vfmadd231ps %ymm13, %ymm12, %ymm5 342 vbroadcastss 56(%r12), %ymm12 // B 343 vfmadd231ps %ymm13, %ymm12, %ymm6 344 vbroadcastss 60(%r12), %ymm12 // B 345 vfmadd231ps %ymm13, %ymm12, %ymm7 346 addq $128, %r11 347 348 // unroll 0 349 vmovaps -64(%r11), %ymm13 // A 350 vbroadcastss 64(%r12), %ymm12 // B 351 vfmadd231ps %ymm13, %ymm12, %ymm0 352 vbroadcastss 68(%r12), %ymm12 // B 353 vfmadd231ps %ymm13, %ymm12, %ymm1 354 vbroadcastss 72(%r12), %ymm12 // B 355 vfmadd231ps %ymm13, %ymm12, %ymm2 356 vbroadcastss 76(%r12), %ymm12 // B 357 vfmadd231ps %ymm13, %ymm12, %ymm3 358 vbroadcastss 80(%r12), %ymm12 // B 359 vfmadd231ps %ymm13, %ymm12, %ymm4 360 vbroadcastss 84(%r12), %ymm12 // B 361 vfmadd231ps %ymm13, %ymm12, %ymm5 362 vbroadcastss 88(%r12), %ymm12 // B 363 vfmadd231ps %ymm13, %ymm12, %ymm6 364 vbroadcastss 92(%r12), %ymm12 // B 365 vfmadd231ps %ymm13, %ymm12, %ymm7 366 addq $128, %r12 367 368 // unroll 0 369 vmovaps -32(%r11), %ymm13 // A 370 vbroadcastss -32(%r12), %ymm12 // B 371 vfmadd231ps %ymm13, %ymm12, %ymm0 372 vbroadcastss -28(%r12), %ymm12 // B 373 vfmadd231ps %ymm13, %ymm12, %ymm1 374 vbroadcastss -24(%r12), %ymm12 // B 375 vfmadd231ps %ymm13, %ymm12, %ymm2 376 vbroadcastss -20(%r12), %ymm12 // B 377 vfmadd231ps %ymm13, %ymm12, %ymm3 378 vbroadcastss -16(%r12), %ymm12 // B 379 vfmadd231ps %ymm13, %ymm12, %ymm4 380 vbroadcastss -12(%r12), %ymm12 // B 381 vfmadd231ps %ymm13, %ymm12, %ymm5 382 vbroadcastss -8(%r12), %ymm12 // B 383 vfmadd231ps %ymm13, %ymm12, %ymm6 384 vbroadcastss -4(%r12), %ymm12 // B 385 vfmadd231ps %ymm13, %ymm12, %ymm7 386 387 cmpl $4, %r10d 388 jg 1b // main loop 389 390 3914: // consider clean1-up loop 392 393 cmpl $0, %r10d 394 jle 2f // return 395 396 // clean-up loop 3973: // clean up loop 398 399 // unroll 0 400 vmovaps 0(%r11), %ymm13 // a 401 vbroadcastss 0(%r12), %ymm12 // b 402 vfmadd231ps %ymm13, %ymm12, %ymm0 403 vbroadcastss 4(%r12), %ymm12 // b 404 vfmadd231ps %ymm13, %ymm12, %ymm1 405 subl $1, %r10d 406 vbroadcastss 8(%r12), %ymm12 // b 407 vfmadd231ps %ymm13, %ymm12, %ymm2 408 addq $32, %r11 409 vbroadcastss 12(%r12), %ymm12 // b 410 vfmadd231ps %ymm13, %ymm12, %ymm3 411 vbroadcastss 16(%r12), %ymm12 // B 412 vfmadd231ps %ymm13, %ymm12, %ymm4 413 vbroadcastss 20(%r12), %ymm12 // B 414 vfmadd231ps %ymm13, %ymm12, %ymm5 415 vbroadcastss 24(%r12), %ymm12 // B 416 vfmadd231ps %ymm13, %ymm12, %ymm6 417 vbroadcastss 28(%r12), %ymm12 // B 418 vfmadd231ps %ymm13, %ymm12, %ymm7 419 addq $32, %r12 420 421 cmpl $0, %r10d 422 jg 3b // clean up loop 423 424 4252: // return 426 427// shuffle scheme 428#else 429 430 cmpl $0, %r10d 431 jle 2f // return 432 433 // preload 434 vbroadcastf128 0(%r12), %ymm14 // B 435 vmovaps 0(%r11), %ymm12 // A 436 vbroadcastf128 16(%r12), %ymm15 // B 437 vmovaps 32(%r11), %ymm13 // A 438 439 cmpl $4, %r10d 440 jle 0f // consider clean-up loop 441 442 // main loop 443 .p2align 3 4441: // main loop 445 446 // unroll 0 447 vfmadd231ps %ymm12, %ymm14, %ymm0 448 vshufps $0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01 449 450 vfmadd231ps %ymm12, %ymm14, %ymm1 451 vshufps $0x4e, %ymm14, %ymm14, %ymm14 // 01 00 11 10 452 453 vfmadd231ps %ymm12, %ymm14, %ymm2 454 vshufps $0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01 455 456 vfmadd231ps %ymm12, %ymm14, %ymm3 457 vbroadcastf128 32(%r12), %ymm14 // B 458 459 vfmadd231ps %ymm12, %ymm15, %ymm4 460 vshufps $0xb1, %ymm15, %ymm15, %ymm15 461 462 vfmadd231ps %ymm12, %ymm15, %ymm5 463 vshufps $0x4e, %ymm15, %ymm15, %ymm15 464 465 vfmadd231ps %ymm12, %ymm15, %ymm6 466 vshufps $0xb1, %ymm15, %ymm15, %ymm15 467 468 vfmadd231ps %ymm12, %ymm15, %ymm7 469 vbroadcastf128 48(%r12), %ymm15 // B 470 vmovaps 64(%r11), %ymm12 // A 471 472 473 // unroll 1 474 vfmadd231ps %ymm13, %ymm14, %ymm0 475 vshufps $0xb1, %ymm14, %ymm14, %ymm14 476 477 vfmadd231ps %ymm13, %ymm14, %ymm1 478 vshufps $0x4e, %ymm14, %ymm14, %ymm14 479 480 vfmadd231ps %ymm13, %ymm14, %ymm2 481 vshufps $0xb1, %ymm14, %ymm14, %ymm14 482 483 vfmadd231ps %ymm13, %ymm14, %ymm3 484 vbroadcastf128 64(%r12), %ymm14 // B 485 486 vfmadd231ps %ymm13, %ymm15, %ymm4 487 vshufps $0xb1, %ymm15, %ymm15, %ymm15 488 489 vfmadd231ps %ymm13, %ymm15, %ymm5 490 vshufps $0x4e, %ymm15, %ymm15, %ymm15 491 492 vfmadd231ps %ymm13, %ymm15, %ymm6 493 vshufps $0xb1, %ymm15, %ymm15, %ymm15 494 495 vfmadd231ps %ymm13, %ymm15, %ymm7 496 vbroadcastf128 80(%r12), %ymm15 // B 497 vmovaps 96(%r11), %ymm13 // A 498 499 500 // unroll 2 501 vfmadd231ps %ymm12, %ymm14, %ymm0 502 vshufps $0xb1, %ymm14, %ymm14, %ymm14 503 504 vfmadd231ps %ymm12, %ymm14, %ymm1 505 vshufps $0x4e, %ymm14, %ymm14, %ymm14 506 507 vfmadd231ps %ymm12, %ymm14, %ymm2 508 vshufps $0xb1, %ymm14, %ymm14, %ymm14 509 510 vfmadd231ps %ymm12, %ymm14, %ymm3 511 vbroadcastf128 96(%r12), %ymm14 // B 512 513 vfmadd231ps %ymm12, %ymm15, %ymm4 514 vshufps $0xb1, %ymm15, %ymm15, %ymm15 515 516 vfmadd231ps %ymm12, %ymm15, %ymm5 517 vshufps $0x4e, %ymm15, %ymm15, %ymm15 518 519 vfmadd231ps %ymm12, %ymm15, %ymm6 520 vshufps $0xb1, %ymm15, %ymm15, %ymm15 521 522 vfmadd231ps %ymm12, %ymm15, %ymm7 523 vbroadcastf128 112(%r12), %ymm15 // B 524 vmovaps 128(%r11), %ymm12 // A 525 526 subl $4, %r10d 527 addq $128, %r11 528 addq $128, %r12 529 530 // unroll 3 531 vfmadd231ps %ymm13, %ymm14, %ymm0 532 vshufps $0xb1, %ymm14, %ymm14, %ymm14 533 534 vfmadd231ps %ymm13, %ymm14, %ymm1 535 vshufps $0x4e, %ymm14, %ymm14, %ymm14 536 537 vfmadd231ps %ymm13, %ymm14, %ymm2 538 vshufps $0xb1, %ymm14, %ymm14, %ymm14 539 540 vfmadd231ps %ymm13, %ymm14, %ymm3 541 vbroadcastf128 0(%r12), %ymm14 // B 542 543 vfmadd231ps %ymm13, %ymm15, %ymm4 544 vshufps $0xb1, %ymm15, %ymm15, %ymm15 545 546 vfmadd231ps %ymm13, %ymm15, %ymm5 547 vshufps $0x4e, %ymm15, %ymm15, %ymm15 548 549 vfmadd231ps %ymm13, %ymm15, %ymm6 550 vshufps $0xb1, %ymm15, %ymm15, %ymm15 551 552 vfmadd231ps %ymm13, %ymm15, %ymm7 553 vbroadcastf128 16(%r12), %ymm15 // B 554 vmovaps 32(%r11), %ymm13 // A 555 556 cmpl $4, %r10d 557 jg 1b // main loop 558 559 5600: // consider clean4-up 561 562 cmpl $3, %r10d 563 jle 4f // clean1 564 565 566 // unroll 0 567 vfmadd231ps %ymm12, %ymm14, %ymm0 568 vshufps $0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01 569 570 vfmadd231ps %ymm12, %ymm14, %ymm1 571 vshufps $0x4e, %ymm14, %ymm14, %ymm14 // 01 00 11 10 572 573 vfmadd231ps %ymm12, %ymm14, %ymm2 574 vshufps $0xb1, %ymm14, %ymm14, %ymm14 // 10 11 00 01 575 576 vfmadd231ps %ymm12, %ymm14, %ymm3 577 vbroadcastf128 32(%r12), %ymm14 // B 578 579 vfmadd231ps %ymm12, %ymm15, %ymm4 580 vshufps $0xb1, %ymm15, %ymm15, %ymm15 581 582 vfmadd231ps %ymm12, %ymm15, %ymm5 583 vshufps $0x4e, %ymm15, %ymm15, %ymm15 584 585 vfmadd231ps %ymm12, %ymm15, %ymm6 586 vshufps $0xb1, %ymm15, %ymm15, %ymm15 587 588 vfmadd231ps %ymm12, %ymm15, %ymm7 589 vbroadcastf128 48(%r12), %ymm15 // B 590 vmovaps 64(%r11), %ymm12 // A 591 592 593 // unroll 1 594 vfmadd231ps %ymm13, %ymm14, %ymm0 595 vshufps $0xb1, %ymm14, %ymm14, %ymm14 596 597 vfmadd231ps %ymm13, %ymm14, %ymm1 598 vshufps $0x4e, %ymm14, %ymm14, %ymm14 599 600 vfmadd231ps %ymm13, %ymm14, %ymm2 601 vshufps $0xb1, %ymm14, %ymm14, %ymm14 602 603 vfmadd231ps %ymm13, %ymm14, %ymm3 604 vbroadcastf128 64(%r12), %ymm14 // B 605 606 vfmadd231ps %ymm13, %ymm15, %ymm4 607 vshufps $0xb1, %ymm15, %ymm15, %ymm15 608 609 vfmadd231ps %ymm13, %ymm15, %ymm5 610 vshufps $0x4e, %ymm15, %ymm15, %ymm15 611 612 vfmadd231ps %ymm13, %ymm15, %ymm6 613 vshufps $0xb1, %ymm15, %ymm15, %ymm15 614 615 vfmadd231ps %ymm13, %ymm15, %ymm7 616 vbroadcastf128 80(%r12), %ymm15 // B 617 vmovaps 96(%r11), %ymm13 // A 618 619 620 // unroll 2 621 vfmadd231ps %ymm12, %ymm14, %ymm0 622 vshufps $0xb1, %ymm14, %ymm14, %ymm14 623 624 vfmadd231ps %ymm12, %ymm14, %ymm1 625 vshufps $0x4e, %ymm14, %ymm14, %ymm14 626 627 vfmadd231ps %ymm12, %ymm14, %ymm2 628 vshufps $0xb1, %ymm14, %ymm14, %ymm14 629 630 vfmadd231ps %ymm12, %ymm14, %ymm3 631 vbroadcastf128 96(%r12), %ymm14 // B 632 633 vfmadd231ps %ymm12, %ymm15, %ymm4 634 vshufps $0xb1, %ymm15, %ymm15, %ymm15 635 636 vfmadd231ps %ymm12, %ymm15, %ymm5 637 vshufps $0x4e, %ymm15, %ymm15, %ymm15 638 639 vfmadd231ps %ymm12, %ymm15, %ymm6 640 vshufps $0xb1, %ymm15, %ymm15, %ymm15 641 642 vfmadd231ps %ymm12, %ymm15, %ymm7 643 vbroadcastf128 112(%r12), %ymm15 // B 644// vmovaps 128(%r11), %ymm12 // A 645 646 subl $4, %r10d 647 addq $128, %r11 648 addq $128, %r12 649 650 // unroll 3 651 vfmadd231ps %ymm13, %ymm14, %ymm0 652 vshufps $0xb1, %ymm14, %ymm14, %ymm14 653 654 vfmadd231ps %ymm13, %ymm14, %ymm1 655 vshufps $0x4e, %ymm14, %ymm14, %ymm14 656 657 vfmadd231ps %ymm13, %ymm14, %ymm2 658 vshufps $0xb1, %ymm14, %ymm14, %ymm14 659 660 vfmadd231ps %ymm13, %ymm14, %ymm3 661// vbroadcastf128 0(%r12), %ymm14 // B 662 663 vfmadd231ps %ymm13, %ymm15, %ymm4 664 vshufps $0xb1, %ymm15, %ymm15, %ymm15 665 666 vfmadd231ps %ymm13, %ymm15, %ymm5 667 vshufps $0x4e, %ymm15, %ymm15, %ymm15 668 669 vfmadd231ps %ymm13, %ymm15, %ymm6 670 vshufps $0xb1, %ymm15, %ymm15, %ymm15 671 672 vfmadd231ps %ymm13, %ymm15, %ymm7 673// vbroadcastf128 16(%r12), %ymm15 // B 674// vmovaps 32(%r11), %ymm13 // A 675 676 677// cmpl $4, %r10d 678 jmp 2f // return 679 680 6814: // consider clean1-up loop 682 683 cmpl $0, %r10d 684 jle 2f // return 685 686 // clean-up loop 6873: // clean up loop 688 689 // unroll 0 690 vbroadcastf128 0(%r12), %ymm14 // B 691 vmovaps 0(%r11), %ymm12 // A 692 vfmadd231ps %ymm12, %ymm14, %ymm0 693 694 vshufps $0xb1, %ymm14, %ymm14, %ymm14 695 vfmadd231ps %ymm12, %ymm14, %ymm1 696 697 vshufps $0x4e, %ymm14, %ymm14, %ymm14 698 vfmadd231ps %ymm12, %ymm14, %ymm2 699 700 vshufps $0xb1, %ymm14, %ymm14, %ymm14 701 vfmadd231ps %ymm12, %ymm14, %ymm3 702 703 vbroadcastf128 16(%r12), %ymm14 // B 704 vfmadd231ps %ymm12, %ymm14, %ymm4 705 706 vshufps $0xb1, %ymm14, %ymm14, %ymm14 707 vfmadd231ps %ymm12, %ymm14, %ymm5 708 709 vshufps $0x4e, %ymm14, %ymm14, %ymm14 710 vfmadd231ps %ymm12, %ymm14, %ymm6 711 712 subl $1, %r10d 713 addq $32, %r11 714 addq $32, %r12 715 716 vshufps $0xb1, %ymm14, %ymm14, %ymm14 717 vfmadd231ps %ymm12, %ymm14, %ymm7 718 719 cmpl $0, %r10d 720 jg 3b // clean up loop 721 722 7232: // return 724 725#endif 726 727#if MACRO_LEVEL>=2 728 .endm 729#else 730 ret 731 732 FUN_END(inner_kernel_gemm_add_nt_8x8_lib8) 733#endif 734 735 736 737 738 739// common inner routine with file scope 740// 741// input arguments: 742// r10d <- k 743// r11 <- A 744// r12 <- B 745// ymm0 <- [d00 d11 d22 d33 d40 d51 d62 d73] 746// ymm1 <- [d01 d10 d23 d32 d41 d50 d63 d72] 747// ymm2 <- [d03 d12 d21 d30 d43 d52 d61 d70] 748// ymm3 <- [d02 d13 d20 d31 d42 d53 d60 d71] 749// ymm4 <- [] 750// ymm5 <- [] 751// ymm6 <- [] 752// ymm7 <- [] 753// ymm8 <- dirty 754// ymm9 <- dirty 755// ymm10 <- dirty 756// ymm11 <- dirty 757// ymm12 <- dirty 758// ymm13 <- dirty 759// ymm14 <- dirty 760// ymm15 <- dirty 761 762// 763// output arguments: 764// r10d <- 0 765// r11 <- A+4*k*sizeof(double) 766// r12 <- B+4*k*sizeof(double) 767// ymm0 <- [d00 d11 d22 d33 d40 d51 d62 d73] 768// ymm1 <- [d01 d10 d23 d32 d41 d50 d63 d72] 769// ymm2 <- [d03 d12 d21 d30 d43 d52 d61 d70] 770// ymm3 <- [d02 d13 d20 d31 d42 d53 d60 d71] 771// ymm4 <- [] 772// ymm5 <- [] 773// ymm6 <- [] 774// ymm7 <- [] 775// ymm8 <- dirty 776// ymm9 <- dirty 777// ymm10 <- dirty 778// ymm11 <- dirty 779// ymm12 <- dirty 780// ymm13 <- dirty 781// ymm14 <- dirty 782// ymm15 <- dirty 783 784#if MACRO_LEVEL>=2 785 .macro INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8 786#else 787 .p2align 4,,15 788 FUN_START(inner_kernel_gemm_sub_nt_8x8_lib8) 789#endif 790 791 cmpl $0, %r10d 792 jle 2f // return 793 794 // preload 795 796 cmpl $3, %r10d 797 jle 4f // consider clean-up loop 798 799 // main loop 800 .p2align 3 8011: // main loop 802 803 // unroll 0 804 vmovaps 0(%r11), %ymm13 // A 805 vbroadcastss 0(%r12), %ymm12 // B 806 vfnmadd231ps %ymm13, %ymm12, %ymm0 807 vbroadcastss 4(%r12), %ymm12 // B 808 vfnmadd231ps %ymm13, %ymm12, %ymm1 809 vbroadcastss 8(%r12), %ymm12 // B 810 vfnmadd231ps %ymm13, %ymm12, %ymm2 811 vbroadcastss 12(%r12), %ymm12 // B 812 vfnmadd231ps %ymm13, %ymm12, %ymm3 813 vbroadcastss 16(%r12), %ymm12 // B 814 vfnmadd231ps %ymm13, %ymm12, %ymm4 815 vbroadcastss 20(%r12), %ymm12 // B 816 vfnmadd231ps %ymm13, %ymm12, %ymm5 817 vbroadcastss 24(%r12), %ymm12 // B 818 vfnmadd231ps %ymm13, %ymm12, %ymm6 819 vbroadcastss 28(%r12), %ymm12 // B 820 vfnmadd231ps %ymm13, %ymm12, %ymm7 821 subl $4, %r10d 822 823 // unroll 0 824 vmovaps 32(%r11), %ymm13 // A 825 vbroadcastss 32(%r12), %ymm12 // B 826 vfnmadd231ps %ymm13, %ymm12, %ymm0 827 vbroadcastss 36(%r12), %ymm12 // B 828 vfnmadd231ps %ymm13, %ymm12, %ymm1 829 vbroadcastss 40(%r12), %ymm12 // B 830 vfnmadd231ps %ymm13, %ymm12, %ymm2 831 vbroadcastss 44(%r12), %ymm12 // B 832 vfnmadd231ps %ymm13, %ymm12, %ymm3 833 vbroadcastss 48(%r12), %ymm12 // B 834 vfnmadd231ps %ymm13, %ymm12, %ymm4 835 vbroadcastss 52(%r12), %ymm12 // B 836 vfnmadd231ps %ymm13, %ymm12, %ymm5 837 vbroadcastss 56(%r12), %ymm12 // B 838 vfnmadd231ps %ymm13, %ymm12, %ymm6 839 vbroadcastss 60(%r12), %ymm12 // B 840 vfnmadd231ps %ymm13, %ymm12, %ymm7 841 addq $128, %r11 842 843 // unroll 0 844 vmovaps -64(%r11), %ymm13 // A 845 vbroadcastss 64(%r12), %ymm12 // B 846 vfnmadd231ps %ymm13, %ymm12, %ymm0 847 vbroadcastss 68(%r12), %ymm12 // B 848 vfnmadd231ps %ymm13, %ymm12, %ymm1 849 vbroadcastss 72(%r12), %ymm12 // B 850 vfnmadd231ps %ymm13, %ymm12, %ymm2 851 vbroadcastss 76(%r12), %ymm12 // B 852 vfnmadd231ps %ymm13, %ymm12, %ymm3 853 vbroadcastss 80(%r12), %ymm12 // B 854 vfnmadd231ps %ymm13, %ymm12, %ymm4 855 vbroadcastss 84(%r12), %ymm12 // B 856 vfnmadd231ps %ymm13, %ymm12, %ymm5 857 vbroadcastss 88(%r12), %ymm12 // B 858 vfnmadd231ps %ymm13, %ymm12, %ymm6 859 vbroadcastss 92(%r12), %ymm12 // B 860 vfnmadd231ps %ymm13, %ymm12, %ymm7 861 addq $128, %r12 862 863 // unroll 0 864 vmovaps -32(%r11), %ymm13 // A 865 vbroadcastss -32(%r12), %ymm12 // B 866 vfnmadd231ps %ymm13, %ymm12, %ymm0 867 vbroadcastss -28(%r12), %ymm12 // B 868 vfnmadd231ps %ymm13, %ymm12, %ymm1 869 vbroadcastss -24(%r12), %ymm12 // B 870 vfnmadd231ps %ymm13, %ymm12, %ymm2 871 vbroadcastss -20(%r12), %ymm12 // B 872 vfnmadd231ps %ymm13, %ymm12, %ymm3 873 vbroadcastss -16(%r12), %ymm12 // B 874 vfnmadd231ps %ymm13, %ymm12, %ymm4 875 vbroadcastss -12(%r12), %ymm12 // B 876 vfnmadd231ps %ymm13, %ymm12, %ymm5 877 vbroadcastss -8(%r12), %ymm12 // B 878 vfnmadd231ps %ymm13, %ymm12, %ymm6 879 vbroadcastss -4(%r12), %ymm12 // B 880 vfnmadd231ps %ymm13, %ymm12, %ymm7 881 882 cmpl $4, %r10d 883 jg 1b // main loop 884 885 8864: // consider clean1-up loop 887 888 cmpl $0, %r10d 889 jle 2f // return 890 891 // clean-up loop 8923: // clean up loop 893 894 // unroll 0 895 vmovaps 0(%r11), %ymm13 // a 896 vbroadcastss 0(%r12), %ymm12 // b 897 vfnmadd231ps %ymm13, %ymm12, %ymm0 898 vbroadcastss 4(%r12), %ymm12 // b 899 vfnmadd231ps %ymm13, %ymm12, %ymm1 900 subl $1, %r10d 901 vbroadcastss 8(%r12), %ymm12 // b 902 vfnmadd231ps %ymm13, %ymm12, %ymm2 903 addq $32, %r11 904 vbroadcastss 12(%r12), %ymm12 // b 905 vfnmadd231ps %ymm13, %ymm12, %ymm3 906 vbroadcastss 16(%r12), %ymm12 // B 907 vfnmadd231ps %ymm13, %ymm12, %ymm4 908 vbroadcastss 20(%r12), %ymm12 // B 909 vfnmadd231ps %ymm13, %ymm12, %ymm5 910 vbroadcastss 24(%r12), %ymm12 // B 911 vfnmadd231ps %ymm13, %ymm12, %ymm6 912 vbroadcastss 28(%r12), %ymm12 // B 913 vfnmadd231ps %ymm13, %ymm12, %ymm7 914 addq $32, %r12 915 916 cmpl $0, %r10d 917 jg 3b // clean up loop 918 919 9202: // return 921 922#if MACRO_LEVEL>=2 923 .endm 924#else 925 ret 926 927 FUN_END(inner_kernel_gemm_sub_nt_8x8_lib8) 928#endif 929 930 931 932 933 934// common inner routine with file scope 935// 936// input arguments: 937// r10d <- k 938// r11 <- A 939// r12 <- B 940// r13 <- 4*sdb*sizeof(double) 941// r14 <= dirty 942// ymm0 <- [] 943// ymm1 <- [] 944// ymm2 <- [] 945// ymm3 <- [] 946// ymm8 <- dirty 947// ymm9 <- dirty 948// ymm10 <- dirty 949// ymm11 <- dirty 950// ymm12 <- dirty 951// ymm13 <- dirty 952// ymm14 <- dirty 953// ymm15 <- dirty 954 955// 956// output arguments: 957// r10d <- 0 958// r11 <- A+4*k*sizeof(double) 959// r12 <- B+(k/4)*sdb*sizeof(double)+(k%4) 960// r13 <- 4*sdb*sizeof(double) 961// r14 <= dirty 962// ymm0 <- [] 963// ymm1 <- [] 964// ymm2 <- [] 965// ymm3 <- [] 966// ymm8 <- dirty 967// ymm9 <- dirty 968// ymm10 <- dirty 969// ymm11 <- dirty 970// ymm12 <- dirty 971// ymm13 <- dirty 972// ymm14 <- dirty 973// ymm15 <- dirty 974 975#if MACRO_LEVEL>=2 976 .macro INNER_KERNEL_GEMM_ADD_NN_8X8_LIB8 977#else 978 .p2align 4,,15 979 FUN_START(inner_kernel_gemm_add_nn_8x8_lib8) 980#endif 981 982 cmpl $0, %r10d 983 jle 2f // return 984 985 cmpl $8, %r10d 986 jl 0f // consider clean-up loop 987 988 // main loop 989 .p2align 3 9901: // main loop 991 992// prefetcht0 0(%r12, %r13, 1) // software prefetch 993// prefetcht0 64(%r12, %r13, 1) // software prefetch 994// prefetcht0 128(%r12, %r13, 1) // software prefetch 995// prefetcht0 192(%r12, %r13, 1) // software prefetch 996 997 // unroll 0 998 vmovaps 0(%r11), %ymm12 // A[0] 999 vbroadcastss 0(%r12), %ymm13 // B[0] 1000 vfmadd231ps %ymm12, %ymm13, %ymm0 1001 vbroadcastss 32(%r12), %ymm13 // B[1] 1002 vfmadd231ps %ymm12, %ymm13, %ymm1 1003 vbroadcastss 64(%r12), %ymm13 // B[2] 1004 vfmadd231ps %ymm12, %ymm13, %ymm2 1005 vbroadcastss 96(%r12), %ymm13 // B[3] 1006 vfmadd231ps %ymm12, %ymm13, %ymm3 1007 vbroadcastss 128(%r12), %ymm13 // B[4] 1008 vfmadd231ps %ymm12, %ymm13, %ymm4 1009 vbroadcastss 160(%r12), %ymm13 // B[5] 1010 vfmadd231ps %ymm12, %ymm13, %ymm5 1011 vbroadcastss 192(%r12), %ymm13 // B[6] 1012 vfmadd231ps %ymm12, %ymm13, %ymm6 1013 vbroadcastss 224(%r12), %ymm13 // B[7] 1014 vfmadd231ps %ymm12, %ymm13, %ymm7 1015 1016 // unroll 1 1017 vmovaps 32(%r11), %ymm12 // A[0] 1018 vbroadcastss 4(%r12), %ymm13 // B[0] 1019 vfmadd231ps %ymm12, %ymm13, %ymm0 1020 vbroadcastss 36(%r12), %ymm13 // B[1] 1021 vfmadd231ps %ymm12, %ymm13, %ymm1 1022 vbroadcastss 68(%r12), %ymm13 // B[2] 1023 vfmadd231ps %ymm12, %ymm13, %ymm2 1024 vbroadcastss 100(%r12), %ymm13 // B[3] 1025 vfmadd231ps %ymm12, %ymm13, %ymm3 1026 vbroadcastss 132(%r12), %ymm13 // B[4] 1027 vfmadd231ps %ymm12, %ymm13, %ymm4 1028 vbroadcastss 164(%r12), %ymm13 // B[5] 1029 vfmadd231ps %ymm12, %ymm13, %ymm5 1030 vbroadcastss 196(%r12), %ymm13 // B[6] 1031 vfmadd231ps %ymm12, %ymm13, %ymm6 1032 vbroadcastss 228(%r12), %ymm13 // B[7] 1033 vfmadd231ps %ymm12, %ymm13, %ymm7 1034 1035 // unroll 2 1036 vmovaps 64(%r11), %ymm12 // A[0] 1037 vbroadcastss 8(%r12), %ymm13 // B[0] 1038 vfmadd231ps %ymm12, %ymm13, %ymm0 1039 vbroadcastss 40(%r12), %ymm13 // B[1] 1040 vfmadd231ps %ymm12, %ymm13, %ymm1 1041 vbroadcastss 72(%r12), %ymm13 // B[2] 1042 vfmadd231ps %ymm12, %ymm13, %ymm2 1043 vbroadcastss 104(%r12), %ymm13 // B[3] 1044 vfmadd231ps %ymm12, %ymm13, %ymm3 1045 vbroadcastss 136(%r12), %ymm13 // B[4] 1046 vfmadd231ps %ymm12, %ymm13, %ymm4 1047 vbroadcastss 168(%r12), %ymm13 // B[5] 1048 vfmadd231ps %ymm12, %ymm13, %ymm5 1049 vbroadcastss 200(%r12), %ymm13 // B[6] 1050 vfmadd231ps %ymm12, %ymm13, %ymm6 1051 vbroadcastss 232(%r12), %ymm13 // B[7] 1052 vfmadd231ps %ymm12, %ymm13, %ymm7 1053 1054 // unroll 3 1055 vmovaps 96(%r11), %ymm12 // A[0] 1056 vbroadcastss 12(%r12), %ymm13 // B[0] 1057 vfmadd231ps %ymm12, %ymm13, %ymm0 1058 vbroadcastss 44(%r12), %ymm13 // B[1] 1059 vfmadd231ps %ymm12, %ymm13, %ymm1 1060 vbroadcastss 76(%r12), %ymm13 // B[2] 1061 vfmadd231ps %ymm12, %ymm13, %ymm2 1062 vbroadcastss 108(%r12), %ymm13 // B[3] 1063 vfmadd231ps %ymm12, %ymm13, %ymm3 1064 vbroadcastss 140(%r12), %ymm13 // B[4] 1065 vfmadd231ps %ymm12, %ymm13, %ymm4 1066 vbroadcastss 172(%r12), %ymm13 // B[5] 1067 vfmadd231ps %ymm12, %ymm13, %ymm5 1068 vbroadcastss 204(%r12), %ymm13 // B[6] 1069 vfmadd231ps %ymm12, %ymm13, %ymm6 1070 vbroadcastss 236(%r12), %ymm13 // B[7] 1071 vfmadd231ps %ymm12, %ymm13, %ymm7 1072 1073 // unroll 4 1074 vmovaps 128(%r11), %ymm12 // A[0] 1075 vbroadcastss 16(%r12), %ymm13 // B[0] 1076 vfmadd231ps %ymm12, %ymm13, %ymm0 1077 vbroadcastss 48(%r12), %ymm13 // B[1] 1078 vfmadd231ps %ymm12, %ymm13, %ymm1 1079 vbroadcastss 80(%r12), %ymm13 // B[2] 1080 vfmadd231ps %ymm12, %ymm13, %ymm2 1081 vbroadcastss 112(%r12), %ymm13 // B[3] 1082 vfmadd231ps %ymm12, %ymm13, %ymm3 1083 vbroadcastss 144(%r12), %ymm13 // B[4] 1084 vfmadd231ps %ymm12, %ymm13, %ymm4 1085 vbroadcastss 176(%r12), %ymm13 // B[5] 1086 vfmadd231ps %ymm12, %ymm13, %ymm5 1087 vbroadcastss 208(%r12), %ymm13 // B[6] 1088 vfmadd231ps %ymm12, %ymm13, %ymm6 1089 vbroadcastss 240(%r12), %ymm13 // B[7] 1090 vfmadd231ps %ymm12, %ymm13, %ymm7 1091 1092 // unroll 5 1093 vmovaps 160(%r11), %ymm12 // A[0] 1094 vbroadcastss 20(%r12), %ymm13 // B[0] 1095 vfmadd231ps %ymm12, %ymm13, %ymm0 1096 vbroadcastss 52(%r12), %ymm13 // B[1] 1097 vfmadd231ps %ymm12, %ymm13, %ymm1 1098 vbroadcastss 84(%r12), %ymm13 // B[2] 1099 vfmadd231ps %ymm12, %ymm13, %ymm2 1100 vbroadcastss 116(%r12), %ymm13 // B[3] 1101 vfmadd231ps %ymm12, %ymm13, %ymm3 1102 vbroadcastss 148(%r12), %ymm13 // B[4] 1103 vfmadd231ps %ymm12, %ymm13, %ymm4 1104 vbroadcastss 180(%r12), %ymm13 // B[5] 1105 vfmadd231ps %ymm12, %ymm13, %ymm5 1106 vbroadcastss 212(%r12), %ymm13 // B[6] 1107 vfmadd231ps %ymm12, %ymm13, %ymm6 1108 vbroadcastss 244(%r12), %ymm13 // B[7] 1109 vfmadd231ps %ymm12, %ymm13, %ymm7 1110 subl $8, %r10d 1111 1112 // unroll 6 1113 vmovaps 192(%r11), %ymm12 // A[0] 1114 vbroadcastss 24(%r12), %ymm13 // B[0] 1115 vfmadd231ps %ymm12, %ymm13, %ymm0 1116 vbroadcastss 56(%r12), %ymm13 // B[1] 1117 vfmadd231ps %ymm12, %ymm13, %ymm1 1118 vbroadcastss 88(%r12), %ymm13 // B[2] 1119 vfmadd231ps %ymm12, %ymm13, %ymm2 1120 vbroadcastss 120(%r12), %ymm13 // B[3] 1121 vfmadd231ps %ymm12, %ymm13, %ymm3 1122 vbroadcastss 152(%r12), %ymm13 // B[4] 1123 vfmadd231ps %ymm12, %ymm13, %ymm4 1124 vbroadcastss 184(%r12), %ymm13 // B[5] 1125 vfmadd231ps %ymm12, %ymm13, %ymm5 1126 vbroadcastss 216(%r12), %ymm13 // B[6] 1127 vfmadd231ps %ymm12, %ymm13, %ymm6 1128 vbroadcastss 248(%r12), %ymm13 // B[7] 1129 vfmadd231ps %ymm12, %ymm13, %ymm7 1130 addq $256, %r11 1131 1132 // unroll 7 1133 vmovaps -32(%r11), %ymm12 // A[0] 1134 vbroadcastss 28(%r12), %ymm13 // B[0] 1135 vfmadd231ps %ymm12, %ymm13, %ymm0 1136 vbroadcastss 60(%r12), %ymm13 // B[1] 1137 vfmadd231ps %ymm12, %ymm13, %ymm1 1138 vbroadcastss 92(%r12), %ymm13 // B[2] 1139 vfmadd231ps %ymm12, %ymm13, %ymm2 1140 vbroadcastss 124(%r12), %ymm13 // B[3] 1141 vfmadd231ps %ymm12, %ymm13, %ymm3 1142 vbroadcastss 156(%r12), %ymm13 // B[4] 1143 vfmadd231ps %ymm12, %ymm13, %ymm4 1144 vbroadcastss 188(%r12), %ymm13 // B[5] 1145 vfmadd231ps %ymm12, %ymm13, %ymm5 1146 vbroadcastss 220(%r12), %ymm13 // B[6] 1147 vfmadd231ps %ymm12, %ymm13, %ymm6 1148 vbroadcastss 252(%r12), %ymm13 // B[7] 1149 addq %r12, %r13 1150 vfmadd231ps %ymm12, %ymm13, %ymm7 1151 1152 cmpl $7, %r10d 1153 jg 1b // main loop 1154 1155 11560: // consider clean1-up loop 1157 1158 cmpl $0, %r10d 1159 jle 2f // return 1160 11613: // clean1-up loop 1162 1163 // unroll 0 1164 vmovaps 0(%r11), %ymm12 // A[0] 1165 vbroadcastss 0(%r12), %ymm13 // B[0] 1166 vfmadd231ps %ymm12, %ymm13, %ymm0 1167 vbroadcastss 32(%r12), %ymm13 // B[1] 1168 vfmadd231ps %ymm12, %ymm13, %ymm1 1169 vbroadcastss 64(%r12), %ymm13 // B[2] 1170 vfmadd231ps %ymm12, %ymm13, %ymm2 1171 vbroadcastss 96(%r12), %ymm13 // B[3] 1172 vfmadd231ps %ymm12, %ymm13, %ymm3 1173 vbroadcastss 128(%r12), %ymm13 // B[4] 1174 vfmadd231ps %ymm12, %ymm13, %ymm4 1175 vbroadcastss 160(%r12), %ymm13 // B[5] 1176 vfmadd231ps %ymm12, %ymm13, %ymm5 1177 vbroadcastss 192(%r12), %ymm13 // B[6] 1178 vfmadd231ps %ymm12, %ymm13, %ymm6 1179 vbroadcastss 224(%r12), %ymm13 // B[7] 1180 vfmadd231ps %ymm12, %ymm13, %ymm7 1181 1182 subl $1, %r10d 1183 addq $32, %r11 1184 addq $4, %r12 1185 1186 cmpl $0, %r10d 1187 jg 3b // clean up loop 1188 1189 11902: // return 1191 1192#if MACRO_LEVEL>=2 1193 .endm 1194#else 1195 ret 1196 1197 FUN_END(inner_kernel_gemm_add_nn_8x8_lib8) 1198#endif 1199 1200 1201 1202 1203 1204// common inner routine with file scope 1205// 1206// edge for B unaligned 1207// 1208// input arguments: 1209// r10 <- k 1210// r11 <- A 1211// r12 <- B 1212// r13 <- bs*sdb*sizeof(double) 1213// r14 <- offB 1214// ymm0 <- [] 1215// ymm1 <- [] 1216// ymm2 <- [] 1217// ymm3 <- [] 1218// ymm8 <- dirty 1219// ymm12 <- dirty 1220// ymm15 <- dirty 1221 1222// 1223// output arguments: 1224// r10 <- k-(4-offB) 1225// r11 <- A+(4-offB)*bs*sizeof(double) 1226// r12 <- B-offB+bs*sdb*sizeof(double) 1227// r13 <- bs*sdb*sizeof(double) 1228// r14 <- offB 1229// ymm0 <- [] 1230// ymm1 <- [] 1231// ymm2 <- [] 1232// ymm3 <- [] 1233// ymm8 <- dirty 1234// ymm12 <- dirty 1235// ymm15 <- dirty 1236 1237 1238#if MACRO_LEVEL>=1 1239 .macro INNER_EDGE_GEMM_ADD_NN_8X8_LIB8 1240#else 1241 .p2align 4,,15 1242 FUN_START(inner_edge_gemm_add_nn_8x8_lib8) 1243#endif 1244 1245 cmpl $0, %r14d // offset==0 1246 jle 2f // end 1247 1248 cmpl $0, %r10d // k==0 1249 jle 2f // end 1250 1251 movl $8, %ebx 1252 subl %r14d, %ebx // 8-offsetB 1253 cmpl %r10d, %ebx 1254// jle 0f 1255// movl %r10d, %ebx // kend=min(k,8-offsetB) 1256//0: 1257 cmovgl %r10d, %ebx // kend=min(k,8-offsetB) 1258 1259 movl %r14d, %eax 1260 sall $2, %eax // offsetB*sizeof(float) 1261 addq %rax, %r12 // B+offsetB*sizeof(float) 1262 1263 // unroll 0 1264 vmovaps 0(%r11), %ymm12 // A[0] 1265 vbroadcastss 0(%r12), %ymm13 // B[0] 1266 vfmadd231ps %ymm12, %ymm13, %ymm0 1267 vbroadcastss 32(%r12), %ymm13 // B[1] 1268 vfmadd231ps %ymm12, %ymm13, %ymm1 1269 vbroadcastss 64(%r12), %ymm13 // B[2] 1270 vfmadd231ps %ymm12, %ymm13, %ymm2 1271 vbroadcastss 96(%r12), %ymm13 // B[3] 1272 vfmadd231ps %ymm12, %ymm13, %ymm3 1273 vbroadcastss 128(%r12), %ymm13 // B[4] 1274 vfmadd231ps %ymm12, %ymm13, %ymm4 1275 vbroadcastss 160(%r12), %ymm13 // B[5] 1276 vfmadd231ps %ymm12, %ymm13, %ymm5 1277 vbroadcastss 192(%r12), %ymm13 // B[6] 1278 vfmadd231ps %ymm12, %ymm13, %ymm6 1279 vbroadcastss 224(%r12), %ymm13 // B[7] 1280 vfmadd231ps %ymm12, %ymm13, %ymm7 1281 1282 subl $1, %r10d // k-1 1283 subl $1, %ebx // kend-1 1284 addq $32, %r11 // A+1*bs*sizeof(float) 1285 addq $4, %r12 // B+1*sizeof(float) 1286 1287 cmpl $0, %ebx 1288 jg 1b 1289 1290 cmpl $0, %r10d 1291 jle 2f // end 1292 1293 addq %r13, %r12 1294 subq $32, %r12 // B+bs*(sdb-1)*sizeof(float) 1295 12962: 1297 1298#if MACRO_LEVEL>=1 1299 .endm 1300#else 1301 ret 1302 1303 FUN_END(inner_edge_gemm_add_nn_8x8_lib8) 1304#endif 1305 1306 1307 1308 1309 1310// common inner routine with file scope 1311// 1312// strsm 1313// right 1314// lower 1315// transposed 1316// not-unit 1317// 1318// input arguments: 1319// r10 <- D 1320// r11 <- inv_diag_D 1321// r12d <- kn 1322// ymm0 <- [] 1323// ymm1 <- [] 1324// ymm2 <- [] 1325// ymm3 <- [] 1326// ymm4 <- [] 1327// ymm5 <- [] 1328// ymm6 <- [] 1329// ymm7 <- [] 1330// ymm12 <- dirty 1331// ymm13 <- dirty 1332// ymm14 <- dirty 1333// ymm15 <- dirty 1334// 1335// output arguments: 1336// r10 <- D 1337// r11 <- inv_diag_D 1338// r12d <- kn 1339// ymm0 <- [] 1340// ymm1 <- [] 1341// ymm2 <- [] 1342// ymm3 <- [] 1343// ymm4 <- [] 1344// ymm5 <- [] 1345// ymm6 <- [] 1346// ymm7 <- [] 1347// ymm12 <- dirty 1348// ymm13 <- dirty 1349// ymm14 <- dirty 1350// ymm15 <- dirty 1351 1352#if MACRO_LEVEL>=1 1353 .macro INNER_EDGE_TRSM_RLT_INV_8X8_VS_LIB8 1354#else 1355 .p2align 4,,15 1356 FUN_START(inner_edge_trsm_rlt_inv_8x8_vs_lib8) 1357#endif 1358 1359 vbroadcastss 0(%r11), %ymm13 1360 vmulps %ymm0, %ymm13, %ymm0 1361 vbroadcastss 4(%r10), %ymm13 1362 vfnmadd231ps %ymm0, %ymm13, %ymm1 1363 vbroadcastss 8(%r10), %ymm13 1364 vfnmadd231ps %ymm0, %ymm13, %ymm2 1365 vbroadcastss 12(%r10), %ymm13 1366 vfnmadd231ps %ymm0, %ymm13, %ymm3 1367 vbroadcastss 16(%r10), %ymm13 1368 vfnmadd231ps %ymm0, %ymm13, %ymm4 1369 vbroadcastss 20(%r10), %ymm13 1370 vfnmadd231ps %ymm0, %ymm13, %ymm5 1371 vbroadcastss 24(%r10), %ymm13 1372 vfnmadd231ps %ymm0, %ymm13, %ymm6 1373 vbroadcastss 28(%r10), %ymm13 1374 vfnmadd231ps %ymm0, %ymm13, %ymm7 1375 1376 vbroadcastss 4(%r11), %ymm13 1377 vmulps %ymm1, %ymm13, %ymm1 1378 vbroadcastss 40(%r10), %ymm13 1379 vfnmadd231ps %ymm1, %ymm13, %ymm2 1380 vbroadcastss 44(%r10), %ymm13 1381 vfnmadd231ps %ymm1, %ymm13, %ymm3 1382 vbroadcastss 48(%r10), %ymm13 1383 vfnmadd231ps %ymm1, %ymm13, %ymm4 1384 vbroadcastss 52(%r10), %ymm13 1385 vfnmadd231ps %ymm1, %ymm13, %ymm5 1386 vbroadcastss 56(%r10), %ymm13 1387 vfnmadd231ps %ymm1, %ymm13, %ymm6 1388 vbroadcastss 60(%r10), %ymm13 1389 vfnmadd231ps %ymm1, %ymm13, %ymm7 1390 1391 vbroadcastss 8(%r11), %ymm13 1392 vmulps %ymm2, %ymm13, %ymm2 1393 vbroadcastss 76(%r10), %ymm13 1394 vfnmadd231ps %ymm2, %ymm13, %ymm3 1395 vbroadcastss 80(%r10), %ymm13 1396 vfnmadd231ps %ymm2, %ymm13, %ymm4 1397 vbroadcastss 84(%r10), %ymm13 1398 vfnmadd231ps %ymm2, %ymm13, %ymm5 1399 vbroadcastss 88(%r10), %ymm13 1400 vfnmadd231ps %ymm2, %ymm13, %ymm6 1401 vbroadcastss 92(%r10), %ymm13 1402 vfnmadd231ps %ymm2, %ymm13, %ymm7 1403 1404 vbroadcastss 12(%r11), %ymm13 1405 vmulps %ymm3, %ymm13, %ymm3 1406 vbroadcastss 112(%r10), %ymm13 1407 vfnmadd231ps %ymm3, %ymm13, %ymm4 1408 vbroadcastss 116(%r10), %ymm13 1409 vfnmadd231ps %ymm3, %ymm13, %ymm5 1410 vbroadcastss 120(%r10), %ymm13 1411 vfnmadd231ps %ymm3, %ymm13, %ymm6 1412 vbroadcastss 124(%r10), %ymm13 1413 vfnmadd231ps %ymm3, %ymm13, %ymm7 1414 1415 vbroadcastss 16(%r11), %ymm13 1416 vmulps %ymm4, %ymm13, %ymm4 1417 cmpl $6, %r12d 1418 jl 0f // ret 1419 vbroadcastss 148(%r10), %ymm13 1420 vfnmadd231ps %ymm4, %ymm13, %ymm5 1421 vbroadcastss 152(%r10), %ymm13 1422 vfnmadd231ps %ymm4, %ymm13, %ymm6 1423 vbroadcastss 156(%r10), %ymm13 1424 vfnmadd231ps %ymm4, %ymm13, %ymm7 1425 1426 vbroadcastss 20(%r11), %ymm13 1427 vmulps %ymm5, %ymm13, %ymm5 1428 cmpl $7, %r12d 1429 jl 0f // ret 1430 vbroadcastss 184(%r10), %ymm13 1431 vfnmadd231ps %ymm5, %ymm13, %ymm6 1432 vbroadcastss 188(%r10), %ymm13 1433 vfnmadd231ps %ymm5, %ymm13, %ymm7 1434 1435 vbroadcastss 24(%r11), %ymm13 1436 vmulps %ymm6, %ymm13, %ymm6 1437 cmpl $8, %r12d 1438 jl 0f // ret 1439 vbroadcastss 220(%r10), %ymm13 1440 vfnmadd231ps %ymm6, %ymm13, %ymm7 1441 1442 vbroadcastss 28(%r11), %ymm13 1443 vmulps %ymm7, %ymm13, %ymm7 1444 14450: 1446 1447#if MACRO_LEVEL>=1 1448 .endm 1449#else 1450 ret 1451 1452 FUN_END(inner_edge_trsm_rlt_inv_8x8_vs_lib8) 1453#endif 1454 1455 1456 1457 1458 1459// common inner routine with file scope 1460// 1461// cholesky factorization 1462// 1463// input arguments: 1464// r10 <- inv_diag_E 1465// r11d <- kn 1466// ymm0 <- [] 1467// ymm1 <- [] 1468// ymm2 <- [] 1469// ymm3 <- [] 1470// ymm4 <- [] 1471// ymm5 <- [] 1472// ymm6 <- [] 1473// ymm7 <- [] 1474// ymm12 <- dirty 1475// ymm13 <- dirty 1476// ymm14 <- dirty 1477// ymm15 <- dirty 1478// 1479// output arguments: 1480// r10 <- inv_diag_E 1481// r11d <- kn 1482// ymm0 <- [] 1483// ymm1 <- [] 1484// ymm2 <- [] 1485// ymm3 <- [] 1486// ymm4 <- [] 1487// ymm5 <- [] 1488// ymm6 <- [] 1489// ymm7 <- [] 1490// ymm12 <- dirty 1491// ymm13 <- dirty 1492// ymm14 <- dirty 1493// ymm15 <- dirty 1494 1495#if MACRO_LEVEL>=1 1496 .macro INNER_EDGE_POTRF_8X8_VS_LIB8 1497#else 1498 .p2align 4,,15 1499 FUN_START(inner_edge_potrf_8x8_vs_lib8) 1500#endif 1501 1502 vxorps %ymm15, %ymm15, %ymm15 // 0.0 1503#if defined(OS_LINUX) | defined(OS_WINDOWS) 1504 vmovss .LC03(%rip), %xmm14 // 1.0 1505#elif defined(OS_MAC) 1506 vmovss LC03(%rip), %xmm14 // 1.0 1507#endif 1508 1509 vmovss %xmm0, %xmm0, %xmm13 1510 vucomiss %xmm15, %xmm13 // d_00 > 0.0 ? 1511 jbe 1f 1512 vsqrtss %xmm13, %xmm13, %xmm13 1513 vdivss %xmm13, %xmm14, %xmm13 15142: 1515 vmovss %xmm13, 0(%r10) 1516 vbroadcastss %xmm13, %ymm13 1517// vpermilps $0x00, %xmm13, %xmm13 1518// vinsertf128 $0x1, %xmm13, %ymm13, %ymm13 1519 vmulps %ymm0, %ymm13, %ymm0 1520 vperm2f128 $0x00, %ymm0, %ymm0, %ymm11 1521 vpermilps $0x55, %ymm11, %ymm13 1522 vfnmadd231ps %ymm0, %ymm13, %ymm1 1523 vpermilps $0xaa, %ymm11, %ymm13 1524 vfnmadd231ps %ymm0, %ymm13, %ymm2 1525 vpermilps $0xff, %ymm11, %ymm13 1526 vfnmadd231ps %ymm0, %ymm13, %ymm3 1527 vperm2f128 $0x11, %ymm0, %ymm0, %ymm11 1528 vpermilps $0x00, %ymm11, %ymm13 1529 vfnmadd231ps %ymm0, %ymm13, %ymm4 1530 vpermilps $0x55, %ymm11, %ymm13 1531 vfnmadd231ps %ymm0, %ymm13, %ymm5 1532 vpermilps $0xaa, %ymm11, %ymm13 1533 vfnmadd231ps %ymm0, %ymm13, %ymm6 1534 vpermilps $0xff, %ymm11, %ymm13 1535 vfnmadd231ps %ymm0, %ymm13, %ymm7 1536 1537 1538 vpermilps $0x55, %xmm1, %xmm13 1539 vucomiss %xmm15, %xmm13 // d_11 > 0.0 ? 1540 jbe 3f 1541 vsqrtss %xmm13, %xmm13, %xmm13 1542 vdivss %xmm13, %xmm14, %xmm13 15434: 1544 vmovss %xmm13, 4(%r10) 1545 vbroadcastss %xmm13, %ymm13 1546 vmulps %ymm1, %ymm13, %ymm1 1547 vperm2f128 $0x00, %ymm1, %ymm1, %ymm11 1548 vpermilps $0xaa, %ymm11, %ymm13 1549 vfnmadd231ps %ymm1, %ymm13, %ymm2 1550 vpermilps $0xff, %ymm11, %ymm13 1551 vfnmadd231ps %ymm1, %ymm13, %ymm3 1552 vperm2f128 $0x11, %ymm1, %ymm1, %ymm11 1553 vpermilps $0x00, %ymm11, %ymm13 1554 vfnmadd231ps %ymm1, %ymm13, %ymm4 1555 vpermilps $0x55, %ymm11, %ymm13 1556 vfnmadd231ps %ymm1, %ymm13, %ymm5 1557 vpermilps $0xaa, %ymm11, %ymm13 1558 vfnmadd231ps %ymm1, %ymm13, %ymm6 1559 vpermilps $0xff, %ymm11, %ymm13 1560 vfnmadd231ps %ymm1, %ymm13, %ymm7 1561 1562 1563 vpermilps $0xaa, %xmm2, %xmm13 1564 vucomiss %xmm15, %xmm13 // d_22 > 0.0 ? 1565 jbe 5f 1566 vsqrtss %xmm13, %xmm13, %xmm13 1567 vdivss %xmm13, %xmm14, %xmm13 15686: 1569 vmovss %xmm13, 8(%r10) 1570 vbroadcastss %xmm13, %ymm13 1571 vmulps %ymm2, %ymm13, %ymm2 1572 vperm2f128 $0x00, %ymm2, %ymm2, %ymm11 1573 vpermilps $0xff, %ymm11, %ymm13 1574 vfnmadd231ps %ymm2, %ymm13, %ymm3 1575 vperm2f128 $0x11, %ymm2, %ymm2, %ymm11 1576 vpermilps $0x00, %ymm11, %ymm13 1577 vfnmadd231ps %ymm2, %ymm13, %ymm4 1578 vpermilps $0x55, %ymm11, %ymm13 1579 vfnmadd231ps %ymm2, %ymm13, %ymm5 1580 vpermilps $0xaa, %ymm11, %ymm13 1581 vfnmadd231ps %ymm2, %ymm13, %ymm6 1582 vpermilps $0xff, %ymm11, %ymm13 1583 vfnmadd231ps %ymm2, %ymm13, %ymm7 1584 1585 1586 vpermilps $0xff, %xmm3, %xmm13 1587 vucomiss %xmm15, %xmm13 // d_33 > 0.0 ? 1588 jbe 7f 1589 vsqrtss %xmm13, %xmm13, %xmm13 1590 vdivss %xmm13, %xmm14, %xmm13 15918: 1592 vmovss %xmm13, 12(%r10) 1593 vbroadcastss %xmm13, %ymm13 1594 vmulps %ymm3, %ymm13, %ymm3 1595 vperm2f128 $0x11, %ymm3, %ymm3, %ymm11 1596 vpermilps $0x00, %ymm11, %ymm13 1597 vfnmadd231ps %ymm3, %ymm13, %ymm4 1598 vpermilps $0x55, %ymm11, %ymm13 1599 vfnmadd231ps %ymm3, %ymm13, %ymm5 1600 vpermilps $0xaa, %ymm11, %ymm13 1601 vfnmadd231ps %ymm3, %ymm13, %ymm6 1602 vpermilps $0xff, %ymm11, %ymm13 1603 vfnmadd231ps %ymm3, %ymm13, %ymm7 1604 1605 1606 vextractf128 $0x1, %ymm4, %xmm13 1607// vpermilps $0x00, %xmm13, %xmm13 1608 vucomiss %xmm15, %xmm13 // d_33 > 0.0 ? 1609 jbe 9f 1610 vsqrtss %xmm13, %xmm13, %xmm13 1611 vdivss %xmm13, %xmm14, %xmm13 161210: 1613 vmovss %xmm13, 16(%r10) 1614 vbroadcastss %xmm13, %ymm13 1615 vmulps %ymm4, %ymm13, %ymm4 1616 cmpl $6, %r11d 1617 jl 0f // ret 1618 vperm2f128 $0x11, %ymm4, %ymm4, %ymm11 1619 vpermilps $0x55, %ymm11, %ymm13 1620 vfnmadd231ps %ymm4, %ymm13, %ymm5 1621 vpermilps $0xaa, %ymm11, %ymm13 1622 vfnmadd231ps %ymm4, %ymm13, %ymm6 1623 vpermilps $0xff, %ymm11, %ymm13 1624 vfnmadd231ps %ymm4, %ymm13, %ymm7 1625 1626 1627 vextractf128 $0x1, %ymm5, %xmm13 1628 vpermilps $0x55, %xmm13, %xmm13 1629 vucomiss %xmm15, %xmm13 // d_33 > 0.0 ? 1630 jbe 11f 1631 vsqrtss %xmm13, %xmm13, %xmm13 1632 vdivss %xmm13, %xmm14, %xmm13 163312: 1634 vmovss %xmm13, 20(%r10) 1635 vbroadcastss %xmm13, %ymm13 1636 vmulps %ymm5, %ymm13, %ymm5 1637 cmpl $7, %r11d 1638 jl 0f // ret 1639 vperm2f128 $0x11, %ymm5, %ymm5, %ymm11 1640 vpermilps $0xaa, %ymm11, %ymm13 1641 vfnmadd231ps %ymm5, %ymm13, %ymm6 1642 vpermilps $0xff, %ymm11, %ymm13 1643 vfnmadd231ps %ymm5, %ymm13, %ymm7 1644 1645 1646 vextractf128 $0x1, %ymm6, %xmm13 1647 vpermilps $0xaa, %xmm13, %xmm13 1648 vucomiss %xmm15, %xmm13 // d_33 > 0.0 ? 1649 jbe 13f 1650 vsqrtss %xmm13, %xmm13, %xmm13 1651 vdivss %xmm13, %xmm14, %xmm13 165214: 1653 vmovss %xmm13, 24(%r10) 1654 vbroadcastss %xmm13, %ymm13 1655 vmulps %ymm6, %ymm13, %ymm6 1656 cmpl $8, %r11d 1657 jl 0f // ret 1658 vperm2f128 $0x11, %ymm6, %ymm6, %ymm11 1659 vpermilps $0xff, %ymm11, %ymm13 1660 vfnmadd231ps %ymm6, %ymm13, %ymm7 1661 1662 1663 vextractf128 $0x1, %ymm7, %xmm13 1664 vpermilps $0xff, %xmm13, %xmm13 1665 vucomiss %xmm15, %xmm13 // d_33 > 0.0 ? 1666 jbe 15f 1667 vsqrtss %xmm13, %xmm13, %xmm13 1668 vdivss %xmm13, %xmm14, %xmm13 166916: 1670 vmovss %xmm13, 28(%r10) 1671 vbroadcastss %xmm13, %ymm13 1672 vmulps %ymm7, %ymm13, %ymm7 1673 1674 1675 jmp 0f 1676 1677 16781: 1679 vxorps %ymm13, %ymm13, %ymm13 1680 jmp 2b 1681 16823: 1683 vxorpd %ymm13, %ymm13, %ymm13 1684 jmp 4b 1685 16865: 1687 vxorpd %ymm13, %ymm13, %ymm13 1688 jmp 6b 1689 16907: 1691 vxorpd %ymm13, %ymm13, %ymm13 1692 jmp 8b 1693 16949: 1695 vxorpd %ymm13, %ymm13, %ymm13 1696 jmp 10b 1697 169811: 1699 vxorpd %ymm13, %ymm13, %ymm13 1700 jmp 12b 1701 170213: 1703 vxorpd %ymm13, %ymm13, %ymm13 1704 jmp 14b 1705 170615: 1707 vxorpd %ymm13, %ymm13, %ymm13 1708 jmp 16b 1709 17100: 1711 1712#if MACRO_LEVEL>=1 1713 .endm 1714#else 1715 ret 1716 1717 FUN_END(inner_edge_potrf_8x8_vs_lib8) 1718#endif 1719 1720 1721 1722 1723 1724// common inner routine with file scope 1725// 1726// scale for generic alpha and beta 1727// 1728// input arguments: 1729// r10 <- alpha 1730// r11 <- beta 1731// r12 <- C 1732// ymm0 <- [] 1733// ymm1 <- [] 1734// ymm2 <- [] 1735// ymm3 <- [] 1736// ymm4 <- [] 1737// ymm5 <- [] 1738// ymm6 <- [] 1739// ymm7 <- [] 1740// ymm8 <- dirty 1741// ymm9 <- dirty 1742// ymm10 <- dirty 1743// ymm11 <- dirty 1744// ymm15 <- dirty 1745// 1746// output arguments: 1747// r10 <- alpha 1748// r11 <- beta 1749// r12 <- C 1750// ymm0 <- [] 1751// ymm1 <- [] 1752// ymm2 <- [] 1753// ymm3 <- [] 1754// ymm4 <- [] 1755// ymm5 <- [] 1756// ymm6 <- [] 1757// ymm7 <- [] 1758// ymm8 <- dirty 1759// ymm9 <- dirty 1760// ymm10 <- dirty 1761// ymm11 <- dirty 1762// ymm15 <- dirty 1763 1764#if MACRO_LEVEL>=1 1765 .macro INNER_SCALE_AB_8X8_LIB8 1766#else 1767 .p2align 4,,15 1768 FUN_START(inner_scale_ab_8x8_lib8) 1769#endif 1770 1771 // alpha 1772 vbroadcastss 0(%r10), %ymm11 1773 1774 vmulps %ymm0, %ymm11, %ymm0 1775 vmulps %ymm1, %ymm11, %ymm1 1776 vmulps %ymm2, %ymm11, %ymm2 1777 vmulps %ymm3, %ymm11, %ymm3 1778 vmulps %ymm4, %ymm11, %ymm4 1779 vmulps %ymm5, %ymm11, %ymm5 1780 vmulps %ymm6, %ymm11, %ymm6 1781 vmulps %ymm7, %ymm11, %ymm7 1782 1783 // beta 1784 vbroadcastss 0(%r11), %ymm14 1785 1786 vxorps %ymm15, %ymm15, %ymm15 // 0.0 1787 1788 vucomiss %xmm15, %xmm14 // beta==0.0 ? 1789 je 0f // end 1790 1791 vmovaps 0(%r12), %ymm15 1792 vfmadd231ps %ymm15, %ymm14, %ymm0 1793 vmovaps 32(%r12), %ymm15 1794 vfmadd231ps %ymm15, %ymm14, %ymm1 1795 vmovaps 64(%r12), %ymm15 1796 vfmadd231ps %ymm15, %ymm14, %ymm2 1797 vmovaps 96(%r12), %ymm15 1798 vfmadd231ps %ymm15, %ymm14, %ymm3 1799 vmovaps 128(%r12), %ymm15 1800 vfmadd231ps %ymm15, %ymm14, %ymm4 1801 vmovaps 160(%r12), %ymm15 1802 vfmadd231ps %ymm15, %ymm14, %ymm5 1803 vmovaps 192(%r12), %ymm15 1804 vfmadd231ps %ymm15, %ymm14, %ymm6 1805 vmovaps 224(%r12), %ymm15 1806 vfmadd231ps %ymm15, %ymm14, %ymm7 1807 18080: 1809 1810#if MACRO_LEVEL>=1 1811 .endm 1812#else 1813 ret 1814 1815 FUN_END(inner_scale_ab_8x8_lib8) 1816#endif 1817 1818 1819 1820 1821 1822// common inner routine with file scope 1823// 1824// scale scale for generic alpha and beta 1825// 1826// input arguments: 1827// r10 <- alpha 1828// r11 <- beta 1829// r12 <- offset 1830// r13 <- C 1831// r14 <- 4*sdc*sizeof(double) 1832// ymm0 <- [] 1833// ymm1 <- [] 1834// ymm2 <- [] 1835// ymm3 <- [] 1836// ymm4 <- [] 1837// ymm5 <- [] 1838// ymm6 <- [] 1839// ymm7 <- [] 1840// ymm8 <- dirty 1841// ymm9 <- dirty 1842// ymm10 <- dirty 1843// ymm11 <- dirty 1844// ymm15 <- dirty 1845// 1846// output arguments: 1847// r10 <- alpha 1848// r11 <- beta 1849// r12 <- offset 1850// r13 <- C 1851// r14 <- 4*sdc*sizeof(double) 1852// r15 <- n0 // col index: start from (inc) 1853// ymm0 <- [] 1854// ymm1 <- [] 1855// ymm2 <- [] 1856// ymm3 <- [] 1857// ymm4 <- [] 1858// ymm5 <- [] 1859// ymm6 <- [] 1860// ymm7 <- [] 1861// ymm8 <- dirty 1862// ymm9 <- dirty 1863// ymm10 <- dirty 1864// ymm11 <- dirty 1865// ymm15 <- dirty 1866 1867#if MACRO_LEVEL>=1 1868 .macro INNER_SCALE_AB_8X8_GEN_LIB8 1869#else 1870 .p2align 4,,15 1871 FUN_START(inner_scale_ab_8x8_gen_lib8) 1872#endif 1873 1874 // alpha 1875 vbroadcastss 0(%r10), %ymm11 1876 1877 vmulps %ymm0, %ymm11, %ymm0 1878 vmulps %ymm1, %ymm11, %ymm1 1879 vmulps %ymm2, %ymm11, %ymm2 1880 vmulps %ymm3, %ymm11, %ymm3 1881 1882 vmulps %ymm4, %ymm11, %ymm4 1883 vmulps %ymm5, %ymm11, %ymm5 1884 vmulps %ymm6, %ymm11, %ymm6 1885 vmulps %ymm7, %ymm11, %ymm7 1886 1887 // beta 1888 vbroadcastss 0(%r11), %ymm15 1889 1890 vxorps %ymm14, %ymm14, %ymm14 // 0.0 1891 1892 vucomiss %xmm15, %xmm14 // beta==0.0 ? 1893 je 3f // end 1894 1895 cmpl $0, %r12d 1896 jg 0f 1897 1898 // offset==0 1899 1900 vmovaps 0(%r13), %ymm12 1901 vfmadd231ps %ymm12, %ymm15, %ymm0 1902 vmovaps 32(%r13), %ymm12 1903 vfmadd231ps %ymm12, %ymm15, %ymm1 1904 vmovaps 64(%r13), %ymm12 1905 vfmadd231ps %ymm12, %ymm15, %ymm2 1906 vmovaps 96(%r13), %ymm12 1907 vfmadd231ps %ymm12, %ymm15, %ymm3 1908 vmovaps 128(%r13), %ymm12 1909 vfmadd231ps %ymm12, %ymm15, %ymm4 1910 vmovaps 160(%r13), %ymm12 1911 vfmadd231ps %ymm12, %ymm15, %ymm5 1912 vmovaps 192(%r13), %ymm12 1913 vfmadd231ps %ymm12, %ymm15, %ymm6 1914 vmovaps 224(%r13), %ymm12 1915 vfmadd231ps %ymm12, %ymm15, %ymm7 1916 1917 jmp 7f 1918 19190: 1920 1921 // offset > 0 1922 // 1 2 3 4 5 6 7 1923 1924 movq %r13, %r15 // C0 1925 addq %r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double) 1926 1927 cmpl $4, %r10d 1928 jl 1f 1929 jg 2f 1930 1931 // offset==4 1932 // TODO 1933 jmp 7f 1934 19351: 1936 // 1 2 3 1937 1938 cmpl $2, %r10d 1939 jl 3f 1940 jg 4f 1941 1942 // offset==2 1943 // TODO 1944 jmp 7f 1945 19463: 1947 // offset==1 1948 // TODO 1949 jmp 7f 1950 19514: 1952 // offset==3 1953 // TODO 1954 jmp 7f 1955 19562: 1957 // 5 6 7 1958 1959 cmpl $6, %r10d 1960 jl 5f 1961 jg 6f 1962 1963 // offset==6 1964 // TODO 1965 jmp 7f 1966 19675: 1968 // offset==5 1969 // TODO 1970 jmp 7f 1971 19726: 1973 // offset==7 1974 // TODO 1975 jmp 7f 1976 1977 // end 19787: 1979 1980 1981#if MACRO_LEVEL>=1 1982 .endm 1983#else 1984 ret 1985 1986 FUN_END(inner_scale_ab_8x8_gen_lib8) 1987#endif 1988 1989 1990 1991 1992 1993// common inner routine with file scope 1994// 1995// blend for generic alpha=1.0 and beta=1.0 1996// 1997// input arguments: 1998// r10 <- C 1999// ymm0 <- [] 2000// ymm1 <- [] 2001// ymm2 <- [] 2002// ymm3 <- [] 2003// ymm4 <- [] 2004// ymm5 <- [] 2005// ymm6 <- [] 2006// ymm7 <- [] 2007// ymm8 <- dirty 2008// ymm9 <- dirty 2009// ymm10 <- dirty 2010// ymm11 <- dirty 2011// ymm15 <- dirty 2012// 2013// output arguments: 2014// r10 <- C 2015// ymm0 <- [] 2016// ymm1 <- [] 2017// ymm2 <- [] 2018// ymm3 <- [] 2019// ymm4 <- [] 2020// ymm5 <- [] 2021// ymm6 <- [] 2022// ymm7 <- [] 2023// ymm8 <- dirty 2024// ymm9 <- dirty 2025// ymm10 <- dirty 2026// ymm11 <- dirty 2027// ymm15 <- dirty 2028 2029#if MACRO_LEVEL>=1 2030 .macro INNER_SCALE_11_8X8_LIB8 2031#else 2032 .p2align 4,,15 2033 FUN_START(inner_scale_11_8x8_lib8) 2034#endif 2035 2036 vmovaps 0(%r10), %ymm15 2037 vaddps %ymm0, %ymm15, %ymm0 2038 vmovaps 32(%r10), %ymm15 2039 vaddps %ymm1, %ymm15, %ymm1 2040 vmovaps 64(%r10), %ymm15 2041 vaddps %ymm2, %ymm15, %ymm2 2042 vmovaps 96(%r10), %ymm15 2043 vaddps %ymm3, %ymm15, %ymm3 2044 vmovaps 128(%r10), %ymm15 2045 vaddps %ymm4, %ymm15, %ymm4 2046 vmovaps 160(%r10), %ymm15 2047 vaddps %ymm5, %ymm15, %ymm5 2048 vmovaps 192(%r10), %ymm15 2049 vaddps %ymm6, %ymm15, %ymm6 2050 vmovaps 224(%r10), %ymm15 2051 vaddps %ymm7, %ymm15, %ymm7 2052 2053#if MACRO_LEVEL>=1 2054 .endm 2055#else 2056 ret 2057 2058 FUN_END(inner_scale_11_8x8_lib8) 2059#endif 2060 2061 2062 2063 2064 2065// common inner routine with file scope 2066// 2067// blend scale for generic alpha and beta 2068// 2069// input arguments: 2070// r10 <- offset 2071// r11 <- C 2072// r12 <- 4*sdc*sizeof(double) 2073// ymm0 <- [] 2074// ymm1 <- [] 2075// ymm2 <- [] 2076// ymm3 <- [] 2077// ymm4 <- [] 2078// ymm5 <- [] 2079// ymm6 <- [] 2080// ymm7 <- [] 2081// ymm8 <- dirty 2082// ymm9 <- dirty 2083// ymm10 <- dirty 2084// ymm11 <- dirty 2085// ymm15 <- dirty 2086// 2087// output arguments: 2088// r10 <- offset 2089// r11 <- C 2090// r12 <- 4*sdc*sizeof(double) 2091// ymm0 <- [] 2092// ymm1 <- [] 2093// ymm2 <- [] 2094// ymm3 <- [] 2095// ymm4 <- [] 2096// ymm5 <- [] 2097// ymm6 <- [] 2098// ymm7 <- [] 2099// ymm8 <- dirty 2100// ymm9 <- dirty 2101// ymm10 <- dirty 2102// ymm11 <- dirty 2103// ymm15 <- dirty 2104 2105#if MACRO_LEVEL>=1 2106 .macro INNER_SCALE_11_8X8_GEN_LIB8 2107#else 2108 .p2align 4,,15 2109 FUN_START(inner_scale_11_8x8_gen_lib8) 2110#endif 2111 2112 cmpl $0, %r10d 2113 jg 0f 2114 2115 // offset==0 2116 2117 vmovaps 0(%r11), %ymm12 2118 vaddps %ymm0, %ymm12, %ymm0 2119 vmovaps 32(%r11), %ymm12 2120 vaddps %ymm1, %ymm12, %ymm1 2121 vmovaps 64(%r11), %ymm12 2122 vaddps %ymm2, %ymm12, %ymm2 2123 vmovaps 96(%r11), %ymm12 2124 vaddps %ymm3, %ymm12, %ymm3 2125 vmovaps 128(%r11), %ymm12 2126 vaddps %ymm4, %ymm12, %ymm4 2127 vmovaps 160(%r11), %ymm12 2128 vaddps %ymm5, %ymm12, %ymm5 2129 vmovaps 192(%r11), %ymm12 2130 vaddps %ymm6, %ymm12, %ymm6 2131 vmovaps 224(%r11), %ymm12 2132 vaddps %ymm7, %ymm12, %ymm7 2133 2134 jmp 7f 2135 21360: 2137 2138 // offset > 0 2139 // 1 2 3 4 5 6 7 2140 2141 movq %r13, %r15 // C0 2142 addq %r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double) 2143 2144 cmpl $4, %r10d 2145 jl 1f 2146 jg 2f 2147 2148 // offset==4 2149 // TODO 2150 jmp 7f 2151 21521: 2153 // 1 2 3 2154 2155 cmpl $2, %r10d 2156 jl 3f 2157 jg 4f 2158 2159 // offset==2 2160 // TODO 2161 jmp 7f 2162 21633: 2164 // offset==1 2165 // TODO 2166 jmp 7f 2167 21684: 2169 // offset==3 2170 // TODO 2171 jmp 7f 2172 21732: 2174 // 5 6 7 2175 2176 cmpl $6, %r10d 2177 jl 5f 2178 jg 6f 2179 2180 // offset==6 2181 // TODO 2182 jmp 7f 2183 21845: 2185 // offset==5 2186 // TODO 2187 jmp 7f 2188 21896: 2190 // offset==7 2191 // TODO 2192 jmp 7f 2193 2194 // end 21957: 2196 2197 2198#if MACRO_LEVEL>=1 2199 .endm 2200#else 2201 ret 2202 2203 FUN_END(inner_scale_11_8x8_gen_lib8) 2204#endif 2205 2206 2207 2208 2209 2210// common inner routine with file scope 2211// 2212// blend for generic alpha and beta 2213// 2214// input arguments: 2215// r10 <- alpha 2216// r11 <- beta 2217// r12 <- C 2218// ymm0 <- [] 2219// ymm1 <- [] 2220// ymm2 <- [] 2221// ymm3 <- [] 2222// ymm4 <- [] 2223// ymm5 <- [] 2224// ymm6 <- [] 2225// ymm7 <- [] 2226// ymm8 <- dirty 2227// ymm9 <- dirty 2228// ymm10 <- dirty 2229// ymm11 <- dirty 2230// ymm15 <- dirty 2231// 2232// output arguments: 2233// r10 <- alpha 2234// r11 <- beta 2235// r12 <- C 2236// ymm0 <- [] 2237// ymm1 <- [] 2238// ymm2 <- [] 2239// ymm3 <- [] 2240// ymm4 <- [] 2241// ymm5 <- [] 2242// ymm6 <- [] 2243// ymm7 <- [] 2244// ymm8 <- dirty 2245// ymm9 <- dirty 2246// ymm10 <- dirty 2247// ymm11 <- dirty 2248// ymm15 <- dirty 2249 2250#if MACRO_LEVEL>=1 2251 .macro INNER_BLEND_SCALE_AB_8X8_LIB8 2252#else 2253 .p2align 4,,15 2254 FUN_START(inner_blend_scale_ab_8x8_lib8) 2255#endif 2256 2257 // alpha 2258 vbroadcastss 0(%r10), %ymm11 2259 2260 vblendps $0xaa, %ymm1, %ymm0, %ymm12 // 1010 1010 2261 vblendps $0x55, %ymm1, %ymm0, %ymm13 // 0101 0101 2262 vblendps $0xaa, %ymm3, %ymm2, %ymm14 2263 vblendps $0x55, %ymm3, %ymm2, %ymm15 2264 2265 vblendps $0xcc, %ymm15, %ymm12, %ymm0 // 1100 1100 2266 vblendps $0x33, %ymm15, %ymm12, %ymm2 // 0011 0011 2267 vblendps $0xcc, %ymm14, %ymm13, %ymm1 2268 vblendps $0x33, %ymm14, %ymm13, %ymm3 2269 2270 vmulps %ymm0, %ymm11, %ymm0 2271 vmulps %ymm1, %ymm11, %ymm1 2272 vmulps %ymm2, %ymm11, %ymm2 2273 vmulps %ymm3, %ymm11, %ymm3 2274 2275 vblendps $0xaa, %ymm5, %ymm4, %ymm12 2276 vblendps $0x55, %ymm5, %ymm4, %ymm13 2277 vblendps $0xaa, %ymm7, %ymm6, %ymm14 2278 vblendps $0x55, %ymm7, %ymm6, %ymm15 2279 2280 vblendps $0xcc, %ymm15, %ymm12, %ymm4 2281 vblendps $0x33, %ymm15, %ymm12, %ymm6 2282 vblendps $0xcc, %ymm14, %ymm13, %ymm5 2283 vblendps $0x33, %ymm14, %ymm13, %ymm7 2284 2285 vmulps %ymm4, %ymm11, %ymm4 2286 vmulps %ymm5, %ymm11, %ymm5 2287 vmulps %ymm6, %ymm11, %ymm6 2288 vmulps %ymm7, %ymm11, %ymm7 2289 2290 // beta 2291 vbroadcastss 0(%r11), %ymm14 2292 2293 vxorps %ymm15, %ymm15, %ymm15 // 0.0 2294 2295 vucomiss %xmm15, %xmm14 // beta==0.0 ? 2296 je 0f // end 2297 2298 vmovaps 0(%r12), %ymm15 2299 vfmadd231ps %ymm15, %ymm14, %ymm0 2300 vmovaps 32(%r12), %ymm15 2301 vfmadd231ps %ymm15, %ymm14, %ymm1 2302 vmovaps 64(%r12), %ymm15 2303 vfmadd231ps %ymm15, %ymm14, %ymm2 2304 vmovaps 96(%r12), %ymm15 2305 vfmadd231ps %ymm15, %ymm14, %ymm3 2306 vmovaps 128(%r12), %ymm15 2307 vfmadd231ps %ymm15, %ymm14, %ymm4 2308 vmovaps 160(%r12), %ymm15 2309 vfmadd231ps %ymm15, %ymm14, %ymm5 2310 vmovaps 192(%r12), %ymm15 2311 vfmadd231ps %ymm15, %ymm14, %ymm6 2312 vmovaps 224(%r12), %ymm15 2313 vfmadd231ps %ymm15, %ymm14, %ymm7 2314 23150: 2316 2317#if MACRO_LEVEL>=1 2318 .endm 2319#else 2320 ret 2321 2322 FUN_END(inner_blend_scale_ab_8x8_lib8) 2323#endif 2324 2325 2326 2327 2328 2329// common inner routine with file scope 2330// 2331// blend scale for generic alpha and beta 2332// 2333// input arguments: 2334// r10 <- alpha 2335// r11 <- beta 2336// r12 <- offset 2337// r13 <- C 2338// r14 <- 4*sdc*sizeof(double) 2339// ymm0 <- [] 2340// ymm1 <- [] 2341// ymm2 <- [] 2342// ymm3 <- [] 2343// ymm4 <- [] 2344// ymm5 <- [] 2345// ymm6 <- [] 2346// ymm7 <- [] 2347// ymm8 <- dirty 2348// ymm9 <- dirty 2349// ymm10 <- dirty 2350// ymm11 <- dirty 2351// ymm15 <- dirty 2352// 2353// output arguments: 2354// r10 <- alpha 2355// r11 <- beta 2356// r12 <- offset 2357// r13 <- C 2358// r14 <- 4*sdc*sizeof(double) 2359// ymm0 <- [] 2360// ymm1 <- [] 2361// ymm2 <- [] 2362// ymm3 <- [] 2363// ymm4 <- [] 2364// ymm5 <- [] 2365// ymm6 <- [] 2366// ymm7 <- [] 2367// ymm8 <- dirty 2368// ymm9 <- dirty 2369// ymm10 <- dirty 2370// ymm11 <- dirty 2371// ymm15 <- dirty 2372 2373#if MACRO_LEVEL>=1 2374 .macro INNER_BLEND_SCALE_AB_8X8_GEN_LIB8 2375#else 2376 .p2align 4,,15 2377 FUN_START(inner_blend_scale_ab_8x8_gen_lib8) 2378#endif 2379 2380 // alpha 2381 vbroadcastss 0(%r10), %ymm11 2382 2383 vblendps $0xaa, %ymm1, %ymm0, %ymm12 // 1010 1010 2384 vblendps $0x55, %ymm1, %ymm0, %ymm13 // 0101 0101 2385 vblendps $0xaa, %ymm3, %ymm2, %ymm14 2386 vblendps $0x55, %ymm3, %ymm2, %ymm15 2387 2388 vblendps $0xcc, %ymm15, %ymm12, %ymm0 // 1100 1100 2389 vblendps $0x33, %ymm15, %ymm12, %ymm2 // 0011 0011 2390 vblendps $0xcc, %ymm14, %ymm13, %ymm1 2391 vblendps $0x33, %ymm14, %ymm13, %ymm3 2392 2393 vmulps %ymm0, %ymm11, %ymm0 2394 vmulps %ymm1, %ymm11, %ymm1 2395 vmulps %ymm2, %ymm11, %ymm2 2396 vmulps %ymm3, %ymm11, %ymm3 2397 2398 vblendps $0xaa, %ymm5, %ymm4, %ymm12 2399 vblendps $0x55, %ymm5, %ymm4, %ymm13 2400 vblendps $0xaa, %ymm7, %ymm6, %ymm14 2401 vblendps $0x55, %ymm7, %ymm6, %ymm15 2402 2403 vblendps $0xcc, %ymm15, %ymm12, %ymm4 2404 vblendps $0x33, %ymm15, %ymm12, %ymm6 2405 vblendps $0xcc, %ymm14, %ymm13, %ymm5 2406 vblendps $0x33, %ymm14, %ymm13, %ymm7 2407 2408 vmulps %ymm4, %ymm11, %ymm4 2409 vmulps %ymm5, %ymm11, %ymm5 2410 vmulps %ymm6, %ymm11, %ymm6 2411 vmulps %ymm7, %ymm11, %ymm7 2412 2413 // beta 2414 vbroadcastss 0(%r11), %ymm15 2415 2416 vxorps %ymm14, %ymm14, %ymm14 // 0.0 2417 2418 vucomiss %xmm15, %xmm14 // beta==0.0 ? 2419 je 3f // end 2420 2421 cmpl $0, %r12d 2422 jg 0f 2423 2424 // offset==0 2425 2426 vmovaps 0(%r13), %ymm12 2427 vfmadd231ps %ymm12, %ymm15, %ymm0 2428 vmovaps 32(%r13), %ymm12 2429 vfmadd231ps %ymm12, %ymm15, %ymm1 2430 vmovaps 64(%r13), %ymm12 2431 vfmadd231ps %ymm12, %ymm15, %ymm2 2432 vmovaps 96(%r13), %ymm12 2433 vfmadd231ps %ymm12, %ymm15, %ymm3 2434 vmovaps 128(%r13), %ymm12 2435 vfmadd231ps %ymm12, %ymm15, %ymm4 2436 vmovaps 160(%r13), %ymm12 2437 vfmadd231ps %ymm12, %ymm15, %ymm5 2438 vmovaps 192(%r13), %ymm12 2439 vfmadd231ps %ymm12, %ymm15, %ymm6 2440 vmovaps 224(%r13), %ymm12 2441 vfmadd231ps %ymm12, %ymm15, %ymm7 2442 2443 jmp 7f 2444 24450: 2446 2447 // offset > 0 2448 // 1 2 3 4 5 6 7 2449 2450 movq %r13, %r15 // C0 2451 addq %r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double) 2452 2453 cmpl $4, %r12d 2454 jl 1f 2455 jg 2f 2456 2457 // offset==4 2458 // TODO 2459 jmp 7f 2460 24611: 2462 // 1 2 3 2463 2464 cmpl $2, %r12d 2465 jl 3f 2466 jg 4f 2467 2468 // offset==2 2469 // TODO 2470 jmp 7f 2471 24723: 2473 // offset==1 2474 // TODO 2475 jmp 7f 2476 24774: 2478 // offset==3 2479 // TODO 2480 jmp 7f 2481 24822: 2483 // 5 6 7 2484 2485 cmpl $6, %r12d 2486 jl 5f 2487 jg 6f 2488 2489 // offset==6 2490 // TODO 2491 jmp 7f 2492 24935: 2494 // offset==5 2495 // TODO 2496 jmp 7f 2497 24986: 2499 // offset==7 2500 // TODO 2501 jmp 7f 2502 2503 // end 25047: 2505 2506 2507#if MACRO_LEVEL>=1 2508 .endm 2509#else 2510 ret 2511 2512 FUN_END(inner_blend_scale_ab_8x8_gen_lib8) 2513#endif 2514 2515 2516 2517 2518 2519// common inner routine with file scope 2520// 2521// blend for generic alpha=1.0 and beta=1.0 2522// 2523// input arguments: 2524// r10 <- C 2525// ymm0 <- [] 2526// ymm1 <- [] 2527// ymm2 <- [] 2528// ymm3 <- [] 2529// ymm4 <- [] 2530// ymm5 <- [] 2531// ymm6 <- [] 2532// ymm7 <- [] 2533// ymm8 <- dirty 2534// ymm9 <- dirty 2535// ymm10 <- dirty 2536// ymm11 <- dirty 2537// ymm15 <- dirty 2538// 2539// output arguments: 2540// r10 <- C 2541// ymm0 <- [] 2542// ymm1 <- [] 2543// ymm2 <- [] 2544// ymm3 <- [] 2545// ymm4 <- [] 2546// ymm5 <- [] 2547// ymm6 <- [] 2548// ymm7 <- [] 2549// ymm8 <- dirty 2550// ymm9 <- dirty 2551// ymm10 <- dirty 2552// ymm11 <- dirty 2553// ymm15 <- dirty 2554 2555#if MACRO_LEVEL>=1 2556 .macro INNER_BLEND_SCALE_11_8X8_LIB8 2557#else 2558 .p2align 4,,15 2559 FUN_START(inner_blend_scale_11_8x8_lib8) 2560#endif 2561 2562 vblendps $0xaa, %ymm1, %ymm0, %ymm12 // 1010 1010 2563 vblendps $0x55, %ymm1, %ymm0, %ymm13 // 0101 0101 2564 vblendps $0xaa, %ymm3, %ymm2, %ymm14 2565 vblendps $0x55, %ymm3, %ymm2, %ymm15 2566 2567 vblendps $0xcc, %ymm15, %ymm12, %ymm0 // 1100 1100 2568 vblendps $0x33, %ymm15, %ymm12, %ymm2 // 0011 0011 2569 vblendps $0xcc, %ymm14, %ymm13, %ymm1 2570 vblendps $0x33, %ymm14, %ymm13, %ymm3 2571 2572 vblendps $0xaa, %ymm5, %ymm4, %ymm12 2573 vblendps $0x55, %ymm5, %ymm4, %ymm13 2574 vblendps $0xaa, %ymm7, %ymm6, %ymm14 2575 vblendps $0x55, %ymm7, %ymm6, %ymm15 2576 2577 vblendps $0xcc, %ymm15, %ymm12, %ymm4 2578 vblendps $0x33, %ymm15, %ymm12, %ymm6 2579 vblendps $0xcc, %ymm14, %ymm13, %ymm5 2580 vblendps $0x33, %ymm14, %ymm13, %ymm7 2581 2582 vmovaps 0(%r10), %ymm15 2583 vaddps %ymm0, %ymm15, %ymm0 2584 vmovaps 32(%r10), %ymm15 2585 vaddps %ymm1, %ymm15, %ymm1 2586 vmovaps 64(%r10), %ymm15 2587 vaddps %ymm2, %ymm15, %ymm2 2588 vmovaps 96(%r10), %ymm15 2589 vaddps %ymm3, %ymm15, %ymm3 2590 vmovaps 128(%r10), %ymm15 2591 vaddps %ymm4, %ymm15, %ymm4 2592 vmovaps 160(%r10), %ymm15 2593 vaddps %ymm5, %ymm15, %ymm5 2594 vmovaps 192(%r10), %ymm15 2595 vaddps %ymm6, %ymm15, %ymm6 2596 vmovaps 224(%r10), %ymm15 2597 vaddps %ymm7, %ymm15, %ymm7 2598 2599#if MACRO_LEVEL>=1 2600 .endm 2601#else 2602 ret 2603 2604 FUN_END(inner_blend_scale_11_8x8_lib8) 2605#endif 2606 2607 2608 2609 2610 2611// common inner routine with file scope 2612// 2613// blend scale for generic alpha and beta 2614// 2615// input arguments: 2616// r10 <- offset 2617// r11 <- C 2618// r12 <- 4*sdc*sizeof(double) 2619// ymm0 <- [] 2620// ymm1 <- [] 2621// ymm2 <- [] 2622// ymm3 <- [] 2623// ymm4 <- [] 2624// ymm5 <- [] 2625// ymm6 <- [] 2626// ymm7 <- [] 2627// ymm8 <- dirty 2628// ymm9 <- dirty 2629// ymm10 <- dirty 2630// ymm11 <- dirty 2631// ymm15 <- dirty 2632// 2633// output arguments: 2634// r10 <- offset 2635// r11 <- C 2636// r12 <- 4*sdc*sizeof(double) 2637// ymm0 <- [] 2638// ymm1 <- [] 2639// ymm2 <- [] 2640// ymm3 <- [] 2641// ymm4 <- [] 2642// ymm5 <- [] 2643// ymm6 <- [] 2644// ymm7 <- [] 2645// ymm8 <- dirty 2646// ymm9 <- dirty 2647// ymm10 <- dirty 2648// ymm11 <- dirty 2649// ymm15 <- dirty 2650 2651#if MACRO_LEVEL>=1 2652 .macro INNER_BLEND_SCALE_11_8X8_GEN_LIB8 2653#else 2654 .p2align 4,,15 2655 FUN_START(inner_blend_scale_11_8x8_gen_lib8) 2656#endif 2657 2658 vblendps $0xaa, %ymm1, %ymm0, %ymm12 // 1010 1010 2659 vblendps $0x55, %ymm1, %ymm0, %ymm13 // 0101 0101 2660 vblendps $0xaa, %ymm3, %ymm2, %ymm14 2661 vblendps $0x55, %ymm3, %ymm2, %ymm15 2662 2663 vblendps $0xcc, %ymm15, %ymm12, %ymm0 // 1100 1100 2664 vblendps $0x33, %ymm15, %ymm12, %ymm2 // 0011 0011 2665 vblendps $0xcc, %ymm14, %ymm13, %ymm1 2666 vblendps $0x33, %ymm14, %ymm13, %ymm3 2667 2668 vblendps $0xaa, %ymm5, %ymm4, %ymm12 2669 vblendps $0x55, %ymm5, %ymm4, %ymm13 2670 vblendps $0xaa, %ymm7, %ymm6, %ymm14 2671 vblendps $0x55, %ymm7, %ymm6, %ymm15 2672 2673 vblendps $0xcc, %ymm15, %ymm12, %ymm4 2674 vblendps $0x33, %ymm15, %ymm12, %ymm6 2675 vblendps $0xcc, %ymm14, %ymm13, %ymm5 2676 vblendps $0x33, %ymm14, %ymm13, %ymm7 2677 2678 cmpl $0, %r10d 2679 jg 0f 2680 2681 // offset==0 2682 2683 vmovaps 0(%r11), %ymm12 2684 vaddps %ymm0, %ymm12, %ymm0 2685 vmovaps 32(%r11), %ymm12 2686 vaddps %ymm1, %ymm12, %ymm1 2687 vmovaps 64(%r11), %ymm12 2688 vaddps %ymm2, %ymm12, %ymm2 2689 vmovaps 96(%r11), %ymm12 2690 vaddps %ymm3, %ymm12, %ymm3 2691 vmovaps 128(%r11), %ymm12 2692 vaddps %ymm4, %ymm12, %ymm4 2693 vmovaps 160(%r11), %ymm12 2694 vaddps %ymm5, %ymm12, %ymm5 2695 vmovaps 192(%r11), %ymm12 2696 vaddps %ymm6, %ymm12, %ymm6 2697 vmovaps 224(%r11), %ymm12 2698 vaddps %ymm7, %ymm12, %ymm7 2699 2700 jmp 7f 2701 27020: 2703 2704 // offset > 0 2705 // 1 2 3 4 5 6 7 2706 2707 movq %r13, %r15 // C0 2708 addq %r14, %r15 // C1 <- C0 + 4*sdc*sizeof(double) 2709 2710 cmpl $4, %r10d 2711 jl 1f 2712 jg 2f 2713 2714 // offset==4 2715 // TODO 2716 jmp 7f 2717 27181: 2719 // 1 2 3 2720 2721 cmpl $2, %r10d 2722 jl 3f 2723 jg 4f 2724 2725 // offset==2 2726 // TODO 2727 jmp 7f 2728 27293: 2730 // offset==1 2731 // TODO 2732 jmp 7f 2733 27344: 2735 // offset==3 2736 // TODO 2737 jmp 7f 2738 27392: 2740 // 5 6 7 2741 2742 cmpl $6, %r10d 2743 jl 5f 2744 jg 6f 2745 2746 // offset==6 2747 // TODO 2748 jmp 7f 2749 27505: 2751 // offset==5 2752 // TODO 2753 jmp 7f 2754 27556: 2756 // offset==7 2757 // TODO 2758 jmp 7f 2759 2760 // end 27617: 2762 2763 2764#if MACRO_LEVEL>=1 2765 .endm 2766#else 2767 ret 2768 2769 FUN_END(inner_blend_scale_11_8x8_gen_lib8) 2770#endif 2771 2772 2773 2774 2775 2776// common inner routine with file scope 2777// 2778// store n 2779// 2780// input arguments: 2781// r10 <- D 2782// ymm0 <- [] 2783// ymm1 <- [] 2784// ymm2 <- [] 2785// ymm3 <- [] 2786// 2787// output arguments: 2788// r10 <- D 2789// ymm0 <- [] 2790// ymm1 <- [] 2791// ymm2 <- [] 2792// ymm3 <- [] 2793 2794#if MACRO_LEVEL>=1 2795 .macro INNER_STORE_8X8_LIB8 2796#else 2797 .p2align 4,,15 2798 FUN_START(inner_store_8x8_lib8) 2799#endif 2800 2801 vmovaps %ymm0, 0(%r10) 2802 vmovaps %ymm1, 32(%r10) 2803 vmovaps %ymm2, 64(%r10) 2804 vmovaps %ymm3, 96(%r10) 2805 vmovaps %ymm4, 128(%r10) 2806 vmovaps %ymm5, 160(%r10) 2807 vmovaps %ymm6, 192(%r10) 2808 vmovaps %ymm7, 224(%r10) 2809 2810#if MACRO_LEVEL>=1 2811 .endm 2812#else 2813 ret 2814 2815 FUN_END(inner_store_8x8_lib8) 2816#endif 2817 2818 2819 2820 2821 2822// common inner routine with file scope 2823// 2824// store n vs 2825// 2826// input arguments: 2827// r10 <- D 2828// r11 <- km 2829// r12 <- kn 2830// ymm0 <- [] 2831// ymm1 <- [] 2832// ymm2 <- [] 2833// ymm3 <- [] 2834// ymm4 <- [] 2835// ymm5 <- [] 2836// ymm6 <- [] 2837// ymm7 <- [] 2838// 2839// output arguments: 2840// r10 <- D 2841// r11 <- km 2842// r12 <- kn 2843// ymm0 <- [] 2844// ymm1 <- [] 2845// ymm2 <- [] 2846// ymm3 <- [] 2847// ymm4 <- [] 2848// ymm5 <- [] 2849// ymm6 <- [] 2850// ymm7 <- [] 2851 2852#if MACRO_LEVEL>=1 2853 .macro INNER_STORE_8X8_VS_LIB8 2854#else 2855 .p2align 4,,15 2856 FUN_START(inner_store_8x8_vs_lib8) 2857#endif 2858 2859 // compute mask for rows 2860 vcvtsi2ss %r11d, %xmm15, %xmm15 2861#if defined(OS_LINUX) | defined(OS_WINDOWS) 2862 vmovups .LC00(%rip), %ymm12 2863#elif defined(OS_MAC) 2864 vmovups LC00(%rip), %ymm12 2865#endif 2866 vshufps $0x00, %xmm15, %xmm15, %xmm15 2867 vinsertf128 $0x1, %xmm15, %ymm15, %ymm15 2868 vsubps %ymm15, %ymm12, %ymm15 2869 2870 vmaskmovps %ymm0, %ymm15, 0(%r10) 2871 vmaskmovps %ymm1, %ymm15, 32(%r10) 2872 vmaskmovps %ymm2, %ymm15, 64(%r10) 2873 vmaskmovps %ymm3, %ymm15, 96(%r10) 2874 vmaskmovps %ymm4, %ymm15, 128(%r10) 2875 cmpl $6, %r12d 2876 jl 0f // end 2877 vmaskmovps %ymm5, %ymm15, 160(%r10) 2878 cmpl $7, %r12d 2879 jl 0f // end 2880 vmaskmovps %ymm6, %ymm15, 192(%r10) 2881 je 0f // end 2882 vmaskmovps %ymm7, %ymm15, 224(%r10) 2883 28840: 2885 2886#if MACRO_LEVEL>=1 2887 .endm 2888#else 2889 ret 2890 2891 FUN_END(inner_store_8x8_vs_lib8) 2892#endif 2893 2894 2895 2896 2897 2898// common inner routine with file scope 2899// 2900// store n generalized 2901// 2902// input arguments: 2903// r10 <- offset 2904// r11 <- D 2905// r12 <- 4*sdd*sizeof(double) 2906// r13 <- m0 // row index: start from (inc) 2907// r14 <- m1 // row index: up to (exc) 2908// r15 <- n0 // col index: start from (inc) 2909// rax <- n1 // col index: up to (exc) 2910// rbx <- dirty 2911// ymm0 <- [] 2912// ymm1 <- [] 2913// ymm2 <- [] 2914// ymm3 <- [] 2915// ymm4 <- [] 2916// ymm5 <- [] 2917// ymm6 <- [] 2918// ymm7 <- [] 2919// 2920// output arguments: 2921// r10 <- offset 2922// r11 <- D 2923// r12 <- 4*sdd*sizeof(double) 2924// r13 <- m0 // row index: start from (inc) 2925// r14 <- m1 // row index: up to (exc) 2926// r15 <- n1-n0 2927// rax <- n1-n0 2928// rbx <- dirty 2929// ymm0 <- [] 2930// ymm1 <- [] 2931// ymm2 <- [] 2932// ymm3 <- [] 2933// ymm4 <- [] 2934// ymm5 <- [] 2935// ymm6 <- [] 2936// ymm7 <- [] 2937 2938#if MACRO_LEVEL>=1 2939 .macro INNER_STORE_8X8_GEN_LIB8 2940#else 2941 .p2align 4,,15 2942 FUN_START(inner_store_8x8_gen_lib8) 2943#endif 2944 2945 // compute mask for rows 2946 vcvtsi2ss %r13d, %xmm14, %xmm14 2947 vcvtsi2ss %r14d, %xmm15, %xmm15 2948#if defined(OS_LINUX) | defined(OS_WINDOWS) 2949 vmovups .LC00(%rip), %ymm12 2950#elif defined(OS_MAC) 2951 vmovups LC00(%rip), %ymm12 2952#endif 2953 vshufps $0x00, %xmm14, %xmm14, %xmm14 2954 vshufps $0x00, %xmm15, %xmm15, %xmm15 2955 vinsertf128 $0x1, %xmm14, %ymm14, %ymm14 2956 vinsertf128 $0x1, %xmm15, %ymm15, %ymm15 2957 vsubps %ymm12, %ymm14, %ymm14 2958 vsubps %ymm15, %ymm12, %ymm15 2959 vandps %ymm14, %ymm15, %ymm15 2960 2961 // shift D and sol for cols 2962 cmpl $0, %r15d 2963 jle 0f 2964 2965 vmovaps %ymm1, %ymm0 2966 vmovaps %ymm2, %ymm1 2967 vmovaps %ymm3, %ymm2 2968 vmovaps %ymm4, %ymm3 2969 vmovaps %ymm5, %ymm4 2970 vmovaps %ymm6, %ymm5 2971 vmovaps %ymm7, %ymm6 2972 addq $32, %r11 2973 2974 cmpl $1, %r15d 2975 jle 0f 2976 2977 vmovaps %ymm1, %ymm0 2978 vmovaps %ymm2, %ymm1 2979 vmovaps %ymm3, %ymm2 2980 vmovaps %ymm4, %ymm3 2981 vmovaps %ymm5, %ymm4 2982 vmovaps %ymm6, %ymm5 2983 addq $32, %r11 2984 2985 cmpl $2, %r15d 2986 jle 0f 2987 2988 vmovaps %ymm1, %ymm0 2989 vmovaps %ymm3, %ymm2 2990 vmovaps %ymm4, %ymm3 2991 vmovaps %ymm5, %ymm4 2992 addq $32, %r11 2993 29940: 2995 2996 // compute number of cols 2997 cmpl $8, %eax 2998 jle 0f 2999 movl $8, %eax 30000: 3001 subl %r15d, %eax 3002 movl %eax, %r15d 3003 3004 cmpl $0, %r10d 3005 jg 0f 3006 3007 // offset==0 3008 vmaskmovps %ymm0, %ymm15, 0(%r11) 3009 vmaskmovps %ymm1, %ymm15, 32(%r11) 3010 vmaskmovps %ymm2, %ymm15, 64(%r11) 3011 vmaskmovps %ymm3, %ymm15, 96(%r11) 3012 vmaskmovps %ymm4, %ymm15, 128(%r11) 3013 cmpl $6, %r15d 3014 jl 7f // end 3015 vmaskmovps %ymm5, %ymm15, 160(%r11) 3016 cmpl $7, %r15d 3017 jl 7f // end 3018 vmaskmovps %ymm6, %ymm15, 192(%r11) 3019 je 7f // end 3020 vmaskmovps %ymm7, %ymm15, 224(%r11) 3021 // 3022 jmp 7f 3023 30240: 3025 // offset > 0 3026 // 1 2 3 4 5 6 7 3027 3028 movq %r11, %rbx // D0 3029 addq %r12, %rbx // D1 <- D0 + 4*sdd*sizeof(double) 3030 3031 cmpl $4, %r10d 3032 jl 1f 3033 jg 2f 3034 3035 // offset==4 3036 // TODO 3037 jmp 7f 3038 30391: 3040 // 1 2 3 3041 3042 cmpl $2, %r10d 3043 jl 3f 3044 jg 4f 3045 3046 // offset==2 3047 // TODO 3048 jmp 7f 3049 30503: 3051 // offset==1 3052 // TODO 3053 jmp 7f 3054 30554: 3056 // offset==3 3057 // TODO 3058 jmp 7f 3059 30602: 3061 // 5 6 7 3062 3063 cmpl $6, %r10d 3064 jl 5f 3065 jg 6f 3066 3067 // offset==6 3068 // TODO 3069 jmp 7f 3070 30715: 3072 // offset==5 3073 // TODO 3074 jmp 7f 3075 30766: 3077 // offset==7 3078 // TODO 3079 jmp 7f 3080 3081 // end 30827: 3083 3084#if MACRO_LEVEL>=1 3085 .endm 3086#else 3087 ret 3088 3089 FUN_END(inner_store_8x8_gen_lib8) 3090#endif 3091 3092 3093 3094 3095 3096// common inner routine with file scope 3097// 3098// store n 3099// 3100// input arguments: 3101// r10 <- D 3102// ymm0 <- [] 3103// ymm1 <- [] 3104// ymm2 <- [] 3105// ymm3 <- [] 3106// 3107// output arguments: 3108// r10 <- D 3109// ymm0 <- [] 3110// ymm1 <- [] 3111// ymm2 <- [] 3112// ymm3 <- [] 3113 3114#if MACRO_LEVEL>=1 3115 .macro INNER_STORE_L_8X8_LIB8 3116#else 3117 .p2align 4,,15 3118 FUN_START(inner_store_l_8x8_lib8) 3119#endif 3120 3121 vmovaps %ymm0, 0(%r10) 3122 vmovaps 32(%r10), %ymm14 3123 vblendps $0x01, %ymm14, %ymm1, %ymm1 3124 vmovaps %ymm1, 32(%r10) 3125 vmovaps 64(%r10), %ymm14 3126 vblendps $0x03, %ymm14, %ymm2, %ymm2 3127 vmovaps %ymm2, 64(%r10) 3128 vmovaps 96(%r10), %ymm14 3129 vblendps $0x07, %ymm14, %ymm3, %ymm3 3130 vmovaps %ymm3, 96(%r10) 3131 vmovaps 128(%r10), %ymm14 3132 vblendps $0x0f, %ymm14, %ymm4, %ymm4 3133 vmovaps %ymm4, 128(%r10) 3134 vmovaps 160(%r10), %ymm14 3135 vblendps $0x1f, %ymm14, %ymm5, %ymm5 3136 vmovaps %ymm5, 160(%r10) 3137 vmovaps 192(%r10), %ymm14 3138 vblendps $0x3f, %ymm14, %ymm6, %ymm6 3139 vmovaps %ymm6, 192(%r10) 3140 vmovaps 224(%r10), %ymm14 3141 vblendps $0x7f, %ymm14, %ymm7, %ymm7 3142 vmovaps %ymm7, 224(%r10) 3143 3144#if MACRO_LEVEL>=1 3145 .endm 3146#else 3147 ret 3148 3149 FUN_END(inner_store_l_8x8_lib8) 3150#endif 3151 3152 3153 3154 3155 3156// common inner routine with file scope 3157// 3158// store lower vs 3159// 3160// input arguments: 3161// r10 <- D 3162// r11 <- km 3163// r12 <- kn 3164// ymm0 <- [] 3165// ymm1 <- [] 3166// ymm2 <- [] 3167// ymm3 <- [] 3168// ymm4 <- [] 3169// ymm5 <- [] 3170// ymm6 <- [] 3171// ymm7 <- [] 3172// 3173// output arguments: 3174// r10 <- D 3175// r11 <- km 3176// r12 <- kn 3177// ymm0 <- [] 3178// ymm1 <- [] 3179// ymm2 <- [] 3180// ymm3 <- [] 3181// ymm4 <- [] 3182// ymm5 <- [] 3183// ymm6 <- [] 3184// ymm7 <- [] 3185 3186#if MACRO_LEVEL>=1 3187 .macro INNER_STORE_L_8X8_VS_LIB8 3188#else 3189 .p2align 4,,15 3190 FUN_START(inner_store_l_8x8_vs_lib8) 3191#endif 3192 3193 // compute mask for rows 3194 vcvtsi2ss %r11d, %xmm15, %xmm15 3195#if defined(OS_LINUX) | defined(OS_WINDOWS) 3196 vmovups .LC00(%rip), %ymm12 3197#elif defined(OS_MAC) 3198 vmovups LC00(%rip), %ymm12 3199#endif 3200 vshufps $0x00, %xmm15, %xmm15, %xmm15 3201 vinsertf128 $0x1, %xmm15, %ymm15, %ymm15 3202 vsubps %ymm15, %ymm12, %ymm15 3203 3204 // offset==0 3205 vmaskmovps %ymm0, %ymm15, 0(%r10) 3206 vmovaps 32(%r10), %ymm12 3207 vblendps $0x01, %ymm12, %ymm1, %ymm1 3208 vmaskmovps %ymm1, %ymm15, 32(%r10) 3209 vmovaps 64(%r10), %ymm12 3210 vblendps $0x03, %ymm12, %ymm2, %ymm2 3211 vmaskmovps %ymm2, %ymm15, 64(%r10) 3212 vmovaps 96(%r10), %ymm12 3213 vblendps $0x07, %ymm12, %ymm3, %ymm3 3214 vmaskmovps %ymm3, %ymm15, 96(%r10) 3215 vmovaps 128(%r10), %ymm12 3216 vblendps $0x0f, %ymm12, %ymm4, %ymm4 3217 vmaskmovps %ymm4, %ymm15, 128(%r10) 3218 cmpl $6, %r12d 3219 jl 0f // end 3220 vmovaps 160(%r10), %ymm12 3221 vblendps $0x1f, %ymm12, %ymm5, %ymm5 3222 vmaskmovps %ymm5, %ymm15, 160(%r10) 3223 cmpl $7, %r12d 3224 jl 0f // end 3225 vmovaps 192(%r10), %ymm12 3226 vblendps $0x3f, %ymm12, %ymm6, %ymm6 3227 vmaskmovps %ymm6, %ymm15, 192(%r10) 3228 je 0f // end 3229 vmovaps 224(%r10), %ymm12 3230 vblendps $0x7f, %ymm12, %ymm7, %ymm7 3231 vmaskmovps %ymm7, %ymm15, 224(%r10) 3232 32330: 3234 3235#if MACRO_LEVEL>=1 3236 .endm 3237#else 3238 ret 3239 3240 FUN_END(inner_store_8x8_vs_lib8) 3241#endif 3242 3243 3244 3245 3246 3247// common inner routine with file scope 3248// 3249// store lower generalized 3250// 3251// input arguments: 3252// r10 <- offset 3253// r11 <- D 3254// r12 <- 4*sdd*sizeof(double) 3255// r13 <- m0 // row index: start from (inc) 3256// r14 <- m1 // row index: up to (exc) 3257// r15 <- n0 // col index: start from (inc) 3258// rax <- n1 // col index: up to (exc) 3259// rbx <- dirty 3260// ymm0 <- [] 3261// ymm1 <- [] 3262// ymm2 <- [] 3263// ymm3 <- [] 3264// ymm4 <- [] 3265// ymm5 <- [] 3266// ymm6 <- [] 3267// ymm7 <- [] 3268// 3269// output arguments: 3270// r10 <- offset 3271// r11 <- D 3272// r12 <- 4*sdd*sizeof(double) 3273// r13 <- m0 // row index: start from (inc) 3274// r14 <- m1 // row index: up to (exc) 3275// r15 <- n1-n0 3276// rax <- n1-n0 3277// rbx <- dirty 3278// ymm0 <- [] 3279// ymm1 <- [] 3280// ymm2 <- [] 3281// ymm3 <- [] 3282// ymm4 <- [] 3283// ymm5 <- [] 3284// ymm6 <- [] 3285// ymm7 <- [] 3286 3287#if MACRO_LEVEL>=1 3288 .macro INNER_STORE_L_8X8_GEN_LIB8 3289#else 3290 .p2align 4,,15 3291 FUN_START(inner_store_l_8x8_gen_lib8) 3292#endif 3293 3294 // compute mask for rows 3295 vcvtsi2ss %r13d, %xmm14, %xmm14 3296 vcvtsi2ss %r14d, %xmm15, %xmm15 3297#if defined(OS_LINUX) | defined(OS_WINDOWS) 3298 vmovups .LC00(%rip), %ymm12 3299#elif defined(OS_MAC) 3300 vmovups LC00(%rip), %ymm12 3301#endif 3302 vshufps $0x00, %xmm14, %xmm14, %xmm14 3303 vshufps $0x00, %xmm15, %xmm15, %xmm15 3304 vinsertf128 $0x1, %xmm14, %ymm14, %ymm14 3305 vinsertf128 $0x1, %xmm15, %ymm15, %ymm15 3306 vsubps %ymm12, %ymm14, %ymm14 3307 vsubps %ymm15, %ymm12, %ymm15 3308 vandps %ymm14, %ymm15, %ymm15 3309 3310 // shift D and sol for cols 3311 cmpl $0, %r15d 3312 jle 0f 3313 3314 vmovaps %ymm1, %ymm0 3315 vmovaps %ymm2, %ymm1 3316 vmovaps %ymm3, %ymm2 3317 vmovaps %ymm4, %ymm3 3318 vmovaps %ymm5, %ymm4 3319 vmovaps %ymm6, %ymm5 3320 vmovaps %ymm7, %ymm6 3321 addq $32, %r11 3322 3323 cmpl $1, %r15d 3324 jle 0f 3325 3326 vmovaps %ymm1, %ymm0 3327 vmovaps %ymm2, %ymm1 3328 vmovaps %ymm3, %ymm2 3329 vmovaps %ymm4, %ymm3 3330 vmovaps %ymm5, %ymm4 3331 vmovaps %ymm6, %ymm5 3332 addq $32, %r11 3333 3334 cmpl $2, %r15d 3335 jle 0f 3336 3337 vmovaps %ymm1, %ymm0 3338 vmovaps %ymm3, %ymm2 3339 vmovaps %ymm4, %ymm3 3340 vmovaps %ymm5, %ymm4 3341 addq $32, %r11 3342 33430: 3344 3345 // compute number of cols 3346 cmpl $8, %eax 3347 jle 0f 3348 movl $8, %eax 33490: 3350 subl %r15d, %eax 3351 movl %eax, %r15d 3352 3353 cmpl $0, %r10d 3354 jg 0f 3355 3356 // offset==0 3357 vmaskmovps %ymm0, %ymm15, 0(%r11) 3358 vmovaps 32(%r11), %ymm12 3359 vblendps $0x01, %ymm12, %ymm1, %ymm1 3360 vmaskmovps %ymm1, %ymm15, 32(%r11) 3361 vmovaps 64(%r11), %ymm12 3362 vblendps $0x03, %ymm12, %ymm2, %ymm2 3363 vmaskmovps %ymm2, %ymm15, 64(%r11) 3364 vmovaps 96(%r11), %ymm12 3365 vblendps $0x07, %ymm12, %ymm3, %ymm3 3366 vmaskmovps %ymm3, %ymm15, 96(%r11) 3367 vmovaps 128(%r11), %ymm12 3368 vblendps $0x0f, %ymm12, %ymm4, %ymm4 3369 vmaskmovps %ymm4, %ymm15, 128(%r11) 3370 cmpl $6, %r15d 3371 jl 7f // end 3372 vmovaps 160(%r11), %ymm12 3373 vblendps $0x1f, %ymm12, %ymm5, %ymm5 3374 vmaskmovps %ymm5, %ymm15, 160(%r11) 3375 cmpl $7, %r15d 3376 jl 7f // end 3377 vmovaps 192(%r11), %ymm12 3378 vblendps $0x3f, %ymm12, %ymm6, %ymm6 3379 vmaskmovps %ymm6, %ymm15, 192(%r11) 3380 je 7f // end 3381 vmovaps 224(%r11), %ymm12 3382 vblendps $0x7f, %ymm12, %ymm7, %ymm7 3383 vmaskmovps %ymm7, %ymm15, 224(%r11) 3384 // 3385 jmp 7f 3386 33870: 3388 // offset > 0 3389 // 1 2 3 4 5 6 7 3390 3391 movq %r11, %rbx // D0 3392 addq %r12, %rbx // D1 <- D0 + 4*sdd*sizeof(double) 3393 3394 cmpl $4, %r10d 3395 jl 1f 3396 jg 2f 3397 3398 // offset==4 3399 // TODO 3400 jmp 7f 3401 34021: 3403 // 1 2 3 3404 3405 cmpl $2, %r10d 3406 jl 3f 3407 jg 4f 3408 3409 // offset==2 3410 // TODO 3411 jmp 7f 3412 34133: 3414 // offset==1 3415 // TODO 3416 jmp 7f 3417 34184: 3419 // offset==3 3420 // TODO 3421 jmp 7f 3422 34232: 3424 // 5 6 7 3425 3426 cmpl $6, %r10d 3427 jl 5f 3428 jg 6f 3429 3430 // offset==6 3431 // TODO 3432 jmp 7f 3433 34345: 3435 // offset==5 3436 // TODO 3437 jmp 7f 3438 34396: 3440 // offset==7 3441 // TODO 3442 jmp 7f 3443 3444 // end 34457: 3446 3447#if MACRO_LEVEL>=1 3448 .endm 3449#else 3450 ret 3451 3452 FUN_END(inner_store_8x8_gen_lib8) 3453#endif 3454 3455 3456 3457 3458 3459// 1 2 3 4 5 6 7 3460// void kernel_sgemm_nt_8x8_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D); 3461 3462 .p2align 4,,15 3463 GLOB_FUN_START(kernel_sgemm_nt_8x8_lib8) 3464 3465 PROLOGUE 3466 3467 // zero accumulation registers 3468 3469 vxorpd %ymm0, %ymm0, %ymm0 3470 vmovaps %ymm0, %ymm1 3471 vmovaps %ymm0, %ymm2 3472 vmovaps %ymm0, %ymm3 3473 vmovaps %ymm0, %ymm4 3474 vmovaps %ymm0, %ymm5 3475 vmovaps %ymm0, %ymm6 3476 vmovaps %ymm0, %ymm7 3477 3478 3479 // call inner dgemm kernel nt 3480 3481 movq ARG1, %r10 // k 3482 movq ARG3, %r11 // A 3483 movq ARG4, %r12 // B 3484 3485#if MACRO_LEVEL>=2 3486 INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8 3487#else 3488 CALL(inner_kernel_gemm_add_nt_8x8_lib8) 3489#endif 3490 3491 3492 // call inner scale 3493 3494 movq ARG2, %r10 // alpha 3495 movq ARG5, %r11 // beta 3496 movq ARG6, %r12 // C 3497 3498#if MACRO_LEVEL>=1 3499 INNER_SCALE_AB_8X8_LIB8 3500#else 3501 CALL(inner_scale_ab_8x8_lib8) 3502#endif 3503 3504 3505 // store n 3506 3507 movq ARG7, %r10 // D 3508 3509#if MACRO_LEVEL>=1 3510 INNER_STORE_8X8_LIB8 3511#else 3512 CALL(inner_store_8x8_lib8) 3513#endif 3514 3515 3516 EPILOGUE 3517 3518 ret 3519 3520 FUN_END(kernel_sgemm_nt_8x8_lib8) 3521 3522 3523 3524 3525 3526// 1 2 3 4 5 6 7 8 9 3527// void kernel_sgemm_nt_8x8_vs)lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn); 3528 3529 .p2align 4,,15 3530 GLOB_FUN_START(kernel_sgemm_nt_8x8_vs_lib8) 3531 3532 PROLOGUE 3533 3534 // zero accumulation registers 3535 3536 vxorpd %ymm0, %ymm0, %ymm0 3537 vmovaps %ymm0, %ymm1 3538 vmovaps %ymm0, %ymm2 3539 vmovaps %ymm0, %ymm3 3540 vmovaps %ymm0, %ymm4 3541 vmovaps %ymm0, %ymm5 3542 vmovaps %ymm0, %ymm6 3543 vmovaps %ymm0, %ymm7 3544 3545 3546 // call inner dgemm kernel nt 3547 3548 movq ARG1, %r10 // k 3549 movq ARG3, %r11 // A 3550 movq ARG4, %r12 // B 3551 3552#if MACRO_LEVEL>=2 3553 INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8 3554#else 3555 CALL(inner_kernel_gemm_add_nt_8x8_lib8) 3556#endif 3557 3558 3559 // call inner scale 3560 3561 movq ARG2, %r10 // alpha 3562 movq ARG5, %r11 // beta 3563 movq ARG6, %r12 // C 3564 3565#if MACRO_LEVEL>=1 3566 INNER_SCALE_AB_8X8_LIB8 3567#else 3568 CALL(inner_scale_ab_8x8_lib8) 3569#endif 3570 3571 3572 // store n 3573 3574 movq ARG7, %r10 // D 3575 movq ARG8, %r11 // D 3576 movq ARG9, %r12 // D 3577 3578#if MACRO_LEVEL>=1 3579 INNER_STORE_8X8_VS_LIB8 3580#else 3581 CALL(inner_store_8x8_vs_lib8) 3582#endif 3583 3584 3585 EPILOGUE 3586 3587 ret 3588 3589 FUN_END(kernel_sgemm_nt_8x8_vs_lib8) 3590 3591 3592 3593 3594 3595// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72 3596// void kernel_sgemm_nt_8x8_gen_lib8(int k, float *alpha, float *A, float *B, float *beta, int offsetC, float *C, int sdc, int offsetD, float *D, int sdd, int m0, int m1, int n0, int n1); 3597 3598 .p2align 4,,15 3599 GLOB_FUN_START(kernel_sgemm_nt_8x8_gen_lib8) 3600 3601 PROLOGUE 3602 3603 // zero accumulation registers 3604 3605 vxorpd %ymm0, %ymm0, %ymm0 3606 vmovaps %ymm0, %ymm1 3607 vmovaps %ymm0, %ymm2 3608 vmovaps %ymm0, %ymm3 3609 vmovaps %ymm0, %ymm4 3610 vmovaps %ymm0, %ymm5 3611 vmovaps %ymm0, %ymm6 3612 vmovaps %ymm0, %ymm7 3613 3614 3615 // call inner dgemm kernel nt 3616 3617 movq ARG1, %r10 // k 3618 movq ARG3, %r11 // A 3619 movq ARG4, %r12 // B 3620 3621#if MACRO_LEVEL>=2 3622 INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8 3623#else 3624 CALL(inner_kernel_gemm_add_nt_8x8_lib8) 3625#endif 3626 3627 3628 // call inner blend scale 3629 3630 movq ARG2, %r10 // alpha 3631 movq ARG5, %r11 // beta 3632 movq ARG6, %r12 // offsetC 3633 movq ARG7, %r13 // C 3634 movq ARG8, %r14 // sdc 3635 sall $5, %r14d // 8*sdc*sizeof(float) 3636 3637#if MACRO_LEVEL>=1 3638 INNER_SCALE_AB_8X8_GEN_LIB8 3639#else 3640 CALL(inner_scale_ab_8x8_gen_lib8) 3641#endif 3642 3643 3644 // store n gen 3645 3646 movq ARG9, %r10 // offsetD 3647 movq ARG10, %r11 // D 3648 movq ARG11, %r12 // sdd 3649 sall $5, %r12d // 8*sdb*sizeof(float) 3650 movq ARG12, %r13 // m0 3651 movq ARG13, %r14 // m1 3652 movq ARG14, %r15 // n0 3653 movq ARG15, %rax // n1 3654 3655#if MACRO_LEVEL>=1 3656 INNER_STORE_8X8_GEN_LIB8 3657#else 3658 CALL(inner_store_8x8_gen_lib8) 3659#endif 3660 3661 3662 EPILOGUE 3663 3664 ret 3665 3666 FUN_END(kernel_sgemm_nt_8x8_gen_lib8) 3667 3668 3669 3670 3671 3672// 1 2 3 4 5 6 7 8 9 3673// void kernel_sgemm_nn_8x8_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, float *C, float *D); 3674 3675 .p2align 4,,15 3676 GLOB_FUN_START(kernel_sgemm_nn_8x8_lib8) 3677 3678 PROLOGUE 3679 3680 // zero accumulation registers 3681 3682 ZERO_ACC 3683 3684 3685 // call inner dgemm kernel nn 3686 3687 movq ARG1, %r10 // k 3688 movq ARG3, %r11 // A 3689 movq ARG5, %r12 // B 3690 movq ARG6, %r13 // sdb 3691 sall $5, %r13d // 4*sdb*sizeof(double) 3692 movq ARG4, %r14 // offsetB 3693 3694#if MACRO_LEVEL>=1 3695 INNER_EDGE_GEMM_ADD_NN_8X8_LIB8 3696#else 3697 CALL(inner_edge_gemm_add_nn_8x8_lib8) 3698#endif 3699 3700#if MACRO_LEVEL>=2 3701 INNER_KERNEL_GEMM_ADD_NN_8X8_LIB8 3702#else 3703 CALL(inner_kernel_gemm_add_nn_8x8_lib8) 3704#endif 3705 3706 3707 // call inner blend 3708 3709 movq ARG2, %r10 // alpha 3710 movq ARG7, %r11 // beta 3711 movq ARG8, %r12 // C 3712 3713#if MACRO_LEVEL>=1 3714 INNER_SCALE_AB_8X8_LIB8 3715#else 3716 CALL(inner_scale_ab_8x8_lib8) 3717#endif 3718 3719 3720 // store n 3721 3722 movq ARG9, %r10 // D 3723 3724#if MACRO_LEVEL>=1 3725 INNER_STORE_8X8_LIB8 3726#else 3727 CALL(inner_store_8x8_lib8) 3728#endif 3729 3730 3731 EPILOGUE 3732 3733 ret 3734 3735 FUN_END(kernel_sgemm_nn_8x8_lib8) 3736 3737 3738 3739 3740 3741// 1 2 3 4 5 6 7 8 9 10 11 3742// void kernel_sgemm_nn_8x8_lib8(int k, float *alpha, float *A, int offsetB, float *B, int sdb, float *beta, float *C, float *D, int km, int kn); 3743 3744 .p2align 4,,15 3745 GLOB_FUN_START(kernel_sgemm_nn_8x8_vs_lib8) 3746 3747 PROLOGUE 3748 3749 // zero accumulation registers 3750 3751 ZERO_ACC 3752 3753 3754 // call inner dgemm kernel nn 3755 3756 movq ARG1, %r10 // k 3757 movq ARG3, %r11 // A 3758 movq ARG5, %r12 // B 3759 movq ARG6, %r13 // sdb 3760 sall $5, %r13d // 4*sdb*sizeof(double) 3761 movq ARG4, %r14 // offsetB 3762 3763#if MACRO_LEVEL>=1 3764 INNER_EDGE_GEMM_ADD_NN_8X8_LIB8 3765#else 3766 CALL(inner_edge_gemm_add_nn_8x8_lib8) 3767#endif 3768 3769#if MACRO_LEVEL>=2 3770 INNER_KERNEL_GEMM_ADD_NN_8X8_LIB8 3771#else 3772 CALL(inner_kernel_gemm_add_nn_8x8_lib8) 3773#endif 3774 3775 3776 // call inner blend 3777 3778 movq ARG2, %r10 // alpha 3779 movq ARG7, %r11 // beta 3780 movq ARG8, %r12 // C 3781 3782#if MACRO_LEVEL>=1 3783 INNER_SCALE_AB_8X8_LIB8 3784#else 3785 CALL(inner_scale_ab_8x8_lib8) 3786#endif 3787 3788 3789 // store n 3790 3791 movq ARG9, %r10 // D 3792 movq ARG10, %r11 // D 3793 movq ARG11, %r12 // D 3794 3795#if MACRO_LEVEL>=1 3796 INNER_STORE_8X8_VS_LIB8 3797#else 3798 CALL(inner_store_8x8_vs_lib8) 3799#endif 3800 3801 3802 EPILOGUE 3803 3804 ret 3805 3806 FUN_END(kernel_sgemm_nn_8x8_vs_lib8) 3807 3808 3809 3810 3811 3812// rdi rsi rdx rcx r8 r9 rsp+8 rsp+16 rsp+24 rsp+32 rsp+40 rsp+48 rsp+56 rsp+64 rsp+72 rsp+80 rsp+88 3813// void kernel_sgemm_nn_8x8_gen_lib4(int k, float *alpha, float *A, int offB, float *B, int sdb, float *beta, int offC, float *C, int sdc, int offD, float *D, int sdd, int m0, int m1, int n0, int n1); 3814 3815 .p2align 4,,15 3816 GLOB_FUN_START(kernel_sgemm_nn_8x8_gen_lib8) 3817 3818 PROLOGUE 3819 3820 // zero accumulation registers 3821 3822 ZERO_ACC 3823 3824 3825 // call inner dgemm kernel nn 3826 3827 movq ARG1, %r10 // k 3828 movq ARG3, %r11 // A 3829 movq ARG5, %r12 // B 3830 movq ARG6, %r13 // sdb 3831 sall $5, %r13d // 4*sdb*sizeof(double) 3832 movq ARG4, %r14 // offsetB 3833 3834#if MACRO_LEVEL>=1 3835 INNER_EDGE_GEMM_ADD_NN_8X8_LIB8 3836#else 3837 CALL(inner_edge_gemm_add_nn_8x8_lib8) 3838#endif 3839 3840#if MACRO_LEVEL>=2 3841 INNER_KERNEL_GEMM_ADD_NN_8X8_LIB8 3842#else 3843 CALL(inner_kernel_gemm_add_nn_8x8_lib8) 3844#endif 3845 3846 3847 // call inner blend scale 3848 3849 movq ARG2, %r10 // alpha 3850 movq ARG7, %r11 // beta 3851 movq ARG8, %r12 // offsetC 3852 movq ARG9, %r13 // C 3853 movq ARG10, %r14 // sdc 3854 sall $5, %r14d // 4*sdc*sizeof(double) 3855 3856#if MACRO_LEVEL>=1 3857 INNER_SCALE_AB_8X8_GEN_LIB8 3858#else 3859 CALL(inner_scale_ab_8x8_gen_lib8) 3860#endif 3861 3862 3863 // store n gen 3864 3865 movq ARG11, %r10 // offsetD 3866 movq ARG12, %r11 // D 3867 movq ARG13, %r12 // sdd 3868 sall $5, %r12d // 4*sdb*sizeof(double) 3869 movq ARG14, %r13 // m0 3870 movq ARG15, %r14 // m1 3871 movq ARG16, %r15 // n0 3872 movq ARG17, %rax // n1 3873 3874#if MACRO_LEVEL>=1 3875 INNER_STORE_8X8_GEN_LIB8 3876#else 3877 CALL(inner_store_8x8_gen_lib8) 3878#endif 3879 3880 3881 EPILOGUE 3882 3883 ret 3884 3885 FUN_END(kernel_sgemm_nn_8x8_gen_lib8) 3886 3887 3888 3889 3890 3891// rdi rsi rdx rcx r8 r9 rsp+8 3892// void kernel_ssyrk_nt_l_8x8_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D); 3893 3894 .p2align 4,,15 3895 GLOB_FUN_START(kernel_ssyrk_nt_l_8x8_lib8) 3896 3897 PROLOGUE 3898 3899 // zero accumulation registers 3900 3901 vxorpd %ymm0, %ymm0, %ymm0 3902 vmovaps %ymm0, %ymm1 3903 vmovaps %ymm0, %ymm2 3904 vmovaps %ymm0, %ymm3 3905 vmovaps %ymm0, %ymm4 3906 vmovaps %ymm0, %ymm5 3907 vmovaps %ymm0, %ymm6 3908 vmovaps %ymm0, %ymm7 3909 3910 3911 // call inner dgemm kernel nt 3912 3913 movq ARG1, %r10 // k 3914 movq ARG3, %r11 // A 3915 movq ARG4, %r12 // B 3916 3917#if MACRO_LEVEL>=2 3918 INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8 3919#else 3920 CALL(inner_kernel_gemm_add_nt_8x8_lib8) 3921#endif 3922 3923 3924 // call inner scale 3925 3926 movq ARG2, %r10 // alpha 3927 movq ARG5, %r11 // beta 3928 movq ARG6, %r12 // C 3929 3930#if MACRO_LEVEL>=1 3931 INNER_SCALE_AB_8X8_LIB8 3932#else 3933 CALL(inner_scale_ab_8x8_lib8) 3934#endif 3935 3936 3937 // store n 3938 3939 movq ARG7, %r10 // D 3940 3941#if MACRO_LEVEL>=1 3942 INNER_STORE_L_8X8_LIB8 3943#else 3944 CALL(inner_store_l_8x8_lib8) 3945#endif 3946 3947 3948 EPILOGUE 3949 3950 ret 3951 3952 FUN_END(kernel_ssyrk_nt_l_8x8_lib8) 3953 3954 3955 3956 3957 3958// 1 2 3 4 5 6 7 8 9 3959// void kernel_ssyrk_nt_l_8x8_vs_lib8(int k, float *alpha, float *A, float *B, float *beta, float *C, float *D, int km, int kn); 3960 3961 .p2align 4,,15 3962 GLOB_FUN_START(kernel_ssyrk_nt_l_8x8_vs_lib8) 3963 3964 PROLOGUE 3965 3966 // zero accumulation registers 3967 3968 vxorpd %ymm0, %ymm0, %ymm0 3969 vmovaps %ymm0, %ymm1 3970 vmovaps %ymm0, %ymm2 3971 vmovaps %ymm0, %ymm3 3972 vmovaps %ymm0, %ymm4 3973 vmovaps %ymm0, %ymm5 3974 vmovaps %ymm0, %ymm6 3975 vmovaps %ymm0, %ymm7 3976 3977 3978 // call inner dgemm kernel nt 3979 3980 movq ARG1, %r10 // k 3981 movq ARG3, %r11 // A 3982 movq ARG4, %r12 // B 3983 3984#if MACRO_LEVEL>=2 3985 INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8 3986#else 3987 CALL(inner_kernel_gemm_add_nt_8x8_lib8) 3988#endif 3989 3990 3991 // call inner scale 3992 3993 movq ARG2, %r10 // alpha 3994 movq ARG5, %r11 // beta 3995 movq ARG6, %r12 // C 3996 3997#if MACRO_LEVEL>=1 3998 INNER_SCALE_AB_8X8_LIB8 3999#else 4000 CALL(inner_scale_ab_8x8_lib8) 4001#endif 4002 4003 4004 // store n 4005 4006 movq ARG7, %r10 // D 4007 movq ARG8, %r11 // km 4008 movq ARG9, %r12 // kn 4009 4010#if MACRO_LEVEL>=1 4011 INNER_STORE_L_8X8_VS_LIB8 4012#else 4013 CALL(inner_store_l_8x8_vs_lib8) 4014#endif 4015 4016 4017 EPILOGUE 4018 4019 ret 4020 4021 FUN_END(kernel_ssyrk_nt_l_8x8_vs_lib8) 4022 4023 4024 4025 4026 4027// edi rsi rdx ecx r8 r9 rsp+8 4028// void kernel_strsm_nt_rl_inv_8x8_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E); 4029 4030 .p2align 4,,15 4031 GLOB_FUN_START(kernel_strsm_nt_rl_inv_8x8_lib8) 4032 4033 PROLOGUE 4034 4035 // zero accumulation registers 4036 4037 vxorpd %ymm0, %ymm0, %ymm0 4038 vmovaps %ymm0, %ymm1 4039 vmovaps %ymm0, %ymm2 4040 vmovaps %ymm0, %ymm3 4041 vmovaps %ymm0, %ymm4 4042 vmovaps %ymm0, %ymm5 4043 vmovaps %ymm0, %ymm6 4044 vmovaps %ymm0, %ymm7 4045 4046 4047 // call inner dgemm kernel nt 4048 4049 movq ARG1, %r10 4050 movq ARG2, %r11 4051 movq ARG3, %r12 4052 4053#if MACRO_LEVEL>=2 4054 INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8 4055#else 4056 CALL(inner_kernel_gemm_sub_nt_8x8_lib8) 4057#endif 4058 4059 4060 // call inner blender_loader nn 4061 4062 movq ARG4, %r10 4063 4064#if MACRO_LEVEL>=1 4065 INNER_SCALE_11_8X8_LIB8 4066#else 4067 CALL(inner_scale_11_8x8_lib8) 4068#endif 4069 4070 4071 // solve 4072 4073 movq ARG6, %r10 // E 4074 movq ARG7, %r11 // inv_diag_E 4075 movq $8, %r12 // n1 4076 4077#if MACRO_LEVEL>=1 4078 INNER_EDGE_TRSM_RLT_INV_8X8_VS_LIB8 4079#else 4080 CALL(inner_edge_trsm_rlt_inv_8x8_vs_lib8) 4081#endif 4082 4083 4084 // store 4085 4086 movq ARG5, %r10 // D 4087 4088#if MACRO_LEVEL>=1 4089 INNER_STORE_8X8_LIB8 4090#else 4091 CALL(inner_store_8x8_lib8) 4092#endif 4093 4094 4095 EPILOGUE 4096 4097 ret 4098 4099 FUN_END(kernel_strsm_nt_rl_inv_8x8_lib8) 4100 4101 4102 4103 4104 4105// edi rsi rdx ecx r8 r9 rsp+8 rsp+16 rsp+24 4106// void kernel_strsm_nt_rl_inv_8x8_vs_lib8(int k, float *A, float *B, float *C, float *D, float *E, float *inv_diag_E, int km, int kn); 4107 4108 .p2align 4,,15 4109 GLOB_FUN_START(kernel_strsm_nt_rl_inv_8x8_vs_lib8) 4110 4111 PROLOGUE 4112 4113 // zero accumulation registers 4114 4115 vxorpd %ymm0, %ymm0, %ymm0 4116 vmovaps %ymm0, %ymm1 4117 vmovaps %ymm0, %ymm2 4118 vmovaps %ymm0, %ymm3 4119 vmovaps %ymm0, %ymm4 4120 vmovaps %ymm0, %ymm5 4121 vmovaps %ymm0, %ymm6 4122 vmovaps %ymm0, %ymm7 4123 4124 4125 // call inner dgemm kernel nt 4126 4127 movq ARG1, %r10 4128 movq ARG2, %r11 4129 movq ARG3, %r12 4130 4131#if MACRO_LEVEL>=2 4132 INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8 4133#else 4134 CALL(inner_kernel_gemm_sub_nt_8x8_lib8) 4135#endif 4136 4137 4138 // call inner blender_loader nn // TODO scale gen 4139 4140 movq ARG4, %r10 // C 4141 4142#if MACRO_LEVEL>=1 4143 INNER_SCALE_11_8X8_LIB8 4144#else 4145 CALL(inner_scale_11_8x8_lib8) 4146#endif 4147 4148 4149 // solve 4150 4151 movq ARG6, %r10 // E 4152 movq ARG7, %r11 // inv_diag_E 4153 movq ARG9, %r12 // kn 4154 4155#if MACRO_LEVEL>=1 4156 INNER_EDGE_TRSM_RLT_INV_8X8_VS_LIB8 4157#else 4158 CALL(inner_edge_trsm_rlt_inv_8x8_vs_lib8) 4159#endif 4160 4161 4162 // store 4163 4164 movq ARG5, %r10 // D 4165 movq ARG8, %r11 // m1 4166 movq ARG9, %r12 // n1 4167 4168#if MACRO_LEVEL>=1 4169 INNER_STORE_8X8_VS_LIB8 4170#else 4171 CALL(inner_store_8x8_vs_lib8) 4172#endif 4173 4174 4175 EPILOGUE 4176 4177 ret 4178 4179 FUN_END(kernel_strsm_nt_rl_inv_8x8_vs_lib8) 4180 4181 4182 4183 4184 4185// 1 2 3 4 5 6 7 8 9 10 4186// void kernel_sgemm_strsm_nt_rl_inv_8x8_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E); 4187 4188 .p2align 4,,15 4189 GLOB_FUN_START(kernel_sgemm_strsm_nt_rl_inv_8x8_lib8) 4190 4191 PROLOGUE 4192 4193 // zero accumulation registers 4194 4195 vxorpd %ymm0, %ymm0, %ymm0 4196 vmovaps %ymm0, %ymm1 4197 vmovaps %ymm0, %ymm2 4198 vmovaps %ymm0, %ymm3 4199 vmovaps %ymm0, %ymm4 4200 vmovaps %ymm0, %ymm5 4201 vmovaps %ymm0, %ymm6 4202 vmovaps %ymm0, %ymm7 4203 4204 4205 // call inner dgemm kernel nt add 4206 4207 movq ARG1, %r10 // kp 4208 movq ARG2, %r11 // Ap 4209 movq ARG3, %r12 // Bp 4210 4211#if MACRO_LEVEL>=2 4212 INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8 4213#else 4214 CALL(inner_kernel_gemm_add_nt_8x8_lib8) 4215#endif 4216 4217 4218 // call inner dgemm kernel nt sub 4219 4220 movq ARG4, %r10 // km 4221 movq ARG5, %r11 // Am 4222 movq ARG6, %r12 // Bm 4223 4224#if MACRO_LEVEL>=2 4225 INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8 4226#else 4227 CALL(inner_kernel_gemm_sub_nt_8x8_lib8) 4228#endif 4229 4230 4231 // call inner blender_loader nn 4232 4233 movq ARG7, %r10 // C 4234 4235#if MACRO_LEVEL>=1 4236 INNER_SCALE_11_8X8_LIB8 4237#else 4238 CALL(inner_scale_11_8x8_lib8) 4239#endif 4240 4241 4242 // solve 4243 4244 movq ARG9, %r10 // E 4245 movq ARG10, %r11 // inv_diag_E 4246 movq $8, %r12 // n1 4247 4248#if MACRO_LEVEL>=1 4249 INNER_EDGE_TRSM_RLT_INV_8X8_VS_LIB8 4250#else 4251 CALL(inner_edge_trsm_rlt_inv_8x8_vs_lib8) 4252#endif 4253 4254 4255 // store 4256 4257 movq ARG8, %r10 // D 4258 4259#if MACRO_LEVEL>=1 4260 INNER_STORE_8X8_LIB8 4261#else 4262 CALL(inner_store_8x8_lib8) 4263#endif 4264 4265 4266 EPILOGUE 4267 4268 ret 4269 4270 FUN_END(kernel_sgemm_strsm_nt_rl_inv_8x8_lib8) 4271 4272 4273 4274 4275 4276// 1 2 3 4 5 6 7 8 9 10 11 12 4277// void kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *E, float *inv_diag_E, int km, int kn); 4278 4279 .p2align 4,,15 4280 GLOB_FUN_START(kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8) 4281 4282 PROLOGUE 4283 4284 // zero accumulation registers 4285 4286 vxorpd %ymm0, %ymm0, %ymm0 4287 vmovaps %ymm0, %ymm1 4288 vmovaps %ymm0, %ymm2 4289 vmovaps %ymm0, %ymm3 4290 vmovaps %ymm0, %ymm4 4291 vmovaps %ymm0, %ymm5 4292 vmovaps %ymm0, %ymm6 4293 vmovaps %ymm0, %ymm7 4294 4295 4296 // call inner dgemm kernel nt add 4297 4298 movq ARG1, %r10 // kp 4299 movq ARG2, %r11 // Ap 4300 movq ARG3, %r12 // Bp 4301 4302#if MACRO_LEVEL>=2 4303 INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8 4304#else 4305 CALL(inner_kernel_gemm_add_nt_8x8_lib8) 4306#endif 4307 4308 4309 // call inner dgemm kernel nt sub 4310 4311 movq ARG4, %r10 // km 4312 movq ARG5, %r11 // Am 4313 movq ARG6, %r12 // Bm 4314 4315#if MACRO_LEVEL>=2 4316 INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8 4317#else 4318 CALL(inner_kernel_gemm_sub_nt_8x8_lib8) 4319#endif 4320 4321 4322 // call inner blender_loader nn 4323 4324 movq ARG7, %r10 // C 4325 4326#if MACRO_LEVEL>=1 4327 INNER_SCALE_11_8X8_LIB8 4328#else 4329 CALL(inner_scale_11_8x8_lib8) 4330#endif 4331 4332 4333 // solve 4334 4335 movq ARG9, %r10 // E 4336 movq ARG10, %r11 // inv_diag_E 4337 movq ARG12, %r12 // kn 4338 4339#if MACRO_LEVEL>=1 4340 INNER_EDGE_TRSM_RLT_INV_8X8_VS_LIB8 4341#else 4342 CALL(inner_edge_trsm_rlt_inv_8x8_vs_lib8) 4343#endif 4344 4345 4346 // store 4347 4348 movq ARG8, %r10 // D 4349 movq ARG11, %r11 // km 4350 movq ARG12, %r12 // kn 4351 4352#if MACRO_LEVEL>=1 4353 INNER_STORE_8X8_VS_LIB8 4354#else 4355 CALL(inner_store_8x8_vs_lib8) 4356#endif 4357 4358 4359 EPILOGUE 4360 4361 ret 4362 4363 FUN_END(kernel_sgemm_strsm_nt_rl_inv_8x8_vs_lib8) 4364 4365 4366 4367 4368 4369// edi rsi rdx rcx r8 r9 4370// void kernel_spotrf_nt_l_8x8_lib8(int k, float *A, float *B, float *C, float *D, float *inv_diag_D); 4371 4372 .p2align 4,,15 4373 GLOB_FUN_START(kernel_spotrf_nt_l_8x8_lib8) 4374 4375 PROLOGUE 4376 4377 // zero accumulation registers 4378 4379 vxorpd %ymm0, %ymm0, %ymm0 4380 vmovaps %ymm0, %ymm1 4381 vmovaps %ymm0, %ymm2 4382 vmovaps %ymm0, %ymm3 4383 vmovaps %ymm0, %ymm4 4384 vmovaps %ymm0, %ymm5 4385 vmovaps %ymm0, %ymm6 4386 vmovaps %ymm0, %ymm7 4387 4388 4389 // call inner dgemm kernel nt 4390 4391 movq ARG1, %r10 4392 movq ARG2, %r11 4393 movq ARG3, %r12 4394 4395#if MACRO_LEVEL>=2 4396 INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8 4397#else 4398 CALL(inner_kernel_gemm_sub_nt_8x8_lib8) 4399#endif 4400 4401 4402 // call inner blender_loader nn 4403 4404 movq ARG4, %r10 // C 4405 4406#if MACRO_LEVEL>=1 4407 INNER_SCALE_11_8X8_LIB8 4408#else 4409 CALL(inner_scale_11_8x8_lib8) 4410#endif 4411 4412 4413 // factorization 4414 4415 movq ARG6, %r10 // inv_diag_D 4416 movl $8, %r11d // n1 4417 4418#if MACRO_LEVEL>=1 4419 INNER_EDGE_POTRF_8X8_VS_LIB8 4420#else 4421 CALL(inner_edge_potrf_8x8_vs_lib8) 4422#endif 4423 4424 4425 // store 4426 4427 movq ARG5, %r10 // D 4428 4429#if MACRO_LEVEL>=1 4430 INNER_STORE_L_8X8_LIB8 4431#else 4432 CALL(inner_store_l_8x8_lib8) 4433#endif 4434 4435 4436 EPILOGUE 4437 4438 ret 4439 4440 FUN_END(kernel_spotrf_nt_l_8x8_lib8) 4441 4442 4443 4444 4445 4446// edi rsi rdx rcx r8 r9 rsp+8 rsp+16 4447// void kernel_spotrf_nt_l_8x8_vs_lib8(int k, float *A, float *B, float *C, float *D, float *inv_diag_D, int km, int kn); 4448 4449 .p2align 4,,15 4450 GLOB_FUN_START(kernel_spotrf_nt_l_8x8_vs_lib8) 4451 4452 PROLOGUE 4453 4454 // zero accumulation registers 4455 4456 vxorpd %ymm0, %ymm0, %ymm0 4457 vmovaps %ymm0, %ymm1 4458 vmovaps %ymm0, %ymm2 4459 vmovaps %ymm0, %ymm3 4460 vmovaps %ymm0, %ymm4 4461 vmovaps %ymm0, %ymm5 4462 vmovaps %ymm0, %ymm6 4463 vmovaps %ymm0, %ymm7 4464 4465 4466 // call inner dgemm kernel nt 4467 4468 movq ARG1, %r10 4469 movq ARG2, %r11 4470 movq ARG3, %r12 4471 4472#if MACRO_LEVEL>=2 4473 INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8 4474#else 4475 CALL(inner_kernel_gemm_sub_nt_8x8_lib8) 4476#endif 4477 4478 4479 // call inner blender_loader nn 4480 4481 movq ARG4, %r10 // C 4482 4483#if MACRO_LEVEL>=1 4484 INNER_SCALE_11_8X8_LIB8 4485#else 4486 CALL(inner_scale_11_8x8_lib8) 4487#endif 4488 4489 4490 // factorization 4491 4492 movq ARG6, %r10 // inv_diag_D 4493 movq ARG8, %r11 // kn 4494 4495#if MACRO_LEVEL>=1 4496 INNER_EDGE_POTRF_8X8_VS_LIB8 4497#else 4498 CALL(inner_edge_potrf_8x8_vs_lib8) 4499#endif 4500 4501 4502 // store 4503 4504 movq ARG5, %r10 // D 4505 movq ARG7, %r11 // m1 4506 movq ARG8, %r12 // n1 4507 4508#if MACRO_LEVEL>=1 4509 INNER_STORE_L_8X8_VS_LIB8 4510#else 4511 CALL(inner_store_l_8x8_vs_lib8) 4512#endif 4513 4514 4515 EPILOGUE 4516 4517 ret 4518 4519 FUN_END(kernel_spotrf_nt_l_8x8_vs_lib8) 4520 4521 4522 4523 4524 4525// 1 2 3 4 5 6 7 8 9 4526// void kernel_ssyrk_spotrf_nt_l_8x8_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *inv_diag_D); 4527 4528 .p2align 4,,15 4529 GLOB_FUN_START(kernel_ssyrk_spotrf_nt_l_8x8_lib8) 4530 4531 PROLOGUE 4532 4533 // zero accumulation registers 4534 4535 ZERO_ACC 4536 4537 4538 // call inner dgemm kernel nt add 4539 4540 movq ARG1, %r10 // kp 4541 movq ARG2, %r11 // Ap 4542 movq ARG3, %r12 // Bp 4543 4544#if MACRO_LEVEL>=2 4545 INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8 4546#else 4547 CALL(inner_kernel_gemm_add_nt_8x8_lib8) 4548#endif 4549 4550 4551 // call inner dgemm kernel nt sub 4552 4553 movq ARG4, %r10 // km 4554 movq ARG5, %r11 // Am 4555 movq ARG6, %r12 // Bm 4556 4557#if MACRO_LEVEL>=2 4558 INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8 4559#else 4560 CALL(inner_kernel_gemm_sub_nt_8x8_lib8) 4561#endif 4562 4563 4564 // call inner blender_loader nn 4565 4566 movq ARG7, %r10 // C 4567 4568#if MACRO_LEVEL>=1 4569 INNER_SCALE_11_8X8_LIB8 4570#else 4571 CALL(inner_scale_11_8x8_lib8) 4572#endif 4573 4574 4575 // factorization 4576 4577 movq ARG9, %r10 // inv_diag_D 4578 movl $8, %r11d 4579 4580#if MACRO_LEVEL>=1 4581 INNER_EDGE_POTRF_8X8_VS_LIB8 4582#else 4583 CALL(inner_edge_potrf_8x8_vs_lib8) 4584#endif 4585 4586 4587 // store 4588 4589 movq ARG8, %r10 // D 4590 4591#if MACRO_LEVEL>=1 4592 INNER_STORE_L_8X8_LIB8 4593#else 4594 CALL(inner_store_l_8x8_lib8) 4595#endif 4596 4597 4598 EPILOGUE 4599 4600 ret 4601 4602 FUN_END(kernel_ssyrk_spotrf_nt_l_8x8_lib8) 4603 4604 4605 4606 4607 4608// 1 2 3 4 5 6 7 8 9 10 11 4609// void kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8(int kp, float *Ap, float *Bp, int km, float *Am, float *Bm, float *C, float *D, float *inv_diag_D, int km, int kn); 4610 4611 .p2align 4,,15 4612 GLOB_FUN_START(kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8) 4613 4614 PROLOGUE 4615 4616 // zero accumulation registers 4617 4618 vxorpd %ymm0, %ymm0, %ymm0 4619 vmovaps %ymm0, %ymm1 4620 vmovaps %ymm0, %ymm2 4621 vmovaps %ymm0, %ymm3 4622 vmovaps %ymm0, %ymm4 4623 vmovaps %ymm0, %ymm5 4624 vmovaps %ymm0, %ymm6 4625 vmovaps %ymm0, %ymm7 4626 4627 4628 // call inner dgemm kernel nt add 4629 4630 movq ARG1, %r10 // kp 4631 movq ARG2, %r11 // Ap 4632 movq ARG3, %r12 // Bp 4633 4634#if MACRO_LEVEL>=2 4635 INNER_KERNEL_GEMM_ADD_NT_8X8_LIB8 4636#else 4637 CALL(inner_kernel_gemm_add_nt_8x8_lib8) 4638#endif 4639 4640 4641 // call inner dgemm kernel nt sub 4642 4643 movq ARG4, %r10 // km 4644 movq ARG5, %r11 // Am 4645 movq ARG6, %r12 // Bm 4646 4647#if MACRO_LEVEL>=2 4648 INNER_KERNEL_GEMM_SUB_NT_8X8_LIB8 4649#else 4650 CALL(inner_kernel_gemm_sub_nt_8x8_lib8) 4651#endif 4652 4653 4654 // call inner blender_loader nn 4655 4656 movq ARG7, %r10 // C 4657 4658#if MACRO_LEVEL>=1 4659 INNER_SCALE_11_8X8_LIB8 4660#else 4661 CALL(inner_scale_11_8x8_lib8) 4662#endif 4663 4664 4665 // factorization 4666 4667 movq ARG9, %r10 // inv_diag_D 4668 movq ARG11, %r11 // kn 4669 4670#if MACRO_LEVEL>=1 4671 INNER_EDGE_POTRF_8X8_VS_LIB8 4672#else 4673 CALL(inner_edge_potrf_8x8_vs_lib8) 4674#endif 4675 4676 4677 // store 4678 4679 movq ARG8, %r10 // D 4680 movq ARG10, %r11 // km 4681 movq ARG11, %r12 // kn 4682 4683#if MACRO_LEVEL>=1 4684 INNER_STORE_L_8X8_VS_LIB8 4685#else 4686 CALL(inner_store_l_8x8_vs_lib8) 4687#endif 4688 4689 4690 EPILOGUE 4691 4692 ret 4693 4694 FUN_END(kernel_ssyrk_spotrf_nt_l_8x8_vs_lib8) 4695 4696 4697 4698 4699 4700#if defined(BLAS_API) 4701 4702#include "kernel_sgemm_8x8_lib.S" 4703 4704#endif 4705 4706 4707 4708 4709 4710 // read-only data 4711#if defined(OS_LINUX) 4712 .section .rodata.cst32,"aM",@progbits,32 4713#elif defined(OS_MAC) 4714 .section __TEXT,__const 4715#elif defined(OS_WINDOWS) 4716 .section .rdata,"dr" 4717#endif 4718 4719#if defined(OS_LINUX) | defined(OS_WINDOWS) 4720 .align 32 4721.LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 } 4722#elif defined(OS_MAC) 4723 .align 5 4724LC00: // { 7.5 6.5 5.5 4.5 3.5 2.5 1.5 0.5 } 4725#endif 4726 .long 1056964608 4727 .long 1069547520 4728 .long 1075838976 4729 .long 1080033280 4730 .long 1083179008 4731 .long 1085276160 4732 .long 1087373312 4733 .long 1089470464 4734 4735#if defined(OS_LINUX) | defined(OS_WINDOWS) 4736 .align 32 4737.LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 } 4738#elif defined(OS_MAC) 4739 .align 5 4740LC01: // { 15.5 14.5 13.5 12.5 11.5 10.5 9.5 8.5 } 4741#endif 4742 .long 1091043328 4743 .long 1092091904 4744 .long 1093140480 4745 .long 1094189056 4746 .long 1095237632 4747 .long 1096286208 4748 .long 1097334784 4749 .long 1098383360 4750 4751#if defined(OS_LINUX) | defined(OS_WINDOWS) 4752 .align 32 4753.LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 } 4754#elif defined(OS_MAC) 4755 .align 5 4756LC02: // { 23.5 22.5 21.5 20.5 19.5 18.5 17.5 16.5 } 4757#endif 4758 .long 1099169792 4759 .long 1099694080 4760 .long 1100218368 4761 .long 1100742656 4762 .long 1101266944 4763 .long 1101791232 4764 .long 1102315520 4765 .long 1102839808 4766 4767#if defined(OS_LINUX) | defined(OS_WINDOWS) 4768 .align 32 4769.LC03: // { 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 } 4770#elif defined(OS_MAC) 4771 .align 5 4772LC03: // { 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 } 4773#endif 4774 .long 1065353216 4775 .long 1065353216 4776 .long 1065353216 4777 .long 1065353216 4778 .long 1065353216 4779 .long 1065353216 4780 .long 1065353216 4781 .long 1065353216 4782 4783#if defined(OS_LINUX) | defined(OS_WINDOWS) 4784 .align 32 4785.LC09: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 } 4786#elif defined(OS_MAC) 4787 .align 5 4788LC09: // { -1.0 -1.0 1.0 1.0 1.0 1.0 1.0 1.0 } 4789#endif 4790 .long 1065353216 4791 .long 1065353216 4792 .long 1065353216 4793 .long 1065353216 4794 .long 1065353216 4795 .long 1065353216 4796 .long 3212836864 4797 .long 3212836864 4798 4799 4800 4801#if defined(OS_LINUX) 4802 .section .note.GNU-stack,"",@progbits 4803#elif defined(OS_MAC) 4804 .subsections_via_symbols 4805#endif 4806 4807