1/************************************************************************************************** 2* * 3* This file is part of BLASFEO. * 4* * 5* BLASFEO -- BLAS For Embedded Optimization. * 6* Copyright (C) 2019 by Gianluca Frison. * 7* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. * 8* All rights reserved. * 9* * 10* The 2-Clause BSD License * 11* * 12* Redistribution and use in source and binary forms, with or without * 13* modification, are permitted provided that the following conditions are met: * 14* * 15* 1. Redistributions of source code must retain the above copyright notice, this * 16* list of conditions and the following disclaimer. * 17* 2. Redistributions in binary form must reproduce the above copyright notice, * 18* this list of conditions and the following disclaimer in the documentation * 19* and/or other materials provided with the distribution. * 20* * 21* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * 22* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * 23* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * 24* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * 25* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * 26* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * 27* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * 28* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * 29* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * 30* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * 31* * 32* Author: Gianluca Frison, gianluca.frison (at) imtek.uni-freiburg.de * 33* * 34**************************************************************************************************/ 35 36// // prologue 37// stmdb sp!, {r4 - r10, fp, lr} // save GP registers 38// add fp, sp, #36 // fp to old sp position 39// fstmfdd sp!, {d8-d15} // save FP registers 40#define PROLOGUE \ 41 stmdb sp!, {r4 - r10, fp, lr}; \ 42 add fp, sp, #36; \ 43 fstmfdd sp!, {d8-d15}; 44// // epilogue 45// fldmfdd sp!, {d8-d15} // load FP registers 46// ldmia sp!, {r4 - r10, fp, pc} // load GP registers and return 47#define EPILOGUE \ 48 fldmfdd sp!, {d8-d15}; \ 49 ldmia sp!, {r4 - r10, fp, pc}; 50 51 52 53#if defined(OS_LINUX) 54 .text 55#elif defined(OS_MAC) 56 .section __TEXT,__text,regular,pure_instructions 57#endif 58 59 60 61// subroutine 62// 63// input arguments: 64// r4 <- k 65// r5 <- A 66// r6 <- B 67// 68// output arguments: 69 70#if MACRO_LEVEL>=2 71 .macro INNER_KERNEL_GEMM_ADD_NT_4X4_LIB4 72#else 73// .p2align 4,,15 74#if defined(OS_LINUX) 75 .type inner_kernel_gemm_add_nt_4x4_lib4, %function 76inner_kernel_gemm_add_nt_4x4_lib4: 77#elif defined(OS_MAC) 78_inner_kernel_gemm_add_nt_4x4_lib4: 79#endif 80#endif 81 82 // early return 83 cmp r4, #0 84 ble 2f // return 85 86 // prefetch 87 pld [r5, #0] 88 pld [r6, #0] 89#if defined(TARGET_ARMV7A_ARM_CORTEX_A9) | defined(TARGET_ARMV7A_ARM_CORTEX_A7) 90#if defined(TARGET_ARMV7A_ARM_CORTEX_A9) 91 pld [r5, #32] 92 pld [r6, #32] 93#endif 94 pld [r5, #64] 95 pld [r6, #64] 96#if defined(TARGET_ARMV7A_ARM_CORTEX_A9) 97 pld [r5, #96] 98 pld [r6, #96] 99#endif 100 pld [r5, #64] 101#else // cortex a15 102 // preload 103 vld1.64 {d0, d1}, [r5:128]! // A 104 vld1.64 {d4, d5}, [r6:128]! // B 105#endif 106 107 cmp r4, #4 108 ble 0f // consider clean up loop 109 110 // main loop 1111: 112 113#if defined(TARGET_ARMV7A_ARM_CORTEX_A9) | defined(TARGET_ARMV7A_ARM_CORTEX_A7) 114 115 vld1.64 {d0, d1}, [r6:128]! // B 116 vld1.64 {d16, d17}, [r5:128]! // A 117 118 vld1.64 {d2, d3}, [r6:128]! // B 119 vld1.64 {d18, d19}, [r5:128]! // A 120 121 vld1.64 {d4, d5}, [r6:128]! // B 122 vld1.64 {d20, d21}, [r5:128]! // A 123 124 vld1.64 {d6, d7}, [r6:128]! // B 125 vld1.64 {d22, d23}, [r5:128]! // A 126 127 // prefetch 128 129 // unroll 0 130 vmla.f32 q4, q8, d0[0] 131 pld [r6, #64] 132 vmla.f32 q5, q8, d0[1] 133 pld [r5, #64] 134 vmla.f32 q6, q8, d1[0] 135#if defined(TARGET_ARMV7A_ARM_CORTEX_A9) 136 pld [r6, #96] 137#endif 138 vmla.f32 q7, q8, d1[1] 139#if defined(TARGET_ARMV7A_ARM_CORTEX_A9) 140 pld [r5, #96] 141#endif 142 143 // unroll 1 144 vmla.f32 q4, q9, d2[0] 145 vmla.f32 q5, q9, d2[1] 146 vmla.f32 q6, q9, d3[0] 147 vmla.f32 q7, q9, d3[1] 148 149 // unroll 2 150 vmla.f32 q4, q10, d4[0] 151 vmla.f32 q5, q10, d4[1] 152 vmla.f32 q6, q10, d5[0] 153 vmla.f32 q7, q10, d5[1] 154 155 // unroll 3 156 vmla.f32 q4, q11, d6[0] 157 vmla.f32 q5, q11, d6[1] 158 vmla.f32 q6, q11, d7[0] 159 vmla.f32 q7, q11, d7[1] 160 161 sub r4, r4, #4 162 163#else // cortex a15 164 165 // prefetch 166 pld [r5, #64] 167 pld [r6, #64] 168 169 // unroll 0 170 vmla.f32 q4, q0, d4[0] 171 vld1.64 {d2, d3}, [r5:128]! // A 172 vmla.f32 q5, q0, d4[1] 173 vld1.64 {d6, d7}, [r6:128]! // B 174 vmla.f32 q6, q0, d5[0] 175 vmla.f32 q7, q0, d5[1] 176 177 // unroll 1 178 vmla.f32 q4, q1, d6[0] 179 vld1.64 {d0, d1}, [r5:128]! // A 180 vmla.f32 q5, q1, d6[1] 181 vld1.64 {d4, d5}, [r6:128]! // B 182 vmla.f32 q6, q1, d7[0] 183 vmla.f32 q7, q1, d7[1] 184 185 // unroll 2 186 vmla.f32 q4, q0, d4[0] 187 vld1.64 {d2, d3}, [r5:128]! // A 188 vmla.f32 q5, q0, d4[1] 189 vld1.64 {d6, d7}, [r6:128]! // B 190 vmla.f32 q6, q0, d5[0] 191 vmla.f32 q7, q0, d5[1] 192 193 // unroll 3 194 vmla.f32 q4, q1, d6[0] 195 vld1.64 {d0, d1}, [r5:128]! // A 196 vmla.f32 q5, q1, d6[1] 197 vld1.64 {d4, d5}, [r6:128]! // B 198 vmla.f32 q6, q1, d7[0] 199 vmla.f32 q7, q1, d7[1] 200 201 sub r4, r4, #4 202 203#endif 204 205 cmp r4, #4 206 bgt 1b 207 2080: 209 210 cmp r4, #3 211 ble 4f 212 213#if defined(TARGET_ARMV7A_ARM_CORTEX_A9) | defined(TARGET_ARMV7A_ARM_CORTEX_A7) 214 215 vld1.64 {d0, d1}, [r6:128]! // B 216 vld1.64 {d16, d17}, [r5:128]! // A 217 218 vld1.64 {d2, d3}, [r6:128]! // B 219 vld1.64 {d18, d19}, [r5:128]! // A 220 221 vld1.64 {d4, d5}, [r6:128]! // B 222 vld1.64 {d20, d21}, [r5:128]! // A 223 224 vld1.64 {d6, d7}, [r6:128]! // B 225 vld1.64 {d22, d23}, [r5:128]! // A 226 227 // prefetch 228 229 // unroll 0 230 vmla.f32 q4, q8, d0[0] 231// pld [r5, #64] 232 vmla.f32 q5, q8, d0[1] 233// pld [r6, #64] 234 vmla.f32 q6, q8, d1[0] 235 vmla.f32 q7, q8, d1[1] 236 237 // unroll 1 238 vmla.f32 q4, q9, d2[0] 239 vmla.f32 q5, q9, d2[1] 240 vmla.f32 q6, q9, d3[0] 241 vmla.f32 q7, q9, d3[1] 242 243 // unroll 2 244 vmla.f32 q4, q10, d4[0] 245 vmla.f32 q5, q10, d4[1] 246 vmla.f32 q6, q10, d5[0] 247 vmla.f32 q7, q10, d5[1] 248 249 // unroll 3 250 vmla.f32 q4, q11, d6[0] 251 vmla.f32 q5, q11, d6[1] 252 vmla.f32 q6, q11, d7[0] 253 vmla.f32 q7, q11, d7[1] 254 255 sub r4, r4, #4 256 257#else // cortex a15 258 259 // unroll 0 260 vmla.f32 q4, q0, d4[0] 261 vld1.64 {d2, d3}, [r5:128]! // A 262 vmla.f32 q5, q0, d4[1] 263 vld1.64 {d6, d7}, [r6:128]! // B 264 vmla.f32 q6, q0, d5[0] 265 vmla.f32 q7, q0, d5[1] 266 267 // unroll 1 268 vmla.f32 q4, q1, d6[0] 269 vld1.64 {d0, d1}, [r5:128]! // A 270 vmla.f32 q5, q1, d6[1] 271 vld1.64 {d4, d5}, [r6:128]! // B 272 vmla.f32 q6, q1, d7[0] 273 vmla.f32 q7, q1, d7[1] 274 275 // unroll 2 276 vmla.f32 q4, q0, d4[0] 277 vld1.64 {d2, d3}, [r5:128]! // A 278 vmla.f32 q5, q0, d4[1] 279 vld1.64 {d6, d7}, [r6:128]! // B 280 vmla.f32 q6, q0, d5[0] 281 vmla.f32 q7, q0, d5[1] 282 283 // unroll 3 284 vmla.f32 q4, q1, d6[0] 285// vld1.64 {d0, d1}, [r5:128]! // A 286 vmla.f32 q5, q1, d6[1] 287// vld1.64 {d4, d5}, [r6:128]! // B 288 vmla.f32 q6, q1, d7[0] 289 vmla.f32 q7, q1, d7[1] 290 291 sub r4, r4, #4 292 293#endif 294 295 b 2f // return 296 2974: // consider clean1-up loop 298 299 cmp r4, #0 300 ble 2f // return 301 302#if defined(TARGET_ARMV7A_ARM_CORTEX_A9) | defined(TARGET_ARMV7A_ARM_CORTEX_A7) 303 304#else // cortex a15 305 sub r5, r5, #16 306 sub r6, r6, #16 307#endif 308 3093: // clean1-up loop 310 311 // unroll 0 312 vld1.64 {d0, d1}, [r5:128]! // A 313 vld1.64 {d4, d5}, [r6:128]! // B 314 315 vmla.f32 q4, q0, d4[0] 316 vmla.f32 q5, q0, d4[1] 317 vmla.f32 q6, q0, d5[0] 318 vmla.f32 q7, q0, d5[1] 319 320 sub r4, r4, #1 321 cmp r4, #0 322 bgt 3b 323 3242: // return 325 326 327#if MACRO_LEVEL>=2 328 .endm 329#else 330 mov pc, lr // return 331 332#if defined(OS_LINUX) 333 .size inner_kernel_gemm_add_nt_4x4_lib4, .-inner_kernel_gemm_add_nt_4x4_lib4 334#endif 335#endif 336 337 338 339 340 341// subroutine 342// 343// input arguments: 344// r4 <- k 345// r5 <- A 346// r6 <- B 347// r7 <- 4*sdb*sizeof(float) 348// 349// output arguments: 350 351#if MACRO_LEVEL>=2 352 .macro INNER_KERNEL_GEMM_ADD_NN_4X4_LIB4 353#else 354// .p2align 4,,15 355#if defined(OS_LINUX) 356 .type inner_kernel_gemm_add_nn_4x4_lib4, %function 357inner_kernel_gemm_add_nn_4x4_lib4: 358#elif defined(OS_MAC) 359_inner_kernel_gemm_add_nn_4x4_lib4: 360#endif 361#endif 362 363 // early return 364 cmp r4, #0 365 ble 2f // return 366 367 // prefetch 368 pld [r5, #0] 369 pld [r6, #0] 370#if defined(TARGET_ARMV7A_ARM_CORTEX_A9) | defined(TARGET_ARMV7A_ARM_CORTEX_A7) 371#if defined(TARGET_ARMV7A_ARM_CORTEX_A9) 372 add r8, r7, r7 373 pld [r5, #32] 374 add r9, r7, #32 375 pld [r6, #32] 376#endif 377 pld [r5, #64] 378 pld [r6, r7] 379#if defined(TARGET_ARMV7A_ARM_CORTEX_A9) 380 pld [r5, #96] 381 pld [r6, r9] 382 add r9, r9, r7 383#endif 384 pld [r5, #64] 385#else // cortex a15 386 // preload 387 vld1.64 {d0, d1}, [r5:128]! // A 388 vldr d4, [r6, #0] // B[0,1] 389 vldr d5, [r6, #16] // B[4,5] 390 vldr d6, [r6, #32] // B[8,9] 391 vldr d7, [r6, #48] // B[12,13] 392#endif 393 394 cmp r4, #4 395 ble 0f // consider clean up loop 396 397 // main loop 3981: 399 400#if defined(TARGET_ARMV7A_ARM_CORTEX_A9) | defined(TARGET_ARMV7A_ARM_CORTEX_A7) 401 402 // prefetch 403 404 vld1.64 {d0, d1, d2, d3}, [r6:128]! // B 405 vld1.64 {d16, d17, d18, d19}, [r5:128]! // A 406 407 vld1.64 {d4, d5, d6, d7}, [r6:128]! // B 408 vld1.64 {d20, d21, d22, d23}, [r5:128]! // A 409 410 sub r6, r6, #64 411 412 // unroll 0 413 vmla.f32 q4, q8, d0[0] 414 pld [r6, r8] 415 vmla.f32 q5, q8, d2[0] 416 pld [r5, #64] 417 vmla.f32 q6, q8, d4[0] 418#if defined(TARGET_ARMV7A_ARM_CORTEX_A9) 419 pld [r6, r9] 420#endif 421 vmla.f32 q7, q8, d6[0] 422#if defined(TARGET_ARMV7A_ARM_CORTEX_A9) 423 pld [r5, #96] 424#endif 425 426 // unroll 1 427 vmla.f32 q4, q9, d0[1] 428 vmla.f32 q5, q9, d2[1] 429 vmla.f32 q6, q9, d4[1] 430 vmla.f32 q7, q9, d6[1] 431 432 // unroll 2 433 vmla.f32 q4, q10, d1[0] 434 vmla.f32 q5, q10, d3[0] 435 vmla.f32 q6, q10, d5[0] 436 vmla.f32 q7, q10, d7[0] 437 438 // unroll 3 439 vmla.f32 q4, q11, d1[1] 440 vmla.f32 q5, q11, d3[1] 441 vmla.f32 q6, q11, d5[1] 442 vmla.f32 q7, q11, d7[1] 443 444 add r6, r6, r7 445 sub r4, r4, #4 446 447#else // cortex a15 448 449 // prefetch 450 pld [r5, #64] 451 pld [r6, r7] 452 453 // unroll 0 454 vmla.f32 q4, q0, d4[0] 455 vld1.64 {d2, d3}, [r5:128]! // A 456 vmla.f32 q5, q0, d5[0] 457 vmla.f32 q6, q0, d6[0] 458 vmla.f32 q7, q0, d7[0] 459 460 // unroll 1 461 vld1.64 {d0, d1}, [r5:128]! // A 462 vmla.f32 q4, q1, d4[1] 463 vldr d4, [r6, #8] // B[2,3] 464 vmla.f32 q5, q1, d5[1] 465 vldr d5, [r6, #24] // B[6,7] 466 vmla.f32 q6, q1, d6[1] 467 vldr d6, [r6, #40] // B[10,11] 468 vmla.f32 q7, q1, d7[1] 469 vldr d7, [r6, #56] // B[14,15] 470 471 // unroll 2 472 vmla.f32 q4, q0, d4[0] 473 vld1.64 {d2, d3}, [r5:128]! // A 474 vmla.f32 q5, q0, d5[0] 475 add r6, r6, r7 476 vmla.f32 q6, q0, d6[0] 477 vmla.f32 q7, q0, d7[0] 478 479 // unroll 3 480 vld1.64 {d0, d1}, [r5:128]! // A 481 vmla.f32 q4, q1, d4[1] 482 vldr d4, [r6, #0] // B[0,1] 483 vmla.f32 q5, q1, d5[1] 484 vldr d5, [r6, #16] // B[4,5] 485 vmla.f32 q6, q1, d6[1] 486 vldr d6, [r6, #32] // B[8,9] 487 vmla.f32 q7, q1, d7[1] 488 vldr d7, [r6, #48] // B[12,13] 489 490 sub r4, r4, #4 491 492#endif 493 494 cmp r4, #4 495 bgt 1b 496 4970: 498 499 cmp r4, #3 500 ble 4f 501 502#if defined(TARGET_ARMV7A_ARM_CORTEX_A9) | defined(TARGET_ARMV7A_ARM_CORTEX_A7) 503 504 vld1.64 {d0, d1, d2, d3}, [r6:128]! // B 505 vld1.64 {d16, d17, d18, d19}, [r5:128]! // A 506 507 vld1.64 {d4, d5, d6, d7}, [r6:128]! // B 508 vld1.64 {d20, d21, d22, d23}, [r5:128]! // A 509 510 // prefetch 511 512 // unroll 0 513 vmla.f32 q4, q8, d0[0] 514// pld [r5, #64] 515 vmla.f32 q5, q8, d2[0] 516// pld [r6, #64] 517 vmla.f32 q6, q8, d4[0] 518 vmla.f32 q7, q8, d6[0] 519 520 // unroll 1 521 vmla.f32 q4, q9, d0[1] 522 vmla.f32 q5, q9, d2[1] 523 vmla.f32 q6, q9, d4[1] 524 vmla.f32 q7, q9, d6[1] 525 526 // unroll 2 527 vmla.f32 q4, q10, d1[0] 528 vmla.f32 q5, q10, d3[0] 529 vmla.f32 q6, q10, d5[0] 530 vmla.f32 q7, q10, d7[0] 531 532 // unroll 3 533 vmla.f32 q4, q11, d1[1] 534 vmla.f32 q5, q11, d3[1] 535 vmla.f32 q6, q11, d5[1] 536 vmla.f32 q7, q11, d7[1] 537 538 add r6, r6, r7 539 sub r4, r4, #4 540 sub r6, r6, #64 541 542#else // cortex a15 543 544 // unroll 0 545 vmla.f32 q4, q0, d4[0] 546 vld1.64 {d2, d3}, [r5:128]! // A 547 vmla.f32 q5, q0, d5[0] 548 vmla.f32 q6, q0, d6[0] 549 vmla.f32 q7, q0, d7[0] 550 551 // unroll 1 552 vld1.64 {d0, d1}, [r5:128]! // A 553 vmla.f32 q4, q1, d4[1] 554 vldr d4, [r6, #8] // B[2,3] 555 vmla.f32 q5, q1, d5[1] 556 vldr d5, [r6, #24] // B[6,7] 557 vmla.f32 q6, q1, d6[1] 558 vldr d6, [r6, #40] // B[10,11] 559 vmla.f32 q7, q1, d7[1] 560 vldr d7, [r6, #56] // B[14,15] 561 562 // unroll 2 563 vmla.f32 q4, q0, d4[0] 564 vld1.64 {d2, d3}, [r5:128]! // A 565 vmla.f32 q5, q0, d5[0] 566 add r6, r6, r7 567 vmla.f32 q6, q0, d6[0] 568 vmla.f32 q7, q0, d7[0] 569 570 // unroll 3 571// vld1.64 {d0, d1}, [r5:128]! // A 572 vmla.f32 q4, q1, d4[1] 573// vldr d4, [r6, #0] // B[0,1] 574 vmla.f32 q5, q1, d5[1] 575// vldr d5, [r6, #16] // B[4,5] 576 vmla.f32 q6, q1, d6[1] 577// vldr d6, [r6, #32] // B[8,9] 578 vmla.f32 q7, q1, d7[1] 579// vldr d7, [r6, #48] // B[12,13] 580 581 sub r4, r4, #4 582 583#endif 584 585 b 2f // return 586 5874: // consider clean1-up loop 588 589 cmp r4, #0 590 ble 2f // return 591 592#if defined(TARGET_ARMV7A_ARM_CORTEX_A9) | defined(TARGET_ARMV7A_ARM_CORTEX_A7) 593 594#else // cortex a15 595 sub r5, r5, #16 596#endif 597 5983: // clean1-up loop 599 600 // unroll 0 601#if defined(TARGET_ARMV7A_ARM_CORTEX_A9) | defined(TARGET_ARMV7A_ARM_CORTEX_A7) 602 603 vld1.64 {d0, d1}, [r5:128]! // A 604 vldr s8, [r6, #0] // B[0] 605 vldr s9, [r6, #16] // B[4] 606 vldr s10, [r6, #32] // B[8] 607 vldr s11, [r6, #48] // B[12] 608 vmla.f32 q4, q0, d4[0] 609 vmla.f32 q5, q0, d4[1] 610 vmla.f32 q6, q0, d5[0] 611 vmla.f32 q7, q0, d5[1] 612 613#else // cortex a15 614 615 vld1.64 {d0, d1}, [r5:128]! // A 616 vldr s8, [r6, #0] // B[0] 617 vmla.f32 q4, q0, d4[0] 618 vldr s8, [r6, #16] // B[4] 619 vmla.f32 q5, q0, d4[0] 620 vldr s8, [r6, #32] // B[8] 621 vmla.f32 q6, q0, d4[0] 622 vldr s8, [r6, #48] // B[12] 623 vmla.f32 q7, q0, d4[0] 624 625#endif 626 627 sub r4, r4, #1 628 add r6, r6, #4 629 cmp r4, #0 630 bgt 3b 631 6322: // return 633 634 635#if MACRO_LEVEL>=2 636 .endm 637#else 638 mov pc, lr // return 639 640#if defined(OS_LINUX) 641 .size inner_kernel_gemm_add_nn_4x4_lib4, .-inner_kernel_gemm_add_nn_4x4_lib4 642#endif 643#endif 644 645 646 647 648 649// subroutine 650// 651// input arguments: 652// r4 <- k 653// r5 <- A 654// r6 <- B 655// r7 <- bs*sdb*sizeof(float) 656// r8 <- offsetB 657 658#if MACRO_LEVEL>=1 659 .macro INNER_EDGE_GEMM_ADD_NN_4X4_LIB4 660#else 661 .p2align 4,,15 662#if defined(OS_LINUX) 663 .type inner_edge_gemm_add_nn_4x4_lib4, %function 664inner_edge_gemm_add_nn_4x4_lib4: 665#elif defined(OS_MAC) 666_inner_edge_gemm_add_nn_4x4_lib4: 667#endif 668#endif 669 670 cmp r8, #0 671 ble 2f // return 672 673 cmp r4, #0 674 ble 2f // return 675 676 677 rsb r9, r8, #4 // 4-offsetB 678 cmp r9, r4 679// ble 0f 680// mov r9, r4 // kend=min(k,4-offsetB( 681//0: 682 movgt r9, r4 // kend=min(k,4-offsetB( 683 684// lsl r10, r8, #2 // offsetB*sizeof(float) 685 add r6, r6, r8, LSL #2 // B + offsetB*sizeof(float) 686 6871: 688#if defined(TARGET_ARMV7A_ARM_CORTEX_A9) | defined(TARGET_ARMV7A_ARM_CORTEX_A7) 689 690 vld1.64 {d0, d1}, [r5:128]! // A 691 vldr s8, [r6, #0] // B[0] 692 vldr s9, [r6, #16] // B[4] 693 vldr s10, [r6, #32] // B[8] 694 vldr s11, [r6, #48] // B[12] 695 vmla.f32 q4, q0, d4[0] 696 vmla.f32 q5, q0, d4[1] 697 vmla.f32 q6, q0, d5[0] 698 vmla.f32 q7, q0, d5[1] 699 700#else 701 702 vld1.64 {d0, d1}, [r5:128]! // A 703 vldr s8, [r6, #0] // B[0] 704 vmla.f32 q4, q0, d4[0] 705 vldr s8, [r6, #16] // B[4] 706 vmla.f32 q5, q0, d4[0] 707 vldr s8, [r6, #32] // B[8] 708 vmla.f32 q6, q0, d4[0] 709 vldr s8, [r6, #48] // B[12] 710 vmla.f32 q7, q0, d4[0] 711 712#endif 713 714 sub r9, r9, #1 715 sub r4, r4, #1 716 add r6, r6, #4 717 718 cmp r9, #0 719 bgt 1b 720 721 cmp r4, #0 722 ble 2f // return 723 724 add r6, r6, r7 725 sub r6, r6, #16 726 7272: // return 728 729#if MACRO_LEVEL>=1 730 .endm 731#else 732 mov pc, lr // return 733 734#if defined(OS_LINUX) 735 .size inner_edge_gemm_add_nn_4x4_lib4, .-inner_edge_gemm_add_nn_4x4_lib4 736#endif 737#endif 738 739 740 741 742 743// subroutine 744// 745// cholesky factorization 746// 747// input arguments: 748// r4 <- inv_diag_D 749// 750// output arguments: 751// r4 <- inv_diag_D 752 753#if MACRO_LEVEL>=1 754 .macro INNER_EDGE_POTRF_4X4_LIB4 lc_zero 755#else 756 .align 3 75799: // 0 758 .word 0 759 .word 0 760 761 .p2align 4,,15 762#if defined(OS_LINUX) 763 .type inner_edge_potrf_4x4_lib4, %function 764inner_edge_potrf_4x4_lib4: 765#elif defined(OS_MAC) 766_inner_edge_potrf_4x4_lib4: 767#endif 768#endif 769 770 fconsts s1, #112 // 1.0 771#if MACRO_LEVEL>=1 772 flds s0, \lc_zero // 0.0 773#else 774 flds s0, 99b // 0.0 775#endif 776 777#if 0 // scalar 778 779 // first column 780 fcmpes s16, s0 781 fmstat 782 ble 1f 783 fsqrts s16, s16 784 fdivs s2, s1, s16 785 fsts s2, [r4, #0] 7862: 787 fmuls s17, s17, s2 788 fmuls s18, s18, s2 789 fmuls s19, s19, s2 790 791 // second column 792 fnmacs s21, s17, s17 793 fnmacs s22, s17, s18 794 fnmacs s23, s17, s19 795 fcmpes s21, s0 796 fmstat 797 ble 3f 798 fsqrts s21, s21 799 fdivs s2, s1, s21 800 fsts s2, [r4, #4] 8014: 802 fmuls s22, s22, s2 803 fmuls s23, s23, s2 804 805 // third column 806 fnmacs s26, s18, s18 807 fnmacs s27, s18, s19 808 fnmacs s26, s22, s22 809 fnmacs s27, s22, s23 810 fcmpes s16, s0 811 fmstat 812 ble 5f 813 fsqrts s26, s26 814 fdivs s2, s1, s26 815 fsts s2, [r4, #8] 8166: 817 fmuls s27, s27, s2 818 819 // fourth column 820 fnmacs s31, s19, s19 821 fnmacs s31, s23, s23 822 fnmacs s31, s27, s27 823 fcmpes s31, s0 824 fmstat 825 ble 7f 826 fsqrts s31, s31 827 fdivs s2, s1, s31 828 fsts s2, [r4, #12] 829 830#else // vector 831 832 // first column 833 fcmpes s16, s0 834 fmstat 835 ble 1f 836 fsqrts s2, s16 837 fdivs s2, s1, s2 838 fsts s2, [r4, #0] 8392: 840 vmul.f32 q4, q4, d1[0] 841 842 // second column 843 vmls.f32 q5, q4, d8[1] 844 fcmpes s21, s0 845 fmstat 846 ble 3f 847 fsqrts s2, s21 848 fdivs s2, s1, s2 849 fsts s2, [r4, #4] 8504: 851 vmul.f32 q5, q5, d1[0] 852 853 // third column 854 vmls.f32 q6, q4, d9[0] 855 vmls.f32 q6, q5, d11[0] 856 fcmpes s16, s0 857 fmstat 858 ble 5f 859 fsqrts s2, s26 860 fdivs s2, s1, s2 861 fsts s2, [r4, #8] 8626: 863 vmul.f32 q6, q6, d1[0] 864 865 // fourth column 866 vmls.f32 q7, q4, d9[1] 867 vmls.f32 q7, q5, d11[1] 868 vmls.f32 q7, q6, d13[1] 869 fcmpes s31, s0 870 fmstat 871 ble 7f 872 fsqrts s31, s31 873 fdivs s2, s1, s31 874 fsts s2, [r4, #12] 875 876#endif 877 878 b 0f 879 8801: 881#if MACRO_LEVEL>=1 882 flds s16, \lc_zero // 0.0 883#else 884 flds s16, 99b // 0.0 885#endif 886 b 2b 887 8883: 889#if MACRO_LEVEL>=1 890 flds s21, \lc_zero // 0.0 891#else 892 flds s21, 99b // 0.0 893#endif 894 b 4b 895 8965: 897#if MACRO_LEVEL>=1 898 flds s26, \lc_zero // 0.0 899#else 900 flds s26, 99b // 0.0 901#endif 902 b 6b 903 9047: 905#if MACRO_LEVEL>=1 906 flds s31, \lc_zero // 0.0 907#else 908 flds s31, 99b // 0.0 909#endif 910 9110: 912 913#if MACRO_LEVEL>=1 914 .endm 915#else 916 mov pc, lr // return 917 918#if defined(OS_LINUX) 919 .size inner_edge_potrf_4x4_lib4, .-inner_edge_potrf_4x4_lib4 920#endif 921#endif 922 923 924// subroutine 925// 926// triangular substitution: 927// side = right 928// uplo = lower 929// tran = transposed 930// requires explicit inverse of diagonal 931// 932// input arguments: 933// r4 <- E 934// r5 <- inv_diag_E 935// 936// output arguments: 937// r4 <- E 938// r5 <- inv_diag_E 939 940#if MACRO_LEVEL>=1 941 .macro INNER_EDGE_TRSM_RLT_INV_4X4_LIB4 942#else 943 .p2align 4,,15 944#if defined(OS_LINUX) 945 .type inner_edge_trsm_rlt_inv_4x4_lib4, %function 946inner_edge_trsm_rlt_inv_4x4_lib4: 947#elif defined(OS_MAC) 948inner_edge_trsm_rlt_inv_4x4_lib4: 949#endif 950#endif 951 952 // first column 953 vldr.32 d0, [r5, #0] // E_inv[0] 954 vmul.f32 q4, q4, d0[0]; 955 956 // second column 957 vldr.32 d0, [r4, #4] // E[1+4*0] 958 vmls.f32 q5, q4, d0[0]; 959 vldr.32 d0, [r5, #4] // E_inv[1] 960 vmul.f32 q5, q5, d0[0]; 961 962 // thirs column 963 vldr.32 d0, [r4, #8] // E[2+4*0] 964 vmls.f32 q6, q4, d0[0]; 965 vldr.32 d0, [r4, #24] // E[2+4*1] 966 vmls.f32 q6, q5, d0[0]; 967 vldr.32 d0, [r5, #8] // E_inv[2] 968 vmul.f32 q6, q6, d0[0]; 969 970 // fourth column 971 vldr.32 d0, [r4, #12] // E[3+4*0] 972 vmls.f32 q7, q4, d0[0]; 973 vldr.32 d0, [r4, #28] // E[3+4*1] 974 vmls.f32 q7, q5, d0[0]; 975 vldr.32 d0, [r4, #44] // E[3+4*2] 976 vmls.f32 q7, q6, d0[0]; 977 vldr.32 d0, [r5, #12] // E_inv[3] 978 vmul.f32 q7, q7, d0[0]; 979 980#if MACRO_LEVEL>=1 981 .endm 982#else 983 mov pc, lr // return 984 985#if defined(OS_LINUX) 986 .size inner_edge_trsm_rlt_inv_4x4_lib4, .-inner_edge_trsm_rlt_inv_4x4_lib4 987#endif 988#endif 989 990 991 992 993 994// subroutine 995// 996// input arguments: 997// r4 <- alpha 998// r5 <- beta 999// r6 <- C 1000// 1001// output arguments: 1002 1003#if MACRO_LEVEL>=1 1004 .macro INNER_SCALE_AB_4X4_LIB4 lc_zero 1005#else 1006 .align 3 100799: // 0 1008 .word 0 1009 .word 0 1010// .p2align 4,,15 1011#if defined(OS_LINUX) 1012 .type inner_scale_ab_4x4_lib4, %function 1013inner_scale_ab_4x4_lib4: 1014#elif defined(OS_MAC) 1015_inner_scale_ab_4x4_lib4: 1016#endif 1017#endif 1018 1019 flds s8, [r4, #0] // alpha 1020 flds s9, [r5, #0] // beta 1021#if MACRO_LEVEL>=2 1022 flds s10, \lc_zero // 0.0 1023#else 1024 flds s10, 99b // 0.0 1025#endif 1026 1027 fcmpes s9, s10 1028 vmul.f32 q4, q4, d4[0] 1029 vmul.f32 q5, q5, d4[0] 1030 vmul.f32 q6, q6, d4[0] 1031 vmul.f32 q7, q7, d4[0] 1032 fmstat 1033 1034 beq 0f // end 1035 1036 vld1.64 {d0, d1, d2, d3}, [r6:128]! 1037 vmla.f32 q4, q0, d4[1] 1038 vmla.f32 q5, q1, d4[1] 1039 vld1.64 {d0, d1, d2, d3}, [r6:128]! 1040 vmla.f32 q6, q0, d4[1] 1041 vmla.f32 q7, q1, d4[1] 1042 10430: 1044 1045#if MACRO_LEVEL>=1 1046 .endm 1047#else 1048 mov pc, lr // return 1049 1050#if defined(OS_LINUX) 1051 .size inner_scale_ab_4x4_lib4, .-inner_scale_ab_4x4_lib4 1052#endif 1053#endif 1054 1055 1056 1057 1058 1059// subroutine 1060// 1061// input arguments: 1062// r4 <- beta 1063// r5 <- C 1064// 1065// output arguments: 1066 1067#if MACRO_LEVEL>=1 1068 .macro INNER_SCALE_M1B_4X4_LIB4 lc_zero 1069#else 1070 .align 3 107199: // 0 1072 .word 0 1073 .word 0 1074// .p2align 4,,15 1075#if defined(OS_LINUX) 1076 .type inner_scale_m1b_4x4_lib4, %function 1077inner_scale_m1b_4x4_lib4: 1078#elif defined(OS_MAC) 1079_inner_scale_m1b_4x4_lib4: 1080#endif 1081#endif 1082 1083 flds s8, [r4, #0] // beta 1084#if MACRO_LEVEL>=2 1085 flds s9, \lc_zero // 0.0 1086#else 1087 flds s9, 99b // 0.0 1088#endif 1089 1090 fcmpes s8, s9 1091 vneg.f32 q4, q4 1092 vneg.f32 q5, q5 1093 vneg.f32 q6, q6 1094 vneg.f32 q7, q7 1095 fmstat 1096 1097 beq 0f // end 1098 1099 vld1.64 {d0, d1, d2, d3}, [r5:128]! 1100 vmla.f32 q4, q0, d4[0] 1101 vmla.f32 q5, q1, d4[0] 1102 vld1.64 {d0, d1, d2, d3}, [r5:128]! 1103 vmla.f32 q6, q0, d4[0] 1104 vmla.f32 q7, q1, d4[0] 1105 11060: 1107 1108#if MACRO_LEVEL>=1 1109 .endm 1110#else 1111 mov pc, lr // return 1112 1113#if defined(OS_LINUX) 1114 .size inner_scale_m1b_4x4_lib4, .-inner_scale_m1b_4x4_lib4 1115#endif 1116#endif 1117 1118 1119 1120 1121 1122// subroutine 1123// 1124// input arguments: 1125// r4 <- C 1126// 1127// output arguments: 1128 1129#if MACRO_LEVEL>=1 1130 .macro INNER_SCALE_M11_4X4_LIB4 1131#else 1132// .p2align 4,,15 1133#if defined(OS_LINUX) 1134 .type inner_scale_m11_4x4_lib4, %function 1135inner_scale_m11_4x4_lib4: 1136#elif defined(OS_MAC) 1137_inner_scale_11_4x4_lib4: 1138#endif 1139#endif 1140 1141 vld1.64 {d0, d1, d2, d3}, [r4:128]! 1142 vsub.f32 q4, q0, q4 1143 vsub.f32 q5, q1, q5 1144 vld1.64 {d0, d1, d2, d3}, [r4:128]! 1145 vsub.f32 q6, q0, q6 1146 vsub.f32 q7, q1, q7 1147 1148#if MACRO_LEVEL>=1 1149 .endm 1150#else 1151 mov pc, lr // return 1152 1153#if defined(OS_LINUX) 1154 .size inner_scale_m11_4x4_lib4, .-inner_scale_m11_4x4_lib4 1155#endif 1156#endif 1157 1158 1159 1160 1161 1162// subroutine 1163// 1164// input arguments: 1165// r4 <- D 1166// 1167// output arguments: 1168 1169#if MACRO_LEVEL>=1 1170 .macro INNER_STORE_4X4_LIB4 1171#else 1172// .p2align 4,,15 1173#if defined(OS_LINUX) 1174 .type inner_store_4x4_lib4, %function 1175inner_store_4x4_lib4: 1176#elif defined(OS_MAC) 1177_inner_store_4x4_lib4: 1178#endif 1179#endif 1180 1181 vst1.64 {d8, d9, d10, d11}, [r4:128]! 1182 vst1.64 {d12, d13, d14, d15}, [r4:128]! 1183 1184#if MACRO_LEVEL>=1 1185 .endm 1186#else 1187 mov pc, lr // return 1188 1189#if defined(OS_LINUX) 1190 .size inner_store_4x4_lib4, .-inner_store_4x4_lib4 1191#endif 1192#endif 1193 1194 1195 1196 1197 1198// subroutine 1199// 1200// input arguments: 1201// r4 <- D 1202// 1203// output arguments: 1204 1205#if MACRO_LEVEL>=1 1206 .macro INNER_STORE_4X4_L_LIB4 1207#else 1208// .p2align 4,,15 1209#if defined(OS_LINUX) 1210 .type inner_store_4x4_l_lib4, %function 1211inner_store_4x4_l_lib4: 1212#elif defined(OS_MAC) 1213_inner_store_4x4_l_lib4: 1214#endif 1215#endif 1216 1217 // first column 1218 vstr.64 d8, [r4, #0] 1219 vstr.64 d9, [r4, #8] 1220 // second column 1221 vstr.32 s21, [r4, #20] 1222 vstr.64 d11, [r4, #24] 1223 // third column 1224 vstr.64 d13, [r4, #40] 1225 // fourth column 1226 vstr.64 s31, [r4, #60] 1227 1228#if MACRO_LEVEL>=1 1229 .endm 1230#else 1231 mov pc, lr // return 1232 1233#if defined(OS_LINUX) 1234 .size inner_store_4x4_l_lib4, .-inner_store_4x4_l_lib4 1235#endif 1236#endif 1237 1238 1239 1240 1241 1242 .align 3 124399: // 0 1244 .word 0 1245 .word 0 1246 1247 1248 1249 1250 1251// r0 r1 r2 r3 sp+0 sp+4 sp+8 1252// void kernel_sgemm_nt_4x4_lib4(int kmax, double *alpha, double *A, double *B, double *beta, double *C, double *D) 1253 1254// .p2align 4,,15 1255#if defined(OS_LINUX) 1256 .global kernel_sgemm_nt_4x4_lib4 1257 .type kernel_sgemm_nt_4x4_lib4, %function 1258kernel_sgemm_nt_4x4_lib4: 1259#elif defined(OS_MAC) 1260 .global kernel_sgemm_nt_4x4_lib4 1261_kernel_sgemm_nt_4x4_lib4: 1262#endif 1263 1264 PROLOGUE 1265 1266 1267 1268 // zero accumulation registers 1269 vldr d8, 99b 1270 vldr d9, 99b 1271 vmov q5, q4 1272 vmov q6, q4 1273 vmov q7, q4 1274 1275 1276 1277 // call inner kernel dgemm nt 1278 mov r4, r0 // kmax 1279 mov r5, r2 // A 1280 mov r6, r3 // B 1281 1282#if MACRO_LEVEL>=2 1283 INNER_KERNEL_GEMM_ADD_NT_4X4_LIB4 1284#else 1285#if defined(OS_LINUX) 1286 bl inner_kernel_gemm_add_nt_4x4_lib4 1287#elif defined(OS_MAC) 1288 bl _inner_kernel_gemm_add_nt_4x4_lib4 1289#endif 1290#endif 1291 1292 1293 1294 // call inner blend for generic alpha and beta 1295 mov r4, r1 // alpha 1296 ldr r5, [fp, #0] // beta 1297 ldr r6, [fp, #4] // C 1298 1299#if MACRO_LEVEL>=1 1300 INNER_SCALE_AB_4X4_LIB4 99f 1301#else 1302#if defined(OS_LINUX) 1303 bl inner_scale_ab_4x4_lib4 1304#elif defined(OS_MAC) 1305 bl _inner_scale_ab_4x4_lib4 1306#endif 1307#endif 1308 1309 1310 1311 // store n 1312 ldr r4, [fp, #8] // D 1313 1314#if MACRO_LEVEL>=1 1315 INNER_STORE_4X4_LIB4 1316#else 1317#if defined(OS_LINUX) 1318 bl inner_store_4x4_lib4 1319#elif defined(OS_MAC) 1320 bl _inner_store_4x4_lib4 1321#endif 1322#endif 1323 1324 1325 1326 EPILOGUE 1327 1328#if defined(OS_LINUX) 1329 .size kernel_sgemm_nt_4x4_lib4, .-kernel_sgemm_nt_4x4_lib4 1330#endif 1331 1332 1333 1334 1335 1336 .align 3 133799: // 0 1338 .word 0 1339 .word 0 1340 1341 1342 1343 1344 1345// r0 r1 r2 r3 sp+0 sp+4 sp+8 sp+12 sp+16 1346// void kernel_sgemm_nn_4x4_lib4(int kmax, double *alpha, double *A, int offsetB, double *B, int sdb, double *beta, double *C, double *D) 1347 1348// .p2align 4,,15 1349#if defined(OS_LINUX) 1350 .global kernel_sgemm_nn_4x4_lib4 1351 .type kernel_sgemm_nn_4x4_lib4, %function 1352kernel_sgemm_nn_4x4_lib4: 1353#elif defined(OS_MAC) 1354 .global kernel_sgemm_nn_4x4_lib4 1355_kernel_sgemm_nn_4x4_lib4: 1356#endif 1357 1358 PROLOGUE 1359 1360 1361 1362 // zero accumulation registers 1363 vldr d8, 99b 1364 vldr d9, 99b 1365 vmov q5, q4 1366 vmov q6, q4 1367 vmov q7, q4 1368 1369 1370 1371 // call inner kernel dgemm nt 1372 mov r4, r0 // kmax 1373 mov r5, r2 // A 1374 ldr r6, [fp, #0] // B 1375 ldr r7, [fp, #4] // sdb 1376 lsl r7, r7, #4 // 4*sizeof(float)*sdb 1377 mov r8, r3 // offsetB 1378 1379#if MACRO_LEVEL>=1 1380 INNER_EDGE_GEMM_ADD_NN_4X4_LIB4 1381#else 1382#if defined(OS_LINUX) 1383 bl inner_edge_gemm_add_nn_4x4_lib4 1384#elif defined(OS_MAC) 1385 bl _inner_edge_gemm_add_nn_4x4_lib4 1386#endif 1387#endif 1388 1389#if MACRO_LEVEL>=2 1390 INNER_KERNEL_GEMM_ADD_NN_4X4_LIB4 1391#else 1392#if defined(OS_LINUX) 1393 bl inner_kernel_gemm_add_nn_4x4_lib4 1394#elif defined(OS_MAC) 1395 bl _inner_kernel_gemm_add_nn_4x4_lib4 1396#endif 1397#endif 1398 1399 1400 1401 // call inner blend for generic alpha and beta 1402 mov r4, r1 // alpha 1403 ldr r5, [fp, #8] // beta 1404 ldr r6, [fp, #12] // C 1405 1406#if MACRO_LEVEL>=1 1407 INNER_SCALE_AB_4X4_LIB4 99f 1408#else 1409#if defined(OS_LINUX) 1410 bl inner_scale_ab_4x4_lib4 1411#elif defined(OS_MAC) 1412 bl _inner_scale_ab_4x4_lib4 1413#endif 1414#endif 1415 1416 1417 1418 // store n 1419 ldr r4, [fp, #16] // D 1420 1421#if MACRO_LEVEL>=1 1422 INNER_STORE_4X4_LIB4 1423#else 1424#if defined(OS_LINUX) 1425 bl inner_store_4x4_lib4 1426#elif defined(OS_MAC) 1427 bl _inner_store_4x4_lib4 1428#endif 1429#endif 1430 1431 1432 1433 EPILOGUE 1434 1435#if defined(OS_LINUX) 1436 .size kernel_sgemm_nn_4x4_lib4, .-kernel_sgemm_nn_4x4_lib4 1437#endif 1438 1439 1440 1441 1442 1443 .align 3 144499: // { 0 } 1445 .word 0 1446 .word 0 1447 1448 1449 1450 1451 1452// r0 r1 r2 r3 sp+0 sp+4 rsp+8 esp+12 1453// void kernel_strsm_nt_rl_inv_4x4_lib4(int kmax, double *A, double *B, double *beta, double *C, double *D, double *E, double *inv_diag_E); 1454 1455// .p2align 4,,15 1456#if defined(OS_LINUX) 1457 .globl kernel_strsm_nt_rl_inv_4x4_lib4 1458 .type kernel_strsm_nt_rl_inv_4x4_lib4, %function 1459kernel_strsm_nt_rl_inv_4x4_lib4: 1460#elif defined(OS_MAC) 1461 .globl _kernel_strsm_nt_rl_inv_4x4_lib4 1462_kernel_strsm_nt_rl_inv_4x4_lib4: 1463#endif 1464 1465 PROLOGUE 1466 1467 1468 1469 // zero accumulation registers 1470 vldr d8, 99b 1471 vldr d9, 99b 1472 vmov q5, q4 1473 vmov q6, q4 1474 vmov q7, q4 1475 1476 1477 1478 // call inner kernel dgemm nt 1479 mov r4, r0 // kmax 1480 mov r5, r1 // A 1481 mov r6, r2 // B 1482 1483#if MACRO_LEVEL>=2 1484 INNER_KERNEL_GEMM_ADD_NT_4X4_LIB4 1485#else 1486#if defined(OS_LINUX) 1487 bl inner_kernel_gemm_add_nt_4x4_lib4 1488#elif defined(OS_MAC) 1489 bl _inner_kernel_gemm_add_nt_4x4_lib4 1490#endif 1491#endif 1492 1493 1494 1495 // call inner blend for alpha=1.0 and beta=1.0 1496 mov r4, r3 // beta 1497 ldr r5, [fp, #0] // C 1498 1499#if MACRO_LEVEL>=1 1500 INNER_SCALE_M1B_4X4_LIB4 1501#else 1502#if defined(OS_LINUX) 1503 bl inner_scale_m1b_4x4_lib4 1504#elif defined(OS_MAC) 1505 bl _inner_scale_m1b_4x4_lib4 1506#endif 1507#endif 1508 1509 1510 1511 // factorization 1512 ldr r4, [fp, #8] // E 1513 ldr r5, [fp, #12] // inv_diag_E 1514 1515#if MACRO_LEVEL>=1 1516 INNER_EDGE_TRSM_RLT_INV_4X4_LIB4 1517#else 1518#if defined(OS_LINUX) 1519 bl inner_edge_trsm_rlt_inv_4x4_lib4 1520#elif defined(OS_MAC) 1521 bl _inner_edge_trsm_rlt_inv_4x4_lib4 1522#endif 1523#endif 1524 1525 1526 1527 // store l 1528 ldr r4, [fp, #4] // D 1529 1530#if MACRO_LEVEL>=1 1531 INNER_STORE_4X4_LIB4 1532#else 1533#if defined(OS_LINUX) 1534 bl inner_store_4x4_lib4 1535#elif defined(OS_MAC) 1536 bl _inner_store_4x4_lib4 1537#endif 1538#endif 1539 1540 1541 1542 EPILOGUE 1543 1544#if defined(OS_LINUX) 1545 .size kernel_strsm_nt_rl_inv_4x4_lib4, .-kernel_strsm_nt_rl_inv_4x4_lib4 1546#endif 1547 1548 1549 1550 1551 1552 .align 3 155399: // 0 1554 .word 0 1555 .word 0 1556 1557 1558 1559 1560 1561// r0 r1 r2 r3 sp+0 sp+4 1562// void kernel_spotrf_nt_l_4x4_lib4(int kmax, double *A, double *B, double *C, double *D, double *inv_diag_D); 1563 1564// .p2align 4,,15 1565#if defined(OS_LINUX) 1566 .globl kernel_spotrf_nt_l_4x4_lib4 1567 .type kernel_spotrf_nt_l_4x4_lib4, %function 1568kernel_spotrf_nt_l_4x4_lib4: 1569#elif defined(OS_MAC) 1570 .globl _kernel_spotrf_nt_l_4x4_lib4 1571_kernel_spotrf_nt_l_4x4_lib4: 1572#endif 1573 1574 PROLOGUE 1575 1576 1577 1578 // zero accumulation registers 1579 vldr d8, 99b 1580 vldr d9, 99b 1581 vmov q5, q4 1582 vmov q6, q4 1583 vmov q7, q4 1584 1585 1586 1587 // call inner kernel dgemm nt 1588 mov r4, r0 // kmax 1589 mov r5, r1 // A 1590 mov r6, r2 // B 1591 1592#if MACRO_LEVEL>=2 1593 INNER_KERNEL_GEMM_ADD_NT_4X4_LIB4 1594#else 1595#if defined(OS_LINUX) 1596 bl inner_kernel_gemm_add_nt_4x4_lib4 1597#elif defined(OS_MAC) 1598 bl _inner_kernel_gemm_add_nt_4x4_lib4 1599#endif 1600#endif 1601 1602 1603 1604 // call inner blend for alpha=1.0 and beta=1.0 1605 mov r4, r3 // C 1606 1607#if MACRO_LEVEL>=1 1608 INNER_SCALE_M11_4X4_LIB4 1609#else 1610#if defined(OS_LINUX) 1611 bl inner_scale_m11_4x4_lib4 1612#elif defined(OS_MAC) 1613 bl _inner_scale_m11_4x4_lib4 1614#endif 1615#endif 1616 1617 1618 1619 // factorization 1620 ldr r4, [fp, #4] // inv_diag_D 1621 1622#if MACRO_LEVEL>=1 1623 INNER_EDGE_POTRF_4X4_LIB4 99f 1624#else 1625#if defined(OS_LINUX) 1626 bl inner_edge_potrf_4x4_lib4 1627#elif defined(OS_MAC) 1628 bl _inner_edge_potrf_4x4_lib4 1629#endif 1630#endif 1631 1632 1633 1634 // store l 1635 ldr r4, [fp, #0] // D 1636 1637#if MACRO_LEVEL>=1 1638 INNER_STORE_4X4_L_LIB4 1639#else 1640#if defined(OS_LINUX) 1641 bl inner_store_4x4_l_lib4 1642#elif defined(OS_MAC) 1643 bl _inner_store_4x4_l_lib4 1644#endif 1645#endif 1646 1647 1648 1649 EPILOGUE 1650 1651#if defined(OS_LINUX) 1652 .size kernel_spotrf_nt_l_4x4_lib4, .-kernel_spotrf_nt_l_4x4_lib4 1653#endif 1654 1655 1656 1657 1658 1659 .align 3 166099: // { 0 } 1661 .word 0 1662 .word 0 1663 1664 1665 1666 1667 1668#if defined(BLAS_API) 1669 1670#include "kernel_sgemm_4x4_lib.S" 1671 1672#endif 1673