1/************************************************************************************************** 2* * 3* This file is part of BLASFEO. * 4* * 5* BLASFEO -- BLAS For Embedded Optimization. * 6* Copyright (C) 2019 by Gianluca Frison. * 7* Developed at IMTEK (University of Freiburg) under the supervision of Moritz Diehl. * 8* All rights reserved. * 9* * 10* The 2-Clause BSD License * 11* * 12* Redistribution and use in source and binary forms, with or without * 13* modification, are permitted provided that the following conditions are met: * 14* * 15* 1. Redistributions of source code must retain the above copyright notice, this * 16* list of conditions and the following disclaimer. * 17* 2. Redistributions in binary form must reproduce the above copyright notice, * 18* this list of conditions and the following disclaimer in the documentation * 19* and/or other materials provided with the distribution. * 20* * 21* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND * 22* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED * 23* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE * 24* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR * 25* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES * 26* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; * 27* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND * 28* ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT * 29* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * 30* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * 31* * 32* Author: Gianluca Frison, gianluca.frison (at) imtek.uni-freiburg.de * 33* * 34**************************************************************************************************/ 35 36 37 38// subroutine 39// 40// input arguments: 41// w8 <- k 42// x9 <- A 43// x10 <- B 44// x11 <- ldb 45// 46// output arguments: 47 48#if MACRO_LEVEL>=2 49 .macro INNER_KERNEL_GEMM_ADD_NT_4X4_LIB4C 50#else 51 .align 4 52 FUN_START(inner_kernel_gemm_add_nt_4x4_lib4c) 53#endif 54 55 56 57#if defined(TARGET_ARMV8A_ARM_CORTEX_A57) 58 59 60 61 // early return 62 cmp w8, #0 63 ble 2f // return 64 65 add x12, x11, x11 66 add x13, x12, x11 67 add x14, x12, x12 68 add x15, x13, x12 69 add x16, x13, x13 70 add x17, x14, x13 71 72 // prefetch 73 prfm PLDL1KEEP, [x10] 74 prfm PLDL1KEEP, [x10, x11] 75 prfm PLDL1KEEP, [x10, x12] 76 prfm PLDL1KEEP, [x10, x13] 77 prfm PLDL1KEEP, [x9, #0] 78 prfm PLDL1KEEP, [x9, #64] 79 80 // preload 81 ldp q24, q25, [x10, #(0*8)] 82 add x10, x10, x11 83 ldp q26, q27, [x10, #(0*8)] 84 add x10, x10, x11 85 ldp q28, q29, [x10, #(0*8)] 86 add x10, x10, x11 87 ldp q30, q31, [x10, #(0*8)] 88 add x10, x10, x11 89 ldp q16, q17, [x9, #(0*8+0*32)] 90 91 cmp w8, #4 92 ble 0f // consider clean up loop 93 94 // prefetch 95 prfm PLDL1KEEP, [x10] 96 prfm PLDL1KEEP, [x10, x11] 97 prfm PLDL1KEEP, [x10, x12] 98 prfm PLDL1KEEP, [x10, x13] 99 prfm PLDL1KEEP, [x9, #128] 100 prfm PLDL1KEEP, [x9, #192] 101 102 // zero tmp acc 103 fmov d8, xzr 104 fmov d9, d8 105 fmov d10, d8 106 fmov d11, d8 107 fmov d12, d8 108 fmov d13, d8 109 fmov d14, d8 110 fmov d15, d8 111 112 // main loop 1131: 114 115 // unroll 0 116 ldp q18, q19, [x9, #(0*8+1*32)] 117 fmla v0.2d, v16.2d, v24.2d[0] 118 fmla v1.2d, v17.2d, v24.2d[0] 119 prfm PLDL1KEEP, [x9, #256] 120// prfm PLDL1KEEP, [x9, #128] 121 fmla v2.2d, v16.2d, v24.2d[1] 122 fmla v3.2d, v17.2d, v24.2d[1] 123 prfm PLDL1KEEP, [x9, #320] 124// prfm PLDL1KEEP, [x9, #192] 125 fmla v4.2d, v16.2d, v25.2d[0] 126 fmla v5.2d, v17.2d, v25.2d[0] 127// prfm PLDL1KEEP, [x10] 128 prfm PLDL1KEEP, [x10, x14] 129 fmla v6.2d, v16.2d, v25.2d[1] 130 fmla v7.2d, v17.2d, v25.2d[1] 131 132 // unroll 1 133 ldp q16, q17, [x9, #(0*8+2*32)] 134 fmla v8.2d, v18.2d, v26.2d[0] 135 fmla v9.2d, v19.2d, v26.2d[0] 136// prfm PLDL1KEEP, [x10, x11] 137 prfm PLDL1KEEP, [x10, x15] 138 fmla v10.2d, v18.2d, v26.2d[1] 139 fmla v11.2d, v19.2d, v26.2d[1] 140// prfm PLDL1KEEP, [x10, x12] 141 prfm PLDL1KEEP, [x10, x16] 142 fmla v12.2d, v18.2d, v27.2d[0] 143 fmla v13.2d, v19.2d, v27.2d[0] 144// prfm PLDL1KEEP, [x10, x13] 145 prfm PLDL1KEEP, [x10, x17] 146 fmla v14.2d, v18.2d, v27.2d[1] 147 fmla v15.2d, v19.2d, v27.2d[1] 148 149 // unroll 2 150 ldp q18, q19, [x9, #(0*8+3*32)] 151 fmla v0.2d, v16.2d, v28.2d[0] 152 fmla v1.2d, v17.2d, v28.2d[0] 153 add x9, x9, #128 154 fmla v2.2d, v16.2d, v28.2d[1] 155 fmla v3.2d, v17.2d, v28.2d[1] 156 sub w8, w8, #4 157 fmla v4.2d, v16.2d, v29.2d[0] 158 fmla v5.2d, v17.2d, v29.2d[0] 159 cmp w8, #4 160 fmla v6.2d, v16.2d, v29.2d[1] 161 fmla v7.2d, v17.2d, v29.2d[1] 162 163 // unroll 3 164 ldp q16, q17, [x9, #(0*8+0*32)] 165 fmla v8.2d, v18.2d, v30.2d[0] 166 fmla v9.2d, v19.2d, v30.2d[0] 167 ldp q24, q25, [x10, #(0*8)] 168 fmla v10.2d, v18.2d, v30.2d[1] 169 add x10, x10, x11 170 fmla v11.2d, v19.2d, v30.2d[1] 171 ldp q26, q27, [x10, #(0*8)] 172 fmla v12.2d, v18.2d, v31.2d[0] 173 add x10, x10, x11 174 fmla v13.2d, v19.2d, v31.2d[0] 175 ldp q28, q29, [x10, #(0*8)] 176 fmla v14.2d, v18.2d, v31.2d[1] 177 add x10, x10, x11 178 fmla v15.2d, v19.2d, v31.2d[1] 179 ldp q30, q31, [x10, #(0*8)] 180 add x10, x10, x11 181 182 bgt 1b 183 184 185 // reduce 186 fadd v0.2d, v0.2d, v8.2d 187 fadd v1.2d, v1.2d, v9.2d 188 fadd v2.2d, v2.2d, v10.2d 189 fadd v3.2d, v3.2d, v11.2d 190 fadd v4.2d, v4.2d, v12.2d 191 fadd v5.2d, v5.2d, v13.2d 192 fadd v6.2d, v6.2d, v14.2d 193 fadd v7.2d, v7.2d, v15.2d 194 195// sub x9, x9, #32 196// sub x10, x10, #32 197 1980: 199 200 cmp w8, #3 201 ble 4f 202 203 // unroll 0 204 ldp q18, q19, [x9, #(0*8+1*32)] 205 fmla v0.2d, v16.2d, v24.2d[0] 206 fmla v1.2d, v17.2d, v24.2d[0] 207// prfm PLDL1KEEP, [x9, #128] 208 fmla v2.2d, v16.2d, v24.2d[1] 209 fmla v3.2d, v17.2d, v24.2d[1] 210// prfm PLDL1KEEP, [x9, #192] 211 fmla v4.2d, v16.2d, v25.2d[0] 212 fmla v5.2d, v17.2d, v25.2d[0] 213// prfm PLDL1KEEP, [x10, #128] 214 fmla v6.2d, v16.2d, v25.2d[1] 215 fmla v7.2d, v17.2d, v25.2d[1] 216 217 // unroll 1 218// prfm PLDL1KEEP, [x10, #192] 219 fmla v0.2d, v18.2d, v26.2d[0] 220 fmla v1.2d, v19.2d, v26.2d[0] 221 ldp q16, q17, [x9, #(0*8+2*32)] 222 fmla v2.2d, v18.2d, v26.2d[1] 223 fmla v3.2d, v19.2d, v26.2d[1] 224 fmla v4.2d, v18.2d, v27.2d[0] 225 fmla v5.2d, v19.2d, v27.2d[0] 226 sub w8, w8, #4 227 fmla v6.2d, v18.2d, v27.2d[1] 228 fmla v7.2d, v19.2d, v27.2d[1] 229 230 // unroll 2 231 ldp q18, q19, [x9, #(0*8+3*32)] 232 fmla v0.2d, v16.2d, v28.2d[0] 233 fmla v1.2d, v17.2d, v28.2d[0] 234 add x9, x9, #128 235 fmla v2.2d, v16.2d, v28.2d[1] 236 fmla v3.2d, v17.2d, v28.2d[1] 237 fmla v4.2d, v16.2d, v29.2d[0] 238 fmla v5.2d, v17.2d, v29.2d[0] 239// cmp w8, #4 240 fmla v6.2d, v16.2d, v29.2d[1] 241 fmla v7.2d, v17.2d, v29.2d[1] 242 243 // unroll 3 244// ldp q16, q17, [x9, #(0*8+0*32)] 245 fmla v0.2d, v18.2d, v30.2d[0] 246 fmla v1.2d, v19.2d, v30.2d[0] 247// ldp q24, q25, [x10, #(0*8+0*32)] 248 fmla v2.2d, v18.2d, v30.2d[1] 249// add x10, x10, x11 250 fmla v3.2d, v19.2d, v30.2d[1] 251// ldp q26, q27, [x10, #(0*8+1*32)] 252 fmla v4.2d, v18.2d, v31.2d[0] 253// add x10, x10, x11 254 fmla v5.2d, v19.2d, v31.2d[0] 255// ldp q28, q29, [x10, #(0*8+2*32)] 256 fmla v6.2d, v18.2d, v31.2d[1] 257// add x10, x10, x11 258 fmla v7.2d, v19.2d, v31.2d[1] 259// ldp q30, q31, [x10, #(0*8+3*32)] 260// add x10, x10, x11 261 262 b 2f // return 263 2644: // consider clean1-up loop 265 266 cmp w8, #0 267 ble 2f // return 268 269 sub x10, x10, x11 270 sub x10, x10, x11 271 sub x10, x10, x11 272 sub x10, x10, x11 273 2743: // clean1-up loop 275 276 // unroll 0 277 ld1 {v24.2d, v25.2d}, [x9], #32 278 ld1 {v28.2d, v29.2d}, [x10] 279 fmla v0.2d, v24.2d, v28.2d[0] 280 fmla v1.2d, v25.2d, v28.2d[0] 281 add x10, x10, x11 282 fmla v2.2d, v24.2d, v28.2d[1] 283 fmla v3.2d, v25.2d, v28.2d[1] 284 sub w8, w8, #1 285 fmla v4.2d, v24.2d, v29.2d[0] 286 fmla v5.2d, v25.2d, v29.2d[0] 287 cmp w8, #0 288 fmla v6.2d, v24.2d, v29.2d[1] 289 fmla v7.2d, v25.2d, v29.2d[1] 290 291 bgt 3b 292 2932: // return 294 295 296 297#elif defined(TARGET_ARMV8A_ARM_CORTEX_A53) 298 299 300 301 // early return 302 cmp w8, #0 303 ble 2f // return 304 305 // prefetch 306 307 // preload 308 309 cmp w8, #4 310 ble 0f // consider clean up loop 311 312 // prefetch 313 314 // zero tmp acc 315 316 // main loop 3171: 318 319 // load 0 & 1 & 2 & 3 320 ld1 {v16.2d, v17.2d}, [x9], #32 321 ld1 {v24.2d, v25.2d}, [x10] 322 add x10, x10, x11 323 ld1 {v18.2d, v19.2d}, [x9], #32 324 ld1 {v26.2d, v27.2d}, [x10] 325 add x10, x10, x11 326 ld1 {v20.2d, v21.2d}, [x9], #32 327 ld1 {v28.2d, v29.2d}, [x10] 328 add x10, x10, x11 329 ld1 {v22.2d, v23.2d}, [x9], #32 330 ld1 {v30.2d, v31.2d}, [x10] 331 add x10, x10, x11 332 333 // unroll 0 334 fmla v0.2d, v16.2d, v24.2d[0] 335 fmla v1.2d, v17.2d, v24.2d[0] 336 fmla v2.2d, v16.2d, v24.2d[1] 337 fmla v3.2d, v17.2d, v24.2d[1] 338 fmla v4.2d, v16.2d, v25.2d[0] 339 fmla v5.2d, v17.2d, v25.2d[0] 340 fmla v6.2d, v16.2d, v25.2d[1] 341 fmla v7.2d, v17.2d, v25.2d[1] 342 343 // unroll 1 344 fmla v0.2d, v18.2d, v26.2d[0] 345 fmla v1.2d, v19.2d, v26.2d[0] 346 fmla v2.2d, v18.2d, v26.2d[1] 347 fmla v3.2d, v19.2d, v26.2d[1] 348 fmla v4.2d, v18.2d, v27.2d[0] 349 fmla v5.2d, v19.2d, v27.2d[0] 350 fmla v6.2d, v18.2d, v27.2d[1] 351 fmla v7.2d, v19.2d, v27.2d[1] 352 sub w8, w8, #4 353 354 // unroll 2 355 fmla v0.2d, v20.2d, v28.2d[0] 356 fmla v1.2d, v21.2d, v28.2d[0] 357 fmla v2.2d, v20.2d, v28.2d[1] 358 fmla v3.2d, v21.2d, v28.2d[1] 359 fmla v4.2d, v20.2d, v29.2d[0] 360 fmla v5.2d, v21.2d, v29.2d[0] 361 fmla v6.2d, v20.2d, v29.2d[1] 362 fmla v7.2d, v21.2d, v29.2d[1] 363 cmp w8, #4 364 365 // unroll 3 366 fmla v0.2d, v22.2d, v30.2d[0] 367 fmla v1.2d, v23.2d, v30.2d[0] 368 fmla v2.2d, v22.2d, v30.2d[1] 369 fmla v3.2d, v23.2d, v30.2d[1] 370 fmla v4.2d, v22.2d, v31.2d[0] 371 fmla v5.2d, v23.2d, v31.2d[0] 372 fmla v6.2d, v22.2d, v31.2d[1] 373 fmla v7.2d, v23.2d, v31.2d[1] 374 375 bgt 1b 376 377 378 // reduce 379 3800: 381 382 cmp w8, #3 383 ble 4f 384 385 // load 0 & 1 & 2 & 3 386 ld1 {v16.2d, v17.2d}, [x9], #32 387 ld1 {v24.2d, v25.2d}, [x10] 388 add x10, x10, x11 389 ld1 {v18.2d, v19.2d}, [x9], #32 390 ld1 {v26.2d, v27.2d}, [x10] 391 add x10, x10, x11 392 ld1 {v20.2d, v21.2d}, [x9], #32 393 ld1 {v28.2d, v29.2d}, [x10] 394 add x10, x10, x11 395 ld1 {v22.2d, v23.2d}, [x9], #32 396 ld1 {v30.2d, v31.2d}, [x10] 397 add x10, x10, x11 398 399 // unroll 0 400 fmla v0.2d, v16.2d, v24.2d[0] 401 fmla v1.2d, v17.2d, v24.2d[0] 402 fmla v2.2d, v16.2d, v24.2d[1] 403 fmla v3.2d, v17.2d, v24.2d[1] 404 fmla v4.2d, v16.2d, v25.2d[0] 405 fmla v5.2d, v17.2d, v25.2d[0] 406 fmla v6.2d, v16.2d, v25.2d[1] 407 fmla v7.2d, v17.2d, v25.2d[1] 408 409 // unroll 1 410 fmla v0.2d, v18.2d, v26.2d[0] 411 fmla v1.2d, v19.2d, v26.2d[0] 412 fmla v2.2d, v18.2d, v26.2d[1] 413 fmla v3.2d, v19.2d, v26.2d[1] 414 fmla v4.2d, v18.2d, v27.2d[0] 415 fmla v5.2d, v19.2d, v27.2d[0] 416 fmla v6.2d, v18.2d, v27.2d[1] 417 fmla v7.2d, v19.2d, v27.2d[1] 418 sub w8, w8, #4 419 420 // unroll 2 421 fmla v0.2d, v20.2d, v28.2d[0] 422 fmla v1.2d, v21.2d, v28.2d[0] 423 fmla v2.2d, v20.2d, v28.2d[1] 424 fmla v3.2d, v21.2d, v28.2d[1] 425 fmla v4.2d, v20.2d, v29.2d[0] 426 fmla v5.2d, v21.2d, v29.2d[0] 427 fmla v6.2d, v20.2d, v29.2d[1] 428 fmla v7.2d, v21.2d, v29.2d[1] 429 430 // unroll 3 431 fmla v0.2d, v22.2d, v30.2d[0] 432 fmla v1.2d, v23.2d, v30.2d[0] 433 fmla v2.2d, v22.2d, v30.2d[1] 434 fmla v3.2d, v23.2d, v30.2d[1] 435 fmla v4.2d, v22.2d, v31.2d[0] 436 fmla v5.2d, v23.2d, v31.2d[0] 437 fmla v6.2d, v22.2d, v31.2d[1] 438 fmla v7.2d, v23.2d, v31.2d[1] 439 440 b 2f // return 441 4424: // consider clean1-up loop 443 444 cmp w8, #0 445 ble 2f // return 446 4473: // clean1-up loop 448 449 // unroll 0 450 ld1 {v24.2d, v25.2d}, [x9], #32 451 ld1 {v28.2d, v29.2d}, [x10] 452 fmla v0.2d, v24.2d, v28.2d[0] 453 fmla v1.2d, v25.2d, v28.2d[0] 454 add x10, x10, x11 455 fmla v2.2d, v24.2d, v28.2d[1] 456 fmla v3.2d, v25.2d, v28.2d[1] 457 sub w8, w8, #1 458 fmla v4.2d, v24.2d, v29.2d[0] 459 fmla v5.2d, v25.2d, v29.2d[0] 460 cmp w8, #0 461 fmla v6.2d, v24.2d, v29.2d[1] 462 fmla v7.2d, v25.2d, v29.2d[1] 463 464 bgt 3b 465 4662: // return 467 468 469 470#endif // cortex a53 471 472 473 474#if MACRO_LEVEL>=2 475 .endm 476#else 477 ret 478 479 FUN_END(inner_kernel_gemm_add_nt_4x4_lib4c) 480#endif 481 482 483 484 485 486// subroutine 487// 488// input arguments: 489// w8 <- k 490// x9 <- A 491// x10 <- B 492// x11 <- ldb 493// 494// output arguments: 495 496#if MACRO_LEVEL>=2 497 .macro INNER_KERNEL_GEMM_ADD_NT_4X3_LIB4C 498#else 499 .align 4 500 FUN_START(inner_kernel_gemm_add_nt_4x3_lib4c) 501#endif 502 503 504 505#if defined(TARGET_ARMV8A_ARM_CORTEX_A57) 506 507 508 509 // early return 510 cmp w8, #0 511 ble 2f // return 512 513 add x12, x11, x11 514 add x13, x12, x11 515 add x14, x12, x12 516 add x15, x13, x12 517 add x16, x13, x13 518 add x17, x14, x13 519 520 // prefetch 521 prfm PLDL1KEEP, [x10] 522 prfm PLDL1KEEP, [x10, x11] 523 prfm PLDL1KEEP, [x10, x12] 524 prfm PLDL1KEEP, [x10, x13] 525 prfm PLDL1KEEP, [x9, #0] 526 prfm PLDL1KEEP, [x9, #64] 527 528 // preload 529 ldr q24, [x10, #(0*8)] 530 ldr d25, [x10, #(2*8)] 531 add x10, x10, x11 532 ldr q26, [x10, #(0*8)] 533 ldr d27, [x10, #(2*8)] 534 add x10, x10, x11 535 ldr q28, [x10, #(0*8)] 536 ldr d29, [x10, #(2*8)] 537 add x10, x10, x11 538 ldr q30, [x10, #(0*8)] 539 ldr d31, [x10, #(2*8)] 540 add x10, x10, x11 541 ldp q16, q17, [x9, #(0*8+0*32)] 542 543 cmp w8, #4 544 ble 0f // consider clean up loop 545 546 // prefetch 547 prfm PLDL1KEEP, [x10] 548 prfm PLDL1KEEP, [x10, x11] 549 prfm PLDL1KEEP, [x10, x12] 550 prfm PLDL1KEEP, [x10, x13] 551 prfm PLDL1KEEP, [x9, #128] 552 prfm PLDL1KEEP, [x9, #192] 553 554 // zero tmp acc 555 fmov d8, xzr 556 fmov d9, d8 557 fmov d10, d8 558 fmov d11, d8 559 fmov d12, d8 560 fmov d13, d8 561 562 // main loop 5631: 564 565 // unroll 0 566 ldp q18, q19, [x9, #(0*8+1*32)] 567 fmla v0.2d, v16.2d, v24.2d[0] 568 fmla v1.2d, v17.2d, v24.2d[0] 569 prfm PLDL1KEEP, [x9, #256] 570// prfm PLDL1KEEP, [x9, #128] 571 fmla v2.2d, v16.2d, v24.2d[1] 572 fmla v3.2d, v17.2d, v24.2d[1] 573 prfm PLDL1KEEP, [x9, #320] 574// prfm PLDL1KEEP, [x9, #192] 575 fmla v4.2d, v16.2d, v25.2d[0] 576 fmla v5.2d, v17.2d, v25.2d[0] 577// prfm PLDL1KEEP, [x10] 578 prfm PLDL1KEEP, [x10, x14] 579 580 // unroll 1 581 ldp q16, q17, [x9, #(0*8+2*32)] 582 fmla v8.2d, v18.2d, v26.2d[0] 583 fmla v9.2d, v19.2d, v26.2d[0] 584// prfm PLDL1KEEP, [x10, x11] 585 prfm PLDL1KEEP, [x10, x15] 586 fmla v10.2d, v18.2d, v26.2d[1] 587 fmla v11.2d, v19.2d, v26.2d[1] 588// prfm PLDL1KEEP, [x10, x12] 589 prfm PLDL1KEEP, [x10, x16] 590 fmla v12.2d, v18.2d, v27.2d[0] 591 fmla v13.2d, v19.2d, v27.2d[0] 592// prfm PLDL1KEEP, [x10, x13] 593 prfm PLDL1KEEP, [x10, x17] 594 595 // unroll 2 596 ldp q18, q19, [x9, #(0*8+3*32)] 597 fmla v0.2d, v16.2d, v28.2d[0] 598 fmla v1.2d, v17.2d, v28.2d[0] 599 add x9, x9, #128 600 fmla v2.2d, v16.2d, v28.2d[1] 601 fmla v3.2d, v17.2d, v28.2d[1] 602 sub w8, w8, #4 603 fmla v4.2d, v16.2d, v29.2d[0] 604 fmla v5.2d, v17.2d, v29.2d[0] 605 cmp w8, #4 606 607 // unroll 3 608 ldp q16, q17, [x9, #(0*8+0*32)] 609 fmla v8.2d, v18.2d, v30.2d[0] 610 fmla v9.2d, v19.2d, v30.2d[0] 611 ldr q24, [x10, #(0*8)] 612 ldr d25, [x10, #(2*8)] 613 fmla v10.2d, v18.2d, v30.2d[1] 614 add x10, x10, x11 615 fmla v11.2d, v19.2d, v30.2d[1] 616 ldr q26, [x10, #(0*8)] 617 ldr d27, [x10, #(2*8)] 618 fmla v12.2d, v18.2d, v31.2d[0] 619 add x10, x10, x11 620 fmla v13.2d, v19.2d, v31.2d[0] 621 ldr q28, [x10, #(0*8)] 622 ldr d29, [x10, #(2*8)] 623 add x10, x10, x11 624 ldr q30, [x10, #(0*8)] 625 ldr d31, [x10, #(2*8)] 626 add x10, x10, x11 627 628 bgt 1b 629 630 631 // reduce 632 fadd v0.2d, v0.2d, v8.2d 633 fadd v1.2d, v1.2d, v9.2d 634 fadd v2.2d, v2.2d, v10.2d 635 fadd v3.2d, v3.2d, v11.2d 636 fadd v4.2d, v4.2d, v12.2d 637 fadd v5.2d, v5.2d, v13.2d 638 6390: 640 641 cmp w8, #3 642 ble 4f 643 644 // unroll 0 645 ldp q18, q19, [x9, #(0*8+1*32)] 646 fmla v0.2d, v16.2d, v24.2d[0] 647 fmla v1.2d, v17.2d, v24.2d[0] 648// prfm PLDL1KEEP, [x9, #128] 649 fmla v2.2d, v16.2d, v24.2d[1] 650 fmla v3.2d, v17.2d, v24.2d[1] 651// prfm PLDL1KEEP, [x9, #192] 652 fmla v4.2d, v16.2d, v25.2d[0] 653 fmla v5.2d, v17.2d, v25.2d[0] 654// prfm PLDL1KEEP, [x10, #128] 655 656 // unroll 1 657// prfm PLDL1KEEP, [x10, #192] 658 fmla v0.2d, v18.2d, v26.2d[0] 659 fmla v1.2d, v19.2d, v26.2d[0] 660 ldp q16, q17, [x9, #(0*8+2*32)] 661 fmla v2.2d, v18.2d, v26.2d[1] 662 fmla v3.2d, v19.2d, v26.2d[1] 663 fmla v4.2d, v18.2d, v27.2d[0] 664 fmla v5.2d, v19.2d, v27.2d[0] 665 sub w8, w8, #4 666 667 // unroll 2 668 ldp q18, q19, [x9, #(0*8+3*32)] 669 fmla v0.2d, v16.2d, v28.2d[0] 670 fmla v1.2d, v17.2d, v28.2d[0] 671 add x9, x9, #128 672 fmla v2.2d, v16.2d, v28.2d[1] 673 fmla v3.2d, v17.2d, v28.2d[1] 674 fmla v4.2d, v16.2d, v29.2d[0] 675 fmla v5.2d, v17.2d, v29.2d[0] 676// cmp w8, #4 677 678 // unroll 3 679// ldp q16, q17, [x9, #(0*8+0*32)] 680 fmla v0.2d, v18.2d, v30.2d[0] 681 fmla v1.2d, v19.2d, v30.2d[0] 682// ldp q24, q25, [x10, #(0*8+0*32)] 683 fmla v2.2d, v18.2d, v30.2d[1] 684// add x10, x10, x11 685 fmla v3.2d, v19.2d, v30.2d[1] 686// ldp q26, q27, [x10, #(0*8+1*32)] 687 fmla v4.2d, v18.2d, v31.2d[0] 688// add x10, x10, x11 689 fmla v5.2d, v19.2d, v31.2d[0] 690// ldp q28, q29, [x10, #(0*8+2*32)] 691// add x10, x10, x11 692// ldp q30, q31, [x10, #(0*8+3*32)] 693// add x10, x10, x11 694 695 b 2f // return 696 6974: // consider clean1-up loop 698 699 cmp w8, #0 700 ble 2f // return 701 702 sub x10, x10, x11 703 sub x10, x10, x11 704 sub x10, x10, x11 705 sub x10, x10, x11 706 7073: // clean1-up loop 708 709 // unroll 0 710 ld1 {v24.2d, v25.2d}, [x9], #32 711 ldr q28, [x10, #(0*8)] 712 ldr d29, [x10, #(2*8)] 713 fmla v0.2d, v24.2d, v28.2d[0] 714 fmla v1.2d, v25.2d, v28.2d[0] 715 add x10, x10, x11 716 fmla v2.2d, v24.2d, v28.2d[1] 717 fmla v3.2d, v25.2d, v28.2d[1] 718 sub w8, w8, #1 719 fmla v4.2d, v24.2d, v29.2d[0] 720 fmla v5.2d, v25.2d, v29.2d[0] 721 cmp w8, #0 722 723 bgt 3b 724 7252: // return 726 727 728 729#elif defined(TARGET_ARMV8A_ARM_CORTEX_A53) 730 731 732 733 // early return 734 cmp w8, #0 735 ble 2f // return 736 737 // prefetch 738 739 // preload 740 741 cmp w8, #4 742 ble 0f // consider clean up loop 743 744 // prefetch 745 746 // zero tmp acc 747 748 // main loop 7491: 750 751 // load 0 & 1 & 2 & 3 752 ld1 {v16.2d, v17.2d}, [x9], #32 753 ldr q24, [x10, #0] 754 ldr d25, [x10, #16] 755 add x10, x10, x11 756 ld1 {v18.2d, v19.2d}, [x9], #32 757 ldr q26, [x10, #0] 758 ldr d27, [x10, #16] 759 add x10, x10, x11 760 ld1 {v20.2d, v21.2d}, [x9], #32 761 ldr q28, [x10, #0] 762 ldr d29, [x10, #16] 763 add x10, x10, x11 764 ld1 {v22.2d, v23.2d}, [x9], #32 765 ldr q30, [x10, #0] 766 ldr d31, [x10, #16] 767 add x10, x10, x11 768 769 // unroll 0 770 fmla v0.2d, v16.2d, v24.2d[0] 771 fmla v1.2d, v17.2d, v24.2d[0] 772 fmla v2.2d, v16.2d, v24.2d[1] 773 fmla v3.2d, v17.2d, v24.2d[1] 774 fmla v4.2d, v16.2d, v25.2d[0] 775 fmla v5.2d, v17.2d, v25.2d[0] 776 777 // unroll 1 778 fmla v0.2d, v18.2d, v26.2d[0] 779 fmla v1.2d, v19.2d, v26.2d[0] 780 fmla v2.2d, v18.2d, v26.2d[1] 781 fmla v3.2d, v19.2d, v26.2d[1] 782 fmla v4.2d, v18.2d, v27.2d[0] 783 fmla v5.2d, v19.2d, v27.2d[0] 784 sub w8, w8, #4 785 786 // unroll 2 787 fmla v0.2d, v20.2d, v28.2d[0] 788 fmla v1.2d, v21.2d, v28.2d[0] 789 fmla v2.2d, v20.2d, v28.2d[1] 790 fmla v3.2d, v21.2d, v28.2d[1] 791 fmla v4.2d, v20.2d, v29.2d[0] 792 fmla v5.2d, v21.2d, v29.2d[0] 793 cmp w8, #4 794 795 // unroll 3 796 fmla v0.2d, v22.2d, v30.2d[0] 797 fmla v1.2d, v23.2d, v30.2d[0] 798 fmla v2.2d, v22.2d, v30.2d[1] 799 fmla v3.2d, v23.2d, v30.2d[1] 800 fmla v4.2d, v22.2d, v31.2d[0] 801 fmla v5.2d, v23.2d, v31.2d[0] 802 803 bgt 1b 804 805 806 // reduce 807 8080: 809 810 cmp w8, #3 811 ble 4f 812 813 // load 0 & 1 & 2 & 3 814 ld1 {v16.2d, v17.2d}, [x9], #32 815 ldr q24, [x10, #0] 816 ldr d25, [x10, #16] 817 add x10, x10, x11 818 ld1 {v18.2d, v19.2d}, [x9], #32 819 ldr q26, [x10, #0] 820 ldr d27, [x10, #16] 821 add x10, x10, x11 822 ld1 {v20.2d, v21.2d}, [x9], #32 823 ldr q28, [x10, #0] 824 ldr d29, [x10, #16] 825 add x10, x10, x11 826 ld1 {v22.2d, v23.2d}, [x9], #32 827 ldr q30, [x10, #0] 828 ldr d31, [x10, #16] 829 add x10, x10, x11 830 831 // unroll 0 832 fmla v0.2d, v16.2d, v24.2d[0] 833 fmla v1.2d, v17.2d, v24.2d[0] 834 fmla v2.2d, v16.2d, v24.2d[1] 835 fmla v3.2d, v17.2d, v24.2d[1] 836 fmla v4.2d, v16.2d, v25.2d[0] 837 fmla v5.2d, v17.2d, v25.2d[0] 838 839 // unroll 1 840 fmla v0.2d, v18.2d, v26.2d[0] 841 fmla v1.2d, v19.2d, v26.2d[0] 842 fmla v2.2d, v18.2d, v26.2d[1] 843 fmla v3.2d, v19.2d, v26.2d[1] 844 fmla v4.2d, v18.2d, v27.2d[0] 845 fmla v5.2d, v19.2d, v27.2d[0] 846 sub w8, w8, #4 847 848 // unroll 2 849 fmla v0.2d, v20.2d, v28.2d[0] 850 fmla v1.2d, v21.2d, v28.2d[0] 851 fmla v2.2d, v20.2d, v28.2d[1] 852 fmla v3.2d, v21.2d, v28.2d[1] 853 fmla v4.2d, v20.2d, v29.2d[0] 854 fmla v5.2d, v21.2d, v29.2d[0] 855 856 // unroll 3 857 fmla v0.2d, v22.2d, v30.2d[0] 858 fmla v1.2d, v23.2d, v30.2d[0] 859 fmla v2.2d, v22.2d, v30.2d[1] 860 fmla v3.2d, v23.2d, v30.2d[1] 861 fmla v4.2d, v22.2d, v31.2d[0] 862 fmla v5.2d, v23.2d, v31.2d[0] 863 864 b 2f // return 865 8664: // consider clean1-up loop 867 868 cmp w8, #0 869 ble 2f // return 870 8713: // clean1-up loop 872 873 // unroll 0 874 ld1 {v24.2d, v25.2d}, [x9], #32 875 ldr q28, [x10, #0] 876 ldr d29, [x10, #16] 877 fmla v0.2d, v24.2d, v28.2d[0] 878 fmla v1.2d, v25.2d, v28.2d[0] 879 add x10, x10, x11 880 fmla v2.2d, v24.2d, v28.2d[1] 881 fmla v3.2d, v25.2d, v28.2d[1] 882 sub w8, w8, #1 883 fmla v4.2d, v24.2d, v29.2d[0] 884 fmla v5.2d, v25.2d, v29.2d[0] 885 cmp w8, #0 886 887 bgt 3b 888 8892: // return 890 891 892 893#endif // cortex a53 894 895 896 897#if MACRO_LEVEL>=2 898 .endm 899#else 900 ret 901 902 FUN_END(inner_kernel_gemm_add_nt_4x3_lib4c) 903#endif 904 905 906 907 908 909// subroutine 910// 911// input arguments: 912// w8 <- k 913// x9 <- A 914// x10 <- B 915// x11 <- ldb 916// 917// output arguments: 918 919#if MACRO_LEVEL>=2 920 .macro INNER_KERNEL_GEMM_ADD_NT_4X2_LIB4C 921#else 922 .align 4 923 FUN_START(inner_kernel_gemm_add_nt_4x2_lib4c) 924#endif 925 926 927 928#if defined(TARGET_ARMV8A_ARM_CORTEX_A57) 929 930 931 932 // early return 933 cmp w8, #0 934 ble 2f // return 935 936 add x12, x11, x11 937 add x13, x12, x11 938 add x14, x12, x12 939 add x15, x13, x12 940 add x16, x13, x13 941 add x17, x14, x13 942 943 // prefetch 944 prfm PLDL1KEEP, [x10] 945 prfm PLDL1KEEP, [x10, x11] 946 prfm PLDL1KEEP, [x10, x12] 947 prfm PLDL1KEEP, [x10, x13] 948 prfm PLDL1KEEP, [x9, #0] 949 prfm PLDL1KEEP, [x9, #64] 950 951 // preload 952 ldr q24, [x10, #(0*8)] 953 add x10, x10, x11 954 ldr q26, [x10, #(0*8)] 955 add x10, x10, x11 956 ldr q28, [x10, #(0*8)] 957 add x10, x10, x11 958 ldr q30, [x10, #(0*8)] 959 add x10, x10, x11 960 ldp q16, q17, [x9, #(0*8+0*32)] 961 962 cmp w8, #4 963 ble 0f // consider clean up loop 964 965 // prefetch 966 prfm PLDL1KEEP, [x10] 967 prfm PLDL1KEEP, [x10, x11] 968 prfm PLDL1KEEP, [x10, x12] 969 prfm PLDL1KEEP, [x10, x13] 970 prfm PLDL1KEEP, [x9, #128] 971 prfm PLDL1KEEP, [x9, #192] 972 973 // zero tmp acc 974 fmov d8, xzr 975 fmov d9, d8 976 fmov d10, d8 977 fmov d11, d8 978 979 // main loop 9801: 981 982 // unroll 0 983 ldp q18, q19, [x9, #(0*8+1*32)] 984 fmla v0.2d, v16.2d, v24.2d[0] 985 fmla v1.2d, v17.2d, v24.2d[0] 986 prfm PLDL1KEEP, [x9, #256] 987// prfm PLDL1KEEP, [x9, #128] 988 fmla v2.2d, v16.2d, v24.2d[1] 989 fmla v3.2d, v17.2d, v24.2d[1] 990 prfm PLDL1KEEP, [x9, #320] 991// prfm PLDL1KEEP, [x9, #192] 992// prfm PLDL1KEEP, [x10] 993 prfm PLDL1KEEP, [x10, x14] 994 995 // unroll 1 996 ldp q16, q17, [x9, #(0*8+2*32)] 997 fmla v8.2d, v18.2d, v26.2d[0] 998 fmla v9.2d, v19.2d, v26.2d[0] 999// prfm PLDL1KEEP, [x10, x11] 1000 prfm PLDL1KEEP, [x10, x15] 1001 fmla v10.2d, v18.2d, v26.2d[1] 1002 fmla v11.2d, v19.2d, v26.2d[1] 1003// prfm PLDL1KEEP, [x10, x12] 1004 prfm PLDL1KEEP, [x10, x16] 1005// prfm PLDL1KEEP, [x10, x13] 1006 prfm PLDL1KEEP, [x10, x17] 1007 1008 // unroll 2 1009 ldp q18, q19, [x9, #(0*8+3*32)] 1010 fmla v0.2d, v16.2d, v28.2d[0] 1011 fmla v1.2d, v17.2d, v28.2d[0] 1012 add x9, x9, #128 1013 fmla v2.2d, v16.2d, v28.2d[1] 1014 fmla v3.2d, v17.2d, v28.2d[1] 1015 sub w8, w8, #4 1016 cmp w8, #4 1017 1018 // unroll 3 1019 ldp q16, q17, [x9, #(0*8+0*32)] 1020 fmla v8.2d, v18.2d, v30.2d[0] 1021 fmla v9.2d, v19.2d, v30.2d[0] 1022 ldr q24, [x10, #(0*8)] 1023 fmla v10.2d, v18.2d, v30.2d[1] 1024 add x10, x10, x11 1025 fmla v11.2d, v19.2d, v30.2d[1] 1026 ldr q26, [x10, #(0*8)] 1027 add x10, x10, x11 1028 ldr q28, [x10, #(0*8)] 1029 add x10, x10, x11 1030 ldr q30, [x10, #(0*8)] 1031 add x10, x10, x11 1032 1033 bgt 1b 1034 1035 1036 // reduce 1037 fadd v0.2d, v0.2d, v8.2d 1038 fadd v1.2d, v1.2d, v9.2d 1039 fadd v2.2d, v2.2d, v10.2d 1040 fadd v3.2d, v3.2d, v11.2d 1041 10420: 1043 1044 cmp w8, #3 1045 ble 4f 1046 1047 // unroll 0 1048 ldp q18, q19, [x9, #(0*8+1*32)] 1049 fmla v0.2d, v16.2d, v24.2d[0] 1050 fmla v1.2d, v17.2d, v24.2d[0] 1051// prfm PLDL1KEEP, [x9, #128] 1052 fmla v2.2d, v16.2d, v24.2d[1] 1053 fmla v3.2d, v17.2d, v24.2d[1] 1054// prfm PLDL1KEEP, [x9, #192] 1055// prfm PLDL1KEEP, [x10, #128] 1056 1057 // unroll 1 1058// prfm PLDL1KEEP, [x10, #192] 1059 fmla v0.2d, v18.2d, v26.2d[0] 1060 fmla v1.2d, v19.2d, v26.2d[0] 1061 ldp q16, q17, [x9, #(0*8+2*32)] 1062 fmla v2.2d, v18.2d, v26.2d[1] 1063 fmla v3.2d, v19.2d, v26.2d[1] 1064 sub w8, w8, #4 1065 1066 // unroll 2 1067 ldp q18, q19, [x9, #(0*8+3*32)] 1068 fmla v0.2d, v16.2d, v28.2d[0] 1069 fmla v1.2d, v17.2d, v28.2d[0] 1070 add x9, x9, #128 1071 fmla v2.2d, v16.2d, v28.2d[1] 1072 fmla v3.2d, v17.2d, v28.2d[1] 1073// cmp w8, #4 1074 1075 // unroll 3 1076// ldp q16, q17, [x9, #(0*8+0*32)] 1077 fmla v0.2d, v18.2d, v30.2d[0] 1078 fmla v1.2d, v19.2d, v30.2d[0] 1079// ldp q24, q25, [x10, #(0*8+0*32)] 1080 fmla v2.2d, v18.2d, v30.2d[1] 1081// add x10, x10, x11 1082 fmla v3.2d, v19.2d, v30.2d[1] 1083// ldp q26, q27, [x10, #(0*8+1*32)] 1084// add x10, x10, x11 1085// ldp q28, q29, [x10, #(0*8+2*32)] 1086// add x10, x10, x11 1087// ldp q30, q31, [x10, #(0*8+3*32)] 1088// add x10, x10, x11 1089 1090 b 2f // return 1091 10924: // consider clean1-up loop 1093 1094 cmp w8, #0 1095 ble 2f // return 1096 1097 sub x10, x10, x11 1098 sub x10, x10, x11 1099 sub x10, x10, x11 1100 sub x10, x10, x11 1101 11023: // clean1-up loop 1103 1104 // unroll 0 1105 ld1 {v24.2d, v25.2d}, [x9], #32 1106 ldr q28, [x10, #(0*8)] 1107 fmla v0.2d, v24.2d, v28.2d[0] 1108 fmla v1.2d, v25.2d, v28.2d[0] 1109 add x10, x10, x11 1110 fmla v2.2d, v24.2d, v28.2d[1] 1111 fmla v3.2d, v25.2d, v28.2d[1] 1112 sub w8, w8, #1 1113 cmp w8, #0 1114 1115 bgt 3b 1116 11172: // return 1118 1119 1120 1121#elif defined(TARGET_ARMV8A_ARM_CORTEX_A53) 1122 1123 1124 1125 // early return 1126 cmp w8, #0 1127 ble 2f // return 1128 1129 // prefetch 1130 1131 // preload 1132 1133 cmp w8, #4 1134 ble 0f // consider clean up loop 1135 1136 // prefetch 1137 1138 // zero tmp acc 1139 1140 // main loop 11411: 1142 1143 // load 0 & 1 & 2 & 3 1144 ld1 {v16.2d, v17.2d}, [x9], #32 1145 ldr q24, [x10, #0] 1146 add x10, x10, x11 1147 ld1 {v18.2d, v19.2d}, [x9], #32 1148 ldr q26, [x10, #0] 1149 add x10, x10, x11 1150 ld1 {v20.2d, v21.2d}, [x9], #32 1151 ldr q28, [x10, #0] 1152 add x10, x10, x11 1153 ld1 {v22.2d, v23.2d}, [x9], #32 1154 ldr q30, [x10, #0] 1155 add x10, x10, x11 1156 1157 // unroll 0 1158 fmla v0.2d, v16.2d, v24.2d[0] 1159 fmla v1.2d, v17.2d, v24.2d[0] 1160 fmla v2.2d, v16.2d, v24.2d[1] 1161 fmla v3.2d, v17.2d, v24.2d[1] 1162 1163 // unroll 1 1164 fmla v0.2d, v18.2d, v26.2d[0] 1165 fmla v1.2d, v19.2d, v26.2d[0] 1166 fmla v2.2d, v18.2d, v26.2d[1] 1167 fmla v3.2d, v19.2d, v26.2d[1] 1168 sub w8, w8, #4 1169 1170 // unroll 2 1171 fmla v0.2d, v20.2d, v28.2d[0] 1172 fmla v1.2d, v21.2d, v28.2d[0] 1173 fmla v2.2d, v20.2d, v28.2d[1] 1174 fmla v3.2d, v21.2d, v28.2d[1] 1175 cmp w8, #4 1176 1177 // unroll 3 1178 fmla v0.2d, v22.2d, v30.2d[0] 1179 fmla v1.2d, v23.2d, v30.2d[0] 1180 fmla v2.2d, v22.2d, v30.2d[1] 1181 fmla v3.2d, v23.2d, v30.2d[1] 1182 1183 bgt 1b 1184 1185 1186 // reduce 1187 11880: 1189 1190 cmp w8, #3 1191 ble 4f 1192 1193 // load 0 & 1 & 2 & 3 1194 ld1 {v16.2d, v17.2d}, [x9], #32 1195 ldr q24, [x10, #0] 1196 add x10, x10, x11 1197 ld1 {v18.2d, v19.2d}, [x9], #32 1198 ldr q26, [x10, #0] 1199 add x10, x10, x11 1200 ld1 {v20.2d, v21.2d}, [x9], #32 1201 ldr q28, [x10, #0] 1202 add x10, x10, x11 1203 ld1 {v22.2d, v23.2d}, [x9], #32 1204 ldr q30, [x10, #0] 1205 add x10, x10, x11 1206 1207 // unroll 0 1208 fmla v0.2d, v16.2d, v24.2d[0] 1209 fmla v1.2d, v17.2d, v24.2d[0] 1210 fmla v2.2d, v16.2d, v24.2d[1] 1211 fmla v3.2d, v17.2d, v24.2d[1] 1212 1213 // unroll 1 1214 fmla v0.2d, v18.2d, v26.2d[0] 1215 fmla v1.2d, v19.2d, v26.2d[0] 1216 fmla v2.2d, v18.2d, v26.2d[1] 1217 fmla v3.2d, v19.2d, v26.2d[1] 1218 sub w8, w8, #4 1219 1220 // unroll 2 1221 fmla v0.2d, v20.2d, v28.2d[0] 1222 fmla v1.2d, v21.2d, v28.2d[0] 1223 fmla v2.2d, v20.2d, v28.2d[1] 1224 fmla v3.2d, v21.2d, v28.2d[1] 1225 1226 // unroll 3 1227 fmla v0.2d, v22.2d, v30.2d[0] 1228 fmla v1.2d, v23.2d, v30.2d[0] 1229 fmla v2.2d, v22.2d, v30.2d[1] 1230 fmla v3.2d, v23.2d, v30.2d[1] 1231 1232 b 2f // return 1233 12344: // consider clean1-up loop 1235 1236 cmp w8, #0 1237 ble 2f // return 1238 12393: // clean1-up loop 1240 1241 // unroll 0 1242 ld1 {v24.2d, v25.2d}, [x9], #32 1243 ldr q28, [x10, #0] 1244 fmla v0.2d, v24.2d, v28.2d[0] 1245 fmla v1.2d, v25.2d, v28.2d[0] 1246 add x10, x10, x11 1247 fmla v2.2d, v24.2d, v28.2d[1] 1248 fmla v3.2d, v25.2d, v28.2d[1] 1249 sub w8, w8, #1 1250 cmp w8, #0 1251 1252 bgt 3b 1253 12542: // return 1255 1256 1257 1258#endif // cortex a53 1259 1260 1261 1262#if MACRO_LEVEL>=2 1263 .endm 1264#else 1265 ret 1266 1267 FUN_END(inner_kernel_gemm_add_nt_4x2_lib4c) 1268#endif 1269 1270 1271 1272 1273 1274// subroutine 1275// 1276// input arguments: 1277// w8 <- k 1278// x9 <- A 1279// x10 <- B 1280// x11 <- ldb 1281// 1282// output arguments: 1283 1284#if MACRO_LEVEL>=2 1285 .macro INNER_KERNEL_GEMM_ADD_NT_4X1_LIB4C 1286#else 1287 .align 4 1288 FUN_START(inner_kernel_gemm_add_nt_4x1_lib4c) 1289#endif 1290 1291 1292 1293#if defined(TARGET_ARMV8A_ARM_CORTEX_A57) 1294 1295 1296 1297 // early return 1298 cmp w8, #0 1299 ble 2f // return 1300 1301 add x12, x11, x11 1302 add x13, x12, x11 1303 add x14, x12, x12 1304 add x15, x13, x12 1305 add x16, x13, x13 1306 add x17, x14, x13 1307 1308 // prefetch 1309 prfm PLDL1KEEP, [x10] 1310 prfm PLDL1KEEP, [x10, x11] 1311 prfm PLDL1KEEP, [x10, x12] 1312 prfm PLDL1KEEP, [x10, x13] 1313 prfm PLDL1KEEP, [x9, #0] 1314 prfm PLDL1KEEP, [x9, #64] 1315 1316 // preload 1317 ldr d24, [x10, #(0*8)] 1318 add x10, x10, x11 1319 ldr d26, [x10, #(0*8)] 1320 add x10, x10, x11 1321 ldr d28, [x10, #(0*8)] 1322 add x10, x10, x11 1323 ldr d30, [x10, #(0*8)] 1324 add x10, x10, x11 1325 ldp q16, q17, [x9, #(0*8+0*32)] 1326 1327 cmp w8, #4 1328 ble 0f // consider clean up loop 1329 1330 // prefetch 1331 prfm PLDL1KEEP, [x10] 1332 prfm PLDL1KEEP, [x10, x11] 1333 prfm PLDL1KEEP, [x10, x12] 1334 prfm PLDL1KEEP, [x10, x13] 1335 prfm PLDL1KEEP, [x9, #128] 1336 prfm PLDL1KEEP, [x9, #192] 1337 1338 // zero tmp acc 1339 fmov d8, xzr 1340 fmov d9, d8 1341 1342 // main loop 13431: 1344 1345 // unroll 0 1346 ldp q18, q19, [x9, #(0*8+1*32)] 1347 fmla v0.2d, v16.2d, v24.2d[0] 1348 fmla v1.2d, v17.2d, v24.2d[0] 1349 prfm PLDL1KEEP, [x9, #256] 1350// prfm PLDL1KEEP, [x9, #128] 1351 prfm PLDL1KEEP, [x9, #320] 1352// prfm PLDL1KEEP, [x9, #192] 1353// prfm PLDL1KEEP, [x10] 1354 prfm PLDL1KEEP, [x10, x14] 1355 1356 // unroll 1 1357 ldp q16, q17, [x9, #(0*8+2*32)] 1358 fmla v8.2d, v18.2d, v26.2d[0] 1359 fmla v9.2d, v19.2d, v26.2d[0] 1360// prfm PLDL1KEEP, [x10, x11] 1361 prfm PLDL1KEEP, [x10, x15] 1362// prfm PLDL1KEEP, [x10, x12] 1363 prfm PLDL1KEEP, [x10, x16] 1364// prfm PLDL1KEEP, [x10, x13] 1365 prfm PLDL1KEEP, [x10, x17] 1366 1367 // unroll 2 1368 ldp q18, q19, [x9, #(0*8+3*32)] 1369 fmla v0.2d, v16.2d, v28.2d[0] 1370 fmla v1.2d, v17.2d, v28.2d[0] 1371 add x9, x9, #128 1372 sub w8, w8, #4 1373 cmp w8, #4 1374 1375 // unroll 3 1376 ldp q16, q17, [x9, #(0*8+0*32)] 1377 fmla v8.2d, v18.2d, v30.2d[0] 1378 fmla v9.2d, v19.2d, v30.2d[0] 1379 ldr d24, [x10, #(0*8)] 1380 add x10, x10, x11 1381 ldr d26, [x10, #(0*8)] 1382 add x10, x10, x11 1383 ldr d28, [x10, #(0*8)] 1384 add x10, x10, x11 1385 ldr d30, [x10, #(0*8)] 1386 add x10, x10, x11 1387 1388 bgt 1b 1389 1390 1391 // reduce 1392 fadd v0.2d, v0.2d, v8.2d 1393 fadd v1.2d, v1.2d, v9.2d 1394 13950: 1396 1397 cmp w8, #3 1398 ble 4f 1399 1400 // unroll 0 1401 ldp q18, q19, [x9, #(0*8+1*32)] 1402 fmla v0.2d, v16.2d, v24.2d[0] 1403 fmla v1.2d, v17.2d, v24.2d[0] 1404// prfm PLDL1KEEP, [x9, #128] 1405// prfm PLDL1KEEP, [x9, #192] 1406// prfm PLDL1KEEP, [x10, #128] 1407 1408 // unroll 1 1409// prfm PLDL1KEEP, [x10, #192] 1410 fmla v0.2d, v18.2d, v26.2d[0] 1411 fmla v1.2d, v19.2d, v26.2d[0] 1412 ldp q16, q17, [x9, #(0*8+2*32)] 1413 sub w8, w8, #4 1414 1415 // unroll 2 1416 ldp q18, q19, [x9, #(0*8+3*32)] 1417 fmla v0.2d, v16.2d, v28.2d[0] 1418 fmla v1.2d, v17.2d, v28.2d[0] 1419 add x9, x9, #128 1420// cmp w8, #4 1421 1422 // unroll 3 1423// ldp q16, q17, [x9, #(0*8+0*32)] 1424 fmla v0.2d, v18.2d, v30.2d[0] 1425 fmla v1.2d, v19.2d, v30.2d[0] 1426// ldp q24, q25, [x10, #(0*8+0*32)] 1427// add x10, x10, x11 1428// ldp q26, q27, [x10, #(0*8+1*32)] 1429// add x10, x10, x11 1430// ldp q28, q29, [x10, #(0*8+2*32)] 1431// add x10, x10, x11 1432// ldp q30, q31, [x10, #(0*8+3*32)] 1433// add x10, x10, x11 1434 1435 b 2f // return 1436 14374: // consider clean1-up loop 1438 1439 cmp w8, #0 1440 ble 2f // return 1441 1442 sub x10, x10, x11 1443 sub x10, x10, x11 1444 sub x10, x10, x11 1445 sub x10, x10, x11 1446 14473: // clean1-up loop 1448 1449 // unroll 0 1450 ld1 {v24.2d, v25.2d}, [x9], #32 1451 ldr d28, [x10, #(0*8)] 1452 fmla v0.2d, v24.2d, v28.2d[0] 1453 fmla v1.2d, v25.2d, v28.2d[0] 1454 add x10, x10, x11 1455 sub w8, w8, #1 1456 cmp w8, #0 1457 1458 bgt 3b 1459 14602: // return 1461 1462 1463 1464#elif defined(TARGET_ARMV8A_ARM_CORTEX_A53) 1465 1466 1467 1468 // early return 1469 cmp w8, #0 1470 ble 2f // return 1471 1472 // prefetch 1473 1474 // preload 1475 1476 cmp w8, #4 1477 ble 0f // consider clean up loop 1478 1479 // prefetch 1480 1481 // zero tmp acc 1482 1483 // main loop 14841: 1485 1486 // load 0 & 1 & 2 & 3 1487 ld1 {v16.2d, v17.2d}, [x9], #32 1488 ldr d24, [x10, #0] 1489 add x10, x10, x11 1490 ld1 {v18.2d, v19.2d}, [x9], #32 1491 ldr d26, [x10, #0] 1492 add x10, x10, x11 1493 ld1 {v20.2d, v21.2d}, [x9], #32 1494 ldr d28, [x10, #0] 1495 add x10, x10, x11 1496 ld1 {v22.2d, v23.2d}, [x9], #32 1497 ldr d30, [x10, #0] 1498 add x10, x10, x11 1499 1500 // unroll 0 1501 fmla v0.2d, v16.2d, v24.2d[0] 1502 fmla v1.2d, v17.2d, v24.2d[0] 1503 1504 // unroll 1 1505 fmla v0.2d, v18.2d, v26.2d[0] 1506 fmla v1.2d, v19.2d, v26.2d[0] 1507 sub w8, w8, #4 1508 1509 // unroll 2 1510 fmla v0.2d, v20.2d, v28.2d[0] 1511 fmla v1.2d, v21.2d, v28.2d[0] 1512 cmp w8, #4 1513 1514 // unroll 3 1515 fmla v0.2d, v22.2d, v30.2d[0] 1516 fmla v1.2d, v23.2d, v30.2d[0] 1517 1518 bgt 1b 1519 1520 1521 // reduce 1522 15230: 1524 1525 cmp w8, #3 1526 ble 4f 1527 1528 // load 0 & 1 & 2 & 3 1529 ld1 {v16.2d, v17.2d}, [x9], #32 1530 ldr d24, [x10, #0] 1531 add x10, x10, x11 1532 ld1 {v18.2d, v19.2d}, [x9], #32 1533 ldr d26, [x10, #0] 1534 add x10, x10, x11 1535 ld1 {v20.2d, v21.2d}, [x9], #32 1536 ldr d28, [x10, #0] 1537 add x10, x10, x11 1538 ld1 {v22.2d, v23.2d}, [x9], #32 1539 ldr d30, [x10, #0] 1540 add x10, x10, x11 1541 1542 // unroll 0 1543 fmla v0.2d, v16.2d, v24.2d[0] 1544 fmla v1.2d, v17.2d, v24.2d[0] 1545 1546 // unroll 1 1547 fmla v0.2d, v18.2d, v26.2d[0] 1548 fmla v1.2d, v19.2d, v26.2d[0] 1549 sub w8, w8, #4 1550 1551 // unroll 2 1552 fmla v0.2d, v20.2d, v28.2d[0] 1553 fmla v1.2d, v21.2d, v28.2d[0] 1554 1555 // unroll 3 1556 fmla v0.2d, v22.2d, v30.2d[0] 1557 fmla v1.2d, v23.2d, v30.2d[0] 1558 1559 b 2f // return 1560 15614: // consider clean1-up loop 1562 1563 cmp w8, #0 1564 ble 2f // return 1565 15663: // clean1-up loop 1567 1568 // unroll 0 1569 ld1 {v24.2d, v25.2d}, [x9], #32 1570 ldr d28, [x10, #0] 1571 fmla v0.2d, v24.2d, v28.2d[0] 1572 fmla v1.2d, v25.2d, v28.2d[0] 1573 add x10, x10, x11 1574 sub w8, w8, #1 1575 cmp w8, #0 1576 1577 bgt 3b 1578 15792: // return 1580 1581 1582 1583#endif // cortex a53 1584 1585 1586 1587#if MACRO_LEVEL>=2 1588 .endm 1589#else 1590 ret 1591 1592 FUN_END(inner_kernel_gemm_add_nt_4x1_lib4c) 1593#endif 1594 1595 1596 1597 1598 1599// subroutine 1600// 1601// input arguments: 1602// w8 <- k 1603// x9 <- A 1604// x10 <- B 1605// x11 <- ldb 1606// 1607// output arguments: 1608 1609#if MACRO_LEVEL>=2 1610 .macro INNER_KERNEL_GEMM_ADD_NN_4X4_LIB4C 1611#else 1612 .align 4 1613 FUN_START(inner_kernel_gemm_add_nn_4x4_lib4c) 1614#endif 1615 1616 1617 1618#if defined(TARGET_ARMV8A_ARM_CORTEX_A57) 1619 1620 1621 1622 // early return 1623 cmp w8, #0 1624 ble 2f // return 1625 1626 add x12, x10, x11 1627 add x13, x12, x11 1628 add x14, x13, x11 1629 1630 // prefetch 1631 prfm PLDL1KEEP, [x10, #0] 1632 prfm PLDL1KEEP, [x12, #0] 1633 prfm PLDL1KEEP, [x13, #0] 1634 prfm PLDL1KEEP, [x14, #0] 1635 prfm PLDL1KEEP, [x9, #0] 1636 prfm PLDL1KEEP, [x9, #64] 1637 1638 // preload 1639 ldp q24, q25, [x10], #32 1640 ldp q26, q27, [x12], #32 1641 ldp q28, q29, [x13], #32 1642 ldp q30, q31, [x14], #32 1643 ldp q16, q17, [x9, #(0*8+0*32)] 1644 1645 cmp w8, #4 1646 ble 0f // consider clean up loop 1647 1648 // prefetch 1649// prfm PLDL1KEEP, [x10, #0] 1650// prfm PLDL1KEEP, [x12, #0] 1651// prfm PLDL1KEEP, [x13, #0] 1652// prfm PLDL1KEEP, [x14, #0] 1653 prfm PLDL1KEEP, [x9, #128] 1654 prfm PLDL1KEEP, [x9, #192] 1655 1656 // zero tmp acc 1657 fmov d8, xzr 1658 fmov d9, d8 1659 fmov d10, d8 1660 fmov d11, d8 1661 fmov d12, d8 1662 fmov d13, d8 1663 fmov d14, d8 1664 fmov d15, d8 1665 1666// add x12, x11, #64 1667// add x12, x11, x11 1668// add x13, x12, #64 1669 1670 // main loop 16711: 1672 1673 // unroll 0 1674 ldp q18, q19, [x9, #(0*8+1*32)] 1675 fmla v0.2d, v16.2d, v24.2d[0] 1676 fmla v1.2d, v17.2d, v24.2d[0] 1677// prfm PLDL1KEEP, [x9, #128] 1678 prfm PLDL1KEEP, [x9, #256] 1679 fmla v2.2d, v16.2d, v26.2d[0] 1680 fmla v3.2d, v17.2d, v26.2d[0] 1681// prfm PLDL1KEEP, [x9, #192] 1682 prfm PLDL1KEEP, [x9, #320] 1683 fmla v4.2d, v16.2d, v28.2d[0] 1684 fmla v5.2d, v17.2d, v28.2d[0] 1685 prfm PLDL1KEEP, [x10, #32] 1686 fmla v6.2d, v16.2d, v30.2d[0] 1687 fmla v7.2d, v17.2d, v30.2d[0] 1688 1689 // unroll 1 1690 prfm PLDL1KEEP, [x12, #32] 1691 fmla v8.2d, v18.2d, v24.2d[1] 1692 fmla v9.2d, v19.2d, v24.2d[1] 1693 ldp q16, q17, [x9, #(0*8+2*32)] 1694 fmla v10.2d, v18.2d, v26.2d[1] 1695 fmla v11.2d, v19.2d, v26.2d[1] 1696 prfm PLDL1KEEP, [x13, #32] 1697 fmla v12.2d, v18.2d, v28.2d[1] 1698 fmla v13.2d, v19.2d, v28.2d[1] 1699 prfm PLDL1KEEP, [x14, #32] 1700 fmla v14.2d, v18.2d, v30.2d[1] 1701 fmla v15.2d, v19.2d, v30.2d[1] 1702 1703 // unroll 2 1704 ldp q18, q19, [x9, #(0*8+3*32)] 1705 fmla v0.2d, v16.2d, v25.2d[0] 1706 fmla v1.2d, v17.2d, v25.2d[0] 1707 add x9, x9, #128 1708 fmla v2.2d, v16.2d, v27.2d[0] 1709 fmla v3.2d, v17.2d, v27.2d[0] 1710 sub w8, w8, #4 1711 fmla v4.2d, v16.2d, v29.2d[0] 1712 fmla v5.2d, v17.2d, v29.2d[0] 1713 cmp w8, #4 1714 fmla v6.2d, v16.2d, v31.2d[0] 1715 fmla v7.2d, v17.2d, v31.2d[0] 1716 1717 // unroll 3 1718 ldp q16, q17, [x9, #(0*8+0*32)] 1719 fmla v8.2d, v18.2d, v25.2d[1] 1720 fmla v9.2d, v19.2d, v25.2d[1] 1721 ldp q24, q25, [x10], #32 1722 fmla v10.2d, v18.2d, v27.2d[1] 1723 fmla v11.2d, v19.2d, v27.2d[1] 1724 ldp q26, q27, [x12], #32 1725 fmla v12.2d, v18.2d, v29.2d[1] 1726 fmla v13.2d, v19.2d, v29.2d[1] 1727 ldp q28, q29, [x13], #32 1728 fmla v14.2d, v18.2d, v31.2d[1] 1729 fmla v15.2d, v19.2d, v31.2d[1] 1730 ldp q30, q31, [x14], #32 1731 1732 bgt 1b 1733 1734 1735 // reduce 1736 fadd v0.2d, v0.2d, v8.2d 1737 fadd v1.2d, v1.2d, v9.2d 1738 fadd v2.2d, v2.2d, v10.2d 1739 fadd v3.2d, v3.2d, v11.2d 1740 fadd v4.2d, v4.2d, v12.2d 1741 fadd v5.2d, v5.2d, v13.2d 1742 fadd v6.2d, v6.2d, v14.2d 1743 fadd v7.2d, v7.2d, v15.2d 1744 1745// sub x9, x9, #32 1746// sub x10, x10, #32 1747 17480: 1749 1750 cmp w8, #3 1751 ble 4f 1752 1753 // unroll 0 1754 ldp q18, q19, [x9, #(0*8+1*32)] 1755 fmla v0.2d, v16.2d, v24.2d[0] 1756 fmla v1.2d, v17.2d, v24.2d[0] 1757// prfm PLDL1KEEP, [x9, #256] 1758 fmla v2.2d, v16.2d, v26.2d[0] 1759 fmla v3.2d, v17.2d, v26.2d[0] 1760// prfm PLDL1KEEP, [x9, #320] 1761 fmla v4.2d, v16.2d, v28.2d[0] 1762 fmla v5.2d, v17.2d, v28.2d[0] 1763// prfm PLDL1KEEP, [x10, #256] 1764 fmla v6.2d, v16.2d, v30.2d[0] 1765 fmla v7.2d, v17.2d, v30.2d[0] 1766 1767 // unroll 1 1768// prfm PLDL1KEEP, [x10, #320] 1769 fmla v0.2d, v18.2d, v24.2d[1] 1770 fmla v1.2d, v19.2d, v24.2d[1] 1771 ldp q16, q17, [x9, #(0*8+2*32)] 1772 fmla v2.2d, v18.2d, v26.2d[1] 1773 fmla v3.2d, v19.2d, v26.2d[1] 1774// add x10, x10, x11 1775 fmla v4.2d, v18.2d, v28.2d[1] 1776 fmla v5.2d, v19.2d, v28.2d[1] 1777 sub w8, w8, #4 1778 fmla v6.2d, v18.2d, v30.2d[1] 1779 fmla v7.2d, v19.2d, v30.2d[1] 1780 1781 // unroll 2 1782 ldp q18, q19, [x9, #(0*8+3*32)] 1783 fmla v0.2d, v16.2d, v25.2d[0] 1784 fmla v1.2d, v17.2d, v25.2d[0] 1785 add x9, x9, #128 1786 fmla v2.2d, v16.2d, v27.2d[0] 1787 fmla v3.2d, v17.2d, v27.2d[0] 1788 fmla v4.2d, v16.2d, v29.2d[0] 1789 fmla v5.2d, v17.2d, v29.2d[0] 1790 cmp w8, #4 1791 fmla v6.2d, v16.2d, v31.2d[0] 1792 fmla v7.2d, v17.2d, v31.2d[0] 1793 1794 // unroll 3 1795// ldp q16, q17, [x9, #(0*8+0*32)] 1796 fmla v0.2d, v18.2d, v25.2d[1] 1797 fmla v1.2d, v19.2d, v25.2d[1] 1798// ldp q24, q25, [x10, #(0*8+0*32)] 1799 fmla v2.2d, v18.2d, v27.2d[1] 1800 fmla v3.2d, v19.2d, v27.2d[1] 1801// ldp q26, q27, [x10, #(0*8+1*32)] 1802 fmla v4.2d, v18.2d, v29.2d[1] 1803 fmla v5.2d, v19.2d, v29.2d[1] 1804// ldp q28, q29, [x10, #(0*8+2*32)] 1805 fmla v6.2d, v18.2d, v31.2d[1] 1806 fmla v7.2d, v19.2d, v31.2d[1] 1807// ldp q30, q31, [x10, #(0*8+3*32)] 1808 1809 b 2f // return 1810 18114: // consider clean1-up loop 1812 1813 cmp w8, #0 1814 ble 2f // return 1815 1816 sub x10, x10, #32 1817 sub x12, x12, #32 1818 sub x13, x13, #32 1819 sub x14, x14, #32 1820 18213: // clean1-up loop 1822 1823 // unroll 0 1824 ldp q24, q25, [x9, #0] 1825 ldr d28, [x10], #8 1826 ldr d29, [x12], #8 1827 ldr d30, [x13], #8 1828 ldr d31, [x14], #8 1829 fmla v0.2d, v24.2d, v28.2d[0] 1830 fmla v1.2d, v25.2d, v28.2d[0] 1831 add x9, x9, #32 1832 fmla v2.2d, v24.2d, v29.2d[0] 1833 fmla v3.2d, v25.2d, v29.2d[0] 1834 sub w8, w8, #1 1835 fmla v4.2d, v24.2d, v30.2d[0] 1836 fmla v5.2d, v25.2d, v30.2d[0] 1837 cmp w8, #0 1838 fmla v6.2d, v24.2d, v31.2d[0] 1839 fmla v7.2d, v25.2d, v31.2d[0] 1840 1841 bgt 3b 1842 18432: // return 1844 1845 1846 1847#else // cortex a53 1848 1849 1850 1851 // early return 1852 cmp w8, #0 1853 ble 2f // return 1854 1855 add x12, x10, x11 1856 add x13, x12, x11 1857 add x14, x13, x11 1858 1859 // prefetch 1860 1861 // preload 1862 1863 cmp w8, #4 1864 ble 0f // consider clean up loop 1865 1866 // prefetch 1867 1868 // zero tmp acc 1869 1870 // main loop 18711: 1872 1873 // load 0 & 1 & 2 & 3 1874 ldp q24, q25, [x10], #32 1875 ldp q26, q27, [x12], #32 1876 ldp q28, q29, [x13], #32 1877 ldp q30, q31, [x14], #32 1878 ldp q16, q17, [x9], #32 1879 ldp q18, q19, [x9], #32 1880 ldp q20, q21, [x9], #32 1881 ldp q22, q23, [x9], #32 1882 1883 // unroll 0 1884 fmla v0.2d, v16.2d, v24.2d[0] 1885 fmla v1.2d, v17.2d, v24.2d[0] 1886 fmla v2.2d, v16.2d, v26.2d[0] 1887 fmla v3.2d, v17.2d, v26.2d[0] 1888 fmla v4.2d, v16.2d, v28.2d[0] 1889 fmla v5.2d, v17.2d, v28.2d[0] 1890 fmla v6.2d, v16.2d, v30.2d[0] 1891 fmla v7.2d, v17.2d, v30.2d[0] 1892 1893 // unroll 1 1894 fmla v0.2d, v18.2d, v24.2d[1] 1895 fmla v1.2d, v19.2d, v24.2d[1] 1896 fmla v2.2d, v18.2d, v26.2d[1] 1897 fmla v3.2d, v19.2d, v26.2d[1] 1898 fmla v4.2d, v18.2d, v28.2d[1] 1899 fmla v5.2d, v19.2d, v28.2d[1] 1900 fmla v6.2d, v18.2d, v30.2d[1] 1901 fmla v7.2d, v19.2d, v30.2d[1] 1902 sub w8, w8, #4 1903 1904 // unroll 2 1905 fmla v0.2d, v20.2d, v25.2d[0] 1906 fmla v1.2d, v21.2d, v25.2d[0] 1907 fmla v2.2d, v20.2d, v27.2d[0] 1908 fmla v3.2d, v21.2d, v27.2d[0] 1909 fmla v4.2d, v20.2d, v29.2d[0] 1910 fmla v5.2d, v21.2d, v29.2d[0] 1911 fmla v6.2d, v20.2d, v31.2d[0] 1912 fmla v7.2d, v21.2d, v31.2d[0] 1913 cmp w8, #4 1914 1915 // unroll 3 1916 fmla v0.2d, v22.2d, v25.2d[1] 1917 fmla v1.2d, v23.2d, v25.2d[1] 1918 fmla v2.2d, v22.2d, v27.2d[1] 1919 fmla v3.2d, v23.2d, v27.2d[1] 1920 fmla v4.2d, v22.2d, v29.2d[1] 1921 fmla v5.2d, v23.2d, v29.2d[1] 1922 fmla v6.2d, v22.2d, v31.2d[1] 1923 fmla v7.2d, v23.2d, v31.2d[1] 1924 1925 bgt 1b 1926 1927 1928 // reduce 1929 19300: 1931 1932 cmp w8, #3 1933 ble 4f 1934 1935 // load 0 & 1 & 2 & 3 1936 ldp q24, q25, [x10], #32 1937 ldp q26, q27, [x12], #32 1938 ldp q28, q29, [x13], #32 1939 ldp q30, q31, [x14], #32 1940 ldp q16, q17, [x9], #32 1941 ldp q18, q19, [x9], #32 1942 ldp q20, q21, [x9], #32 1943 ldp q22, q23, [x9], #32 1944 1945 // unroll 0 1946 fmla v0.2d, v16.2d, v24.2d[0] 1947 fmla v1.2d, v17.2d, v24.2d[0] 1948 fmla v2.2d, v16.2d, v26.2d[0] 1949 fmla v3.2d, v17.2d, v26.2d[0] 1950 fmla v4.2d, v16.2d, v28.2d[0] 1951 fmla v5.2d, v17.2d, v28.2d[0] 1952 fmla v6.2d, v16.2d, v30.2d[0] 1953 fmla v7.2d, v17.2d, v30.2d[0] 1954 1955 // unroll 1 1956 fmla v0.2d, v18.2d, v24.2d[1] 1957 fmla v1.2d, v19.2d, v24.2d[1] 1958 fmla v2.2d, v18.2d, v26.2d[1] 1959 fmla v3.2d, v19.2d, v26.2d[1] 1960 fmla v4.2d, v18.2d, v28.2d[1] 1961 fmla v5.2d, v19.2d, v28.2d[1] 1962 fmla v6.2d, v18.2d, v30.2d[1] 1963 fmla v7.2d, v19.2d, v30.2d[1] 1964 sub w8, w8, #4 1965 1966 // unroll 2 1967 fmla v0.2d, v20.2d, v25.2d[0] 1968 fmla v1.2d, v21.2d, v25.2d[0] 1969 fmla v2.2d, v20.2d, v27.2d[0] 1970 fmla v3.2d, v21.2d, v27.2d[0] 1971 fmla v4.2d, v20.2d, v29.2d[0] 1972 fmla v5.2d, v21.2d, v29.2d[0] 1973 fmla v6.2d, v20.2d, v31.2d[0] 1974 fmla v7.2d, v21.2d, v31.2d[0] 1975// cmp w8, #4 1976 1977 // unroll 3 1978 fmla v0.2d, v22.2d, v25.2d[1] 1979 fmla v1.2d, v23.2d, v25.2d[1] 1980 fmla v2.2d, v22.2d, v27.2d[1] 1981 fmla v3.2d, v23.2d, v27.2d[1] 1982 fmla v4.2d, v22.2d, v29.2d[1] 1983 fmla v5.2d, v23.2d, v29.2d[1] 1984 fmla v6.2d, v22.2d, v31.2d[1] 1985 fmla v7.2d, v23.2d, v31.2d[1] 1986 1987 b 2f // return 1988 19894: // consider clean1-up loop 1990 1991 cmp w8, #0 1992 ble 2f // return 1993 19943: // clean1-up loop 1995 1996 // unroll 0 1997 ldp q24, q25, [x9, #0] 1998 ldr d28, [x10], #8 1999 ldr d29, [x12], #8 2000 ldr d30, [x13], #8 2001 ldr d31, [x14], #8 2002 fmla v0.2d, v24.2d, v28.2d[0] 2003 fmla v1.2d, v25.2d, v28.2d[0] 2004 add x9, x9, #32 2005 fmla v2.2d, v24.2d, v29.2d[0] 2006 fmla v3.2d, v25.2d, v29.2d[0] 2007 sub w8, w8, #1 2008 fmla v4.2d, v24.2d, v30.2d[0] 2009 fmla v5.2d, v25.2d, v30.2d[0] 2010 cmp w8, #0 2011 fmla v6.2d, v24.2d, v31.2d[0] 2012 fmla v7.2d, v25.2d, v31.2d[0] 2013 2014 bgt 3b 2015 20162: // return 2017 2018 2019 2020#endif 2021 2022 2023 2024#if MACRO_LEVEL>=2 2025 .endm 2026#else 2027 ret 2028 2029 FUN_END(inner_kernel_gemm_add_nn_4x4_lib4c) 2030#endif 2031 2032 2033 2034 2035 2036// subroutine 2037// 2038// input arguments: 2039// w8 <- k 2040// x9 <- A 2041// x10 <- B 2042// x11 <- ldb 2043// 2044// output arguments: 2045 2046#if MACRO_LEVEL>=2 2047 .macro INNER_KERNEL_GEMM_ADD_NN_4X3_LIB4C 2048#else 2049 .align 4 2050 FUN_START(inner_kernel_gemm_add_nn_4x3_lib4c) 2051#endif 2052 2053 2054 2055#if defined(TARGET_ARMV8A_ARM_CORTEX_A57) 2056 2057 2058 2059 // early return 2060 cmp w8, #0 2061 ble 2f // return 2062 2063 add x12, x10, x11 2064 add x13, x12, x11 2065 2066 // prefetch 2067 prfm PLDL1KEEP, [x10, #0] 2068 prfm PLDL1KEEP, [x12, #0] 2069 prfm PLDL1KEEP, [x13, #0] 2070 prfm PLDL1KEEP, [x9, #0] 2071 prfm PLDL1KEEP, [x9, #64] 2072 2073 // preload 2074 ldp q24, q25, [x10], #32 2075 ldp q26, q27, [x12], #32 2076 ldp q28, q29, [x13], #32 2077 ldp q16, q17, [x9, #(0*8+0*32)] 2078 2079 cmp w8, #4 2080 ble 0f // consider clean up loop 2081 2082 // prefetch 2083// prfm PLDL1KEEP, [x10, #0] 2084// prfm PLDL1KEEP, [x12, #0] 2085// prfm PLDL1KEEP, [x13, #0] 2086// prfm PLDL1KEEP, [x14, #0] 2087 prfm PLDL1KEEP, [x9, #128] 2088 prfm PLDL1KEEP, [x9, #192] 2089 2090 // zero tmp acc 2091 fmov d8, xzr 2092 fmov d9, d8 2093 fmov d10, d8 2094 fmov d11, d8 2095 fmov d12, d8 2096 fmov d13, d8 2097 2098 // main loop 20991: 2100 2101 // unroll 0 2102 ldp q18, q19, [x9, #(0*8+1*32)] 2103 fmla v0.2d, v16.2d, v24.2d[0] 2104 fmla v1.2d, v17.2d, v24.2d[0] 2105// prfm PLDL1KEEP, [x9, #128] 2106 prfm PLDL1KEEP, [x9, #256] 2107 fmla v2.2d, v16.2d, v26.2d[0] 2108 fmla v3.2d, v17.2d, v26.2d[0] 2109// prfm PLDL1KEEP, [x9, #192] 2110 prfm PLDL1KEEP, [x9, #320] 2111 fmla v4.2d, v16.2d, v28.2d[0] 2112 fmla v5.2d, v17.2d, v28.2d[0] 2113 prfm PLDL1KEEP, [x10, #32] 2114 2115 // unroll 1 2116 prfm PLDL1KEEP, [x12, #32] 2117 fmla v8.2d, v18.2d, v24.2d[1] 2118 fmla v9.2d, v19.2d, v24.2d[1] 2119 ldp q16, q17, [x9, #(0*8+2*32)] 2120 fmla v10.2d, v18.2d, v26.2d[1] 2121 fmla v11.2d, v19.2d, v26.2d[1] 2122 prfm PLDL1KEEP, [x13, #32] 2123 fmla v12.2d, v18.2d, v28.2d[1] 2124 fmla v13.2d, v19.2d, v28.2d[1] 2125// prfm PLDL1KEEP, [x14, #32] 2126 2127 // unroll 2 2128 ldp q18, q19, [x9, #(0*8+3*32)] 2129 fmla v0.2d, v16.2d, v25.2d[0] 2130 fmla v1.2d, v17.2d, v25.2d[0] 2131 add x9, x9, #128 2132 fmla v2.2d, v16.2d, v27.2d[0] 2133 fmla v3.2d, v17.2d, v27.2d[0] 2134 sub w8, w8, #4 2135 fmla v4.2d, v16.2d, v29.2d[0] 2136 fmla v5.2d, v17.2d, v29.2d[0] 2137 cmp w8, #4 2138 2139 // unroll 3 2140 ldp q16, q17, [x9, #(0*8+0*32)] 2141 fmla v8.2d, v18.2d, v25.2d[1] 2142 fmla v9.2d, v19.2d, v25.2d[1] 2143 ldp q24, q25, [x10], #32 2144 fmla v10.2d, v18.2d, v27.2d[1] 2145 fmla v11.2d, v19.2d, v27.2d[1] 2146 ldp q26, q27, [x12], #32 2147 fmla v12.2d, v18.2d, v29.2d[1] 2148 fmla v13.2d, v19.2d, v29.2d[1] 2149 ldp q28, q29, [x13], #32 2150 2151 bgt 1b 2152 2153 2154 // reduce 2155 fadd v0.2d, v0.2d, v8.2d 2156 fadd v1.2d, v1.2d, v9.2d 2157 fadd v2.2d, v2.2d, v10.2d 2158 fadd v3.2d, v3.2d, v11.2d 2159 fadd v4.2d, v4.2d, v12.2d 2160 fadd v5.2d, v5.2d, v13.2d 2161 21620: 2163 2164 cmp w8, #3 2165 ble 4f 2166 2167 // unroll 0 2168 ldp q18, q19, [x9, #(0*8+1*32)] 2169 fmla v0.2d, v16.2d, v24.2d[0] 2170 fmla v1.2d, v17.2d, v24.2d[0] 2171// prfm PLDL1KEEP, [x9, #256] 2172 fmla v2.2d, v16.2d, v26.2d[0] 2173 fmla v3.2d, v17.2d, v26.2d[0] 2174// prfm PLDL1KEEP, [x9, #320] 2175 fmla v4.2d, v16.2d, v28.2d[0] 2176 fmla v5.2d, v17.2d, v28.2d[0] 2177// prfm PLDL1KEEP, [x10, #256] 2178 2179 // unroll 1 2180// prfm PLDL1KEEP, [x10, #320] 2181 fmla v0.2d, v18.2d, v24.2d[1] 2182 fmla v1.2d, v19.2d, v24.2d[1] 2183 ldp q16, q17, [x9, #(0*8+2*32)] 2184 fmla v2.2d, v18.2d, v26.2d[1] 2185 fmla v3.2d, v19.2d, v26.2d[1] 2186// add x10, x10, x11 2187 fmla v4.2d, v18.2d, v28.2d[1] 2188 fmla v5.2d, v19.2d, v28.2d[1] 2189 sub w8, w8, #4 2190 2191 // unroll 2 2192 ldp q18, q19, [x9, #(0*8+3*32)] 2193 fmla v0.2d, v16.2d, v25.2d[0] 2194 fmla v1.2d, v17.2d, v25.2d[0] 2195 add x9, x9, #128 2196 fmla v2.2d, v16.2d, v27.2d[0] 2197 fmla v3.2d, v17.2d, v27.2d[0] 2198 fmla v4.2d, v16.2d, v29.2d[0] 2199 fmla v5.2d, v17.2d, v29.2d[0] 2200 cmp w8, #4 2201 2202 // unroll 3 2203// ldp q16, q17, [x9, #(0*8+0*32)] 2204 fmla v0.2d, v18.2d, v25.2d[1] 2205 fmla v1.2d, v19.2d, v25.2d[1] 2206// ldp q24, q25, [x10, #(0*8+0*32)] 2207 fmla v2.2d, v18.2d, v27.2d[1] 2208 fmla v3.2d, v19.2d, v27.2d[1] 2209// ldp q26, q27, [x10, #(0*8+1*32)] 2210 fmla v4.2d, v18.2d, v29.2d[1] 2211 fmla v5.2d, v19.2d, v29.2d[1] 2212// ldp q28, q29, [x10, #(0*8+2*32)] 2213 2214 b 2f // return 2215 22164: // consider clean1-up loop 2217 2218 cmp w8, #0 2219 ble 2f // return 2220 2221 sub x10, x10, #32 2222 sub x12, x12, #32 2223 sub x13, x13, #32 2224 22253: // clean1-up loop 2226 2227 // unroll 0 2228 ldp q24, q25, [x9, #0] 2229 ldr d28, [x10], #8 2230 ldr d29, [x12], #8 2231 ldr d30, [x13], #8 2232// ldr d31, [x14], #8 2233 fmla v0.2d, v24.2d, v28.2d[0] 2234 fmla v1.2d, v25.2d, v28.2d[0] 2235 add x9, x9, #32 2236 fmla v2.2d, v24.2d, v29.2d[0] 2237 fmla v3.2d, v25.2d, v29.2d[0] 2238 sub w8, w8, #1 2239 fmla v4.2d, v24.2d, v30.2d[0] 2240 fmla v5.2d, v25.2d, v30.2d[0] 2241 cmp w8, #0 2242 2243 bgt 3b 2244 22452: // return 2246 2247 2248 2249#else // cortex a53 2250 2251 2252 2253 // early return 2254 cmp w8, #0 2255 ble 2f // return 2256 2257 add x12, x10, x11 2258 add x13, x12, x11 2259 2260 // prefetch 2261 2262 // preload 2263 2264 cmp w8, #4 2265 ble 0f // consider clean up loop 2266 2267 // prefetch 2268 2269 // zero tmp acc 2270 2271 // main loop 22721: 2273 2274 // load 0 & 1 & 2 & 3 2275 ldp q24, q25, [x10], #32 2276 ldp q26, q27, [x12], #32 2277 ldp q28, q29, [x13], #32 2278 ldp q16, q17, [x9], #32 2279 ldp q18, q19, [x9], #32 2280 ldp q20, q21, [x9], #32 2281 ldp q22, q23, [x9], #32 2282 2283 // unroll 0 2284 fmla v0.2d, v16.2d, v24.2d[0] 2285 fmla v1.2d, v17.2d, v24.2d[0] 2286 fmla v2.2d, v16.2d, v26.2d[0] 2287 fmla v3.2d, v17.2d, v26.2d[0] 2288 fmla v4.2d, v16.2d, v28.2d[0] 2289 fmla v5.2d, v17.2d, v28.2d[0] 2290 2291 // unroll 1 2292 fmla v0.2d, v18.2d, v24.2d[1] 2293 fmla v1.2d, v19.2d, v24.2d[1] 2294 fmla v2.2d, v18.2d, v26.2d[1] 2295 fmla v3.2d, v19.2d, v26.2d[1] 2296 fmla v4.2d, v18.2d, v28.2d[1] 2297 fmla v5.2d, v19.2d, v28.2d[1] 2298 sub w8, w8, #4 2299 2300 // unroll 2 2301 fmla v0.2d, v20.2d, v25.2d[0] 2302 fmla v1.2d, v21.2d, v25.2d[0] 2303 fmla v2.2d, v20.2d, v27.2d[0] 2304 fmla v3.2d, v21.2d, v27.2d[0] 2305 fmla v4.2d, v20.2d, v29.2d[0] 2306 fmla v5.2d, v21.2d, v29.2d[0] 2307 cmp w8, #4 2308 2309 // unroll 3 2310 fmla v0.2d, v22.2d, v25.2d[1] 2311 fmla v1.2d, v23.2d, v25.2d[1] 2312 fmla v2.2d, v22.2d, v27.2d[1] 2313 fmla v3.2d, v23.2d, v27.2d[1] 2314 fmla v4.2d, v22.2d, v29.2d[1] 2315 fmla v5.2d, v23.2d, v29.2d[1] 2316 2317 bgt 1b 2318 2319 2320 // reduce 2321 23220: 2323 2324 cmp w8, #3 2325 ble 4f 2326 2327 // load 0 & 1 & 2 & 3 2328 ldp q24, q25, [x10], #32 2329 ldp q26, q27, [x12], #32 2330 ldp q28, q29, [x13], #32 2331 ldp q16, q17, [x9], #32 2332 ldp q18, q19, [x9], #32 2333 ldp q20, q21, [x9], #32 2334 ldp q22, q23, [x9], #32 2335 2336 // unroll 0 2337 fmla v0.2d, v16.2d, v24.2d[0] 2338 fmla v1.2d, v17.2d, v24.2d[0] 2339 fmla v2.2d, v16.2d, v26.2d[0] 2340 fmla v3.2d, v17.2d, v26.2d[0] 2341 fmla v4.2d, v16.2d, v28.2d[0] 2342 fmla v5.2d, v17.2d, v28.2d[0] 2343 2344 // unroll 1 2345 fmla v0.2d, v18.2d, v24.2d[1] 2346 fmla v1.2d, v19.2d, v24.2d[1] 2347 fmla v2.2d, v18.2d, v26.2d[1] 2348 fmla v3.2d, v19.2d, v26.2d[1] 2349 fmla v4.2d, v18.2d, v28.2d[1] 2350 fmla v5.2d, v19.2d, v28.2d[1] 2351 sub w8, w8, #4 2352 2353 // unroll 2 2354 fmla v0.2d, v20.2d, v25.2d[0] 2355 fmla v1.2d, v21.2d, v25.2d[0] 2356 fmla v2.2d, v20.2d, v27.2d[0] 2357 fmla v3.2d, v21.2d, v27.2d[0] 2358 fmla v4.2d, v20.2d, v29.2d[0] 2359 fmla v5.2d, v21.2d, v29.2d[0] 2360// cmp w8, #4 2361 2362 // unroll 3 2363 fmla v0.2d, v22.2d, v25.2d[1] 2364 fmla v1.2d, v23.2d, v25.2d[1] 2365 fmla v2.2d, v22.2d, v27.2d[1] 2366 fmla v3.2d, v23.2d, v27.2d[1] 2367 fmla v4.2d, v22.2d, v29.2d[1] 2368 fmla v5.2d, v23.2d, v29.2d[1] 2369 2370 b 2f // return 2371 23724: // consider clean1-up loop 2373 2374 cmp w8, #0 2375 ble 2f // return 2376 23773: // clean1-up loop 2378 2379 // unroll 0 2380 ldp q24, q25, [x9, #0] 2381 ldr d28, [x10], #8 2382 ldr d29, [x12], #8 2383 ldr d30, [x13], #8 2384 fmla v0.2d, v24.2d, v28.2d[0] 2385 fmla v1.2d, v25.2d, v28.2d[0] 2386 add x9, x9, #32 2387 fmla v2.2d, v24.2d, v29.2d[0] 2388 fmla v3.2d, v25.2d, v29.2d[0] 2389 sub w8, w8, #1 2390 fmla v4.2d, v24.2d, v30.2d[0] 2391 fmla v5.2d, v25.2d, v30.2d[0] 2392 cmp w8, #0 2393 2394 bgt 3b 2395 23962: // return 2397 2398 2399 2400#endif 2401 2402 2403 2404#if MACRO_LEVEL>=2 2405 .endm 2406#else 2407 ret 2408 2409 FUN_END(inner_kernel_gemm_add_nn_4x3_lib4c) 2410#endif 2411 2412 2413 2414 2415 2416// subroutine 2417// 2418// input arguments: 2419// w8 <- k 2420// x9 <- A 2421// x10 <- B 2422// x11 <- ldb 2423// 2424// output arguments: 2425 2426#if MACRO_LEVEL>=2 2427 .macro INNER_KERNEL_GEMM_ADD_NN_4X2_LIB4C 2428#else 2429 .align 4 2430 FUN_START(inner_kernel_gemm_add_nn_4x2_lib4c) 2431#endif 2432 2433 2434 2435#if defined(TARGET_ARMV8A_ARM_CORTEX_A57) 2436 2437 2438 2439 // early return 2440 cmp w8, #0 2441 ble 2f // return 2442 2443 add x12, x10, x11 2444 2445 // prefetch 2446 prfm PLDL1KEEP, [x10, #0] 2447 prfm PLDL1KEEP, [x12, #0] 2448 prfm PLDL1KEEP, [x9, #0] 2449 prfm PLDL1KEEP, [x9, #64] 2450 2451 // preload 2452 ldp q24, q25, [x10], #32 2453 ldp q26, q27, [x12], #32 2454 ldp q16, q17, [x9, #(0*8+0*32)] 2455 2456 cmp w8, #4 2457 ble 0f // consider clean up loop 2458 2459 // prefetch 2460// prfm PLDL1KEEP, [x10, #0] 2461// prfm PLDL1KEEP, [x12, #0] 2462// prfm PLDL1KEEP, [x13, #0] 2463// prfm PLDL1KEEP, [x14, #0] 2464 prfm PLDL1KEEP, [x9, #128] 2465 prfm PLDL1KEEP, [x9, #192] 2466 2467 // zero tmp acc 2468 fmov d8, xzr 2469 fmov d9, d8 2470 fmov d10, d8 2471 fmov d11, d8 2472 2473 // main loop 24741: 2475 2476 // unroll 0 2477 ldp q18, q19, [x9, #(0*8+1*32)] 2478 fmla v0.2d, v16.2d, v24.2d[0] 2479 fmla v1.2d, v17.2d, v24.2d[0] 2480// prfm PLDL1KEEP, [x9, #128] 2481 prfm PLDL1KEEP, [x9, #256] 2482 fmla v2.2d, v16.2d, v26.2d[0] 2483 fmla v3.2d, v17.2d, v26.2d[0] 2484// prfm PLDL1KEEP, [x9, #192] 2485 prfm PLDL1KEEP, [x9, #320] 2486 prfm PLDL1KEEP, [x10, #32] 2487 2488 // unroll 1 2489 prfm PLDL1KEEP, [x12, #32] 2490 fmla v8.2d, v18.2d, v24.2d[1] 2491 fmla v9.2d, v19.2d, v24.2d[1] 2492 ldp q16, q17, [x9, #(0*8+2*32)] 2493 fmla v10.2d, v18.2d, v26.2d[1] 2494 fmla v11.2d, v19.2d, v26.2d[1] 2495 2496 // unroll 2 2497 ldp q18, q19, [x9, #(0*8+3*32)] 2498 fmla v0.2d, v16.2d, v25.2d[0] 2499 fmla v1.2d, v17.2d, v25.2d[0] 2500 add x9, x9, #128 2501 fmla v2.2d, v16.2d, v27.2d[0] 2502 fmla v3.2d, v17.2d, v27.2d[0] 2503 sub w8, w8, #4 2504 cmp w8, #4 2505 2506 // unroll 3 2507 ldp q16, q17, [x9, #(0*8+0*32)] 2508 fmla v8.2d, v18.2d, v25.2d[1] 2509 fmla v9.2d, v19.2d, v25.2d[1] 2510 ldp q24, q25, [x10], #32 2511 fmla v10.2d, v18.2d, v27.2d[1] 2512 fmla v11.2d, v19.2d, v27.2d[1] 2513 ldp q26, q27, [x12], #32 2514 2515 bgt 1b 2516 2517 2518 // reduce 2519 fadd v0.2d, v0.2d, v8.2d 2520 fadd v1.2d, v1.2d, v9.2d 2521 fadd v2.2d, v2.2d, v10.2d 2522 fadd v3.2d, v3.2d, v11.2d 2523 25240: 2525 2526 cmp w8, #3 2527 ble 4f 2528 2529 // unroll 0 2530 ldp q18, q19, [x9, #(0*8+1*32)] 2531 fmla v0.2d, v16.2d, v24.2d[0] 2532 fmla v1.2d, v17.2d, v24.2d[0] 2533// prfm PLDL1KEEP, [x9, #256] 2534 fmla v2.2d, v16.2d, v26.2d[0] 2535 fmla v3.2d, v17.2d, v26.2d[0] 2536// prfm PLDL1KEEP, [x9, #320] 2537// prfm PLDL1KEEP, [x10, #256] 2538 2539 // unroll 1 2540// prfm PLDL1KEEP, [x10, #320] 2541 fmla v0.2d, v18.2d, v24.2d[1] 2542 fmla v1.2d, v19.2d, v24.2d[1] 2543 ldp q16, q17, [x9, #(0*8+2*32)] 2544 fmla v2.2d, v18.2d, v26.2d[1] 2545 fmla v3.2d, v19.2d, v26.2d[1] 2546// add x10, x10, x11 2547 sub w8, w8, #4 2548 2549 // unroll 2 2550 ldp q18, q19, [x9, #(0*8+3*32)] 2551 fmla v0.2d, v16.2d, v25.2d[0] 2552 fmla v1.2d, v17.2d, v25.2d[0] 2553 add x9, x9, #128 2554 fmla v2.2d, v16.2d, v27.2d[0] 2555 fmla v3.2d, v17.2d, v27.2d[0] 2556 cmp w8, #4 2557 2558 // unroll 3 2559// ldp q16, q17, [x9, #(0*8+0*32)] 2560 fmla v0.2d, v18.2d, v25.2d[1] 2561 fmla v1.2d, v19.2d, v25.2d[1] 2562// ldp q24, q25, [x10, #(0*8+0*32)] 2563 fmla v2.2d, v18.2d, v27.2d[1] 2564 fmla v3.2d, v19.2d, v27.2d[1] 2565// ldp q26, q27, [x10, #(0*8+1*32)] 2566 2567 b 2f // return 2568 25694: // consider clean1-up loop 2570 2571 cmp w8, #0 2572 ble 2f // return 2573 2574 sub x10, x10, #32 2575 sub x12, x12, #32 2576 25773: // clean1-up loop 2578 2579 // unroll 0 2580 ldp q24, q25, [x9, #0] 2581 ldr d28, [x10], #8 2582 ldr d29, [x12], #8 2583 fmla v0.2d, v24.2d, v28.2d[0] 2584 fmla v1.2d, v25.2d, v28.2d[0] 2585 add x9, x9, #32 2586 fmla v2.2d, v24.2d, v29.2d[0] 2587 fmla v3.2d, v25.2d, v29.2d[0] 2588 sub w8, w8, #1 2589 cmp w8, #0 2590 2591 bgt 3b 2592 25932: // return 2594 2595 2596 2597#else // cortex a53 2598 2599 2600 2601 // early return 2602 cmp w8, #0 2603 ble 2f // return 2604 2605 add x12, x10, x11 2606 2607 // prefetch 2608 2609 // preload 2610 2611 cmp w8, #4 2612 ble 0f // consider clean up loop 2613 2614 // prefetch 2615 2616 // zero tmp acc 2617 2618 // main loop 26191: 2620 2621 // load 0 & 1 & 2 & 3 2622 ldp q24, q25, [x10], #32 2623 ldp q26, q27, [x12], #32 2624 ldp q16, q17, [x9], #32 2625 ldp q18, q19, [x9], #32 2626 ldp q20, q21, [x9], #32 2627 ldp q22, q23, [x9], #32 2628 2629 // unroll 0 2630 fmla v0.2d, v16.2d, v24.2d[0] 2631 fmla v1.2d, v17.2d, v24.2d[0] 2632 fmla v2.2d, v16.2d, v26.2d[0] 2633 fmla v3.2d, v17.2d, v26.2d[0] 2634 2635 // unroll 1 2636 fmla v0.2d, v18.2d, v24.2d[1] 2637 fmla v1.2d, v19.2d, v24.2d[1] 2638 fmla v2.2d, v18.2d, v26.2d[1] 2639 fmla v3.2d, v19.2d, v26.2d[1] 2640 sub w8, w8, #4 2641 2642 // unroll 2 2643 fmla v0.2d, v20.2d, v25.2d[0] 2644 fmla v1.2d, v21.2d, v25.2d[0] 2645 fmla v2.2d, v20.2d, v27.2d[0] 2646 fmla v3.2d, v21.2d, v27.2d[0] 2647 cmp w8, #4 2648 2649 // unroll 3 2650 fmla v0.2d, v22.2d, v25.2d[1] 2651 fmla v1.2d, v23.2d, v25.2d[1] 2652 fmla v2.2d, v22.2d, v27.2d[1] 2653 fmla v3.2d, v23.2d, v27.2d[1] 2654 2655 bgt 1b 2656 2657 2658 // reduce 2659 26600: 2661 2662 cmp w8, #3 2663 ble 4f 2664 2665 // load 0 & 1 & 2 & 3 2666 ldp q24, q25, [x10], #32 2667 ldp q26, q27, [x12], #32 2668 ldp q16, q17, [x9], #32 2669 ldp q18, q19, [x9], #32 2670 ldp q20, q21, [x9], #32 2671 ldp q22, q23, [x9], #32 2672 2673 // unroll 0 2674 fmla v0.2d, v16.2d, v24.2d[0] 2675 fmla v1.2d, v17.2d, v24.2d[0] 2676 fmla v2.2d, v16.2d, v26.2d[0] 2677 fmla v3.2d, v17.2d, v26.2d[0] 2678 2679 // unroll 1 2680 fmla v0.2d, v18.2d, v24.2d[1] 2681 fmla v1.2d, v19.2d, v24.2d[1] 2682 fmla v2.2d, v18.2d, v26.2d[1] 2683 fmla v3.2d, v19.2d, v26.2d[1] 2684 sub w8, w8, #4 2685 2686 // unroll 2 2687 fmla v0.2d, v20.2d, v25.2d[0] 2688 fmla v1.2d, v21.2d, v25.2d[0] 2689 fmla v2.2d, v20.2d, v27.2d[0] 2690 fmla v3.2d, v21.2d, v27.2d[0] 2691// cmp w8, #4 2692 2693 // unroll 3 2694 fmla v0.2d, v22.2d, v25.2d[1] 2695 fmla v1.2d, v23.2d, v25.2d[1] 2696 fmla v2.2d, v22.2d, v27.2d[1] 2697 fmla v3.2d, v23.2d, v27.2d[1] 2698 2699 b 2f // return 2700 27014: // consider clean1-up loop 2702 2703 cmp w8, #0 2704 ble 2f // return 2705 27063: // clean1-up loop 2707 2708 // unroll 0 2709 ldp q24, q25, [x9, #0] 2710 ldr d28, [x10], #8 2711 ldr d29, [x12], #8 2712 fmla v0.2d, v24.2d, v28.2d[0] 2713 fmla v1.2d, v25.2d, v28.2d[0] 2714 add x9, x9, #32 2715 fmla v2.2d, v24.2d, v29.2d[0] 2716 fmla v3.2d, v25.2d, v29.2d[0] 2717 sub w8, w8, #1 2718 cmp w8, #0 2719 2720 bgt 3b 2721 27222: // return 2723 2724 2725 2726#endif 2727 2728 2729 2730#if MACRO_LEVEL>=2 2731 .endm 2732#else 2733 ret 2734 2735 FUN_END(inner_kernel_gemm_add_nn_4x2_lib4c) 2736#endif 2737 2738 2739 2740 2741 2742// subroutine 2743// 2744// input arguments: 2745// w8 <- k 2746// x9 <- A 2747// x10 <- B 2748// x11 <- ldb 2749// 2750// output arguments: 2751 2752#if MACRO_LEVEL>=2 2753 .macro INNER_KERNEL_GEMM_ADD_NN_4X1_LIB4C 2754#else 2755 .align 4 2756 FUN_START(inner_kernel_gemm_add_nn_4x1_lib4c) 2757#endif 2758 2759 2760 2761#if defined(TARGET_ARMV8A_ARM_CORTEX_A57) 2762 2763 2764 2765 // early return 2766 cmp w8, #0 2767 ble 2f // return 2768 2769 // prefetch 2770 prfm PLDL1KEEP, [x10, #0] 2771 prfm PLDL1KEEP, [x9, #0] 2772 prfm PLDL1KEEP, [x9, #64] 2773 2774 // preload 2775 ldp q24, q25, [x10], #32 2776 ldp q16, q17, [x9, #(0*8+0*32)] 2777 2778 cmp w8, #4 2779 ble 0f // consider clean up loop 2780 2781 // prefetch 2782 prfm PLDL1KEEP, [x9, #128] 2783 prfm PLDL1KEEP, [x9, #192] 2784 2785 // zero tmp acc 2786 fmov d8, xzr 2787 fmov d9, d8 2788 2789 // main loop 27901: 2791 2792 // unroll 0 2793 ldp q18, q19, [x9, #(0*8+1*32)] 2794 fmla v0.2d, v16.2d, v24.2d[0] 2795 fmla v1.2d, v17.2d, v24.2d[0] 2796// prfm PLDL1KEEP, [x9, #128] 2797 prfm PLDL1KEEP, [x9, #256] 2798// prfm PLDL1KEEP, [x9, #192] 2799 prfm PLDL1KEEP, [x9, #320] 2800 prfm PLDL1KEEP, [x10, #32] 2801 2802 // unroll 1 2803 fmla v8.2d, v18.2d, v24.2d[1] 2804 fmla v9.2d, v19.2d, v24.2d[1] 2805 ldp q16, q17, [x9, #(0*8+2*32)] 2806 2807 // unroll 2 2808 ldp q18, q19, [x9, #(0*8+3*32)] 2809 fmla v0.2d, v16.2d, v25.2d[0] 2810 fmla v1.2d, v17.2d, v25.2d[0] 2811 add x9, x9, #128 2812 sub w8, w8, #4 2813 cmp w8, #4 2814 2815 // unroll 3 2816 ldp q16, q17, [x9, #(0*8+0*32)] 2817 fmla v8.2d, v18.2d, v25.2d[1] 2818 fmla v9.2d, v19.2d, v25.2d[1] 2819 ldp q24, q25, [x10], #32 2820 2821 bgt 1b 2822 2823 2824 // reduce 2825 fadd v0.2d, v0.2d, v8.2d 2826 fadd v1.2d, v1.2d, v9.2d 2827 28280: 2829 2830 cmp w8, #3 2831 ble 4f 2832 2833 // unroll 0 2834 ldp q18, q19, [x9, #(0*8+1*32)] 2835 fmla v0.2d, v16.2d, v24.2d[0] 2836 fmla v1.2d, v17.2d, v24.2d[0] 2837// prfm PLDL1KEEP, [x9, #256] 2838// prfm PLDL1KEEP, [x9, #320] 2839// prfm PLDL1KEEP, [x10, #256] 2840 2841 // unroll 1 2842 fmla v0.2d, v18.2d, v24.2d[1] 2843 fmla v1.2d, v19.2d, v24.2d[1] 2844 ldp q16, q17, [x9, #(0*8+2*32)] 2845// add x10, x10, x11 2846 sub w8, w8, #4 2847 2848 // unroll 2 2849 ldp q18, q19, [x9, #(0*8+3*32)] 2850 fmla v0.2d, v16.2d, v25.2d[0] 2851 fmla v1.2d, v17.2d, v25.2d[0] 2852 add x9, x9, #128 2853 cmp w8, #4 2854 2855 // unroll 3 2856// ldp q16, q17, [x9, #(0*8+0*32)] 2857 fmla v0.2d, v18.2d, v25.2d[1] 2858 fmla v1.2d, v19.2d, v25.2d[1] 2859// ldp q24, q25, [x10, #(0*8+0*32)] 2860 2861 b 2f // return 2862 28634: // consider clean1-up loop 2864 2865 cmp w8, #0 2866 ble 2f // return 2867 2868 sub x10, x10, #32 2869 28703: // clean1-up loop 2871 2872 // unroll 0 2873 ldp q24, q25, [x9, #0] 2874 ldr d28, [x10], #8 2875 fmla v0.2d, v24.2d, v28.2d[0] 2876 fmla v1.2d, v25.2d, v28.2d[0] 2877 add x9, x9, #32 2878 sub w8, w8, #1 2879 cmp w8, #0 2880 2881 bgt 3b 2882 28832: // return 2884 2885 2886 2887#else // cortex a53 2888 2889 2890 2891 // early return 2892 cmp w8, #0 2893 ble 2f // return 2894 2895 // prefetch 2896 2897 // preload 2898 2899 cmp w8, #4 2900 ble 0f // consider clean up loop 2901 2902 // prefetch 2903 2904 // zero tmp acc 2905 2906 // main loop 29071: 2908 2909 // load 0 & 1 & 2 & 3 2910 ldp q24, q25, [x10], #32 2911 ldp q16, q17, [x9], #32 2912 ldp q18, q19, [x9], #32 2913 ldp q20, q21, [x9], #32 2914 ldp q22, q23, [x9], #32 2915 2916 // unroll 0 2917 fmla v0.2d, v16.2d, v24.2d[0] 2918 fmla v1.2d, v17.2d, v24.2d[0] 2919 2920 // unroll 1 2921 fmla v0.2d, v18.2d, v24.2d[1] 2922 fmla v1.2d, v19.2d, v24.2d[1] 2923 sub w8, w8, #4 2924 2925 // unroll 2 2926 fmla v0.2d, v20.2d, v25.2d[0] 2927 fmla v1.2d, v21.2d, v25.2d[0] 2928 cmp w8, #4 2929 2930 // unroll 3 2931 fmla v0.2d, v22.2d, v25.2d[1] 2932 fmla v1.2d, v23.2d, v25.2d[1] 2933 2934 bgt 1b 2935 2936 2937 // reduce 2938 29390: 2940 2941 cmp w8, #3 2942 ble 4f 2943 2944 // load 0 & 1 & 2 & 3 2945 ldp q24, q25, [x10], #32 2946 ldp q16, q17, [x9], #32 2947 ldp q18, q19, [x9], #32 2948 ldp q20, q21, [x9], #32 2949 ldp q22, q23, [x9], #32 2950 2951 // unroll 0 2952 fmla v0.2d, v16.2d, v24.2d[0] 2953 fmla v1.2d, v17.2d, v24.2d[0] 2954 2955 // unroll 1 2956 fmla v0.2d, v18.2d, v24.2d[1] 2957 fmla v1.2d, v19.2d, v24.2d[1] 2958 sub w8, w8, #4 2959 2960 // unroll 2 2961 fmla v0.2d, v20.2d, v25.2d[0] 2962 fmla v1.2d, v21.2d, v25.2d[0] 2963// cmp w8, #4 2964 2965 // unroll 3 2966 fmla v0.2d, v22.2d, v25.2d[1] 2967 fmla v1.2d, v23.2d, v25.2d[1] 2968 2969 b 2f // return 2970 29714: // consider clean1-up loop 2972 2973 cmp w8, #0 2974 ble 2f // return 2975 29763: // clean1-up loop 2977 2978 // unroll 0 2979 ldp q24, q25, [x9, #0] 2980 ldr d28, [x10], #8 2981 fmla v0.2d, v24.2d, v28.2d[0] 2982 fmla v1.2d, v25.2d, v28.2d[0] 2983 add x9, x9, #32 2984 sub w8, w8, #1 2985 cmp w8, #0 2986 2987 bgt 3b 2988 29892: // return 2990 2991 2992 2993#endif 2994 2995 2996 2997#if MACRO_LEVEL>=2 2998 .endm 2999#else 3000 ret 3001 3002 FUN_END(inner_kernel_gemm_add_nn_4x1_lib4c) 3003#endif 3004 3005 3006 3007 3008 3009// subroutine 3010// 3011// triangular substitution: 3012// side = left 3013// uplo = lower 3014// tran = not-transposed 3015// unit diagonal 3016// 3017// input arguments: 3018// x8 <- E 3019// x9 <- lde 3020// 3021// output arguments: 3022 3023#if MACRO_LEVEL>=1 3024 .macro INNER_EDGE_TRSM_LLN_ONE_4X4_LIB 3025#else 3026 .align 4 3027 FUN_START(inner_edge_trsm_lln_one_4x4_lib) 3028#endif 3029 3030 ldp q24, q25, [x8, #0] // E[0+4*0] 3031 add x8, x8, x9 3032 ins v24.d[0], xzr 3033 fmls v0.2d, v24.2d, v0.2d[0] 3034 fmls v1.2d, v25.2d, v0.2d[0] 3035 fmls v2.2d, v24.2d, v2.2d[0] 3036 fmls v3.2d, v25.2d, v2.2d[0] 3037 fmls v4.2d, v24.2d, v4.2d[0] 3038 fmls v5.2d, v25.2d, v4.2d[0] 3039 fmls v6.2d, v24.2d, v6.2d[0] 3040 fmls v7.2d, v25.2d, v6.2d[0] 3041 3042 ldr q25, [x8, #16] // E[2+4*1] 3043 add x8, x8, x9 3044 fmls v1.2d, v25.2d, v0.2d[1] 3045 fmls v3.2d, v25.2d, v2.2d[1] 3046 fmls v5.2d, v25.2d, v4.2d[1] 3047 fmls v7.2d, v25.2d, v6.2d[1] 3048 3049 ldr q25, [x8, #16] // E[2+4*2] 3050// add x8, x8, x9 3051 ins v25.d[0], xzr 3052 fmls v1.2d, v25.2d, v1.2d[0] 3053 fmls v3.2d, v25.2d, v3.2d[0] 3054 fmls v5.2d, v25.2d, v5.2d[0] 3055 fmls v7.2d, v25.2d, v7.2d[0] 3056 3057#if MACRO_LEVEL>=1 3058 .endm 3059#else 3060 ret 3061 3062 FUN_END(inner_edge_trsm_lln_one_4x4_lib) 3063#endif 3064 3065 3066 3067 3068 3069// subroutine 3070// 3071// triangular substitution: 3072// side = right 3073// uplo = lower 3074// tran = transposed 3075// requires explicit inverse of diagonal 3076// 3077// input arguments: 3078// x8 <- E 3079// x9 <- lde 3080// x10 <- inv_diag_E 3081// 3082// output arguments: 3083 3084#if MACRO_LEVEL>=1 3085 .macro INNER_EDGE_TRSM_RLT_INV_4X4_LIB 3086#else 3087 .align 4 3088 FUN_START(inner_edge_trsm_rlt_inv_4x4_lib) 3089#endif 3090 3091 ldr d16, [x10, #0] // E_inv[0] 3092 fmul v0.2d, v0.2d, v16.2d[0] 3093 fmul v1.2d, v1.2d, v16.2d[0] 3094 ldr d16, [x8, #8] // E[1+4*0] 3095 fmls v2.2d, v0.2d, v16.2d[0] 3096 fmls v3.2d, v1.2d, v16.2d[0] 3097 ldr d16, [x8, #16] // E[2+4*0] 3098 fmls v4.2d, v0.2d, v16.2d[0] 3099 fmls v5.2d, v1.2d, v16.2d[0] 3100 ldr d16, [x8, #24] // E[3+4*0] 3101 fmls v6.2d, v0.2d, v16.2d[0] 3102 fmls v7.2d, v1.2d, v16.2d[0] 3103 add x8, x8, x9 3104 3105 ldr d16, [x10, #8] // E_inv[1] 3106 fmul v2.2d, v2.2d, v16.2d[0] 3107 fmul v3.2d, v3.2d, v16.2d[0] 3108 ldr d16, [x8, #16] // E[2+4*1] 3109 fmls v4.2d, v2.2d, v16.2d[0] 3110 fmls v5.2d, v3.2d, v16.2d[0] 3111 ldr d16, [x8, #24] // E[3+4*1] 3112 fmls v6.2d, v2.2d, v16.2d[0] 3113 fmls v7.2d, v3.2d, v16.2d[0] 3114 add x8, x8, x9 3115 3116 ldr d16, [x10, #16] // E_inv[2] 3117 fmul v4.2d, v4.2d, v16.2d[0] 3118 fmul v5.2d, v5.2d, v16.2d[0] 3119 ldr d16, [x8, #24] // E[3+4*1] 3120 fmls v6.2d, v4.2d, v16.2d[0] 3121 fmls v7.2d, v5.2d, v16.2d[0] 3122// add x8, x8, x9 3123 3124 ldr d16, [x10, #24] // E_inv[2] 3125 fmul v6.2d, v6.2d, v16.2d[0] 3126 fmul v7.2d, v7.2d, v16.2d[0] 3127// add x8, x8, x9 3128 3129#if MACRO_LEVEL>=1 3130 .endm 3131#else 3132 ret 3133 3134 FUN_END(inner_edge_trsm_rlt_inv_4x4_lib) 3135#endif 3136 3137 3138 3139 3140 3141// subroutine 3142// 3143// triangular substitution: 3144// side = right 3145// uplo = lower 3146// tran = transposed 3147// requires explicit inverse of diagonal 3148// 3149// input arguments: 3150// x8 <- E 3151// w9 <- lde 3152// x10 <- inv_diag_E 3153// w11 <- n1 3154// 3155// output arguments: 3156 3157#if MACRO_LEVEL>=1 3158 .macro INNER_EDGE_TRSM_RLT_INV_4X4_VS_LIB 3159#else 3160 .align 4 3161 FUN_START(inner_edge_trsm_rlt_inv_4x4_vs_lib) 3162#endif 3163 3164 // first column 3165 ldr d16, [x10, #0] // E_inv[0] 3166 fmul v0.2d, v0.2d, v16.2d[0] 3167 fmul v1.2d, v1.2d, v16.2d[0] 3168 cmp w11, #2 3169 blt 0f // return 3170 3171 // second column 3172 ldr d16, [x8, #8] // E[1+4*0] 3173 fmls v2.2d, v0.2d, v16.2d[0] 3174 fmls v3.2d, v1.2d, v16.2d[0] 3175 ldr d16, [x10, #8] // E_inv[1] 3176 fmul v2.2d, v2.2d, v16.2d[0] 3177 fmul v3.2d, v3.2d, v16.2d[0] 3178 cmp w11, #3 3179 blt 0f // return 3180 3181 // third column 3182 add x12, x8, x9 3183 ldr d16, [x8, #16] // E[2+4*0] 3184 fmls v4.2d, v0.2d, v16.2d[0] 3185 fmls v5.2d, v1.2d, v16.2d[0] 3186 ldr d16, [x12, #16] // E[2+4*1] 3187 fmls v4.2d, v2.2d, v16.2d[0] 3188 fmls v5.2d, v3.2d, v16.2d[0] 3189 ldr d16, [x10, #16] // E_inv[2] 3190 fmul v4.2d, v4.2d, v16.2d[0] 3191 fmul v5.2d, v5.2d, v16.2d[0] 3192 cmp w11, #4 3193 blt 0f // return 3194 3195 // forth column 3196 add x13, x12, x9 3197 ldr d16, [x8, #24] // E[3+4*0] 3198 fmls v6.2d, v0.2d, v16.2d[0] 3199 fmls v7.2d, v1.2d, v16.2d[0] 3200 ldr d16, [x12, #24] // E[3+4*1] 3201 fmls v6.2d, v2.2d, v16.2d[0] 3202 fmls v7.2d, v3.2d, v16.2d[0] 3203 ldr d16, [x13, #24] // E[3+4*2] 3204 fmls v6.2d, v4.2d, v16.2d[0] 3205 fmls v7.2d, v5.2d, v16.2d[0] 3206 ldr d16, [x10, #24] // E_inv[3] 3207 fmul v6.2d, v6.2d, v16.2d[0] 3208 fmul v7.2d, v7.2d, v16.2d[0] 3209 32100: 3211#if MACRO_LEVEL>=1 3212 .endm 3213#else 3214 ret 3215 3216 FUN_END(inner_edge_trsm_rlt_inv_4x4_vs_lib) 3217#endif 3218 3219 3220 3221 3222 3223// subroutine 3224// 3225// input arguments: 3226// 3227// output arguments: 3228 3229#if MACRO_LEVEL>=1 3230 .macro INNER_TRAN_4X4_LIB 3231#else 3232 .align 4 3233 FUN_START(inner_tran_4x4_lib) 3234#endif 3235 3236 trn1 v24.2d, v0.2d, v2.2d 3237 trn2 v2.2d, v0.2d, v2.2d 3238 trn1 v25.2d, v5.2d, v7.2d 3239 trn2 v7.2d, v5.2d, v7.2d 3240 trn1 v26.2d, v1.2d, v3.2d 3241 trn2 v27.2d, v1.2d, v3.2d 3242 trn1 v1.2d, v4.2d, v6.2d 3243 trn2 v3.2d, v4.2d, v6.2d 3244 mov v0.16b, v24.16b 3245 mov v5.16b, v25.16b 3246 mov v4.16b, v26.16b 3247 mov v6.16b, v27.16b 3248 3249#if MACRO_LEVEL>=1 3250 .endm 3251#else 3252 ret 3253 3254 FUN_END(inner_tran_4x4_lib) 3255#endif 3256 3257 3258 3259 3260 3261// subroutine 3262// 3263// input arguments: 3264// x8 <- alpha 3265// x9 <- beta 3266// x10 <- C 3267// x11 <- ldc*sizeof(double) 3268// 3269// output arguments: 3270 3271#if MACRO_LEVEL>=1 3272 .macro INNER_SCALE_AB_4X4_LIB 3273#else 3274 .align 4 3275 FUN_START(inner_scale_ab_4x4_lib) 3276#endif 3277 3278 ld1 {v28.2d}, [x8] 3279 3280 ld1 {v29.2d}, [x9] 3281 3282 fmul v0.2d, v0.2d, v28.2d[0] 3283 fmul v1.2d, v1.2d, v28.2d[0] 3284 fmul v2.2d, v2.2d, v28.2d[0] 3285 fmul v3.2d, v3.2d, v28.2d[0] 3286 fmul v4.2d, v4.2d, v28.2d[0] 3287 fmul v5.2d, v5.2d, v28.2d[0] 3288 fmul v6.2d, v6.2d, v28.2d[0] 3289 fmul v7.2d, v7.2d, v28.2d[0] 3290 3291 fcmpe d29, #0 3292 beq 0f 3293 3294 ldp q24, q25, [x10, #0] 3295 add x10, x10, x11 3296 ldp q26, q27, [x10, #0] 3297 add x10, x10, x11 3298 fmla v0.2d, v24.2d, v29.2d[0] 3299 fmla v1.2d, v25.2d, v29.2d[0] 3300 fmla v2.2d, v26.2d, v29.2d[0] 3301 fmla v3.2d, v27.2d, v29.2d[0] 3302 3303 ldp q24, q25, [x10, #0] 3304 add x10, x10, x11 3305 ldp q26, q27, [x10, #0] 3306 add x10, x10, x11 3307 fmla v4.2d, v24.2d, v29.2d[0] 3308 fmla v5.2d, v25.2d, v29.2d[0] 3309 fmla v6.2d, v26.2d, v29.2d[0] 3310 fmla v7.2d, v27.2d, v29.2d[0] 3311 33120: 3313 3314#if MACRO_LEVEL>=1 3315 .endm 3316#else 3317 ret 3318 3319 FUN_END(inner_scale_ab_4x4_lib) 3320#endif 3321 3322 3323 3324 3325 3326// subroutine 3327// 3328// input arguments: 3329// x8 <- alpha 3330// x9 <- beta 3331// x10 <- C 3332// x11 <- ldc*sizeof(double) 3333// x12 <- km 3334// x13 <- kn 3335// 3336// output arguments: 3337 3338#if MACRO_LEVEL>=1 3339 .macro INNER_SCALE_AB_4X4_VS_LIB 3340#else 3341 .align 4 3342 FUN_START(inner_scale_ab_4x4_vs_lib) 3343#endif 3344 3345 ld1 {v28.2d}, [x8] 3346 3347 ld1 {v29.2d}, [x9] 3348 3349 fmul v0.2d, v0.2d, v28.2d[0] 3350 fmul v1.2d, v1.2d, v28.2d[0] 3351 fmul v2.2d, v2.2d, v28.2d[0] 3352 fmul v3.2d, v3.2d, v28.2d[0] 3353 fmul v4.2d, v4.2d, v28.2d[0] 3354 fmul v5.2d, v5.2d, v28.2d[0] 3355 fmul v6.2d, v6.2d, v28.2d[0] 3356 fmul v7.2d, v7.2d, v28.2d[0] 3357 3358 fcmpe d29, #0 3359 beq 0f 3360 3361 cmp w12, #4 3362 blt 1f 3363 3364 ldp q24, q25, [x10, #0] 3365 add x10, x10, x11 3366 fmla v0.2d, v24.2d, v29.2d[0] 3367 fmla v1.2d, v25.2d, v29.2d[0] 3368 3369 cmp w13, #1 3370 ble 0f 3371 3372 ldp q24, q25, [x10, #0] 3373 add x10, x10, x11 3374 fmla v2.2d, v24.2d, v29.2d[0] 3375 fmla v3.2d, v25.2d, v29.2d[0] 3376 3377 cmp w13, #2 3378 ble 0f 3379 3380 ldp q24, q25, [x10, #0] 3381 add x10, x10, x11 3382 fmla v4.2d, v24.2d, v29.2d[0] 3383 fmla v5.2d, v25.2d, v29.2d[0] 3384 3385 cmp w13, #3 3386 ble 0f 3387 3388 ldp q24, q25, [x10, #0] 3389 add x10, x10, x11 3390 fmla v6.2d, v24.2d, v29.2d[0] 3391 fmla v7.2d, v25.2d, v29.2d[0] 3392 3393 b 0f 3394 33951: 3396 cmp w12, #3 3397 blt 2f 3398 3399 ldr q24, [x10, #0] 3400 ldr d25, [x10, #16] 3401 add x10, x10, x11 3402 fmla v0.2d, v24.2d, v29.2d[0] 3403 fmla v1.2d, v25.2d, v29.2d[0] 3404 3405 cmp w13, #1 3406 ble 0f 3407 3408 ldr q24, [x10, #0] 3409 ldr d25, [x10, #16] 3410 add x10, x10, x11 3411 fmla v2.2d, v24.2d, v29.2d[0] 3412 fmla v3.2d, v25.2d, v29.2d[0] 3413 3414 cmp w13, #2 3415 ble 0f 3416 3417 ldr q24, [x10, #0] 3418 ldr d25, [x10, #16] 3419 add x10, x10, x11 3420 fmla v4.2d, v24.2d, v29.2d[0] 3421 fmla v5.2d, v25.2d, v29.2d[0] 3422 3423 cmp w13, #3 3424 ble 0f 3425 3426 ldr q24, [x10, #0] 3427 ldr d25, [x10, #16] 3428 add x10, x10, x11 3429 fmla v6.2d, v24.2d, v29.2d[0] 3430 fmla v7.2d, v25.2d, v29.2d[0] 3431 3432 b 0f 3433 34342: 3435 cmp w12, #2 3436 blt 3f 3437 3438 ldr q24, [x10, #0] 3439 add x10, x10, x11 3440 fmla v0.2d, v24.2d, v29.2d[0] 3441 3442 cmp w13, #1 3443 ble 0f 3444 3445 ldr q24, [x10, #0] 3446 add x10, x10, x11 3447 fmla v2.2d, v24.2d, v29.2d[0] 3448 3449 cmp w13, #2 3450 ble 0f 3451 3452 ldr q24, [x10, #0] 3453 add x10, x10, x11 3454 fmla v4.2d, v24.2d, v29.2d[0] 3455 3456 cmp w13, #3 3457 ble 0f 3458 3459 ldr q24, [x10, #0] 3460 add x10, x10, x11 3461 fmla v6.2d, v24.2d, v29.2d[0] 3462 3463 b 0f 3464 34653: 3466 cmp w12, #1 3467 blt 0f 3468 3469 ldr d24, [x10, #0] 3470 add x10, x10, x11 3471 fmla v0.2d, v24.2d, v29.2d[0] 3472 3473 cmp w13, #1 3474 ble 0f 3475 3476 ldr d24, [x10, #0] 3477 add x10, x10, x11 3478 fmla v2.2d, v24.2d, v29.2d[0] 3479 3480 cmp w13, #2 3481 ble 0f 3482 3483 ldr d24, [x10, #0] 3484 add x10, x10, x11 3485 fmla v4.2d, v24.2d, v29.2d[0] 3486 3487 cmp w13, #3 3488 ble 0f 3489 3490 ldr d24, [x10, #0] 3491 add x10, x10, x11 3492 fmla v6.2d, v24.2d, v29.2d[0] 3493 34940: 3495 3496#if MACRO_LEVEL>=1 3497 .endm 3498#else 3499 ret 3500 3501 FUN_END(inner_scale_ab_4x4_vs_lib) 3502#endif 3503 3504 3505 3506 3507 3508// subroutine 3509// 3510// input arguments: 3511// x8 <- beta 3512// x9 <- C 3513// x10 <- ldc 3514// 3515// output arguments: 3516 3517#if MACRO_LEVEL>=1 3518 .macro INNER_SCALE_M1B_4X4_LIB 3519#else 3520 .align 4 3521 FUN_START(inner_scale_m1b_4x4_lib) 3522#endif 3523 3524 ld1 {v29.2d}, [x8] 3525 3526 fneg v0.2d, v0.2d 3527 fneg v1.2d, v1.2d 3528 fneg v2.2d, v2.2d 3529 fneg v3.2d, v3.2d 3530 3531 fneg v4.2d, v4.2d 3532 fneg v5.2d, v5.2d 3533 fneg v6.2d, v6.2d 3534 fneg v7.2d, v7.2d 3535 3536 fcmpe d29, #0 3537 beq 0f 3538 3539 ldp q24, q25, [x9, #0] 3540 add x9, x9, x10 3541 ldp q26, q27, [x9, #0] 3542 add x9, x9, x10 3543 fmla v0.2d, v24.2d, v29.2d[0] 3544 fmla v1.2d, v25.2d, v29.2d[0] 3545 fmla v2.2d, v26.2d, v29.2d[0] 3546 fmla v3.2d, v27.2d, v29.2d[0] 3547 3548 ldp q24, q25, [x9, #0] 3549 add x9, x9, x10 3550 ldp q26, q27, [x9, #0] 3551 add x9, x9, x10 3552 fmla v4.2d, v24.2d, v29.2d[0] 3553 fmla v5.2d, v25.2d, v29.2d[0] 3554 fmla v6.2d, v26.2d, v29.2d[0] 3555 fmla v7.2d, v27.2d, v29.2d[0] 3556 35570: 3558 3559#if MACRO_LEVEL>=1 3560 .endm 3561#else 3562 ret 3563 3564 FUN_END(inner_scale_m1b_4x4_lib) 3565#endif 3566 3567 3568 3569 3570 3571// subroutine 3572// 3573// input arguments: 3574// x8 <- beta 3575// x9 <- C 3576// x19 <- ldc*sizeof(double) 3577// x11 <- km 3578// x12 <- kn 3579// 3580// output arguments: 3581 3582#if MACRO_LEVEL>=1 3583 .macro INNER_SCALE_M1B_4X4_VS_LIB 3584#else 3585 .align 4 3586 FUN_START(inner_scale_m1b_4x4_vs_lib) 3587#endif 3588 3589 ld1 {v29.2d}, [x8] 3590 3591 fneg v0.2d, v0.2d 3592 fneg v1.2d, v1.2d 3593 fneg v2.2d, v2.2d 3594 fneg v3.2d, v3.2d 3595 3596 fneg v4.2d, v4.2d 3597 fneg v5.2d, v5.2d 3598 fneg v6.2d, v6.2d 3599 fneg v7.2d, v7.2d 3600 3601 fcmpe d29, #0 3602 beq 0f 3603 3604 cmp w11, #4 3605 blt 1f 3606 3607 ldp q24, q25, [x9, #0] 3608 add x9, x9, x10 3609 fmla v0.2d, v24.2d, v29.2d[0] 3610 fmla v1.2d, v25.2d, v29.2d[0] 3611 3612 cmp w12, #1 3613 ble 0f 3614 3615 ldp q24, q25, [x9, #0] 3616 add x9, x9, x10 3617 fmla v2.2d, v24.2d, v29.2d[0] 3618 fmla v3.2d, v25.2d, v29.2d[0] 3619 3620 cmp w12, #2 3621 ble 0f 3622 3623 ldp q24, q25, [x9, #0] 3624 add x9, x9, x10 3625 fmla v4.2d, v24.2d, v29.2d[0] 3626 fmla v5.2d, v25.2d, v29.2d[0] 3627 3628 cmp w12, #3 3629 ble 0f 3630 3631 ldp q24, q25, [x9, #0] 3632 add x9, x9, x10 3633 fmla v6.2d, v24.2d, v29.2d[0] 3634 fmla v7.2d, v25.2d, v29.2d[0] 3635 3636 b 0f 3637 36381: 3639 cmp w11, #3 3640 blt 2f 3641 3642 ldr q24, [x9, #0] 3643 ldr d25, [x9, #16] 3644 add x9, x9, x10 3645 fmla v0.2d, v24.2d, v29.2d[0] 3646 fmla v1.2d, v25.2d, v29.2d[0] 3647 3648 cmp w12, #1 3649 ble 0f 3650 3651 ldr q24, [x9, #0] 3652 ldr d25, [x9, #16] 3653 add x9, x9, x10 3654 fmla v2.2d, v24.2d, v29.2d[0] 3655 fmla v3.2d, v25.2d, v29.2d[0] 3656 3657 cmp w12, #2 3658 ble 0f 3659 3660 ldr q24, [x9, #0] 3661 ldr d25, [x9, #16] 3662 add x9, x9, x10 3663 fmla v4.2d, v24.2d, v29.2d[0] 3664 fmla v5.2d, v25.2d, v29.2d[0] 3665 3666 cmp w12, #3 3667 ble 0f 3668 3669 ldr q24, [x9, #0] 3670 ldr d25, [x9, #16] 3671 add x9, x9, x10 3672 fmla v6.2d, v24.2d, v29.2d[0] 3673 fmla v7.2d, v25.2d, v29.2d[0] 3674 3675 b 0f 3676 36772: 3678 cmp w11, #2 3679 blt 3f 3680 3681 ldr q24, [x9, #0] 3682 add x9, x9, x10 3683 fmla v0.2d, v24.2d, v29.2d[0] 3684 3685 cmp w12, #1 3686 ble 0f 3687 3688 ldr q24, [x9, #0] 3689 add x9, x9, x10 3690 fmla v2.2d, v24.2d, v29.2d[0] 3691 3692 cmp w12, #2 3693 ble 0f 3694 3695 ldr q24, [x9, #0] 3696 add x9, x9, x10 3697 fmla v4.2d, v24.2d, v29.2d[0] 3698 3699 cmp w12, #3 3700 ble 0f 3701 3702 ldr q24, [x9, #0] 3703 add x9, x9, x10 3704 fmla v6.2d, v24.2d, v29.2d[0] 3705 3706 b 0f 3707 37083: 3709 cmp w11, #1 3710 blt 0f 3711 3712 ldr d24, [x9, #0] 3713 add x9, x9, x10 3714 fmla v0.2d, v24.2d, v29.2d[0] 3715 3716 cmp w12, #1 3717 ble 0f 3718 3719 ldr d24, [x9, #0] 3720 add x9, x9, x10 3721 fmla v2.2d, v24.2d, v29.2d[0] 3722 3723 cmp w12, #2 3724 ble 0f 3725 3726 ldr d24, [x9, #0] 3727 add x9, x9, x10 3728 fmla v4.2d, v24.2d, v29.2d[0] 3729 3730 cmp w12, #3 3731 ble 0f 3732 3733 ldr d24, [x9, #0] 3734 add x9, x9, x10 3735 fmla v6.2d, v24.2d, v29.2d[0] 3736 37370: 3738 3739#if MACRO_LEVEL>=1 3740 .endm 3741#else 3742 ret 3743 3744 FUN_END(inner_scale_m1b_4x4_vs_lib) 3745#endif 3746 3747 3748 3749 3750 3751// subroutine 3752// 3753// input arguments: 3754// x8 <- C 3755// x9 <- ldc*sizeof(double) 3756// 3757// output arguments: 3758 3759#if MACRO_LEVEL>=1 3760 .macro INNER_SCALE_M11_4X4_LIB 3761#else 3762 .align 4 3763 FUN_START(inner_scale_m11_4x4_lib) 3764#endif 3765 3766 ldp q24, q25, [x8, #0] 3767 add x8, x8, x9 3768 ldp q26, q27, [x8, #0] 3769 add x8, x8, x9 3770 fsub v0.2d, v24.2d, v0.2d 3771 fsub v1.2d, v25.2d, v1.2d 3772 fsub v2.2d, v26.2d, v2.2d 3773 fsub v3.2d, v27.2d, v3.2d 3774 3775 ldp q24, q25, [x8, #0] 3776 add x8, x8, x9 3777 ldp q26, q27, [x8, #0] 3778 add x8, x8, x9 3779 fsub v4.2d, v24.2d, v4.2d 3780 fsub v5.2d, v25.2d, v5.2d 3781 fsub v6.2d, v26.2d, v6.2d 3782 fsub v7.2d, v27.2d, v7.2d 3783 3784#if MACRO_LEVEL>=1 3785 .endm 3786#else 3787 ret 3788 3789 FUN_END(inner_scale_m11_4x4_lib) 3790#endif 3791 3792 3793 3794 3795 3796// subroutine 3797// 3798// input arguments: 3799// x8 <- C 3800// x9 <- ldc*sizeof(double) 3801// x10 <- km 3802// x11 <- kn 3803// 3804// output arguments: 3805 3806#if MACRO_LEVEL>=1 3807 .macro INNER_SCALE_M11_4X4_VS_LIB 3808#else 3809 .align 4 3810 FUN_START(inner_scale_m11_4x4_vs_lib) 3811#endif 3812 3813 cmp w10, #4 3814 blt 1f 3815 3816 ldp q24, q25, [x8, #0] 3817 add x8, x8, x9 3818 fsub v0.2d, v24.2d, v0.2d 3819 fsub v1.2d, v25.2d, v1.2d 3820 3821 cmp w11, #1 3822 ble 0f 3823 3824 ldp q24, q25, [x8, #0] 3825 add x8, x8, x9 3826 fsub v2.2d, v24.2d, v2.2d 3827 fsub v3.2d, v25.2d, v3.2d 3828 3829 cmp w11, #2 3830 ble 0f 3831 3832 ldp q24, q25, [x8, #0] 3833 add x8, x8, x9 3834 fsub v4.2d, v24.2d, v4.2d 3835 fsub v5.2d, v25.2d, v5.2d 3836 3837 cmp w11, #3 3838 ble 0f 3839 3840 ldp q24, q25, [x8, #0] 3841 add x8, x8, x9 3842 fsub v6.2d, v24.2d, v6.2d 3843 fsub v7.2d, v25.2d, v7.2d 3844 3845 b 0f 3846 38471: 3848 cmp w10, #3 3849 blt 2f 3850 3851 ldr q24, [x8, #0] 3852 ldr d25, [x8, #16] 3853 add x8, x8, x9 3854 fsub v0.2d, v24.2d, v0.2d 3855 fsub v1.2d, v25.2d, v1.2d 3856 3857 cmp w11, #1 3858 ble 0f 3859 3860 ldr q24, [x8, #0] 3861 ldr d25, [x8, #16] 3862 add x8, x8, x9 3863 fsub v2.2d, v24.2d, v2.2d 3864 fsub v3.2d, v25.2d, v3.2d 3865 3866 cmp w11, #2 3867 ble 0f 3868 3869 ldr q24, [x8, #0] 3870 ldr d25, [x8, #16] 3871 add x8, x8, x9 3872 fsub v4.2d, v24.2d, v4.2d 3873 fsub v5.2d, v25.2d, v5.2d 3874 3875 cmp w11, #3 3876 ble 0f 3877 3878 ldr q24, [x8, #0] 3879 ldr d25, [x8, #16] 3880 add x8, x8, x9 3881 fsub v6.2d, v24.2d, v6.2d 3882 fsub v7.2d, v25.2d, v7.2d 3883 3884 b 0f 3885 38862: 3887 cmp w10, #2 3888 blt 3f 3889 3890 ldr q24, [x8, #0] 3891 add x8, x8, x9 3892 fsub v0.2d, v24.2d, v0.2d 3893 3894 cmp w11, #1 3895 ble 0f 3896 3897 ldr q24, [x8, #0] 3898 add x8, x8, x9 3899 fsub v2.2d, v24.2d, v2.2d 3900 3901 cmp w11, #2 3902 ble 0f 3903 3904 ldr q24, [x8, #0] 3905 add x8, x8, x9 3906 fsub v4.2d, v24.2d, v4.2d 3907 3908 cmp w11, #3 3909 ble 0f 3910 3911 ldr q24, [x8, #0] 3912 add x8, x8, x9 3913 fsub v6.2d, v24.2d, v6.2d 3914 3915 b 0f 3916 39173: 3918 cmp w10, #1 3919 blt 0f 3920 3921 ldr d24, [x8, #0] 3922 add x8, x8, x9 3923 fsub v0.2d, v24.2d, v0.2d 3924 3925 cmp w11, #1 3926 ble 0f 3927 3928 ldr d24, [x8, #0] 3929 add x8, x8, x9 3930 fsub v2.2d, v24.2d, v2.2d 3931 3932 cmp w11, #2 3933 ble 0f 3934 3935 ldr d24, [x8, #0] 3936 add x8, x8, x9 3937 fsub v4.2d, v24.2d, v4.2d 3938 3939 cmp w11, #3 3940 ble 0f 3941 3942 ldr d24, [x8, #0] 3943 add x8, x8, x9 3944 fsub v6.2d, v24.2d, v6.2d 3945 39460: 3947 3948#if MACRO_LEVEL>=1 3949 .endm 3950#else 3951 ret 3952 3953 FUN_END(inner_scale_m11_4x4_vs_lib) 3954#endif 3955 3956 3957 3958 3959 3960// subroutine 3961// 3962// input arguments: 3963// x8 <- D 3964// x9 <- ldd*sizeof(double) 3965// 3966// output arguments: 3967 3968#if MACRO_LEVEL>=1 3969 .macro INNER_STORE_4X4_LIB 3970#else 3971 .align 4 3972 FUN_START(inner_store_4x4_lib) 3973#endif 3974 3975 stp q0, q1, [x8, #0] 3976 add x8, x8, x9 3977 stp q2, q3, [x8, #0] 3978 add x8, x8, x9 3979 stp q4, q5, [x8, #0] 3980 add x8, x8, x9 3981 stp q6, q7, [x8, #0] 3982 3983#if MACRO_LEVEL>=1 3984 .endm 3985#else 3986 ret 3987 3988 FUN_END(inner_store_4x4_lib) 3989#endif 3990 3991 3992 3993 3994 3995// subroutine 3996// 3997// input arguments: 3998// x8 <- D 3999// x9 <- ldd*sizeof(double) 4000// x10 <- km 4001// x11 <- kn 4002// 4003// output arguments: 4004 4005#if MACRO_LEVEL>=1 4006 .macro INNER_STORE_4X4_VS_LIB 4007#else 4008 .align 4 4009 FUN_START(inner_store_4x4_vs_lib) 4010#endif 4011 4012 cmp w10, #4 4013 bge 1f 4014 4015 mov x12, x8 4016 4017 ldp q24, q25, [x12, #0] 4018 add x12, x12, x9 4019 ldp q26, q27, [x12, #0] 4020 add x12, x12, x9 4021 ldp q28, q29, [x12, #0] 4022 add x12, x12, x9 4023 ldp q30, q31, [x12, #0] 4024 4025 // 4th row 4026 ins v1.d[1], v25.d[1] 4027 ins v3.d[1], v27.d[1] 4028 ins v5.d[1], v29.d[1] 4029 ins v7.d[1], v31.d[1] 4030 cmp w10, #3 4031 bge 1f 4032 // 3th row 4033 ins v1.d[0], v25.d[0] 4034 ins v3.d[0], v27.d[0] 4035 ins v5.d[0], v29.d[0] 4036 ins v7.d[0], v31.d[0] 4037 cmp w10, #2 4038 bge 1f 4039 // 2nd row 4040 ins v0.d[1], v24.d[1] 4041 ins v2.d[1], v26.d[1] 4042 ins v4.d[1], v28.d[1] 4043 ins v6.d[1], v30.d[1] 4044 cmp w10, #1 4045 bge 1f 4046 // 1st row 4047 ins v0.d[0], v24.d[0] 4048 ins v2.d[0], v26.d[0] 4049 ins v4.d[0], v28.d[0] 4050 ins v6.d[0], v30.d[0] 4051 40521: 4053 // 1st col 4054 stp q0, q1, [x8, #0] 4055 add x8, x8, x9 4056 cmp w11, #2 4057 blt 0f 4058 // 2nd col 4059 stp q2, q3, [x8, #0] 4060 add x8, x8, x9 4061 cmp w11, #3 4062 blt 0f 4063 // 3rd col 4064 stp q4, q5, [x8, #0] 4065 add x8, x8, x9 4066 cmp w11, #3 4067 beq 0f 4068 // 4th col 4069 stp q6, q7, [x8, #0] 4070 40710: 4072 4073#if MACRO_LEVEL>=1 4074 .endm 4075#else 4076 ret 4077 4078 FUN_END(inner_store_4x4_vs_lib) 4079#endif 4080 4081 4082 4083 4084 4085// subroutine 4086// 4087// input arguments: 4088// x8 <- D 4089// x9 <- ldd*sizeof(double) 4090// 4091// output arguments: 4092 4093#if MACRO_LEVEL>=1 4094 .macro INNER_STORE_L_4X4_LIB 4095#else 4096 .align 4 4097 FUN_START(inner_store_l_4x4_lib) 4098#endif 4099 4100 mov x12, x8 4101 4102 add x12, x12, x9 4103 ldr q16, [x12, #0] 4104 add x12, x12, x9 4105 add x12, x12, x9 4106 ldr q17, [x12, #16] 4107 4108 ins v2.d[0], v16.d[0] 4109 ins v7.d[0], v17.d[0] 4110 4111 stp q0, q1, [x8, #0] 4112 add x8, x8, x9 4113 stp q2, q3, [x8, #0] 4114 add x8, x8, x9 4115 str q5, [x8, #16] 4116 add x8, x8, x9 4117 str q7, [x8, #16] 4118 4119#if MACRO_LEVEL>=1 4120 .endm 4121#else 4122 ret 4123 4124 FUN_END(inner_store_l_4x4_lib) 4125#endif 4126 4127 4128 4129 4130 4131// subroutine 4132// 4133// input arguments: 4134// x8 <- D 4135// x9 <- ldd*sizeof(double) 4136// x10 <- km 4137// x11 <- kn 4138// 4139// output arguments: 4140 4141#if MACRO_LEVEL>=1 4142 .macro INNER_STORE_L_4X4_VS_LIB 4143#else 4144 .align 4 4145 FUN_START(inner_store_l_4x4_vs_lib) 4146#endif 4147 4148 cmp w10, #4 4149 bge 1f 4150 4151 mov x12, x8 4152 4153 ldp q24, q25, [x12, #0] 4154 add x12, x12, x9 4155 ldp q26, q27, [x12, #0] 4156 add x12, x12, x9 4157 ldp q28, q29, [x12, #0] 4158 add x12, x12, x9 4159 ldp q30, q31, [x12, #0] 4160 4161 // 4th row 4162 ins v1.d[1], v25.d[1] 4163 ins v3.d[1], v27.d[1] 4164 ins v5.d[1], v29.d[1] 4165 ins v7.d[1], v31.d[1] 4166 cmp w10, #3 4167 bge 1f 4168 // 3th row 4169 ins v1.d[0], v25.d[0] 4170 ins v3.d[0], v27.d[0] 4171 ins v5.d[0], v29.d[0] 4172 ins v7.d[0], v31.d[0] 4173 cmp w10, #2 4174 bge 1f 4175 // 2nd row 4176 ins v0.d[1], v24.d[1] 4177 ins v2.d[1], v26.d[1] 4178 ins v4.d[1], v28.d[1] 4179 ins v6.d[1], v30.d[1] 4180 cmp w10, #1 4181 bge 1f 4182 // 1st row 4183 ins v0.d[0], v24.d[0] 4184 ins v2.d[0], v26.d[0] 4185 ins v4.d[0], v28.d[0] 4186 ins v6.d[0], v30.d[0] 4187 41881: 4189 mov x12, x8 4190 4191 add x12, x12, x9 4192 ldr q16, [x12, #0] 4193 add x12, x12, x9 4194 add x12, x12, x9 4195 ldr q17, [x12, #16] 4196 4197 ins v2.d[0], v16.d[0] 4198 ins v7.d[0], v17.d[0] 4199 4200 // 1st col 4201 stp q0, q1, [x8, #0] 4202 add x8, x8, x9 4203 cmp w11, #2 4204 blt 0f 4205 // 2nd col 4206 stp q2, q3, [x8, #0] 4207 add x8, x8, x9 4208 cmp w11, #3 4209 blt 0f 4210 // 3rd col 4211 str q5, [x8, #16] 4212 add x8, x8, x9 4213 beq 0f 4214 // 4th col 4215 str q7, [x8, #16] 4216 42170: 4218 4219#if MACRO_LEVEL>=1 4220 .endm 4221#else 4222 ret 4223 4224 FUN_END(inner_store_l_4x4_vs_lib) 4225#endif 4226 4227 4228 4229 4230 4231// subroutine 4232// 4233// input arguments: 4234// x8 <- D 4235// x9 <- ldd*sizeof(double) 4236// 4237// output arguments: 4238 4239#if MACRO_LEVEL>=1 4240 .macro INNER_STORE_U_4X4_LIB 4241#else 4242 .align 4 4243 FUN_START(inner_store_u_4x4_lib) 4244#endif 4245 4246 str d0, [x8, #0] 4247 add x8, x8, x9 4248 str q2, [x8, #0] 4249 add x8, x8, x9 4250 str q4, [x8, #0] 4251 str d5, [x8, #16] 4252 add x8, x8, x9 4253 stp q6, q7, [x8, #0] 4254 4255#if MACRO_LEVEL>=1 4256 .endm 4257#else 4258 ret 4259 4260 FUN_END(inner_store_u_4x4_lib) 4261#endif 4262 4263 4264 4265 4266 4267// subroutine 4268// 4269// input arguments: 4270// x8 <- D 4271// x9 <- ldd*sizeof(double) 4272// x10 <- km 4273// x11 <- kn 4274// 4275// output arguments: 4276 4277#if MACRO_LEVEL>=1 4278 .macro INNER_STORE_U_4X4_VS_LIB 4279#else 4280 .align 4 4281 FUN_START(inner_store_u_4x4_vs_lib) 4282#endif 4283 4284 cmp w10, #4 4285 bge 1f 4286 4287 mov x12, x8 4288 4289 ldp q24, q25, [x12, #0] 4290 add x12, x12, x9 4291 ldp q26, q27, [x12, #0] 4292 add x12, x12, x9 4293 ldp q28, q29, [x12, #0] 4294 add x12, x12, x9 4295 ldp q30, q31, [x12, #0] 4296 4297 // 4th row 4298 ins v1.d[1], v25.d[1] 4299 ins v3.d[1], v27.d[1] 4300 ins v5.d[1], v29.d[1] 4301 ins v7.d[1], v31.d[1] 4302 cmp w10, #3 4303 bge 1f 4304 // 3th row 4305 ins v1.d[0], v25.d[0] 4306 ins v3.d[0], v27.d[0] 4307 ins v5.d[0], v29.d[0] 4308 ins v7.d[0], v31.d[0] 4309 cmp w10, #2 4310 bge 1f 4311 // 2nd row 4312 ins v0.d[1], v24.d[1] 4313 ins v2.d[1], v26.d[1] 4314 ins v4.d[1], v28.d[1] 4315 ins v6.d[1], v30.d[1] 4316 cmp w10, #1 4317 bge 1f 4318 // 1st row 4319 ins v0.d[0], v24.d[0] 4320 ins v2.d[0], v26.d[0] 4321 ins v4.d[0], v28.d[0] 4322 ins v6.d[0], v30.d[0] 4323 43241: 4325 // 1st col 4326 str d0, [x8, #0] 4327 add x8, x8, x9 4328 cmp w11, #2 4329 blt 0f 4330 // 2nd col 4331 str q2, [x8, #0] 4332 add x8, x8, x9 4333 cmp w11, #3 4334 blt 0f 4335 // 3rd col 4336 str q4, [x8, #0] 4337 str d5, [x8, #16] 4338 add x8, x8, x9 4339 beq 0f 4340 // 4th col 4341 stp q6, q7, [x8, #0] 4342 43430: 4344 4345#if MACRO_LEVEL>=1 4346 .endm 4347#else 4348 ret 4349 4350 FUN_END(inner_store_u_4x4_vs_lib) 4351#endif 4352 4353 4354 4355 4356 4357// subroutine 4358// 4359// input arguments: 4360// x8 <- D 4361// x9 <- ldd*sizeof(double) 4362// 4363// output arguments: 4364 4365#if MACRO_LEVEL>=1 4366 .macro INNER_PREFETCH_4X4_LIB 4367#else 4368 .align 4 4369 FUN_START(inner_prefetch_4x4_lib) 4370#endif 4371 4372 prfm PLDL1KEEP, [x8, #0] 4373 add x8, x8, x9 4374 prfm PLDL1KEEP, [x8, #0] 4375 add x8, x8, x9 4376 prfm PLDL1KEEP, [x8, #0] 4377 add x8, x8, x9 4378 prfm PLDL1KEEP, [x8, #0] 4379 4380#if MACRO_LEVEL>=1 4381 .endm 4382#else 4383 ret 4384 4385 FUN_END(inner_prefetch_4x4_lib) 4386#endif 4387 4388 4389 4390 4391 4392// w0 x1 x2 x3 x4 x5 w6 x7 sp+0 4393// void kernel_dgemm_nt_4x4_lib44cc(int kmax, double *alpha, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd) 4394 4395 .align 4 4396 GLOB_FUN_START(kernel_dgemm_nt_4x4_lib44cc) 4397 4398 4399 4400 PROLOGUE 4401 4402 4403 4404 ZERO_ACC 4405 4406 4407 4408 // call inner kernel gemm nt 4409 mov w8, w0 // kmax 4410 mov x9, x2 // A 4411 mov x10, x3 // B 4412 4413#if MACRO_LEVEL>=2 4414 INNER_KERNEL_GEMM_ADD_NT_4X4_LIB4 4415#else 4416 bl inner_kernel_gemm_add_nt_4x4_lib4 4417#endif 4418 4419 4420 4421 // prefetch 4422 mov x8, x7 // D 4423 ldr w9, [sp, #(STACKSIZE + 0)] // ldd 4424 lsl w9, w9, #3 // 8*ldd 4425 4426#if MACRO_LEVEL>=1 4427 INNER_PREFETCH_4X4_LIB 4428#else 4429 bl inner_prefetch_4x4_lib 4430#endif 4431 4432 4433 4434 // call inner blend for generic alpha and beta 4435 mov x8, x1 // alpha 4436 mov x9, x4 // beta 4437 mov x10, x5 // C 4438 mov w11, w6 // ldc 4439 lsl w11, w11, #3 // 8*ldc 4440 4441#if MACRO_LEVEL>=1 4442 INNER_SCALE_AB_4X4_LIB 4443#else 4444 bl inner_scale_ab_4x4_lib 4445#endif 4446 4447 4448 4449 // store n 4450 mov x8, x7 // D 4451 ldr w9, [sp, #(STACKSIZE + 0)] // ldd 4452 lsl w9, w9, #3 // 8*ldd 4453 4454#if MACRO_LEVEL>=1 4455 INNER_STORE_4X4_LIB 4456#else 4457 bl inner_store_4x4_lib 4458#endif 4459 4460 4461 4462 EPILOGUE 4463 4464 mov x0, #0 4465 4466 ret 4467 4468 FUN_END(kernel_dgemm_nt_4x4_lib44cc) 4469 4470 4471 4472 4473 4474// w0 x1 x2 x3 x4 x5 w6 x7 sp+0 sp+8 sp+16 4475// void kernel_dgemm_nt_4x4_vs_lib44cc(int kmax, double *alpha, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1) 4476 4477 .align 4 4478 GLOB_FUN_START(kernel_dgemm_nt_4x4_vs_lib44cc) 4479 4480 4481 4482 PROLOGUE 4483 4484 4485 4486 ZERO_ACC 4487 4488 4489 4490 // call inner kernel gemm nt 4491 mov w8, w0 // kmax 4492 mov x9, x2 // A 4493 mov x10, x3 // B 4494 4495#if MACRO_LEVEL>=2 4496 INNER_KERNEL_GEMM_ADD_NT_4X4_LIB4 4497#else 4498 bl inner_kernel_gemm_add_nt_4x4_lib4 4499#endif 4500 4501 4502 4503 // call inner blend for generic alpha and beta 4504 mov x8, x1 // alpha 4505 mov x9, x4 // beta 4506 mov x10, x5 // C 4507 mov w11, w6 // ldc 4508 lsl w11, w11, #3 // 8*ldc 4509 ldr w12, [sp, #(STACKSIZE + 8)] // m1 4510 ldr w13, [sp, #(STACKSIZE + 16)] // n1 4511 4512#if MACRO_LEVEL>=1 4513 INNER_SCALE_AB_4X4_VS_LIB 4514#else 4515 bl inner_scale_ab_4x4_vs_lib 4516#endif 4517 4518 4519 4520 // store n 4521 mov x8, x7 // D 4522 ldr w9, [sp, #(STACKSIZE + 0)] // ldd 4523 lsl w9, w9, #3 // 8*ldd 4524 ldr w10, [sp, #(STACKSIZE + 8)] // m1 4525 ldr w11, [sp, #(STACKSIZE + 16)] // n1 4526 4527#if MACRO_LEVEL>=1 4528 INNER_STORE_4X4_VS_LIB 4529#else 4530 bl inner_store_4x4_vs_lib 4531#endif 4532 4533 4534 4535 EPILOGUE 4536 4537 mov x0, #0 4538 4539 ret 4540 4541 FUN_END(kernel_dgemm_nt_4x4_vs_lib44cc) 4542 4543 4544 4545 4546 4547// w0 x1 x2 x3 x4 x5 w6 x7 sp+0 sp+8 4548// void kernel_dgemm_nt_4x4_lib4ccc(int kmax, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd) 4549 4550 .align 4 4551 GLOB_FUN_START(kernel_dgemm_nt_4x4_lib4ccc) 4552 4553 4554 4555 PROLOGUE 4556 4557 4558 4559 ZERO_ACC 4560 4561 4562 4563 // call inner kernel gemm nt 4564 mov w8, w0 // kmax 4565 mov x9, x2 // A 4566 mov x10, x3 // B 4567 mov w11, w4 // ldb 4568 lsl w11, w11, #3 // 8*ldb 4569 4570#if MACRO_LEVEL>=2 4571 INNER_KERNEL_GEMM_ADD_NT_4X4_LIB4C 4572#else 4573 bl inner_kernel_gemm_add_nt_4x4_lib4c 4574#endif 4575 4576 4577 4578 // prefetch 4579 ldr x8, [sp, #(STACKSIZE + 0)] // D 4580 ldr w9, [sp, #(STACKSIZE + 8)] // ldd 4581 lsl w9, w9, #3 // 8*sdd 4582 4583#if MACRO_LEVEL>=1 4584 INNER_PREFETCH_4X4_LIB 4585#else 4586 bl inner_prefetch_4x4_lib 4587#endif 4588 4589 4590 4591 // call inner blend for generic alpha and beta 4592 mov x8, x1 // alpha 4593 mov x9, x5 // beta 4594 mov x10, x6 // C 4595 mov w11, w7 // ldc 4596 lsl w11, w11, #3 // 8*ldc 4597 4598#if MACRO_LEVEL>=1 4599 INNER_SCALE_AB_4X4_LIB 4600#else 4601 bl inner_scale_ab_4x4_lib 4602#endif 4603 4604 4605 4606 // store n 4607 ldr x8, [sp, #(STACKSIZE + 0)] // D 4608 ldr w9, [sp, #(STACKSIZE + 8)] // ldd 4609 lsl w9, w9, #3 // 8*ldd 4610 4611#if MACRO_LEVEL>=1 4612 INNER_STORE_4X4_LIB 4613#else 4614 bl inner_store_4x4_lib 4615#endif 4616 4617 4618 4619 EPILOGUE 4620 4621 mov x0, #0 4622 4623 ret 4624 4625 FUN_END(kernel_dgemm_nt_4x4_lib4ccc) 4626 4627 4628 4629 4630 4631 4632// w0 x1 x2 x3 x4 x5 w6 x7 sp+0 sp+8 sp+16 sp+24 4633// void kernel_dgemm_nt_4x4_vs_lib4ccc(int kmax, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1) 4634 4635 .align 4 4636 GLOB_FUN_START(kernel_dgemm_nt_4x4_vs_lib4ccc) 4637 4638 4639 4640 PROLOGUE 4641 4642 4643 4644 ZERO_ACC 4645 4646 4647 4648 // call inner kernel gemm nt 4649 mov w8, w0 // kmax 4650 mov x9, x2 // A 4651 mov x10, x3 // B 4652 mov w11, w4 // ldb 4653 lsl w11, w11, #3 // 8*ldb 4654 4655 ldr w12, [sp, #(STACKSIZE + 24)] // n1 4656 cmp w12, #1 4657 bgt 100f 4658 4659#if MACRO_LEVEL>=2 4660 INNER_KERNEL_GEMM_ADD_NT_4X1_LIB4C 4661#else 4662 bl inner_kernel_gemm_add_nt_4x1_lib4c 4663#endif 4664 4665 b 103f 4666 4667100: 4668 4669 ldr w12, [sp, #(STACKSIZE + 24)] // n1 4670 cmp w12, #2 4671 bgt 101f 4672 4673#if MACRO_LEVEL>=2 4674 INNER_KERNEL_GEMM_ADD_NT_4X2_LIB4C 4675#else 4676 bl inner_kernel_gemm_add_nt_4x2_lib4c 4677#endif 4678 4679 b 103f 4680 4681101: 4682 4683 ldr w12, [sp, #(STACKSIZE + 24)] // n1 4684 cmp w12, #3 4685 bgt 102f 4686 4687#if MACRO_LEVEL>=2 4688 INNER_KERNEL_GEMM_ADD_NT_4X3_LIB4C 4689#else 4690 bl inner_kernel_gemm_add_nt_4x3_lib4c 4691#endif 4692 4693 b 103f 4694 4695102: 4696 4697#if MACRO_LEVEL>=2 4698 INNER_KERNEL_GEMM_ADD_NT_4X4_LIB4C 4699#else 4700 bl inner_kernel_gemm_add_nt_4x4_lib4c 4701#endif 4702 4703103: 4704 4705 4706 4707 // prefetch 4708 // TODO prefethc vs 4709// ldr x8, [sp, #(STACKSIZE + 0)] // D 4710// ldr w9, [sp, #(STACKSIZE + 8)] // ldd 4711// lsl w9, w9, #3 // 8*sdd 4712 4713#if MACRO_LEVEL>=1 4714// INNER_PREFETCH_4X4_LIB 4715#else 4716// bl inner_prefetch_4x4_lib 4717#endif 4718 4719 4720 4721 // call inner blend for generic alpha and beta 4722 mov x8, x1 // alpha 4723 mov x9, x5 // beta 4724 mov x10, x6 // C 4725 mov w11, w7 // ldc 4726 lsl w11, w11, #3 // 8*ldc 4727 ldr w12, [sp, #(STACKSIZE + 16)] // m1 4728 ldr w13, [sp, #(STACKSIZE + 24)] // n1 4729 4730#if MACRO_LEVEL>=1 4731 INNER_SCALE_AB_4X4_VS_LIB 4732#else 4733 bl inner_scale_ab_4x4_vs_lib 4734#endif 4735 4736 4737 4738 // store n 4739 ldr x8, [sp, #(STACKSIZE + 0)] // D 4740 ldr w9, [sp, #(STACKSIZE + 8)] // ldd 4741 lsl w9, w9, #3 // 8*ldd 4742 ldr w10, [sp, #(STACKSIZE + 16)] // m1 4743 ldr w11, [sp, #(STACKSIZE + 24)] // n1 4744 4745#if MACRO_LEVEL>=1 4746 INNER_STORE_4X4_VS_LIB 4747#else 4748 bl inner_store_4x4_vs_lib 4749#endif 4750 4751 4752 4753 EPILOGUE 4754 4755 mov x0, #0 4756 4757 ret 4758 4759 FUN_END(kernel_dgemm_nt_4x4_vs_lib4ccc) 4760 4761 4762 4763 4764 4765 4766// w0 x1 x2 x3 x4 x5 w6 x7 sp+0 sp+8 4767// void kernel_dgemm_nn_4x4_lib4ccc(int kmax, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd) 4768 4769 .align 4 4770 GLOB_FUN_START(kernel_dgemm_nn_4x4_lib4ccc) 4771 4772 4773 4774 PROLOGUE 4775 4776 4777 4778 ZERO_ACC 4779 4780 4781 4782 // call inner kernel gemm nt 4783 mov w8, w0 // kmax 4784 mov x9, x2 // A 4785 mov x10, x3 // B 4786 mov w11, w4 // ldb 4787 lsl w11, w11, #3 // 8*ldb 4788 4789#if MACRO_LEVEL>=2 4790 INNER_KERNEL_GEMM_ADD_NN_4X4_LIB4C 4791#else 4792 bl inner_kernel_gemm_add_nn_4x4_lib4c 4793#endif 4794 4795 4796 4797 // prefetch 4798 ldr x8, [sp, #(STACKSIZE + 0)] // D 4799 ldr w9, [sp, #(STACKSIZE + 8)] // ldd 4800 lsl w9, w9, #3 // 8*sdd 4801 4802#if MACRO_LEVEL>=1 4803 INNER_PREFETCH_4X4_LIB 4804#else 4805 bl inner_prefetch_4x4_lib 4806#endif 4807 4808 4809 4810 // call inner blend for generic alpha and beta 4811 mov x8, x1 // alpha 4812 mov x9, x5 // beta 4813 mov x10, x6 // C 4814 mov w11, w7 // ldc 4815 lsl w11, w11, #3 // 8*ldc 4816 4817#if MACRO_LEVEL>=1 4818 INNER_SCALE_AB_4X4_LIB 4819#else 4820 bl inner_scale_ab_4x4_lib 4821#endif 4822 4823 4824 4825 // store n 4826 ldr x8, [sp, #(STACKSIZE + 0)] // D 4827 ldr w9, [sp, #(STACKSIZE + 8)] // ldd 4828 lsl w9, w9, #3 // 8*ldd 4829 4830#if MACRO_LEVEL>=1 4831 INNER_STORE_4X4_LIB 4832#else 4833 bl inner_store_4x4_lib 4834#endif 4835 4836 4837 4838 EPILOGUE 4839 4840 mov x0, #0 4841 4842 ret 4843 4844 FUN_END(kernel_dgemm_nn_4x4_lib4ccc) 4845 4846 4847 4848 4849 4850// w0 x1 x2 x3 x4 x5 w6 x7 sp+0 sp+8 sp+16 sp+24 4851// void kernel_dgemm_nn_4x4_vs_lib4ccc(int kmax, double *alpha, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1) 4852 4853 .align 4 4854 GLOB_FUN_START(kernel_dgemm_nn_4x4_vs_lib4ccc) 4855 4856 4857 4858 PROLOGUE 4859 4860 4861 4862 ZERO_ACC 4863 4864 4865 4866 // call inner kernel gemm nt 4867 mov w8, w0 // kmax 4868 mov x9, x2 // A 4869 mov x10, x3 // B 4870 mov w11, w4 // ldb 4871 lsl w11, w11, #3 // 8*ldb 4872 4873 ldr w12, [sp, #(STACKSIZE + 24)] // n1 4874 cmp w12, #1 4875 bgt 100f 4876 4877#if MACRO_LEVEL>=2 4878 INNER_KERNEL_GEMM_ADD_NN_4X1_LIB4C 4879#else 4880 bl inner_kernel_gemm_add_nn_4x1_lib4c 4881#endif 4882 4883 b 103f 4884 4885100: 4886 4887 ldr w12, [sp, #(STACKSIZE + 24)] // n1 4888 cmp w12, #2 4889 bgt 101f 4890 4891#if MACRO_LEVEL>=2 4892 INNER_KERNEL_GEMM_ADD_NN_4X2_LIB4C 4893#else 4894 bl inner_kernel_gemm_add_nn_4x2_lib4c 4895#endif 4896 4897 b 103f 4898 4899101: 4900 4901 ldr w12, [sp, #(STACKSIZE + 24)] // n1 4902 cmp w12, #3 4903 bgt 102f 4904 4905#if MACRO_LEVEL>=2 4906 INNER_KERNEL_GEMM_ADD_NN_4X3_LIB4C 4907#else 4908 bl inner_kernel_gemm_add_nn_4x3_lib4c 4909#endif 4910 4911 b 103f 4912 4913102: 4914 4915#if MACRO_LEVEL>=2 4916 INNER_KERNEL_GEMM_ADD_NN_4X4_LIB4C 4917#else 4918 bl inner_kernel_gemm_add_nn_4x4_lib4c 4919#endif 4920 4921103: 4922 4923 4924 4925 // prefetch 4926 // TODO prefethc vs 4927// ldr x8, [sp, #(STACKSIZE + 0)] // D 4928// ldr w9, [sp, #(STACKSIZE + 8)] // ldd 4929// lsl w9, w9, #3 // 8*sdd 4930 4931#if MACRO_LEVEL>=1 4932// INNER_PREFETCH_4X4_LIB 4933#else 4934// bl inner_prefetch_4x4_lib 4935#endif 4936 4937 4938 4939 // call inner blend for generic alpha and beta 4940 mov x8, x1 // alpha 4941 mov x9, x5 // beta 4942 mov x10, x6 // C 4943 mov w11, w7 // ldc 4944 lsl w11, w11, #3 // 8*ldc 4945 ldr w12, [sp, #(STACKSIZE + 16)] // m1 4946 ldr w13, [sp, #(STACKSIZE + 24)] // n1 4947 4948#if MACRO_LEVEL>=1 4949 INNER_SCALE_AB_4X4_VS_LIB 4950#else 4951 bl inner_scale_ab_4x4_vs_lib 4952#endif 4953 4954 4955 4956 // store n 4957 ldr x8, [sp, #(STACKSIZE + 0)] // D 4958 ldr w9, [sp, #(STACKSIZE + 8)] // ldd 4959 lsl w9, w9, #3 // 8*ldd 4960 ldr w10, [sp, #(STACKSIZE + 16)] // m1 4961 ldr w11, [sp, #(STACKSIZE + 24)] // n1 4962 4963#if MACRO_LEVEL>=1 4964 INNER_STORE_4X4_VS_LIB 4965#else 4966 bl inner_store_4x4_vs_lib 4967#endif 4968 4969 4970 4971 EPILOGUE 4972 4973 mov x0, #0 4974 4975 ret 4976 4977 FUN_END(kernel_dgemm_nn_4x4_vs_lib4ccc) 4978 4979 4980 4981 4982 4983// w0 x1 x2 x3 x4 x5 w6 x7 sp+0 4984// void kernel_dsyrk_nt_l_4x4_lib44cc(int kmax, double *alpha, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd) 4985 4986 .align 4 4987 GLOB_FUN_START(kernel_dsyrk_nt_l_4x4_lib44cc) 4988 4989 4990 4991 PROLOGUE 4992 4993 4994 4995 ZERO_ACC 4996 4997 4998 4999 // call inner kernel gemm nt 5000 mov w8, w0 // kmax 5001 mov x9, x2 // A 5002 mov x10, x3 // B 5003 5004#if MACRO_LEVEL>=2 5005 INNER_KERNEL_GEMM_ADD_NT_4X4_LIB4 5006#else 5007 bl inner_kernel_gemm_add_nt_4x4_lib4 5008#endif 5009 5010 5011 5012 // call inner blend for generic alpha and beta 5013 mov x8, x1 // alpha 5014 mov x9, x4 // beta 5015 mov x10, x5 // C 5016 mov w11, w6 // ldc 5017 lsl w11, w11, #3 // 8*ldc 5018 5019#if MACRO_LEVEL>=1 5020 INNER_SCALE_AB_4X4_LIB 5021#else 5022 bl inner_scale_ab_4x4_lib 5023#endif 5024 5025 5026 5027 // store n 5028 mov x8, x7 // D 5029 ldr w9, [sp, #(STACKSIZE + 0)] // ldd 5030 lsl w9, w9, #3 // 8*ldd 5031 5032#if MACRO_LEVEL>=1 5033 INNER_STORE_L_4X4_LIB 5034#else 5035 bl inner_store_l_4x4_lib 5036#endif 5037 5038 5039 5040 EPILOGUE 5041 5042 mov x0, #0 5043 5044 ret 5045 5046 FUN_END(kernel_dsyrk_nt_l_4x4_lib44cc) 5047 5048 5049 5050 5051 5052// w0 x1 x2 x3 x4 x5 w6 x7 sp+0 sp+8 sp+16 5053// void kernel_dsyrk_nt_l_4x4_vs_lib44cc(int kmax, double *alpha, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1) 5054 5055 .align 4 5056 GLOB_FUN_START(kernel_dsyrk_nt_l_4x4_vs_lib44cc) 5057 5058 5059 5060 PROLOGUE 5061 5062 5063 5064 ZERO_ACC 5065 5066 5067 5068 // call inner kernel gemm nt 5069 mov w8, w0 // kmax 5070 mov x9, x2 // A 5071 mov x10, x3 // B 5072 5073#if MACRO_LEVEL>=2 5074 INNER_KERNEL_GEMM_ADD_NT_4X4_LIB4 5075#else 5076 bl inner_kernel_gemm_add_nt_4x4_lib4 5077#endif 5078 5079 5080 5081 // call inner blend for generic alpha and beta 5082 mov x8, x1 // alpha 5083 mov x9, x4 // beta 5084 mov x10, x5 // C 5085 mov w11, w6 // ldc 5086 lsl w11, w11, #3 // 8*ldc 5087 ldr w12, [sp, #(STACKSIZE + 8)] // m1 5088 ldr w13, [sp, #(STACKSIZE + 16)] // n1 5089 5090#if MACRO_LEVEL>=1 5091 INNER_SCALE_AB_4X4_VS_LIB 5092#else 5093 bl inner_scale_ab_4x4_vs_lib 5094#endif 5095 5096 5097 5098 // store n 5099 mov x8, x7 // D 5100 ldr w9, [sp, #(STACKSIZE + 0)] // ldd 5101 lsl w9, w9, #3 // 8*ldd 5102 ldr w10, [sp, #(STACKSIZE + 8)] // m1 5103 ldr w11, [sp, #(STACKSIZE + 16)] // n1 5104 5105#if MACRO_LEVEL>=1 5106 INNER_STORE_L_4X4_VS_LIB 5107#else 5108 bl inner_store_l_4x4_vs_lib 5109#endif 5110 5111 5112 5113 EPILOGUE 5114 5115 mov x0, #0 5116 5117 ret 5118 5119 FUN_END(kernel_dsyrk_nt_l_4x4_vs_lib44cc) 5120 5121 5122 5123 5124 5125 5126// w0 x1 x2 x3 x4 x5 w6 x7 sp+0 5127// void kernel_dsyrk_nt_u_4x4_lib44cc(int kmax, double *alpha, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd) 5128 5129 .align 4 5130 GLOB_FUN_START(kernel_dsyrk_nt_u_4x4_lib44cc) 5131 5132 5133 5134 PROLOGUE 5135 5136 5137 5138 ZERO_ACC 5139 5140 5141 5142 // call inner kernel gemm nt 5143 mov w8, w0 // kmax 5144 mov x9, x2 // A 5145 mov x10, x3 // B 5146 5147#if MACRO_LEVEL>=2 5148 INNER_KERNEL_GEMM_ADD_NT_4X4_LIB4 5149#else 5150 bl inner_kernel_gemm_add_nt_4x4_lib4 5151#endif 5152 5153 5154 5155 // call inner blend for generic alpha and beta 5156 mov x8, x1 // alpha 5157 mov x9, x4 // beta 5158 mov x10, x5 // C 5159 mov w11, w6 // ldc 5160 lsl w11, w11, #3 // 8*ldc 5161 5162#if MACRO_LEVEL>=1 5163 INNER_SCALE_AB_4X4_LIB 5164#else 5165 bl inner_scale_ab_4x4_lib 5166#endif 5167 5168 5169 5170 // store n 5171 mov x8, x7 // D 5172 ldr w9, [sp, #(STACKSIZE + 0)] // ldd 5173 lsl w9, w9, #3 // 8*ldd 5174 5175#if MACRO_LEVEL>=1 5176 INNER_STORE_U_4X4_LIB 5177#else 5178 bl inner_store_u_4x4_lib 5179#endif 5180 5181 5182 5183 EPILOGUE 5184 5185 mov x0, #0 5186 5187 ret 5188 5189 FUN_END(kernel_dsyrk_nt_u_4x4_lib44cc) 5190 5191 5192 5193 5194 5195// w0 x1 x2 x3 x4 x5 w6 x7 sp+0 sp+8 sp+16 5196// void kernel_dsyrk_nt_u_4x4_vs_lib44cc(int kmax, double *alpha, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1) 5197 5198 .align 4 5199 GLOB_FUN_START(kernel_dsyrk_nt_u_4x4_vs_lib44cc) 5200 5201 5202 5203 PROLOGUE 5204 5205 5206 5207 ZERO_ACC 5208 5209 5210 5211 // call inner kernel gemm nt 5212 mov w8, w0 // kmax 5213 mov x9, x2 // A 5214 mov x10, x3 // B 5215 5216#if MACRO_LEVEL>=2 5217 INNER_KERNEL_GEMM_ADD_NT_4X4_LIB4 5218#else 5219 bl inner_kernel_gemm_add_nt_4x4_lib4 5220#endif 5221 5222 5223 5224 // call inner blend for generic alpha and beta 5225 mov x8, x1 // alpha 5226 mov x9, x4 // beta 5227 mov x10, x5 // C 5228 mov w11, w6 // ldc 5229 lsl w11, w11, #3 // 8*ldc 5230 ldr w12, [sp, #(STACKSIZE + 8)] // m1 5231 ldr w13, [sp, #(STACKSIZE + 16)] // n1 5232 5233#if MACRO_LEVEL>=1 5234 INNER_SCALE_AB_4X4_VS_LIB 5235#else 5236 bl inner_scale_ab_4x4_vs_lib 5237#endif 5238 5239 5240 5241 // store n 5242 mov x8, x7 // D 5243 ldr w9, [sp, #(STACKSIZE + 0)] // ldd 5244 lsl w9, w9, #3 // 8*ldd 5245 ldr w10, [sp, #(STACKSIZE + 8)] // m1 5246 ldr w11, [sp, #(STACKSIZE + 16)] // n1 5247 5248#if MACRO_LEVEL>=1 5249 INNER_STORE_U_4X4_VS_LIB 5250#else 5251 bl inner_store_u_4x4_vs_lib 5252#endif 5253 5254 5255 5256 EPILOGUE 5257 5258 mov x0, #0 5259 5260 ret 5261 5262 FUN_END(kernel_dsyrk_nt_u_4x4_vs_lib44cc) 5263 5264 5265 5266 5267 5268// w0 x1 x2 x3 w4 x5 w6 x7 sp+0 sp+8 5269// void kernel_dtrsm_nt_rl_inv_4x4_lib44ccc(int kmax, double *A, double *B, double *C, int ldc, double *D, int ldd, double *E, int lde, double *inv_diag_E) 5270 5271 .align 4 5272 GLOB_FUN_START(kernel_dtrsm_nt_rl_inv_4x4_lib44ccc) 5273 5274 5275 5276 PROLOGUE 5277 5278 5279 5280 ZERO_ACC 5281 5282 5283 5284 // call inner kernel gemm nt 5285 mov w8, w0 // kmax 5286 mov x9, x1 // A 5287 mov x10, x2 // B 5288 5289#if MACRO_LEVEL>=2 5290 INNER_KERNEL_GEMM_ADD_NT_4X4_LIB4 5291#else 5292 bl inner_kernel_gemm_add_nt_4x4_lib4 5293#endif 5294 5295 5296 5297 // call inner blend for alpha=1.0 and beta=1.0 5298 mov x8, x3 // C 5299 mov w9, w4 // ldc 5300 lsl w9, w9, #3 // 8*ldc 5301 5302#if MACRO_LEVEL>=1 5303 INNER_SCALE_M11_4X4_LIB 5304#else 5305 bl inner_scale_m11_4x4_lib 5306#endif 5307 5308 5309 5310 // solution 5311 mov x8, x7 // E 5312 ldr w9, [sp, #(STACKSIZE + 0)] // sde 5313 lsl w9, w9, #3 // 8*ldc 5314 ldr x10, [sp, #(STACKSIZE + 8)] // inv_diag_E 5315 5316#if MACRO_LEVEL>=1 5317 INNER_EDGE_TRSM_RLT_INV_4X4_LIB 5318#else 5319 bl inner_edge_trsm_rlt_inv_4x4_lib 5320#endif 5321 5322 5323 5324 // store l 5325 mov x8, x5 // D 5326 mov w9, w6 // ldd 5327 lsl w9, w9, #3 // 8*ldd 5328 5329#if MACRO_LEVEL>=1 5330 INNER_STORE_4X4_LIB 5331#else 5332 bl inner_store_4x4_lib 5333#endif 5334 5335 5336 5337 EPILOGUE 5338 5339 mov x0, #0 5340 5341 ret 5342 5343 FUN_END(kernel_dtrsm_nt_rl_inv_4x4_lib44ccc) 5344 5345 5346 5347 5348 5349// w0 x1 x2 x3 w4 x5 w6 x7 sp+0 sp+8 sp+16 sp+24 5350// void kernel_dtrsm_nt_rl_inv_4x4_vs_lib44ccc(int kmax, double *A, double *B, double *C, int ldc, double *D, int ldd, double *E, int lde, double *inv_diag_E, int m1, int n1) 5351 5352 .align 4 5353 GLOB_FUN_START(kernel_dtrsm_nt_rl_inv_4x4_vs_lib44ccc) 5354 5355 5356 5357 PROLOGUE 5358 5359 5360 5361 ZERO_ACC 5362 5363 5364 5365 // call inner kernel gemm nt 5366 mov w8, w0 // kmax 5367 mov x9, x1 // A 5368 mov x10, x2 // B 5369 5370#if MACRO_LEVEL>=2 5371 INNER_KERNEL_GEMM_ADD_NT_4X4_LIB4 5372#else 5373 bl inner_kernel_gemm_add_nt_4x4_lib4 5374#endif 5375 5376 5377 5378 // call inner blend for alpha=1.0 and beta=1.0 5379 mov x8, x3 // C 5380 mov w9, w4 // ldc 5381 lsl w9, w9, #3 // 8*ldc 5382 ldr w10, [sp, #(STACKSIZE + 16)] // m1 5383 ldr w11, [sp, #(STACKSIZE + 24)] // n1 5384 5385#if MACRO_LEVEL>=1 5386 INNER_SCALE_M11_4X4_VS_LIB 5387#else 5388 bl inner_scale_m11_4x4_vs_lib 5389#endif 5390 5391 5392 5393 // solution 5394 mov x8, x7 // E 5395 ldr w9, [sp, #(STACKSIZE + 0)] // sde 5396 lsl w9, w9, #3 // 8*ldc 5397 ldr x10, [sp, #(STACKSIZE + 8)] // inv_diag_E 5398 ldr w11, [sp, #(STACKSIZE + 24)] // n1 5399 5400#if MACRO_LEVEL>=1 5401 INNER_EDGE_TRSM_RLT_INV_4X4_VS_LIB 5402#else 5403 bl inner_edge_trsm_rlt_inv_4x4_vs_lib 5404#endif 5405 5406 5407 5408 // store l 5409 mov x8, x5 // D 5410 mov w9, w6 // ldd 5411 lsl w9, w9, #3 // 8*ldd 5412 ldr w10, [sp, #(STACKSIZE + 16)] // m1 5413 ldr w11, [sp, #(STACKSIZE + 24)] // n1 5414 5415#if MACRO_LEVEL>=1 5416 INNER_STORE_4X4_VS_LIB 5417#else 5418 bl inner_store_4x4_vs_lib 5419#endif 5420 5421 5422 5423 EPILOGUE 5424 5425 mov x0, #0 5426 5427 ret 5428 5429 FUN_END(kernel_dtrsm_nt_rl_inv_4x4_vs_lib44ccc) 5430 5431 5432 5433 5434 5435// w0 x1 x2 x3 x4 w5 x6 w7 sp+0 5436// void kernel_dtrsm_nt_rl_one_4x4_lib44cc4(int kmax, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd, double *E) 5437 5438 .align 4 5439 GLOB_FUN_START(kernel_dtrsm_nt_rl_one_4x4_lib44cc4) 5440 5441 5442 5443 PROLOGUE 5444 5445 5446 5447 ZERO_ACC 5448 5449 5450 5451 // call inner kernel gemm nt 5452 mov w8, w0 // kmax 5453 mov x9, x1 // A 5454 mov x10, x2 // B 5455 5456#if MACRO_LEVEL>=2 5457 INNER_KERNEL_GEMM_ADD_NT_4X4_LIB4 5458#else 5459 bl inner_kernel_gemm_add_nt_4x4_lib4 5460#endif 5461 5462 5463 5464 // call inner blend for alpha=1.0 5465 mov x8, x3 // beta 5466 mov x9, x4 // C 5467 mov w10, w5 // ldc 5468 lsl w10, w10, #3 // 8*ldc 5469 5470#if MACRO_LEVEL>=1 5471 INNER_SCALE_M1B_4X4_LIB 5472#else 5473 bl inner_scale_m1b_4x4_lib 5474#endif 5475 5476 5477 5478 // solution 5479 ldr x8, [sp, #(STACKSIZE + 0)] // E 5480 5481#if MACRO_LEVEL>=1 5482 INNER_EDGE_TRSM_RLT_ONE_4X4_LIB4 5483#else 5484 bl inner_edge_trsm_rlt_one_4x4_lib4 5485#endif 5486 5487 5488 5489 // store l 5490 mov x8, x6 // D 5491 mov w9, w7 // ldd 5492 lsl w9, w9, #3 // 8*ldd 5493 5494#if MACRO_LEVEL>=1 5495 INNER_STORE_4X4_LIB 5496#else 5497 bl inner_store_4x4_lib 5498#endif 5499 5500 5501 5502 EPILOGUE 5503 5504 mov x0, #0 5505 5506 ret 5507 5508 FUN_END(kernel_dtrsm_nt_rl_one_4x4_lib44cc4) 5509 5510 5511 5512 5513 5514// w0 x1 x2 x3 x4 w5 x6 w7 sp+0 sp+8 sp+16 5515// void kernel_dtrsm_nt_rl_one_4x4_vs_lib44cc4(int kmax, double *A, double *B, double *beta, double *C, int ldc, double *D, int ldd, double *E, int m1, int n1) 5516 5517 .align 4 5518 GLOB_FUN_START(kernel_dtrsm_nt_rl_one_4x4_vs_lib44cc4) 5519 5520 5521 5522 PROLOGUE 5523 5524 5525 5526 ZERO_ACC 5527 5528 5529 5530 // call inner kernel gemm nt 5531 mov w8, w0 // kmax 5532 mov x9, x1 // A 5533 mov x10, x2 // B 5534 5535#if MACRO_LEVEL>=2 5536 INNER_KERNEL_GEMM_ADD_NT_4X4_LIB4 5537#else 5538 bl inner_kernel_gemm_add_nt_4x4_lib4 5539#endif 5540 5541 5542 5543 // call inner blend for alpha=1.0 5544 mov x8, x3 // beta 5545 mov x9, x4 // C 5546 mov w10, w5 // ldc 5547 lsl w10, w10, #3 // 8*ldc 5548 ldr w11, [sp, #(STACKSIZE + 8)] // m1 5549 ldr w12, [sp, #(STACKSIZE + 16)] // n1 5550 5551#if MACRO_LEVEL>=1 5552 INNER_SCALE_M1B_4X4_VS_LIB 5553#else 5554 bl inner_scale_m1b_4x4_vs_lib 5555#endif 5556 5557 5558 5559 // solution 5560 ldr x8, [sp, #(STACKSIZE + 0)] // E 5561 ldr w9, [sp, #(STACKSIZE + 16)] // n1 5562 5563#if MACRO_LEVEL>=1 5564 INNER_EDGE_TRSM_RLT_ONE_4X4_VS_LIB4 5565#else 5566 bl inner_edge_trsm_rlt_one_4x4_vs_lib4 5567#endif 5568 5569 5570 5571 // store l 5572 mov x8, x6 // D 5573 mov w9, w7 // ldd 5574 lsl w9, w9, #3 // 8*ldd 5575 ldr w10, [sp, #(STACKSIZE + 8)] // m1 5576 ldr w11, [sp, #(STACKSIZE + 16)] // n1 5577 5578#if MACRO_LEVEL>=1 5579 INNER_STORE_4X4_VS_LIB 5580#else 5581 bl inner_store_4x4_vs_lib 5582#endif 5583 5584 5585 5586 EPILOGUE 5587 5588 mov x0, #0 5589 5590 ret 5591 5592 FUN_END(kernel_dtrsm_nt_rl_one_4x4_vs_lib44cc4) 5593 5594 5595 5596 5597 5598// w0 x1 x2 x3 w4 x5 w6 x7 5599// void kernel_dpotrf_nt_l_4x4_lib44cc(int kmax, double *A, double *B, double *C, int ldc, double *D, int ldd, double *inv_diag_D) 5600 5601 .align 4 5602 GLOB_FUN_START(kernel_dpotrf_nt_l_4x4_lib44cc) 5603 5604 5605 5606 PROLOGUE 5607 5608 5609 5610 ZERO_ACC 5611 5612 5613 5614 // call inner kernel gemm nt 5615 mov w8, w0 // kmax 5616 mov x9, x1 // A 5617 mov x10, x2 // B 5618 5619#if MACRO_LEVEL>=2 5620 INNER_KERNEL_GEMM_ADD_NT_4X4_LIB4 5621#else 5622 bl inner_kernel_gemm_add_nt_4x4_lib4 5623#endif 5624 5625 5626 5627 // call inner blend for alpha=1.0 and beta=1.0 5628 mov x8, x3 // C 5629 mov w9, w4 // ldc 5630 lsl w9, w9, #3 // 8*ldc 5631 5632#if MACRO_LEVEL>=1 5633 INNER_SCALE_M11_4X4_LIB 5634#else 5635 bl inner_scale_m11_4x4_lib 5636#endif 5637 5638 5639 5640 // factorization 5641 mov x8, x7 // inv_diag_E 5642 5643#if MACRO_LEVEL>=1 5644 INNER_EDGE_POTRF_4X4_LIB4 5645#else 5646 bl inner_edge_potrf_4x4_lib4 5647#endif 5648 5649 5650 5651 // store l 5652 mov x8, x5 // D 5653 mov w9, w6 // ldd 5654 lsl w9, w9, #3 // 8*ldd 5655 5656#if MACRO_LEVEL>=1 5657 INNER_STORE_L_4X4_LIB 5658#else 5659 bl inner_store_l_4x4_lib 5660#endif 5661 5662 5663 5664 EPILOGUE 5665 5666 mov x0, #0 5667 5668 ret 5669 5670 FUN_END(kernel_dpotrf_nt_l_4x4_lib44cc) 5671 5672 5673 5674 5675 5676// w0 x1 x2 x3 w4 x5 w6 x7 sp+0 sp+1 5677// void kernel_dpotrf_nt_l_4x4_vs_lib44cc(int kmax, double *A, double *B, double *C, int ldc, double *D, int ldd, double *inv_diag_D, int m1, int n1) 5678 5679 .align 4 5680 GLOB_FUN_START(kernel_dpotrf_nt_l_4x4_vs_lib44cc) 5681 5682 5683 5684 PROLOGUE 5685 5686 5687 5688 ZERO_ACC 5689 5690 5691 5692 // call inner kernel gemm nt 5693 mov w8, w0 // kmax 5694 mov x9, x1 // A 5695 mov x10, x2 // B 5696 5697#if MACRO_LEVEL>=2 5698 INNER_KERNEL_GEMM_ADD_NT_4X4_LIB4 5699#else 5700 bl inner_kernel_gemm_add_nt_4x4_lib4 5701#endif 5702 5703 5704 5705 // call inner blend for alpha=1.0 and beta=1.0 5706 mov x8, x3 // C 5707 mov w9, w4 // ldc 5708 lsl w9, w9, #3 // 8*ldc 5709 ldr w10, [sp, #(STACKSIZE + 0)] // m1 5710 ldr w11, [sp, #(STACKSIZE + 8)] // n1 5711 5712#if MACRO_LEVEL>=1 5713 INNER_SCALE_M11_4X4_VS_LIB 5714#else 5715 bl inner_scale_m11_4x4_vs_lib 5716#endif 5717 5718 5719 5720 // factorization 5721 mov x8, x7 // inv_diag_E 5722 ldr w9, [sp, #(STACKSIZE + 8)] // n1 5723 5724#if MACRO_LEVEL>=1 5725 INNER_EDGE_POTRF_4X4_VS_LIB4 5726#else 5727 bl inner_edge_potrf_4x4_vs_lib4 5728#endif 5729 5730 5731 5732 // store l 5733 mov x8, x5 // D 5734 mov w9, w6 // ldd 5735 lsl w9, w9, #3 // 8*ldd 5736 ldr w10, [sp, #(STACKSIZE + 0)] // m1 5737 ldr w11, [sp, #(STACKSIZE + 8)] // n1 5738 5739#if MACRO_LEVEL>=1 5740 INNER_STORE_L_4X4_VS_LIB 5741#else 5742 bl inner_store_l_4x4_vs_lib 5743#endif 5744 5745 5746 5747 EPILOGUE 5748 5749 mov x0, #0 5750 5751 ret 5752 5753 FUN_END(kernel_dpotrf_nt_l_4x4_vs_lib44cc) 5754 5755 5756 5757 5758 5759// w0 x1 x2 x3 x4 x5 w6 x7 sp+0 sp+8 sp+16 5760// void kernel_dtrsm_nn_ll_one_4x4_lib4cccc(int kmax, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, double *E, int lde) 5761 5762 .align 4 5763 GLOB_FUN_START(kernel_dtrsm_nn_ll_one_4x4_lib4cccc) 5764 5765 5766 5767 PROLOGUE 5768 5769 5770 5771 ZERO_ACC 5772 5773 5774 5775 // call inner kernel gemm nt 5776 mov w8, w0 // kmax 5777 mov x9, x1 // A 5778 mov x10, x2 // B 5779 mov w11, w3 // ldb 5780 lsl w11, w11, #3 // 8*ldb 5781 5782#if MACRO_LEVEL>=2 5783 INNER_KERNEL_GEMM_ADD_NN_4X4_LIB4C 5784#else 5785 bl inner_kernel_gemm_add_nn_4x4_lib4c 5786#endif 5787 5788 5789 5790 // call inner blend for alpha=1.0 and beta=1.0 5791 mov x8, x4 // beta 5792 mov x9, x5 // C 5793 mov w10, w6 // ldc 5794 lsl w10, w10, #3 // 8*ldc 5795 5796#if MACRO_LEVEL>=1 5797 INNER_SCALE_M1B_4X4_LIB 5798#else 5799 bl inner_scale_m1b_4x4_lib 5800#endif 5801 5802 5803 5804 // solution 5805 ldr x8, [sp, #(STACKSIZE + 8)] // E 5806 ldr w9, [sp, #(STACKSIZE + 16)] // sde 5807 lsl w9, w9, #3 // 8*ldc 5808 5809#if MACRO_LEVEL>=1 5810 INNER_EDGE_TRSM_LLN_ONE_4X4_LIB 5811#else 5812 bl inner_edge_trsm_lln_one_4x4_lib 5813#endif 5814 5815 5816 5817 // store l 5818 mov x8, x7 // D 5819 ldr w9, [sp, #(STACKSIZE + 0)] // sdd 5820 lsl w9, w9, #3 // 8*ldd 5821 5822#if MACRO_LEVEL>=1 5823 INNER_STORE_4X4_LIB 5824#else 5825 bl inner_store_4x4_lib 5826#endif 5827 5828 5829 5830 EPILOGUE 5831 5832 mov x0, #0 5833 5834 ret 5835 5836 FUN_END(kernel_dtrsm_nn_ll_one_4x4_lib4cccc) 5837 5838 5839 5840 5841 5842 5843// w0 x1 x2 x3 x4 x5 w6 x7 sp+0 sp+8 sp+16 sp+24 sp+32 5844// void kernel_dtrsm_nn_ll_one_4x4_vs_lib4cccc(int kmax, double *A, double *B, int ldb, double *beta, double *C, int ldc, double *D, int ldd, double *E, int lde, int m1, int n1) 5845 5846 .align 4 5847 GLOB_FUN_START(kernel_dtrsm_nn_ll_one_4x4_vs_lib4cccc) 5848 5849 5850 5851 PROLOGUE 5852 5853 5854 5855 ZERO_ACC 5856 5857 5858 5859 // call inner kernel gemm nt 5860 mov w8, w0 // kmax 5861 mov x9, x1 // A 5862 mov x10, x2 // B 5863 mov w11, w3 // ldb 5864 lsl w11, w11, #3 // 8*ldb 5865 5866 ldr w12, [sp, #(STACKSIZE + 32)] // n1 5867 cmp w12, #1 5868 bgt 100f 5869 5870#if MACRO_LEVEL>=2 5871 INNER_KERNEL_GEMM_ADD_NN_4X1_LIB4C 5872#else 5873 bl inner_kernel_gemm_add_nn_4x1_lib4c 5874#endif 5875 5876 b 103f 5877 5878100: 5879 5880 ldr w12, [sp, #(STACKSIZE + 32)] // n1 5881 cmp w12, #2 5882 bgt 101f 5883 5884#if MACRO_LEVEL>=2 5885 INNER_KERNEL_GEMM_ADD_NN_4X2_LIB4C 5886#else 5887 bl inner_kernel_gemm_add_nn_4x2_lib4c 5888#endif 5889 5890 b 103f 5891 5892101: 5893 5894 ldr w12, [sp, #(STACKSIZE + 32)] // n1 5895 cmp w12, #3 5896 bgt 102f 5897 5898#if MACRO_LEVEL>=2 5899 INNER_KERNEL_GEMM_ADD_NN_4X3_LIB4C 5900#else 5901 bl inner_kernel_gemm_add_nn_4x3_lib4c 5902#endif 5903 5904 b 103f 5905 5906102: 5907 5908#if MACRO_LEVEL>=2 5909 INNER_KERNEL_GEMM_ADD_NN_4X4_LIB4C 5910#else 5911 bl inner_kernel_gemm_add_nn_4x4_lib4c 5912#endif 5913 5914103: 5915 5916 5917 5918 // call inner blend for alpha=1.0 and beta=1.0 5919 mov x8, x4 // beta 5920 mov x9, x5 // C 5921 mov w10, w6 // ldc 5922 lsl w10, w10, #3 // 8*ldc 5923 ldr w11, [sp, #(STACKSIZE + 24)] // m1 5924 ldr w12, [sp, #(STACKSIZE + 32)] // n1 5925 5926#if MACRO_LEVEL>=1 5927 INNER_SCALE_M1B_4X4_VS_LIB 5928#else 5929 bl inner_scale_m1b_4x4_vs_lib 5930#endif 5931 5932 5933 5934 // solution 5935 ldr x8, [sp, #(STACKSIZE + 8)] // E 5936 ldr w9, [sp, #(STACKSIZE + 16)] // sde 5937 lsl w9, w9, #3 // 8*ldc 5938 5939#if MACRO_LEVEL>=1 5940 INNER_EDGE_TRSM_LLN_ONE_4X4_LIB 5941#else 5942 bl inner_edge_trsm_lln_one_4x4_lib 5943#endif 5944 5945 5946 5947 // store l 5948 mov x8, x7 // D 5949 ldr w9, [sp, #(STACKSIZE + 0)] // sdd 5950 lsl w9, w9, #3 // 8*ldd 5951 ldr w10, [sp, #(STACKSIZE + 24)] // m1 5952 ldr w11, [sp, #(STACKSIZE + 32)] // n1 5953 5954#if MACRO_LEVEL>=1 5955 INNER_STORE_4X4_VS_LIB 5956#else 5957 bl inner_store_4x4_vs_lib 5958#endif 5959 5960 5961 5962 EPILOGUE 5963 5964 mov x0, #0 5965 5966 ret 5967 5968 FUN_END(kernel_dtrsm_nn_ll_one_4x4_vs_lib4cccc) 5969 5970 5971 5972 5973 5974// w0 x1 x2 x3 x4 x5 w6 x7 sp+0 sp+8 5975// void kernel_dgemm_nt_4x4_libc4cc(int kmax, double *alpha, double *A, int lda, double *B, double *beta, double *C, int ldc, double *D, int ldd) 5976 5977 .align 4 5978 GLOB_FUN_START(kernel_dgemm_nt_4x4_libc4cc) 5979 5980 5981 5982 PROLOGUE 5983 5984 5985 5986 ZERO_ACC 5987 5988 5989 5990 // call inner kernel gemm nt 5991 mov w8, w0 // kmax 5992 mov x9, x4 // B 5993 mov x10, x2 // A 5994 mov w11, w3 // lda 5995 lsl w11, w11, #3 // 8*lda 5996 5997#if MACRO_LEVEL>=2 5998 INNER_KERNEL_GEMM_ADD_NT_4X4_LIB4C 5999#else 6000 bl inner_kernel_gemm_add_nt_4x4_lib4c 6001#endif 6002 6003 6004 6005 // call inner blend for generic alpha and beta 6006 mov x8, x1 // alpha 6007 mov x9, x5 // beta 6008 mov x10, x6 // C 6009 mov w11, w7 // ldc 6010 lsl w11, w11, #3 // 8*ldc 6011 6012#if MACRO_LEVEL>=1 6013 INNER_TRAN_4X4_LIB 6014#else 6015 bl inner_tran_4x4_lib 6016#endif 6017 6018 6019#if MACRO_LEVEL>=1 6020 INNER_SCALE_AB_4X4_LIB 6021#else 6022 bl inner_scale_ab_4x4_lib 6023#endif 6024 6025 6026 6027 // store n 6028 ldr x8, [sp, #(STACKSIZE + 0)] // D 6029 ldr w9, [sp, #(STACKSIZE + 8)] // ldd 6030 lsl w9, w9, #3 // 8*ldd 6031 6032#if MACRO_LEVEL>=1 6033 INNER_STORE_4X4_LIB 6034#else 6035 bl inner_store_4x4_lib 6036#endif 6037 6038 6039 6040 EPILOGUE 6041 6042 mov x0, #0 6043 6044 ret 6045 6046 FUN_END(kernel_dgemm_nt_4x4_libc4cc) 6047 6048 6049 6050 6051 6052// w0 x1 x2 x3 x4 x5 w6 x7 sp+0 sp+8 sp+16 sp+24 6053// void kernel_dgemm_nt_4x4_vs_libc4cc(int kmax, double *alpha, double *A, int lda, double *B, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1) 6054 6055 .align 4 6056 GLOB_FUN_START(kernel_dgemm_nt_4x4_vs_libc4cc) 6057 6058 6059 6060 PROLOGUE 6061 6062 6063 6064 ZERO_ACC 6065 6066 6067 6068 // call inner kernel gemm nt 6069 mov w8, w0 // kmax 6070 mov x9, x4 // B 6071 mov x10, x2 // A 6072 mov w11, w3 // lda 6073 lsl w11, w11, #3 // 8*ldb 6074 6075 ldr w12, [sp, #(STACKSIZE + 24)] // n1 6076 cmp w12, #1 6077 bgt 100f 6078 6079#if MACRO_LEVEL>=2 6080 INNER_KERNEL_GEMM_ADD_NT_4X1_LIB4C 6081#else 6082 bl inner_kernel_gemm_add_nt_4x1_lib4c 6083#endif 6084 6085 b 103f 6086 6087100: 6088 6089 ldr w12, [sp, #(STACKSIZE + 24)] // n1 6090 cmp w12, #2 6091 bgt 101f 6092 6093#if MACRO_LEVEL>=2 6094 INNER_KERNEL_GEMM_ADD_NT_4X2_LIB4C 6095#else 6096 bl inner_kernel_gemm_add_nt_4x2_lib4c 6097#endif 6098 6099 b 103f 6100 6101101: 6102 6103 ldr w12, [sp, #(STACKSIZE + 24)] // n1 6104 cmp w12, #3 6105 bgt 102f 6106 6107#if MACRO_LEVEL>=2 6108 INNER_KERNEL_GEMM_ADD_NT_4X3_LIB4C 6109#else 6110 bl inner_kernel_gemm_add_nt_4x3_lib4c 6111#endif 6112 6113 b 103f 6114 6115102: 6116 6117#if MACRO_LEVEL>=2 6118 INNER_KERNEL_GEMM_ADD_NT_4X4_LIB4C 6119#else 6120 bl inner_kernel_gemm_add_nt_4x4_lib4c 6121#endif 6122 6123103: 6124 6125 6126 6127 // call inner blend for generic alpha and beta 6128 mov x8, x1 // alpha 6129 mov x9, x5 // beta 6130 mov x10, x6 // C 6131 mov w11, w7 // ldc 6132 lsl w11, w11, #3 // 8*ldc 6133 ldr w12, [sp, #(STACKSIZE + 16)] // m1 6134 ldr w13, [sp, #(STACKSIZE + 24)] // n1 6135 6136#if MACRO_LEVEL>=1 6137 INNER_TRAN_4X4_LIB 6138#else 6139 bl inner_tran_4x4_lib 6140#endif 6141 6142 6143#if MACRO_LEVEL>=1 6144 INNER_SCALE_AB_4X4_VS_LIB 6145#else 6146 bl inner_scale_ab_4x4_vs_lib 6147#endif 6148 6149 6150 6151 // store n 6152 ldr x8, [sp, #(STACKSIZE + 0)] // D 6153 ldr w9, [sp, #(STACKSIZE + 8)] // ldd 6154 lsl w9, w9, #3 // 8*ldd 6155 ldr w10, [sp, #(STACKSIZE + 16)] // m1 6156 ldr w11, [sp, #(STACKSIZE + 24)] // n1 6157 6158#if MACRO_LEVEL>=1 6159 INNER_STORE_4X4_VS_LIB 6160#else 6161 bl inner_store_4x4_vs_lib 6162#endif 6163 6164 6165 6166 EPILOGUE 6167 6168 mov x0, #0 6169 6170 ret 6171 6172 FUN_END(kernel_dgemm_nt_4x4_vs_libc4cc) 6173 6174 6175 6176 6177 6178// w0 x1 x2 x3 x4 x5 w6 x7 sp+0 sp+8 6179// void kernel_dgemm_tt_4x4_libc4cc(int kmax, double *alpha, double *A, int lda, double *B, double *beta, double *C, int ldc, double *D, int ldd) 6180 6181 .align 4 6182 GLOB_FUN_START(kernel_dgemm_tt_4x4_libc4cc) 6183 6184 6185 6186 PROLOGUE 6187 6188 6189 6190 ZERO_ACC 6191 6192 6193 6194 // call inner kernel gemm nt 6195 mov w8, w0 // kmax 6196 mov x9, x4 // B 6197 mov x10, x2 // A 6198 mov w11, w3 // lda 6199 lsl w11, w11, #3 // 8*lda 6200 6201#if MACRO_LEVEL>=2 6202 INNER_KERNEL_GEMM_ADD_NN_4X4_LIB4C 6203#else 6204 bl inner_kernel_gemm_add_nn_4x4_lib4c 6205#endif 6206 6207 6208 6209 // call inner blend for generic alpha and beta 6210 mov x8, x1 // alpha 6211 mov x9, x5 // beta 6212 mov x10, x6 // C 6213 mov w11, w7 // ldc 6214 lsl w11, w11, #3 // 8*ldc 6215 6216#if MACRO_LEVEL>=1 6217 INNER_TRAN_4X4_LIB 6218#else 6219 bl inner_tran_4x4_lib 6220#endif 6221 6222 6223#if MACRO_LEVEL>=1 6224 INNER_SCALE_AB_4X4_LIB 6225#else 6226 bl inner_scale_ab_4x4_lib 6227#endif 6228 6229 6230 6231 // store n 6232 ldr x8, [sp, #(STACKSIZE + 0)] // D 6233 ldr w9, [sp, #(STACKSIZE + 8)] // ldd 6234 lsl w9, w9, #3 // 8*ldd 6235 6236#if MACRO_LEVEL>=1 6237 INNER_STORE_4X4_LIB 6238#else 6239 bl inner_store_4x4_lib 6240#endif 6241 6242 6243 6244 EPILOGUE 6245 6246 mov x0, #0 6247 6248 ret 6249 6250 FUN_END(kernel_dgemm_tt_4x4_libc4cc) 6251 6252 6253 6254 6255 6256// w0 x1 x2 x3 x4 x5 w6 x7 sp+0 sp+8 sp+16 sp+24 6257// void kernel_dgemm_tt_4x4_vs_libc4cc(int kmax, double *alpha, double *A, int lda, double *B, double *beta, double *C, int ldc, double *D, int ldd, int m1, int n1) 6258 6259 .align 4 6260 GLOB_FUN_START(kernel_dgemm_tt_4x4_vs_libc4cc) 6261 6262 6263 6264 PROLOGUE 6265 6266 6267 6268 ZERO_ACC 6269 6270 6271 6272 // call inner kernel gemm nt 6273 mov w8, w0 // kmax 6274 mov x9, x4 // B 6275 mov x10, x2 // A 6276 mov w11, w3 // lda 6277 lsl w11, w11, #3 // 8*ldb 6278 6279 ldr w12, [sp, #(STACKSIZE + 24)] // n1 6280 cmp w12, #1 6281 bgt 100f 6282 6283#if MACRO_LEVEL>=2 6284 INNER_KERNEL_GEMM_ADD_NN_4X1_LIB4C 6285#else 6286 bl inner_kernel_gemm_add_nn_4x1_lib4c 6287#endif 6288 6289 b 103f 6290 6291100: 6292 6293 ldr w12, [sp, #(STACKSIZE + 24)] // n1 6294 cmp w12, #2 6295 bgt 101f 6296 6297#if MACRO_LEVEL>=2 6298 INNER_KERNEL_GEMM_ADD_NN_4X2_LIB4C 6299#else 6300 bl inner_kernel_gemm_add_nn_4x2_lib4c 6301#endif 6302 6303 b 103f 6304 6305101: 6306 6307 ldr w12, [sp, #(STACKSIZE + 24)] // n1 6308 cmp w12, #3 6309 bgt 102f 6310 6311#if MACRO_LEVEL>=2 6312 INNER_KERNEL_GEMM_ADD_NN_4X3_LIB4C 6313#else 6314 bl inner_kernel_gemm_add_nn_4x3_lib4c 6315#endif 6316 6317 b 103f 6318 6319102: 6320 6321#if MACRO_LEVEL>=2 6322 INNER_KERNEL_GEMM_ADD_NN_4X4_LIB4C 6323#else 6324 bl inner_kernel_gemm_add_nn_4x4_lib4c 6325#endif 6326 6327103: 6328 6329 6330 6331 // call inner blend for generic alpha and beta 6332 mov x8, x1 // alpha 6333 mov x9, x5 // beta 6334 mov x10, x6 // C 6335 mov w11, w7 // ldc 6336 lsl w11, w11, #3 // 8*ldc 6337 ldr w12, [sp, #(STACKSIZE + 16)] // m1 6338 ldr w13, [sp, #(STACKSIZE + 24)] // n1 6339 6340#if MACRO_LEVEL>=1 6341 INNER_TRAN_4X4_LIB 6342#else 6343 bl inner_tran_4x4_lib 6344#endif 6345 6346 6347#if MACRO_LEVEL>=1 6348 INNER_SCALE_AB_4X4_VS_LIB 6349#else 6350 bl inner_scale_ab_4x4_vs_lib 6351#endif 6352 6353 6354 6355 // store n 6356 ldr x8, [sp, #(STACKSIZE + 0)] // D 6357 ldr w9, [sp, #(STACKSIZE + 8)] // ldd 6358 lsl w9, w9, #3 // 8*ldd 6359 ldr w10, [sp, #(STACKSIZE + 16)] // m1 6360 ldr w11, [sp, #(STACKSIZE + 24)] // n1 6361 6362#if MACRO_LEVEL>=1 6363 INNER_STORE_4X4_VS_LIB 6364#else 6365 bl inner_store_4x4_vs_lib 6366#endif 6367 6368 6369 6370 EPILOGUE 6371 6372 mov x0, #0 6373 6374 ret 6375 6376 FUN_END(kernel_dgemm_tt_4x4_vs_libc4cc) 6377 6378 6379 6380 6381 6382 6383 6384