1/******************************************************************************* 2Copyright (c) 2015, The OpenBLAS Project 3All rights reserved. 4Redistribution and use in source and binary forms, with or without 5modification, are permitted provided that the following conditions are 6met: 71. Redistributions of source code must retain the above copyright 8notice, this list of conditions and the following disclaimer. 92. Redistributions in binary form must reproduce the above copyright 10notice, this list of conditions and the following disclaimer in 11the documentation and/or other materials provided with the 12distribution. 133. Neither the name of the OpenBLAS project nor the names of 14its contributors may be used to endorse or promote products 15derived from this software without specific prior written permission. 16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE 20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE 25USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26*******************************************************************************/ 27 28#define ASSEMBLER 29#include "common.h" 30 31/* X0 X1 X2 s0 X3 x4 x5 x6 x7 */ 32/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc, BLASLONG offset) */ 33 34#define origM x0 35#define origN x1 36#define origK x2 37#define origPA x3 38#define origPB x4 39#define pC x5 40#define LDC x6 41#define offset x7 42#define counterL x8 43#define counterI x9 44#define counterJ x10 45#define pB x11 46#define pCRow0 x12 47#define pCRow1 x13 48#define pCRow2 x14 49#define pA x15 50#define temp x16 51#define tempOffset x17 52#define tempK x18 53 54#define alpha0 s10 55#define alphaV0 v10.s[0] 56#define alpha1 s11 57#define alphaV1 v11.s[0] 58#define alpha2 s14 59#define alphaV2 v14.s[0] 60#define alpha3 s15 61#define alphaV3 v15.s[0] 62 63// 00 origM 64// 01 origN 65// 02 origK 66// 03 origPA 67// 04 origPB 68// 05 pC 69// 06 origLDC -> LDC 70// 07 offset 71// 08 counterL 72// 09 counterI 73// 10 counterJ 74// 11 pB 75// 12 pCRow0 76// 13 pCRow1 77// 14 pCRow2 78// 15 pA 79// 16 temp 80// 17 tempOffset 81// 18 must save tempK 82// 19 must save 83// 20 must save 84// 21 must save 85// 22 must save 86// 23 must save 87// 24 must save 88// 25 must save 89// 26 must save 90// 27 must save 91// 28 must save 92// 29 frame 93// 30 link 94// 31 sp 95 96//v00 ALPHA -> pA00, pA01 97//v01 pA02, pA03 98//v02 99//v03 100//v04 pA10, pA11 101//v05 pA12, pA13 102//v06 103//v07 104//v08 must save pB00, pB01 105//v09 must save pB02, pB03 106//v10 must save ALPHA0 107//v11 must save ALPHA1 108//v12 must save pB10, pB11 109//v13 must save pB12, pB13 110//v14 must save ALPHA2 111//v15 must save ALPHA3 112//v16 must save C00, C01 113//v17 must save C02, C03 114//v18 115//v19 116//v20 C10, C11 117//v21 C12, C13 118//v22 119//v23 120//v24 C20, C21 121//v25 C22, C23 122//v26 123//v27 124//v28 C30, C31 125//v29 C32, C33 126//v30 127//v31 128 129/******************************************************************************* 130* Macro definitions 131*******************************************************************************/ 132 133.macro INIT4x4 134 fmov s16, wzr 135 fmov s17, s16 136 fmov s20, s17 137 fmov s21, s16 138 fmov s24, s17 139 fmov s25, s16 140 fmov s28, s17 141 fmov s29, s16 142.endm 143 144.macro KERNEL4x4_I 145 ld1 {v8.2s, v9.2s}, [pB] 146 add pB, pB, #16 147 ld1 {v0.2s, v1.2s}, [pA] 148 add pA, pA, #16 149 150 fmul v16.2s, v0.2s, v8.s[0] 151 fmul v29.2s, v1.2s, v9.s[1] 152 153 fmul v20.2s, v0.2s, v8.s[1] 154 fmul v25.2s, v1.2s, v9.s[0] 155 156 fmul v24.2s, v0.2s, v9.s[0] 157 fmul v21.2s, v1.2s, v8.s[1] 158 159 fmul v28.2s, v0.2s, v9.s[1] 160 fmul v17.2s, v1.2s, v8.s[0] 161 162 ld1 {v12.2s, v13.2s}, [pB] 163 add pB, pB, #16 164 ld1 {v4.2s, v5.2s}, [pA] 165 add pA, pA, #16 166.endm 167 168.macro KERNEL4x4_M1 169 fmla v16.2s, v0.2s, v8.s[0] 170 fmla v29.2s, v1.2s, v9.s[1] 171 172 ld1 {v12.2s, v13.2s}, [pB] // For next round 173 add pB, pB, #16 174 175 fmla v20.2s, v0.2s, v8.s[1] 176 fmla v25.2s, v1.2s, v9.s[0] 177 178 ld1 {v4.2s, v5.2s}, [pA] // For next round 179 add pA, pA, #16 180 181 fmla v24.2s, v0.2s, v9.s[0] 182 fmla v21.2s, v1.2s, v8.s[1] 183 184 prfm PLDL1KEEP, [pB, #512] 185 186 fmla v28.2s, v0.2s, v9.s[1] 187 fmla v17.2s, v1.2s, v8.s[0] 188.endm 189 190.macro KERNEL4x4_M2 191 fmla v16.2s, v4.2s, v12.s[0] 192 fmla v29.2s, v5.2s, v13.s[1] 193 194 ld1 {v8.2s, v9.2s}, [pB] // For next round 195 add pB, pB, #16 196 197 fmla v20.2s, v4.2s, v12.s[1] 198 fmla v25.2s, v5.2s, v13.s[0] 199 200 ld1 {v0.2s, v1.2s}, [pA] // For next round 201 add pA, pA, #16 202 203 fmla v24.2s, v4.2s, v13.s[0] 204 fmla v21.2s, v5.2s, v12.s[1] 205 206 prfm PLDL1KEEP, [pA, #512] 207 208 fmla v28.2s, v4.2s, v13.s[1] 209 fmla v17.2s, v5.2s, v12.s[0] 210.endm 211 212.macro KERNEL4x4_E 213 fmla v16.2s, v4.2s, v12.s[0] 214 fmla v29.2s, v5.2s, v13.s[1] 215 216 fmla v20.2s, v4.2s, v12.s[1] 217 fmla v25.2s, v5.2s, v13.s[0] 218 219 fmla v24.2s, v4.2s, v13.s[0] 220 fmla v21.2s, v5.2s, v12.s[1] 221 222 fmla v28.2s, v4.2s, v13.s[1] 223 fmla v17.2s, v5.2s, v12.s[0] 224.endm 225 226.macro KERNEL4x4_SUB 227 ld1 {v8.2s, v9.2s}, [pB] 228 add pB, pB, #16 229 ld1 {v0.2s, v1.2s}, [pA] 230 add pA, pA, #16 231 232 fmla v16.2s, v0.2s, v8.s[0] 233 fmla v29.2s, v1.2s, v9.s[1] 234 235 fmla v20.2s, v0.2s, v8.s[1] 236 fmla v25.2s, v1.2s, v9.s[0] 237 238 fmla v24.2s, v0.2s, v9.s[0] 239 fmla v21.2s, v1.2s, v8.s[1] 240 241 fmla v28.2s, v0.2s, v9.s[1] 242 fmla v17.2s, v1.2s, v8.s[0] 243.endm 244 245.macro SAVE4x4 246 fmul v8.2s, v16.2s, alphaV0 247 fmul v9.2s, v17.2s, alphaV1 248 st1 {v8.2s, v9.2s}, [pCRow0] 249 250 add pCRow1, pCRow0, LDC 251 fmul v12.2s, v20.2s, alphaV2 252 fmul v13.2s, v21.2s, alphaV3 253 st1 {v12.2s, v13.2s}, [pCRow1] 254 255 add pCRow2, pCRow1, LDC 256 fmul v8.2s, v24.2s, alphaV0 257 fmul v9.2s, v25.2s, alphaV1 258 st1 {v8.2s, v9.2s}, [pCRow2] 259 260 add pCRow1, pCRow2, LDC 261 fmul v12.2s, v28.2s, alphaV2 262 fmul v13.2s, v29.2s, alphaV3 263 st1 {v12.2s, v13.2s}, [pCRow1] 264 265 add pCRow0, pCRow0, #16 266.endm 267 268/******************************************************************************/ 269 270.macro INIT2x4 271 fmov s16, wzr 272 fmov s20, s16 273 fmov s24, s20 274 fmov s28, s16 275.endm 276 277.macro KERNEL2x4_SUB 278 ld1 {v8.2s, v9.2s}, [pB] 279 add pB, pB, #16 280 ld1 {v0.2s}, [pA] 281 add pA, pA, #8 282 283 fmla v16.2s, v0.2s, v8.s[0] 284 fmla v20.2s, v0.2s, v8.s[1] 285 fmla v24.2s, v0.2s, v9.s[0] 286 fmla v28.2s, v0.2s, v9.s[1] 287.endm 288 289.macro SAVE2x4 290 fmul v8.2s, v16.2s, alphaV0 291 st1 {v8.2s}, [pCRow0] 292 293 add pCRow1, pCRow0, LDC 294 fmul v12.2s, v20.2s, alphaV1 295 st1 {v12.2s}, [pCRow1] 296 297 add pCRow2, pCRow1, LDC 298 fmul v8.2s, v24.2s, alphaV2 299 st1 {v8.2s}, [pCRow2] 300 301 add pCRow1, pCRow2, LDC 302 fmul v12.2s, v28.2s, alphaV3 303 st1 {v12.2s}, [pCRow1] 304 305 add pCRow0, pCRow0, #8 306.endm 307 308/******************************************************************************/ 309 310.macro INIT1x4 311 fmov s16, wzr 312 fmov s20, s16 313.endm 314 315.macro KERNEL1x4_SUB 316 ldr s0, [pA] 317 add pA, pA, #4 318 319 ld1 {v8.2s, v9.2s}, [pB] 320 add pB, pB, #16 321 322 fmla v16.2s, v8.2s, v0.s[0] 323 fmla v20.2s, v9.2s, v0.s[0] 324.endm 325 326.macro SAVE1x4 327 add pCRow1, pCRow0, LDC 328 fmul v8.2s, v16.2s, alphaV0 329 st1 {v8.s}[0], [pCRow0] 330 st1 {v8.s}[1], [pCRow1] 331 332 add pCRow2, pCRow1, LDC 333 add pCRow1, pCRow2, LDC 334 fmul v12.2s, v20.2s, alphaV1 335 st1 {v12.s}[0], [pCRow2] 336 st1 {v12.s}[1], [pCRow1] 337 338 add pCRow0, pCRow0, #4 339.endm 340 341/******************************************************************************/ 342 343.macro INIT4x2 344 fmov s16, wzr 345 fmov s17, s16 346 fmov s20, s17 347 fmov s21, s16 348.endm 349 350.macro KERNEL4x2_SUB 351 ld1 {v8.2s}, [pB] 352 add pB, pB, #8 353 ld1 {v0.2s, v1.2s}, [pA] 354 add pA, pA, #16 355 356 fmla v16.2s, v0.2s, v8.s[0] 357 fmla v17.2s, v1.2s, v8.s[0] 358 fmla v20.2s, v0.2s, v8.s[1] 359 fmla v21.2s, v1.2s, v8.s[1] 360.endm 361 362.macro SAVE4x2 363 fmul v8.2s, v16.2s, alphaV0 364 fmul v9.2s, v17.2s, alphaV1 365 st1 {v8.2s, v9.2s}, [pCRow0] 366 367 add pCRow1, pCRow0, LDC 368 fmul v12.2s, v20.2s, alphaV2 369 fmul v13.2s, v21.2s, alphaV3 370 st1 {v12.2s, v13.2s}, [pCRow1] 371 372 add pCRow0, pCRow0, #16 373.endm 374 375/******************************************************************************/ 376 377.macro INIT2x2 378 fmov s16, wzr 379 fmov s20, s16 380.endm 381 382.macro KERNEL2x2_SUB 383 ld1 {v8.2s}, [pB] 384 add pB, pB, #8 385 386 ld1 {v0.2s}, [pA] 387 add pA, pA, #8 388 389 fmla v16.2s, v0.2s, v8.s[0] 390 fmla v20.2s, v0.2s, v8.s[1] 391.endm 392 393.macro SAVE2x2 394 fmul v8.2s, v16.2s, alphaV0 395 st1 {v8.2s}, [pCRow0] 396 397 add pCRow1 , pCRow0, LDC 398 fmul v12.2s, v20.2s, alphaV1 399 st1 {v12.2s}, [pCRow1] 400 401 add pCRow0, pCRow0, #8 402.endm 403 404/******************************************************************************/ 405 406.macro INIT1x2 407 fmov s16, wzr 408.endm 409 410.macro KERNEL1x2_SUB 411 ld1 {v8.2s} , [pB] 412 add pB , pB, #8 413 414 ldr s0 , [pA] 415 add pA, pA, #4 416 417 fmla v16.2s, v8.2s, v0.s[0] 418.endm 419 420.macro SAVE1x2 421 add pCRow1 , pCRow0, LDC 422 fmul v8.2s, v16.2s, alphaV0 423 st1 {v8.s}[0], [pCRow0] 424 st1 {v8.s}[1], [pCRow1] 425 426 add pCRow0, pCRow0, #4 427.endm 428 429/******************************************************************************/ 430 431.macro INIT4x1 432 fmov s16, wzr 433 fmov s17, s16 434.endm 435 436.macro KERNEL4x1_SUB 437 ldr s8, [pB] 438 add pB , pB, #4 439 440 ld1 {v0.2s, v1.2s}, [pA] 441 add pA , pA, #16 442 443 fmla v16.2s, v0.2s, v8.s[0] 444 fmla v17.2s, v1.2s, v8.s[0] 445.endm 446 447.macro SAVE4x1 448 fmul v8.2s, v16.2s, alphaV0 449 fmul v9.2s, v17.2s, alphaV1 450 st1 {v8.2s, v9.2s}, [pCRow0] 451 452 add pCRow0, pCRow0, #16 453.endm 454 455 456 457 458/******************************************************************************/ 459 460.macro INIT2x1 461 fmov s16, wzr 462.endm 463 464.macro KERNEL2x1_SUB 465 ldr s8, [pB] 466 add pB , pB, #4 467 468 ld1 {v0.2s}, [pA] 469 add pA , pA, #8 470 471 fmla v16.2s, v0.2s, v8.s[0] 472.endm 473 474.macro SAVE2x1 475 fmul v8.2s, v16.2s, alphaV0 476 st1 {v8.2s}, [pCRow0] 477 478 add pCRow0, pCRow0, #8 479.endm 480 481/******************************************************************************/ 482 483.macro INIT1x1 484 fmov s16, wzr 485.endm 486 487.macro KERNEL1x1_SUB 488 ldr s8, [pB] 489 add pB , pB, #4 490 491 ldr s0, [pA] 492 add pA , pA, #4 493 494 fmadd s16, s0, s8, s16 495.endm 496 497.macro SAVE1x1 498 fmul s8, s16, alpha0 499 str s8, [pCRow0] 500 501 add pCRow0, pCRow0, #4 502.endm 503 504/******************************************************************************* 505* End of macro definitions 506*******************************************************************************/ 507 508 PROLOGUE 509 510.Lstrmm_kernel_begin: 511 512 .align 5 513 add sp, sp, #-(11 * 16) 514 stp d8, d9, [sp, #(0 * 16)] 515 stp d10, d11, [sp, #(1 * 16)] 516 stp d12, d13, [sp, #(2 * 16)] 517 stp d14, d15, [sp, #(3 * 16)] 518 stp d16, d17, [sp, #(4 * 16)] 519 stp x18, x19, [sp, #(5 * 16)] 520 stp x20, x21, [sp, #(6 * 16)] 521 stp x22, x23, [sp, #(7 * 16)] 522 stp x24, x25, [sp, #(8 * 16)] 523 stp x26, x27, [sp, #(9 * 16)] 524 str x28, [sp, #(10 * 16)] 525 526 fmov alpha0, s0 527 fmov alpha1, s0 528 fmov alpha2, s0 529 fmov alpha3, s0 530 531 lsl LDC, LDC, #2 // ldc = ldc * 4 532 533#if !defined(LEFT) 534 neg tempOffset, offset 535#endif 536 537 mov pB, origPB 538 539 mov counterJ, origN 540 asr counterJ, counterJ, #2 // J = J / 4 541 cmp counterJ, #0 542 ble .Lstrmm_kernel_L2_BEGIN 543 544/******************************************************************************/ 545 546.Lstrmm_kernel_L4_BEGIN: 547 mov pCRow0, pC // pCRow0 = C 548 add pC, pC, LDC, lsl #2 549 550#if defined(LEFT) 551 mov tempOffset, offset 552#endif 553 554 mov pA, origPA // pA = start of A array 555 556.Lstrmm_kernel_L4_M4_BEGIN: 557 558 mov counterI, origM 559 asr counterI, counterI, #2 // counterI = counterI / 4 560 cmp counterI, #0 561 ble .Lstrmm_kernel_L4_M2_BEGIN 562 563.Lstrmm_kernel_L4_M4_20: 564 565#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 566 mov pB, origPB 567#else 568 mov pB, origPB 569 lsl temp, tempOffset, #4 570 add pB, pB, temp 571 add pA, pA, temp 572#endif 573 574#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 575 sub tempK, origK, tempOffset 576#elif defined(LEFT) 577 add tempK, tempOffset, #4 578#else 579 add tempK, tempOffset, #4 580#endif 581 582 asr counterL , tempK, #1 // L = K / 2 583 cmp counterL , #2 // is there at least 4 to do? 584 blt .Lstrmm_kernel_L4_M4_32 585 586 KERNEL4x4_I // do one in the K 587 KERNEL4x4_M2 // do another in the K 588 589 subs counterL, counterL, #2 590 ble .Lstrmm_kernel_L4_M4_22a 591 .align 5 592 593.Lstrmm_kernel_L4_M4_22: 594 595 KERNEL4x4_M1 596 KERNEL4x4_M2 597 598 subs counterL, counterL, #1 599 bgt .Lstrmm_kernel_L4_M4_22 600 601.Lstrmm_kernel_L4_M4_22a: 602 603 KERNEL4x4_M1 604 KERNEL4x4_E 605 606 b .Lstrmm_kernel_L4_M4_44 607 608.Lstrmm_kernel_L4_M4_32: 609 610 tst counterL, #1 611 ble .Lstrmm_kernel_L4_M4_40 612 613 KERNEL4x4_I 614 KERNEL4x4_E 615 616 b .Lstrmm_kernel_L4_M4_44 617 618.Lstrmm_kernel_L4_M4_40: 619 620 INIT4x4 621 622.Lstrmm_kernel_L4_M4_44: 623 624 ands counterL , tempK, #1 625 ble .Lstrmm_kernel_L4_M4_100 626 627.Lstrmm_kernel_L4_M4_46: 628 629 KERNEL4x4_SUB 630 631.Lstrmm_kernel_L4_M4_100: 632 633 SAVE4x4 634 635#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 636 sub tempK, origK, tempOffset 637#if defined(LEFT) 638 sub tempK, tempK, #4 639#else 640 sub tempK, tempK, #4 641#endif 642 lsl temp, tempK, #4 643 add pA, pA, temp 644 add pB, pB, temp 645#endif 646#if defined(LEFT) 647 add tempOffset, tempOffset, #4 648#endif 649 650.Lstrmm_kernel_L4_M4_END: 651 subs counterI, counterI, #1 652 bne .Lstrmm_kernel_L4_M4_20 653 654.Lstrmm_kernel_L4_M2_BEGIN: 655 656 mov counterI, origM 657 tst counterI , #3 658 ble .Lstrmm_kernel_L4_END 659 660 tst counterI, #2 // counterI = counterI / 2 661 ble .Lstrmm_kernel_L4_M1_BEGIN 662 663.Lstrmm_kernel_L4_M2_20: 664 665 INIT2x4 666 667#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 668 mov pB, origPB 669#else 670 mov pB, origPB 671 lsl temp, tempOffset, #3 672 add pA, pA, temp 673 lsl temp, tempOffset, #4 674 add pB, pB, temp 675#endif 676 677#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 678 sub tempK, origK, tempOffset 679#elif defined(LEFT) 680 add tempK, tempOffset, #2 681#else 682 add tempK, tempOffset, #4 683#endif 684 685 asr counterL , tempK, #3 // counterL = counterL / 8 686 cmp counterL , #0 687 ble .Lstrmm_kernel_L4_M2_40 688 689.Lstrmm_kernel_L4_M2_22: 690 691 KERNEL2x4_SUB 692 KERNEL2x4_SUB 693 KERNEL2x4_SUB 694 KERNEL2x4_SUB 695 696 KERNEL2x4_SUB 697 KERNEL2x4_SUB 698 KERNEL2x4_SUB 699 KERNEL2x4_SUB 700 701 subs counterL, counterL, #1 702 bgt .Lstrmm_kernel_L4_M2_22 703 704 705.Lstrmm_kernel_L4_M2_40: 706 707 ands counterL , tempK, #7 // counterL = counterL % 8 708 ble .Lstrmm_kernel_L4_M2_100 709 710.Lstrmm_kernel_L4_M2_42: 711 712 KERNEL2x4_SUB 713 714 subs counterL, counterL, #1 715 bgt .Lstrmm_kernel_L4_M2_42 716 717.Lstrmm_kernel_L4_M2_100: 718 719 SAVE2x4 720 721#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 722 sub tempK, origK, tempOffset 723#if defined(LEFT) 724 sub tempK, tempK, #2 725#else 726 sub tempK, tempK, #4 727#endif 728 lsl temp, tempK, #3 729 add pA, pA, temp 730 lsl temp, tempK, #4 731 add pB, pB, temp 732#endif 733#if defined(LEFT) 734 add tempOffset, tempOffset, #2 735#endif 736 737 738.Lstrmm_kernel_L4_M2_END: 739 740 741.Lstrmm_kernel_L4_M1_BEGIN: 742 743 tst counterI, #1 // counterI = counterI % 2 744 ble .Lstrmm_kernel_L4_END 745 746.Lstrmm_kernel_L4_M1_20: 747 748 INIT1x4 749 750#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 751 mov pB, origPB 752#else 753 mov pB, origPB 754 lsl temp, tempOffset, #4 755 add pB, pB, temp 756 lsl temp, tempOffset, #2 757 add pA, pA, temp 758#endif 759 760#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 761 sub tempK, origK, tempOffset 762#elif defined(LEFT) 763 add tempK, tempOffset, #1 764#else 765 add tempK, tempOffset, #4 766#endif 767 768 asr counterL , tempK, #3 // counterL = counterL / 8 769 cmp counterL , #0 770 ble .Lstrmm_kernel_L4_M1_40 771 772.Lstrmm_kernel_L4_M1_22: 773 KERNEL1x4_SUB 774 KERNEL1x4_SUB 775 KERNEL1x4_SUB 776 KERNEL1x4_SUB 777 778 KERNEL1x4_SUB 779 KERNEL1x4_SUB 780 KERNEL1x4_SUB 781 KERNEL1x4_SUB 782 783 subs counterL, counterL, #1 784 bgt .Lstrmm_kernel_L4_M1_22 785 786 787.Lstrmm_kernel_L4_M1_40: 788 789 ands counterL , tempK, #7 // counterL = counterL % 8 790 ble .Lstrmm_kernel_L4_M1_100 791 792.Lstrmm_kernel_L4_M1_42: 793 794 KERNEL1x4_SUB 795 796 subs counterL, counterL, #1 797 bgt .Lstrmm_kernel_L4_M1_42 798 799.Lstrmm_kernel_L4_M1_100: 800 801 SAVE1x4 802 803#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 804 sub tempK, origK, tempOffset 805#if defined(LEFT) 806 sub tempK, tempK, #1 807#else 808 sub tempK, tempK, #4 809#endif 810 lsl temp, tempK, #2 811 add pA, pA, temp 812 lsl temp, tempK, #4 813 add pB, pB, temp 814#endif 815#if defined(LEFT) 816 add tempOffset, tempOffset, #1 817#endif 818 819 820.Lstrmm_kernel_L4_END: 821 add origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4 822 823#if !defined(LEFT) 824 add tempOffset, tempOffset, #4 825#endif 826 827 subs counterJ, counterJ , #1 // j-- 828 bgt .Lstrmm_kernel_L4_BEGIN 829 830 831/******************************************************************************/ 832 833.Lstrmm_kernel_L2_BEGIN: // less than 2 left in N direction 834 835 mov counterJ , origN 836 tst counterJ , #3 837 ble .Lstrmm_kernel_L999 838 839 tst counterJ , #2 840 ble .Lstrmm_kernel_L1_BEGIN 841 842 mov pCRow0, pC // pCRow0 = pC 843 844 add pC,pC,LDC, lsl #1 845 846#if defined(LEFT) 847 mov tempOffset, offset 848#endif 849 850 mov pA, origPA // pA = A 851 852.Lstrmm_kernel_L2_M4_BEGIN: 853 854 mov counterI, origM 855 asr counterI, counterI, #2 // counterI = counterI / 4 856 cmp counterI,#0 857 ble .Lstrmm_kernel_L2_M2_BEGIN 858 859.Lstrmm_kernel_L2_M4_20: 860 861 INIT4x2 862 863#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 864 mov pB, origPB 865#else 866 mov pB, origPB 867 lsl temp, tempOffset, #3 868 add pB, pB, temp 869 lsl temp, tempOffset, #4 870 add pA, pA, temp 871#endif 872 873#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 874 sub tempK, origK, tempOffset 875#elif defined(LEFT) 876 add tempK, tempOffset, #4 877#else 878 add tempK, tempOffset, #2 879#endif 880 881 asr counterL , tempK, #3 // counterL = counterL / 8 882 cmp counterL,#0 883 ble .Lstrmm_kernel_L2_M4_40 884 .align 5 885 886.Lstrmm_kernel_L2_M4_22: 887 KERNEL4x2_SUB 888 KERNEL4x2_SUB 889 KERNEL4x2_SUB 890 KERNEL4x2_SUB 891 892 KERNEL4x2_SUB 893 KERNEL4x2_SUB 894 KERNEL4x2_SUB 895 KERNEL4x2_SUB 896 897 subs counterL, counterL, #1 898 bgt .Lstrmm_kernel_L2_M4_22 899 900 901.Lstrmm_kernel_L2_M4_40: 902 903 ands counterL , tempK, #7 // counterL = counterL % 8 904 ble .Lstrmm_kernel_L2_M4_100 905 906.Lstrmm_kernel_L2_M4_42: 907 908 KERNEL4x2_SUB 909 910 subs counterL, counterL, #1 911 bgt .Lstrmm_kernel_L2_M4_42 912 913.Lstrmm_kernel_L2_M4_100: 914 915 SAVE4x2 916 917#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 918 sub tempK, origK, tempOffset 919#if defined(LEFT) 920 sub tempK, tempK, #4 921#else 922 sub tempK, tempK, #2 923#endif 924 lsl temp, tempK, #4 925 add pA, pA, temp 926 lsl temp, tempK, #3 927 add pB, pB, temp 928#endif 929#if defined(LEFT) 930 add tempOffset, tempOffset, #4 931#endif 932 933.Lstrmm_kernel_L2_M4_END: 934 935 subs counterI, counterI, #1 936 bgt .Lstrmm_kernel_L2_M4_20 937 938 939.Lstrmm_kernel_L2_M2_BEGIN: 940 941 mov counterI, origM 942 tst counterI , #3 943 ble .Lstrmm_kernel_L2_END 944 945 tst counterI, #2 // counterI = counterI / 2 946 ble .Lstrmm_kernel_L2_M1_BEGIN 947 948.Lstrmm_kernel_L2_M2_20: 949 950 INIT2x2 951 952#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 953 mov pB, origPB 954#else 955 mov pB, origPB 956 lsl temp, tempOffset, #3 957 add pB, pB, temp 958 lsl temp, tempOffset, #3 959 add pA, pA, temp 960#endif 961 962#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 963 sub tempK, origK, tempOffset 964#elif defined(LEFT) 965 add tempK, tempOffset, #2 966#else 967 add tempK, tempOffset, #2 968#endif 969 970 asr counterL , tempK, #3 // counterL = counterL / 8 971 cmp counterL,#0 972 ble .Lstrmm_kernel_L2_M2_40 973 974.Lstrmm_kernel_L2_M2_22: 975 976 KERNEL2x2_SUB 977 KERNEL2x2_SUB 978 KERNEL2x2_SUB 979 KERNEL2x2_SUB 980 981 KERNEL2x2_SUB 982 KERNEL2x2_SUB 983 KERNEL2x2_SUB 984 KERNEL2x2_SUB 985 986 subs counterL, counterL, #1 987 bgt .Lstrmm_kernel_L2_M2_22 988 989 990.Lstrmm_kernel_L2_M2_40: 991 992 ands counterL , tempK, #7 // counterL = counterL % 8 993 ble .Lstrmm_kernel_L2_M2_100 994 995.Lstrmm_kernel_L2_M2_42: 996 997 KERNEL2x2_SUB 998 999 subs counterL, counterL, #1 1000 bgt .Lstrmm_kernel_L2_M2_42 1001 1002.Lstrmm_kernel_L2_M2_100: 1003 1004 SAVE2x2 1005#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 1006 sub tempK, origK, tempOffset 1007#if defined(LEFT) 1008 sub tempK, tempK, #2 1009#else 1010 sub tempK, tempK, #2 1011#endif 1012 lsl temp, tempK, #3 1013 add pA, pA, temp 1014 lsl temp, tempK, #3 1015 add pB, pB, temp 1016#endif 1017#if defined(LEFT) 1018 add tempOffset, tempOffset, #2 1019#endif 1020 1021.Lstrmm_kernel_L2_M2_END: 1022 1023 1024.Lstrmm_kernel_L2_M1_BEGIN: 1025 1026 tst counterI, #1 // counterI = counterI % 2 1027 ble .Lstrmm_kernel_L2_END 1028 1029.Lstrmm_kernel_L2_M1_20: 1030 1031 INIT1x2 1032 1033#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 1034 mov pB, origPB 1035#else 1036 mov pB, origPB 1037 lsl temp, tempOffset, #3 1038 add pB, pB, temp 1039 lsl temp, tempOffset, #2 1040 add pA, pA, temp 1041#endif 1042 1043#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1044 sub tempK, origK, tempOffset 1045#elif defined(LEFT) 1046 add tempK, tempOffset, #1 1047#else 1048 add tempK, tempOffset, #2 1049#endif 1050 1051 asr counterL , tempK, #3 // counterL = counterL / 8 1052 cmp counterL, #0 1053 ble .Lstrmm_kernel_L2_M1_40 1054 1055.Lstrmm_kernel_L2_M1_22: 1056 KERNEL1x2_SUB 1057 KERNEL1x2_SUB 1058 KERNEL1x2_SUB 1059 KERNEL1x2_SUB 1060 1061 KERNEL1x2_SUB 1062 KERNEL1x2_SUB 1063 KERNEL1x2_SUB 1064 KERNEL1x2_SUB 1065 1066 subs counterL, counterL, #1 1067 bgt .Lstrmm_kernel_L2_M1_22 1068 1069 1070.Lstrmm_kernel_L2_M1_40: 1071 1072 ands counterL , tempK, #7 // counterL = counterL % 8 1073 ble .Lstrmm_kernel_L2_M1_100 1074 1075.Lstrmm_kernel_L2_M1_42: 1076 1077 KERNEL1x2_SUB 1078 1079 subs counterL, counterL, #1 1080 bgt .Lstrmm_kernel_L2_M1_42 1081 1082.Lstrmm_kernel_L2_M1_100: 1083 1084 SAVE1x2 1085 1086#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 1087 sub tempK, origK, tempOffset 1088#if defined(LEFT) 1089 sub tempK, tempK, #1 1090#else 1091 sub tempK, tempK, #2 1092#endif 1093 lsl temp, tempK, #2 1094 add pA, pA, temp 1095 lsl temp, tempK, #3 1096 add pB, pB, temp 1097#endif 1098#if defined(LEFT) 1099 add tempOffset, tempOffset, #1 1100#endif 1101 1102.Lstrmm_kernel_L2_END: 1103#if !defined(LEFT) 1104 add tempOffset, tempOffset, #2 1105#endif 1106 add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 1107 1108/******************************************************************************/ 1109 1110.Lstrmm_kernel_L1_BEGIN: 1111 1112 mov counterJ , origN 1113 tst counterJ , #1 1114 ble .Lstrmm_kernel_L999 // done 1115 1116 1117 mov pCRow0, pC // pCRow0 = C 1118 add pC , pC , LDC // Update pC to point to next 1119 1120#if defined(LEFT) 1121 mov tempOffset, offset 1122#endif 1123 1124 mov pA, origPA // pA = A 1125 1126.Lstrmm_kernel_L1_M4_BEGIN: 1127 1128 mov counterI, origM 1129 asr counterI, counterI, #2 // counterI = counterI / 4 1130 cmp counterI, #0 1131 ble .Lstrmm_kernel_L1_M2_BEGIN 1132 1133.Lstrmm_kernel_L1_M4_20: 1134 1135 INIT4x1 1136 1137#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 1138 mov pB, origPB 1139#else 1140 mov pB, origPB 1141 lsl temp, tempOffset, #2 1142 add pB, pB, temp 1143 lsl temp, tempOffset, #4 1144 add pA, pA, temp 1145#endif 1146 1147#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1148 sub tempK, origK, tempOffset 1149#elif defined(LEFT) 1150 add tempK, tempOffset, #4 1151#else 1152 add tempK, tempOffset, #1 1153#endif 1154 1155 asr counterL , tempK, #3 // counterL = counterL / 8 1156 cmp counterL , #0 1157 ble .Lstrmm_kernel_L1_M4_40 1158 .align 5 1159 1160.Lstrmm_kernel_L1_M4_22: 1161 KERNEL4x1_SUB 1162 KERNEL4x1_SUB 1163 KERNEL4x1_SUB 1164 KERNEL4x1_SUB 1165 1166 KERNEL4x1_SUB 1167 KERNEL4x1_SUB 1168 KERNEL4x1_SUB 1169 KERNEL4x1_SUB 1170 1171 subs counterL, counterL, #1 1172 bgt .Lstrmm_kernel_L1_M4_22 1173 1174 1175.Lstrmm_kernel_L1_M4_40: 1176 1177 ands counterL , tempK, #7 // counterL = counterL % 8 1178 ble .Lstrmm_kernel_L1_M4_100 1179 1180.Lstrmm_kernel_L1_M4_42: 1181 1182 KERNEL4x1_SUB 1183 1184 subs counterL, counterL, #1 1185 bgt .Lstrmm_kernel_L1_M4_42 1186 1187.Lstrmm_kernel_L1_M4_100: 1188 1189 SAVE4x1 1190 1191#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 1192 sub tempK, origK, tempOffset 1193#if defined(LEFT) 1194 sub tempK, tempK, #4 1195#else 1196 sub tempK, tempK, #1 1197#endif 1198 lsl temp, tempK, #4 1199 add pA, pA, temp 1200 lsl temp, tempK, #2 1201 add pB, pB, temp 1202#endif 1203#if defined(LEFT) 1204 add tempOffset, tempOffset, #4 1205#endif 1206 1207.Lstrmm_kernel_L1_M4_END: 1208 1209 subs counterI, counterI, #1 1210 bgt .Lstrmm_kernel_L1_M4_20 1211 1212 1213.Lstrmm_kernel_L1_M2_BEGIN: 1214 1215 mov counterI, origM 1216 tst counterI , #3 1217 ble .Lstrmm_kernel_L1_END 1218 1219 tst counterI, #2 // counterI = counterI / 2 1220 ble .Lstrmm_kernel_L1_M1_BEGIN 1221 1222.Lstrmm_kernel_L1_M2_20: 1223 1224 INIT2x1 1225 1226#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 1227 mov pB, origPB 1228#else 1229 mov pB, origPB 1230 lsl temp, tempOffset, #2 1231 add pB, pB, temp 1232 lsl temp, tempOffset, #3 1233 add pA, pA, temp 1234#endif 1235 1236#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1237 sub tempK, origK, tempOffset 1238#elif defined(LEFT) 1239 add tempK, tempOffset, #2 1240#else 1241 add tempK, tempOffset, #1 1242#endif 1243 1244 asr counterL , tempK, #3 // counterL = counterL / 8 1245 cmp counterL , #0 1246 ble .Lstrmm_kernel_L1_M2_40 1247 1248.Lstrmm_kernel_L1_M2_22: 1249 1250 KERNEL2x1_SUB 1251 KERNEL2x1_SUB 1252 KERNEL2x1_SUB 1253 KERNEL2x1_SUB 1254 1255 KERNEL2x1_SUB 1256 KERNEL2x1_SUB 1257 KERNEL2x1_SUB 1258 KERNEL2x1_SUB 1259 1260 subs counterL, counterL, #1 1261 bgt .Lstrmm_kernel_L1_M2_22 1262 1263 1264.Lstrmm_kernel_L1_M2_40: 1265 1266 ands counterL , tempK, #7 // counterL = counterL % 8 1267 ble .Lstrmm_kernel_L1_M2_100 1268 1269.Lstrmm_kernel_L1_M2_42: 1270 1271 KERNEL2x1_SUB 1272 1273 subs counterL, counterL, #1 1274 bgt .Lstrmm_kernel_L1_M2_42 1275 1276.Lstrmm_kernel_L1_M2_100: 1277 1278 SAVE2x1 1279 1280#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 1281 sub tempK, origK, tempOffset 1282#if defined(LEFT) 1283 sub tempK, tempK, #2 1284#else 1285 sub tempK, tempK, #1 1286#endif 1287 lsl temp, tempK, #3 1288 add pA, pA, temp 1289 lsl temp, tempK, #2 1290 add pB, pB, temp 1291#endif 1292#if defined(LEFT) 1293 add tempOffset, tempOffset, #2 1294#endif 1295 1296 1297.Lstrmm_kernel_L1_M2_END: 1298 1299 1300.Lstrmm_kernel_L1_M1_BEGIN: 1301 1302 tst counterI, #1 // counterI = counterI % 2 1303 ble .Lstrmm_kernel_L1_END 1304 1305.Lstrmm_kernel_L1_M1_20: 1306 1307 INIT1x1 1308 1309#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 1310 mov pB, origPB 1311#else 1312 mov pB, origPB 1313 lsl temp, tempOffset, #2 1314 add pB, pB, temp 1315 lsl temp, tempOffset, #2 1316 add pA, pA, temp 1317#endif 1318 1319#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1320 sub tempK, origK, tempOffset 1321#elif defined(LEFT) 1322 add tempK, tempOffset, #1 1323#else 1324 add tempK, tempOffset, #1 1325#endif 1326 1327 asr counterL , tempK, #3 // counterL = counterL / 8 1328 cmp counterL , #0 1329 ble .Lstrmm_kernel_L1_M1_40 1330 1331.Lstrmm_kernel_L1_M1_22: 1332 KERNEL1x1_SUB 1333 KERNEL1x1_SUB 1334 KERNEL1x1_SUB 1335 KERNEL1x1_SUB 1336 1337 KERNEL1x1_SUB 1338 KERNEL1x1_SUB 1339 KERNEL1x1_SUB 1340 KERNEL1x1_SUB 1341 1342 subs counterL, counterL, #1 1343 bgt .Lstrmm_kernel_L1_M1_22 1344 1345 1346.Lstrmm_kernel_L1_M1_40: 1347 1348 ands counterL , tempK, #7 // counterL = counterL % 8 1349 ble .Lstrmm_kernel_L1_M1_100 1350 1351.Lstrmm_kernel_L1_M1_42: 1352 1353 KERNEL1x1_SUB 1354 1355 subs counterL, counterL, #1 1356 bgt .Lstrmm_kernel_L1_M1_42 1357 1358.Lstrmm_kernel_L1_M1_100: 1359 1360 SAVE1x1 1361 1362#if 0 1363#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) 1364 sub tempK, origK, tempOffset 1365#if defined(LEFT) 1366 sub tempK, tempK, #1 1367#else 1368 sub tempK, tempK, #1 1369#endif 1370 lsl temp, tempK, #2 1371 add pA, pA, temp 1372 lsl temp, tempK, #2 1373 add pB, pB, temp 1374#endif 1375#if defined(LEFT) 1376 add tempOffset, tempOffset, #1 1377#endif 1378#endif 1379 1380.Lstrmm_kernel_L1_END: 1381 1382#if 0 1383#if !defined(LEFT) 1384 add tempOffset, tempOffset, #1 1385#endif 1386#endif 1387 1388.Lstrmm_kernel_L999: 1389 mov x0, #0 // set return value 1390 ldp d8, d9, [sp, #(0 * 16)] 1391 ldp d10, d11, [sp, #(1 * 16)] 1392 ldp d12, d13, [sp, #(2 * 16)] 1393 ldp d14, d15, [sp, #(3 * 16)] 1394 ldp d16, d17, [sp, #(4 * 16)] 1395 ldp x18, x19, [sp, #(5 * 16)] 1396 ldp x20, x21, [sp, #(6 * 16)] 1397 ldp x22, x23, [sp, #(7 * 16)] 1398 ldp x24, x25, [sp, #(8 * 16)] 1399 ldp x26, x27, [sp, #(9 * 16)] 1400 ldr x28, [sp, #(10 * 16)] 1401 add sp, sp, #(11*16) 1402 ret 1403 1404 EPILOGUE 1405 1406