1/*************************************************************************** 2Copyright (c) 2013, The OpenBLAS Project 3All rights reserved. 4Redistribution and use in source and binary forms, with or without 5modification, are permitted provided that the following conditions are 6met: 71. Redistributions of source code must retain the above copyright 8notice, this list of conditions and the following disclaimer. 92. Redistributions in binary form must reproduce the above copyright 10notice, this list of conditions and the following disclaimer in 11the documentation and/or other materials provided with the 12distribution. 133. Neither the name of the OpenBLAS project nor the names of 14its contributors may be used to endorse or promote products 15derived from this software without specific prior written permission. 16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE 20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE 25USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26*****************************************************************************/ 27 28/************************************************************************************** 29* 2013/10/16 Saar 30* BLASTEST : OK 31* CTEST : OK 32* TEST : OK 33* 34**************************************************************************************/ 35 36#define ASSEMBLER 37#include "common.h" 38 39#define STACKSIZE 256 40 41#define OLD_M r0 42#define OLD_N r1 43#define OLD_K r2 44#define OLD_A r3 45#define OLD_ALPHA_R s0 46#define OLD_ALPHA_I s1 47 48/****************************************************** 49* [fp, #-128] - [fp, #-64] is reserved 50* for store and restore of floating point 51* registers 52*******************************************************/ 53 54#define KKK [fp, #-240] 55#define KK [fp, #-244 ] 56#define A [fp, #-248 ] 57#define LDC [fp, #-252 ] 58#define M [fp, #-256 ] 59#define N [fp, #-260 ] 60#define K [fp, #-264 ] 61 62#define FP_ZERO [fp, #-232] 63#define FP_ZERO_0 [fp, #-232] 64#define FP_ZERO_1 [fp, #-228] 65 66 67#define ALPHA_I [fp, #-272] 68#define ALPHA_R [fp, #-280] 69 70#if !defined(__ARM_PCS_VFP) 71#define OLD_ALPHAR_SOFTFP r3 72#define OLD_ALPHAI_SOFTFP [fp, #4] 73#define OLD_A_SOFTFP [fp, #8 ] 74#define B [fp, #12 ] 75#define C [fp, #16 ] 76#define OLD_LDC [fp, #20 ] 77#define OFFSET [fp, #24 ] 78#else 79#define B [fp, #4 ] 80#define C [fp, #8 ] 81#define OLD_LDC [fp, #12 ] 82#define OFFSET [fp, #16 ] 83#endif 84 85#define I r0 86#define J r1 87#define L r2 88 89#define AO r5 90#define BO r6 91 92#define CO1 r8 93#define CO2 r9 94 95#define K1 r7 96#define BC r12 97 98#define A_PRE 96 99#define B_PRE 96 100#define C_PRE 64 101 102 103 104/************************************************************************************** 105* Macro definitions 106**************************************************************************************/ 107 108 109#if defined(NN) || defined(NT) || defined(TN) || defined(TT) 110 111 #define KMAC_R vmls.f32 112 #define KMAC_I fmacs 113 114 #define FMAC_R1 fmacs 115 #define FMAC_R2 vmls.f32 116 #define FMAC_I1 fmacs 117 #define FMAC_I2 fmacs 118 119#elif defined(CN) || defined(CT) 120 121 #define KMAC_R fmacs 122 #define KMAC_I vmls.f32 123 124 #define FMAC_R1 fmacs 125 #define FMAC_R2 vmls.f32 126 #define FMAC_I1 fmacs 127 #define FMAC_I2 fmacs 128 129#elif defined(NC) || defined(TC) 130 131 #define KMAC_R fmacs 132 #define KMAC_I vmls.f32 133 134 #define FMAC_R1 fmacs 135 #define FMAC_R2 fmacs 136 #define FMAC_I1 vmls.f32 137 #define FMAC_I2 fmacs 138 139#else 140 141 #define KMAC_R vmls.f32 142 #define KMAC_I fmacs 143 144 #define FMAC_R1 fmacs 145 #define FMAC_R2 fmacs 146 #define FMAC_I1 vmls.f32 147 #define FMAC_I2 fmacs 148 149#endif 150 151 152.macro INIT2x2 153 154 flds s8 , FP_ZERO 155 vmov.f32 s9 , s8 156 vmov.f32 s10, s8 157 vmov.f32 s11, s8 158 vmov.f32 s12, s8 159 vmov.f32 s13, s8 160 vmov.f32 s14, s8 161 vmov.f32 s15, s8 162 163.endm 164 165.macro KERNEL2x2_I 166 167 pld [ AO, #A_PRE ] 168 vldmia.f32 AO!, { s0 - s3 } 169 pld [ BO, #B_PRE ] 170 vldmia.f32 BO!, { s4 - s7 } 171 172 173 fmuls s8 , s0, s4 174 fmuls s9 , s0, s5 175 fmuls s10 , s2, s4 176 fmuls s11 , s2, s5 177 178 KMAC_R s8 , s1, s5 179 KMAC_I s9 , s1, s4 180 KMAC_R s10 , s3, s5 181 KMAC_I s11 , s3, s4 182 183 fmuls s12 , s0, s6 184 fmuls s13 , s0, s7 185 fmuls s14 , s2, s6 186 fmuls s15 , s2, s7 187 188 KMAC_R s12 , s1, s7 189 KMAC_I s13 , s1, s6 190 KMAC_R s14 , s3, s7 191 KMAC_I s15 , s3, s6 192 193.endm 194 195 196 197.macro KERNEL2x2_M1 198 199 pld [ AO, #A_PRE ] 200 vldmia.f32 AO!, { s0 - s3 } 201 pld [ BO, #B_PRE ] 202 vldmia.f32 BO!, { s4 - s7 } 203 204 fmacs s8 , s0, s4 205 fmacs s9 , s0, s5 206 fmacs s10 , s2, s4 207 fmacs s11 , s2, s5 208 209 KMAC_R s8 , s1, s5 210 KMAC_I s9 , s1, s4 211 KMAC_R s10 , s3, s5 212 KMAC_I s11 , s3, s4 213 214 fmacs s12 , s0, s6 215 fmacs s13 , s0, s7 216 fmacs s14 , s2, s6 217 fmacs s15 , s2, s7 218 219 KMAC_R s12 , s1, s7 220 KMAC_I s13 , s1, s6 221 KMAC_R s14 , s3, s7 222 KMAC_I s15 , s3, s6 223 224.endm 225 226.macro KERNEL2x2_M2 227 228 vldmia.f32 AO!, { s0 - s3 } 229 vldmia.f32 BO!, { s4 - s7 } 230 231 fmacs s8 , s0, s4 232 fmacs s9 , s0, s5 233 fmacs s10 , s2, s4 234 fmacs s11 , s2, s5 235 236 KMAC_R s8 , s1, s5 237 KMAC_I s9 , s1, s4 238 KMAC_R s10 , s3, s5 239 KMAC_I s11 , s3, s4 240 241 fmacs s12 , s0, s6 242 fmacs s13 , s0, s7 243 fmacs s14 , s2, s6 244 fmacs s15 , s2, s7 245 246 KMAC_R s12 , s1, s7 247 KMAC_I s13 , s1, s6 248 KMAC_R s14 , s3, s7 249 KMAC_I s15 , s3, s6 250 251 252.endm 253 254 255.macro KERNEL2x2_E 256 257 vldmia.f32 AO!, { s0 - s3 } 258 vldmia.f32 BO!, { s4 - s7 } 259 260 fmacs s8 , s0, s4 261 fmacs s9 , s0, s5 262 fmacs s10 , s2, s4 263 fmacs s11 , s2, s5 264 265 KMAC_R s8 , s1, s5 266 KMAC_I s9 , s1, s4 267 KMAC_R s10 , s3, s5 268 KMAC_I s11 , s3, s4 269 270 fmacs s12 , s0, s6 271 fmacs s13 , s0, s7 272 fmacs s14 , s2, s6 273 fmacs s15 , s2, s7 274 275 KMAC_R s12 , s1, s7 276 KMAC_I s13 , s1, s6 277 KMAC_R s14 , s3, s7 278 KMAC_I s15 , s3, s6 279 280 281.endm 282 283.macro KERNEL2x2_SUB 284 285 vldmia.f32 AO!, { s0 - s3 } 286 vldmia.f32 BO!, { s4 - s7 } 287 288 fmacs s8 , s0, s4 289 fmacs s9 , s0, s5 290 fmacs s10 , s2, s4 291 fmacs s11 , s2, s5 292 293 KMAC_R s8 , s1, s5 294 KMAC_I s9 , s1, s4 295 KMAC_R s10 , s3, s5 296 KMAC_I s11 , s3, s4 297 298 fmacs s12 , s0, s6 299 fmacs s13 , s0, s7 300 fmacs s14 , s2, s6 301 fmacs s15 , s2, s7 302 303 KMAC_R s12 , s1, s7 304 KMAC_I s13 , s1, s6 305 KMAC_R s14 , s3, s7 306 KMAC_I s15 , s3, s6 307 308 309.endm 310 311.macro SAVE2x2 312 313 ldr r3 , LDC 314 add CO2 , CO1, r3 315 316 flds s0, ALPHA_R 317 flds s1, ALPHA_I 318 319 flds s4, FP_ZERO 320 vmov.f32 s5, s4 321 vmov.f32 s6, s4 322 vmov.f32 s7, s4 323 324 FMAC_R1 s4 , s0 , s8 325 FMAC_I1 s5 , s0 , s9 326 FMAC_R2 s4 , s1 , s9 327 FMAC_I2 s5 , s1 , s8 328 329 FMAC_R1 s6 , s0 , s10 330 FMAC_I1 s7 , s0 , s11 331 FMAC_R2 s6 , s1 , s11 332 FMAC_I2 s7 , s1 , s10 333 334 vstmia.f32 CO1, { s4 - s7 } 335 336 flds s4, FP_ZERO 337 vmov.f32 s5, s4 338 vmov.f32 s6, s4 339 vmov.f32 s7, s4 340 341 FMAC_R1 s4 , s0 , s12 342 FMAC_I1 s5 , s0 , s13 343 FMAC_R2 s4 , s1 , s13 344 FMAC_I2 s5 , s1 , s12 345 346 FMAC_R1 s6 , s0 , s14 347 FMAC_I1 s7 , s0 , s15 348 FMAC_R2 s6 , s1 , s15 349 FMAC_I2 s7 , s1 , s14 350 351 vstmia.f32 CO2, { s4 - s7 } 352 353 add CO1, CO1, #16 354 355.endm 356 357/******************************************************************************/ 358 359.macro INIT1x2 360 361 flds s8 , FP_ZERO 362 vmov.f32 s9 , s8 363 vmov.f32 s12, s8 364 vmov.f32 s13, s8 365 366.endm 367 368.macro KERNEL1x2_I 369 370 flds s0 , [ AO ] 371 flds s1 , [ AO, #4 ] 372 373 flds s4 , [ BO ] 374 flds s5 , [ BO, #4 ] 375 376 flds s6 , [ BO, #8 ] 377 flds s7 , [ BO, #12 ] 378 379 fmuls s8 , s0, s4 380 KMAC_R s8 , s1, s5 381 fmuls s9 , s0, s5 382 KMAC_I s9 , s1, s4 383 384 fmuls s12 , s0, s6 385 KMAC_R s12 , s1, s7 386 fmuls s13 , s0, s7 387 KMAC_I s13 , s1, s6 388 389 add BO , BO, #16 390 add AO , AO, #8 391 392.endm 393 394 395 396.macro KERNEL1x2_M1 397 398 flds s0 , [ AO ] 399 flds s1 , [ AO, #4 ] 400 401 flds s4 , [ BO ] 402 flds s5 , [ BO, #4 ] 403 flds s6 , [ BO, #8 ] 404 flds s7 , [ BO, #12 ] 405 406 fmacs s8 , s0, s4 407 KMAC_R s8 , s1, s5 408 fmacs s9 , s0, s5 409 KMAC_I s9 , s1, s4 410 411 fmacs s12 , s0, s6 412 KMAC_R s12 , s1, s7 413 fmacs s13 , s0, s7 414 KMAC_I s13 , s1, s6 415 416 add BO , BO, #16 417 add AO , AO, #8 418 419.endm 420 421.macro KERNEL1x2_M2 422 423 flds s0 , [ AO ] 424 flds s1 , [ AO, #4 ] 425 426 flds s4 , [ BO ] 427 flds s5 , [ BO, #4 ] 428 flds s6 , [ BO, #8 ] 429 flds s7 , [ BO, #12 ] 430 431 fmacs s8 , s0, s4 432 KMAC_R s8 , s1, s5 433 fmacs s9 , s0, s5 434 KMAC_I s9 , s1, s4 435 436 fmacs s12 , s0, s6 437 KMAC_R s12 , s1, s7 438 fmacs s13 , s0, s7 439 KMAC_I s13 , s1, s6 440 441 add BO , BO, #16 442 add AO , AO, #8 443 444 445.endm 446 447 448.macro KERNEL1x2_E 449 450 flds s0 , [ AO ] 451 flds s1 , [ AO, #4 ] 452 453 flds s4 , [ BO ] 454 flds s5 , [ BO, #4 ] 455 flds s6 , [ BO, #8 ] 456 flds s7 , [ BO, #12 ] 457 458 fmacs s8 , s0, s4 459 KMAC_R s8 , s1, s5 460 fmacs s9 , s0, s5 461 KMAC_I s9 , s1, s4 462 463 fmacs s12 , s0, s6 464 KMAC_R s12 , s1, s7 465 fmacs s13 , s0, s7 466 KMAC_I s13 , s1, s6 467 468 add BO , BO, #16 469 add AO , AO, #8 470 471.endm 472 473.macro KERNEL1x2_SUB 474 475 flds s0 , [ AO ] 476 flds s1 , [ AO, #4 ] 477 478 flds s4 , [ BO ] 479 flds s5 , [ BO, #4 ] 480 flds s6 , [ BO, #8 ] 481 flds s7 , [ BO, #12 ] 482 483 fmacs s8 , s0, s4 484 KMAC_R s8 , s1, s5 485 fmacs s9 , s0, s5 486 KMAC_I s9 , s1, s4 487 488 fmacs s12 , s0, s6 489 KMAC_R s12 , s1, s7 490 fmacs s13 , s0, s7 491 KMAC_I s13 , s1, s6 492 493 add BO , BO, #16 494 add AO , AO, #8 495 496 497.endm 498 499 500.macro SAVE1x2 501 502 ldr r3 , LDC 503 add CO2 , CO1, r3 504 505 flds s0, ALPHA_R 506 flds s1, ALPHA_I 507 508 flds s4, FP_ZERO 509 vmov.f32 s5, s4 510 511 FMAC_R1 s4 , s0 , s8 512 FMAC_I1 s5 , s0 , s9 513 FMAC_R2 s4 , s1 , s9 514 FMAC_I2 s5 , s1 , s8 515 516 vstmia.f32 CO1, { s4 - s5 } 517 518 flds s4, FP_ZERO 519 vmov.f32 s5, s4 520 521 FMAC_R1 s4 , s0 , s12 522 FMAC_I1 s5 , s0 , s13 523 FMAC_R2 s4 , s1 , s13 524 FMAC_I2 s5 , s1 , s12 525 526 vstmia.f32 CO2, { s4 - s5 } 527 528 add CO1, CO1, #8 529 530.endm 531 532 533/******************************************************************************/ 534 535.macro INIT2x1 536 537 flds s8 , FP_ZERO 538 vmov.f32 s9 , s8 539 vmov.f32 s10, s8 540 vmov.f32 s11, s8 541 542.endm 543 544.macro KERNEL2x1_I 545 546 flds s0 , [ AO ] 547 flds s1 , [ AO, #4 ] 548 549 flds s2 , [ AO, #8 ] 550 flds s3 , [ AO, #12 ] 551 552 flds s4 , [ BO ] 553 flds s5 , [ BO, #4 ] 554 555 fmuls s8 , s0, s4 556 KMAC_R s8 , s1, s5 557 fmuls s9 , s0, s5 558 KMAC_I s9 , s1, s4 559 560 fmuls s10 , s2, s4 561 KMAC_R s10 , s3, s5 562 fmuls s11 , s2, s5 563 KMAC_I s11 , s3, s4 564 565 add BO , BO, #8 566 add AO , AO, #16 567 568.endm 569 570 571 572.macro KERNEL2x1_M1 573 574 flds s0 , [ AO ] 575 flds s1 , [ AO, #4 ] 576 flds s2 , [ AO, #8 ] 577 flds s3 , [ AO, #12 ] 578 579 flds s4 , [ BO ] 580 flds s5 , [ BO, #4 ] 581 582 fmacs s8 , s0, s4 583 KMAC_R s8 , s1, s5 584 fmacs s9 , s0, s5 585 KMAC_I s9 , s1, s4 586 587 fmacs s10 , s2, s4 588 KMAC_R s10 , s3, s5 589 fmacs s11 , s2, s5 590 KMAC_I s11 , s3, s4 591 592 add BO , BO, #8 593 add AO , AO, #16 594 595.endm 596 597.macro KERNEL2x1_M2 598 599 flds s0 , [ AO ] 600 flds s1 , [ AO, #4 ] 601 flds s2 , [ AO, #8 ] 602 flds s3 , [ AO, #12 ] 603 604 flds s4 , [ BO ] 605 flds s5 , [ BO, #4 ] 606 607 fmacs s8 , s0, s4 608 KMAC_R s8 , s1, s5 609 fmacs s9 , s0, s5 610 KMAC_I s9 , s1, s4 611 612 fmacs s10 , s2, s4 613 KMAC_R s10 , s3, s5 614 fmacs s11 , s2, s5 615 KMAC_I s11 , s3, s4 616 617 add BO , BO, #8 618 add AO , AO, #16 619 620 621.endm 622 623 624.macro KERNEL2x1_E 625 626 flds s0 , [ AO ] 627 flds s1 , [ AO, #4 ] 628 flds s2 , [ AO, #8 ] 629 flds s3 , [ AO, #12 ] 630 631 flds s4 , [ BO ] 632 flds s5 , [ BO, #4 ] 633 634 fmacs s8 , s0, s4 635 KMAC_R s8 , s1, s5 636 fmacs s9 , s0, s5 637 KMAC_I s9 , s1, s4 638 639 fmacs s10 , s2, s4 640 KMAC_R s10 , s3, s5 641 fmacs s11 , s2, s5 642 KMAC_I s11 , s3, s4 643 644 add BO , BO, #8 645 add AO , AO, #16 646 647.endm 648 649.macro KERNEL2x1_SUB 650 651 flds s0 , [ AO ] 652 flds s1 , [ AO, #4 ] 653 flds s2 , [ AO, #8 ] 654 flds s3 , [ AO, #12 ] 655 656 flds s4 , [ BO ] 657 flds s5 , [ BO, #4 ] 658 659 fmacs s8 , s0, s4 660 KMAC_R s8 , s1, s5 661 fmacs s9 , s0, s5 662 KMAC_I s9 , s1, s4 663 664 fmacs s10 , s2, s4 665 KMAC_R s10 , s3, s5 666 fmacs s11 , s2, s5 667 KMAC_I s11 , s3, s4 668 669 add BO , BO, #8 670 add AO , AO, #16 671 672 673.endm 674 675 676.macro SAVE2x1 677 678 flds s0, ALPHA_R 679 flds s1, ALPHA_I 680 681 flds s4, FP_ZERO 682 vmov.f32 s5, s4 683 vmov.f32 s6, s4 684 vmov.f32 s7, s4 685 686 FMAC_R1 s4 , s0 , s8 687 FMAC_I1 s5 , s0 , s9 688 FMAC_R2 s4 , s1 , s9 689 FMAC_I2 s5 , s1 , s8 690 691 FMAC_R1 s6 , s0 , s10 692 FMAC_I1 s7 , s0 , s11 693 FMAC_R2 s6 , s1 , s11 694 FMAC_I2 s7 , s1 , s10 695 696 vstmia.f32 CO1, { s4 - s7 } 697 698 add CO1, CO1, #16 699 700.endm 701 702 703/******************************************************************************/ 704 705.macro INIT1x1 706 707 flds s8 , FP_ZERO 708 vmov.f32 s9 , s8 709 710.endm 711 712.macro KERNEL1x1_I 713 714 flds s0 , [ AO ] 715 flds s1 , [ AO, #4 ] 716 717 flds s4 , [ BO ] 718 flds s5 , [ BO, #4 ] 719 720 fmuls s8 , s0, s4 721 KMAC_R s8 , s1, s5 722 fmuls s9 , s0, s5 723 KMAC_I s9 , s1, s4 724 725 add BO , BO, #8 726 add AO , AO, #8 727 728.endm 729 730 731 732.macro KERNEL1x1_M1 733 734 flds s0 , [ AO ] 735 flds s1 , [ AO, #4 ] 736 737 flds s4 , [ BO ] 738 flds s5 , [ BO, #4 ] 739 740 fmacs s8 , s0, s4 741 KMAC_R s8 , s1, s5 742 fmacs s9 , s0, s5 743 KMAC_I s9 , s1, s4 744 745 add BO , BO, #8 746 add AO , AO, #8 747 748.endm 749 750.macro KERNEL1x1_M2 751 752 flds s0 , [ AO ] 753 flds s1 , [ AO, #4 ] 754 755 flds s4 , [ BO ] 756 flds s5 , [ BO, #4 ] 757 758 fmacs s8 , s0, s4 759 KMAC_R s8 , s1, s5 760 fmacs s9 , s0, s5 761 KMAC_I s9 , s1, s4 762 763 add BO , BO, #8 764 add AO , AO, #8 765 766 767.endm 768 769 770.macro KERNEL1x1_E 771 772 flds s0 , [ AO ] 773 flds s1 , [ AO, #4 ] 774 775 flds s4 , [ BO ] 776 flds s5 , [ BO, #4 ] 777 778 fmacs s8 , s0, s4 779 KMAC_R s8 , s1, s5 780 fmacs s9 , s0, s5 781 KMAC_I s9 , s1, s4 782 783 add BO , BO, #8 784 add AO , AO, #8 785 786.endm 787 788.macro KERNEL1x1_SUB 789 790 flds s0 , [ AO ] 791 flds s1 , [ AO, #4 ] 792 793 flds s4 , [ BO ] 794 flds s5 , [ BO, #4 ] 795 796 fmacs s8 , s0, s4 797 KMAC_R s8 , s1, s5 798 fmacs s9 , s0, s5 799 KMAC_I s9 , s1, s4 800 801 add BO , BO, #8 802 add AO , AO, #8 803 804 805.endm 806 807 808.macro SAVE1x1 809 810 flds s0, ALPHA_R 811 flds s1, ALPHA_I 812 813 flds s4, FP_ZERO 814 vmov.f32 s5, s4 815 816 FMAC_R1 s4 , s0 , s8 817 FMAC_I1 s5 , s0 , s9 818 FMAC_R2 s4 , s1 , s9 819 FMAC_I2 s5 , s1 , s8 820 821 vstmia.f32 CO1, { s4 - s5 } 822 823 add CO1, CO1, #8 824 825.endm 826 827/************************************************************************************** 828* End of macro definitions 829**************************************************************************************/ 830 831 PROLOGUE 832 833 .align 5 834 835 push {r4 - r9, fp} 836 add fp, sp, #24 837 sub sp, sp, #STACKSIZE // reserve stack 838 839#if !defined(__ARM_PCS_VFP) 840 vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP 841 vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP 842 ldr OLD_A, OLD_A_SOFTFP 843#endif 844 str OLD_M, M 845 str OLD_N, N 846 str OLD_K, K 847 str OLD_A, A 848 vstr OLD_ALPHA_R, ALPHA_R 849 vstr OLD_ALPHA_I, ALPHA_I 850 851 sub r3, fp, #128 852 vstm r3, { s8 - s15} // store floating point registers 853 854 movs r4, #0 855 str r4, FP_ZERO 856 str r4, FP_ZERO_1 857 858 ldr r3, OLD_LDC 859 lsl r3, r3, #3 // ldc = ldc * 4 * 2 860 str r3, LDC 861 862 ldr r3, OFFSET 863#ifndef LEFT 864 neg r3 , r3 865#endif 866 str r3 , KK 867 868 ldr BC, B 869 870 ldr J, N 871 asrs J, J, #1 // J = J / 2 872 ble _L1_BEGIN 873 874_L2_BEGIN: 875 876 ldr CO1, C // CO1 = C 877 ldr r4 , LDC 878 lsl r4 , r4 , #1 // LDC * 2 879 add r3 , r4, CO1 880 str r3 , C // store C 881 882#if defined(LEFT) 883 ldr r3 , OFFSET 884 str r3 , KK 885#endif 886 887 ldr AO, A // AO = A 888 pld [AO , #A_PRE-64] 889 pld [AO , #A_PRE-32] 890 891 892 893_L2_M2_BEGIN: 894 895 ldr I, M 896 asrs I, I, #1 // I = I / 2 897 ble _L2_M1_BEGIN 898 899_L2_M2_20: 900 901#if (defined(LEFT) && defined(TRANSA)) || \ 902 (!defined(LEFT) && !defined(TRANSA)) 903 904 mov BO, BC 905#else 906 mov BO, BC 907 ldr r3 , KK 908 lsls r4 , r3 , #4 // 2 * 4 * 2 float values 909 add BO , BO , r4 910 lsls r4 , r3 , #4 // 2 * 4 * 2 float values 911 add AO , AO , r4 912 913#endif 914 915#ifndef TRMMKERNEL 916 ldr K1, K 917#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 918 ldr K1, K 919 ldr r3, KK 920 sub K1, K1, r3 921 str K1, KKK 922#else 923 ldr K1, KK 924#ifdef LEFT 925 add K1, K1, #2 // number of values in AO 926#else 927 add K1, K1, #2 // number of values in BO 928#endif 929 str K1, KKK 930#endif 931 932 asrs L , K1, #3 // L = L / 8 933 cmp L , #3 934 blt _L2_M2_30 935 .align 5 936 937 938 939 KERNEL2x2_I 940 KERNEL2x2_M2 941 KERNEL2x2_M1 942 KERNEL2x2_M2 943 944 KERNEL2x2_M1 945 KERNEL2x2_M2 946 KERNEL2x2_M1 947 KERNEL2x2_M2 948 949 sub L, L, #2 950 951_L2_M2_22: 952 953 KERNEL2x2_M1 954 KERNEL2x2_M2 955 KERNEL2x2_M1 956 KERNEL2x2_M2 957 958 KERNEL2x2_M1 959 KERNEL2x2_M2 960 KERNEL2x2_M1 961 KERNEL2x2_M2 962 963 subs L, L, #1 964 bgt _L2_M2_22 965 966 KERNEL2x2_M1 967 KERNEL2x2_M2 968 KERNEL2x2_M1 969 KERNEL2x2_M2 970 971 KERNEL2x2_M1 972 KERNEL2x2_M2 973 KERNEL2x2_M1 974 KERNEL2x2_E 975 976 b _L2_M2_44 977 978 979_L2_M2_30: 980 tst L, #3 981 ble _L2_M2_40 982 983 tst L, #2 984 ble _L2_M2_32 985 986 KERNEL2x2_I 987 KERNEL2x2_M2 988 KERNEL2x2_M1 989 KERNEL2x2_M2 990 991 KERNEL2x2_M1 992 KERNEL2x2_M2 993 KERNEL2x2_M1 994 KERNEL2x2_M2 995 996 KERNEL2x2_M1 997 KERNEL2x2_M2 998 KERNEL2x2_M1 999 KERNEL2x2_M2 1000 1001 1002 KERNEL2x2_M1 1003 KERNEL2x2_M2 1004 KERNEL2x2_M1 1005 KERNEL2x2_E 1006 1007 b _L2_M2_44 1008 1009_L2_M2_32: 1010 1011 tst L, #1 1012 ble _L2_M2_40 1013 1014 KERNEL2x2_I 1015 KERNEL2x2_M2 1016 KERNEL2x2_M1 1017 KERNEL2x2_M2 1018 1019 KERNEL2x2_M1 1020 KERNEL2x2_M2 1021 KERNEL2x2_M1 1022 KERNEL2x2_E 1023 1024 b _L2_M2_44 1025 1026 1027_L2_M2_40: 1028 1029 INIT2x2 1030 1031 1032_L2_M2_44: 1033 1034 ands L , K1, #7 // L = L % 8 1035 ble _L2_M2_100 1036 1037_L2_M2_46: 1038 1039 KERNEL2x2_SUB 1040 1041 subs L, L, #1 1042 bne _L2_M2_46 1043 1044_L2_M2_100: 1045 1046 SAVE2x2 1047 1048#if (defined(LEFT) && defined(TRANSA)) || \ 1049 (!defined(LEFT) && !defined(TRANSA)) 1050 ldr r3 , K 1051 ldr r4 , KKK 1052 sub r3 , r3 , r4 1053 lsls r4 , r3 , #4 // 2 * 4 * 2 float values 1054 add BO , BO , r4 1055 lsls r4 , r3 , #4 // 2 * 4 * 2 float values 1056 add AO , AO , r4 1057#endif 1058 1059#if defined(LEFT) 1060 ldr r3 , KK 1061 add r3 , r3 , #2 // number of values in AO 1062 str r3 , KK 1063#endif 1064 1065 1066_L2_M2_END: 1067 1068 subs I, I, #1 1069 bne _L2_M2_20 1070 1071 1072_L2_M1_BEGIN: 1073 1074 ldr I, M 1075 tst I, #1 // I = I % 2 1076 ble _L2_END 1077 1078_L2_M1_20: 1079 1080 INIT1x2 1081 1082#if (defined(LEFT) && defined(TRANSA)) || \ 1083 (!defined(LEFT) && !defined(TRANSA)) 1084 1085 mov BO, BC 1086#else 1087 mov BO, BC 1088 ldr r3 , KK 1089 lsls r4 , r3 , #4 // 2 * 4 * 2 float values 1090 add BO , BO , r4 1091 lsls r4 , r3 , #3 // 1 * 4 * 2 float values 1092 add AO , AO , r4 1093 1094#endif 1095 1096#ifndef TRMMKERNEL 1097 ldr K1, K 1098#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1099 ldr K1, K 1100 ldr r3, KK 1101 sub K1, K1, r3 1102 str K1, KKK 1103#else 1104 ldr K1, KK 1105#ifdef LEFT 1106 add K1, K1, #1 // number of values in AO 1107#else 1108 add K1, K1, #2 // number of values in BO 1109#endif 1110 str K1, KKK 1111#endif 1112 1113 asrs L , K1, #3 // L = L / 8 1114 ble _L2_M1_40 1115 1116_L2_M1_22: 1117 1118 KERNEL1x2_SUB 1119 KERNEL1x2_SUB 1120 KERNEL1x2_SUB 1121 KERNEL1x2_SUB 1122 1123 KERNEL1x2_SUB 1124 KERNEL1x2_SUB 1125 KERNEL1x2_SUB 1126 KERNEL1x2_SUB 1127 1128 subs L, L, #1 1129 bgt _L2_M1_22 1130 1131 1132_L2_M1_40: 1133 1134 ands L , K1, #7 // L = L % 8 1135 ble _L2_M1_100 1136 1137_L2_M1_42: 1138 1139 KERNEL1x2_SUB 1140 1141 subs L, L, #1 1142 bgt _L2_M1_42 1143 1144_L2_M1_100: 1145 1146 SAVE1x2 1147 1148#if (defined(LEFT) && defined(TRANSA)) || \ 1149 (!defined(LEFT) && !defined(TRANSA)) 1150 ldr r3 , K 1151 ldr r4 , KKK 1152 sub r3 , r3 , r4 1153 lsls r4 , r3 , #4 // 2 * 4 * 2 float values 1154 add BO , BO , r4 1155 lsls r4 , r3 , #3 // 1 * 4 * 2 float values 1156 add AO , AO , r4 1157#endif 1158 1159#if defined(LEFT) 1160 ldr r3 , KK 1161 add r3 , r3 , #1 // number of values in AO 1162 str r3 , KK 1163#endif 1164 1165 1166 1167_L2_END: 1168 1169 mov r3, BC 1170 ldr r4, K 1171 lsl r4, r4, #4 // k * 2 * 4 * 2 1172 add r3, r3, r4 // B = B + K * 2 * 8 1173 mov BC, r3 1174 1175#if !defined(LEFT) 1176 ldr r3 , KK 1177 add r3 , r3 , #2 // number of values in BO 1178 str r3 , KK 1179#endif 1180 1181 subs J , #1 // j-- 1182 bgt _L2_BEGIN 1183 1184 1185 1186/*********************************************************************************************/ 1187 1188_L1_BEGIN: 1189 1190 ldr J , N 1191 tst J , #1 1192 ble _L999 1193 1194 1195 ldr CO1, C // CO1 = C 1196 ldr r4 , LDC 1197 add r3 , r4, CO1 1198 str r3 , C // store C 1199 1200#if defined(LEFT) 1201 ldr r3 , OFFSET 1202 str r3 , KK 1203#endif 1204 1205 ldr AO, A // AO = A 1206 1207_L1_M2_BEGIN: 1208 1209 ldr I, M 1210 asrs I, I, #1 // I = I / 2 1211 ble _L1_M1_BEGIN 1212 1213_L1_M2_20: 1214 1215#if (defined(LEFT) && defined(TRANSA)) || \ 1216 (!defined(LEFT) && !defined(TRANSA)) 1217 1218 mov BO, BC 1219#else 1220 mov BO, BC 1221 ldr r3 , KK 1222 lsls r4 , r3 , #3 // 1 * 4 * 2 float values 1223 add BO , BO , r4 1224 lsls r4 , r3 , #4 // 2 * 4 * 2 float values 1225 add AO , AO , r4 1226 1227#endif 1228 1229#ifndef TRMMKERNEL 1230 ldr K1, K 1231#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1232 ldr K1, K 1233 ldr r3, KK 1234 sub K1, K1, r3 1235 str K1, KKK 1236#else 1237 ldr K1, KK 1238#ifdef LEFT 1239 add K1, K1, #2 // number of values in AO 1240#else 1241 add K1, K1, #1 // number of values in BO 1242#endif 1243 str K1, KKK 1244#endif 1245 1246 asrs L , K1, #3 // L = L / 8 1247 cmp L , #3 1248 blt _L1_M2_30 1249 .align 5 1250 1251 1252 1253 KERNEL2x1_I 1254 KERNEL2x1_M2 1255 KERNEL2x1_M1 1256 KERNEL2x1_M2 1257 1258 KERNEL2x1_M1 1259 KERNEL2x1_M2 1260 KERNEL2x1_M1 1261 KERNEL2x1_M2 1262 1263 sub L, L, #2 1264 1265_L1_M2_22: 1266 1267 KERNEL2x1_M1 1268 KERNEL2x1_M2 1269 KERNEL2x1_M1 1270 KERNEL2x1_M2 1271 1272 KERNEL2x1_M1 1273 KERNEL2x1_M2 1274 KERNEL2x1_M1 1275 KERNEL2x1_M2 1276 1277 subs L, L, #1 1278 bgt _L1_M2_22 1279 1280 KERNEL2x1_M1 1281 KERNEL2x1_M2 1282 KERNEL2x1_M1 1283 KERNEL2x1_M2 1284 1285 KERNEL2x1_M1 1286 KERNEL2x1_M2 1287 KERNEL2x1_M1 1288 KERNEL2x1_E 1289 1290 b _L1_M2_44 1291 1292 1293_L1_M2_30: 1294 tst L, #3 1295 ble _L1_M2_40 1296 1297 tst L, #2 1298 ble _L1_M2_32 1299 1300 KERNEL2x1_I 1301 KERNEL2x1_M2 1302 KERNEL2x1_M1 1303 KERNEL2x1_M2 1304 1305 KERNEL2x1_M1 1306 KERNEL2x1_M2 1307 KERNEL2x1_M1 1308 KERNEL2x1_M2 1309 1310 KERNEL2x1_M1 1311 KERNEL2x1_M2 1312 KERNEL2x1_M1 1313 KERNEL2x1_M2 1314 1315 1316 KERNEL2x1_M1 1317 KERNEL2x1_M2 1318 KERNEL2x1_M1 1319 KERNEL2x1_E 1320 1321 b _L1_M2_44 1322 1323_L1_M2_32: 1324 1325 tst L, #1 1326 ble _L1_M2_40 1327 1328 KERNEL2x1_I 1329 KERNEL2x1_M2 1330 KERNEL2x1_M1 1331 KERNEL2x1_M2 1332 1333 KERNEL2x1_M1 1334 KERNEL2x1_M2 1335 KERNEL2x1_M1 1336 KERNEL2x1_E 1337 1338 b _L1_M2_44 1339 1340 1341_L1_M2_40: 1342 1343 INIT2x1 1344 1345 1346_L1_M2_44: 1347 1348 ands L , K1, #7 // L = L % 8 1349 ble _L1_M2_100 1350 1351_L1_M2_46: 1352 1353 KERNEL2x1_SUB 1354 1355 subs L, L, #1 1356 bne _L1_M2_46 1357 1358_L1_M2_100: 1359 1360 SAVE2x1 1361 1362#if (defined(LEFT) && defined(TRANSA)) || \ 1363 (!defined(LEFT) && !defined(TRANSA)) 1364 ldr r3 , K 1365 ldr r4 , KKK 1366 sub r3 , r3 , r4 1367 lsls r4 , r3 , #3 // 1 * 4 * 2 float values 1368 add BO , BO , r4 1369 lsls r4 , r3 , #4 // 2 * 4 * 2 float values 1370 add AO , AO , r4 1371#endif 1372 1373#if defined(LEFT) 1374 ldr r3 , KK 1375 add r3 , r3 , #2 // number of values in AO 1376 str r3 , KK 1377#endif 1378 1379 1380 1381_L1_M2_END: 1382 1383 subs I, I, #1 1384 bne _L1_M2_20 1385 1386 1387_L1_M1_BEGIN: 1388 1389 ldr I, M 1390 tst I, #1 // I = I % 2 1391 ble _L1_END 1392 1393_L1_M1_20: 1394 1395 INIT1x1 1396 1397#if (defined(LEFT) && defined(TRANSA)) || \ 1398 (!defined(LEFT) && !defined(TRANSA)) 1399 1400 mov BO, BC 1401#else 1402 mov BO, BC 1403 ldr r3 , KK 1404 lsls r4 , r3 , #3 // 1 * 4 * 2 float values 1405 add BO , BO , r4 1406 lsls r4 , r3 , #3 // 1 * 4 * 2 float values 1407 add AO , AO , r4 1408 1409#endif 1410 1411#ifndef TRMMKERNEL 1412 ldr K1, K 1413#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1414 ldr K1, K 1415 ldr r3, KK 1416 sub K1, K1, r3 1417 str K1, KKK 1418#else 1419 ldr K1, KK 1420#ifdef LEFT 1421 add K1, K1, #1 // number of values in AO 1422#else 1423 add K1, K1, #1 // number of values in BO 1424#endif 1425 str K1, KKK 1426#endif 1427 1428 asrs L , K1, #3 // L = L / 8 1429 ble _L1_M1_40 1430 1431_L1_M1_22: 1432 1433 KERNEL1x1_SUB 1434 KERNEL1x1_SUB 1435 KERNEL1x1_SUB 1436 KERNEL1x1_SUB 1437 1438 KERNEL1x1_SUB 1439 KERNEL1x1_SUB 1440 KERNEL1x1_SUB 1441 KERNEL1x1_SUB 1442 1443 subs L, L, #1 1444 bgt _L1_M1_22 1445 1446 1447_L1_M1_40: 1448 1449 ands L , K1, #7 // L = L % 8 1450 ble _L1_M1_100 1451 1452_L1_M1_42: 1453 1454 KERNEL1x1_SUB 1455 1456 subs L, L, #1 1457 bgt _L1_M1_42 1458 1459_L1_M1_100: 1460 1461 SAVE1x1 1462 1463 1464_L1_END: 1465 1466 1467 1468_L999: 1469 1470 sub r3, fp, #128 1471 vldm r3, { s8 - s15} // restore floating point registers 1472 1473 movs r0, #0 // set return value 1474 sub sp, fp, #24 1475 pop {r4 - r9, fp} 1476 bx lr 1477 1478 EPILOGUE 1479 1480