1/*************************************************************************** 2Copyright (c) 2013, The OpenBLAS Project 3All rights reserved. 4Redistribution and use in source and binary forms, with or without 5modification, are permitted provided that the following conditions are 6met: 71. Redistributions of source code must retain the above copyright 8notice, this list of conditions and the following disclaimer. 92. Redistributions in binary form must reproduce the above copyright 10notice, this list of conditions and the following disclaimer in 11the documentation and/or other materials provided with the 12distribution. 133. Neither the name of the OpenBLAS project nor the names of 14its contributors may be used to endorse or promote products 15derived from this software without specific prior written permission. 16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE 20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE 25USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26*****************************************************************************/ 27 28/************************************************************************************** 29* 2013/10/16 Saar 30* BLASTEST : OK 31* CTEST : OK 32* TEST : OK 33* 34**************************************************************************************/ 35 36#define ASSEMBLER 37#include "common.h" 38 39#define STACKSIZE 256 40 41#define OLD_M r0 42#define OLD_N r1 43#define OLD_K r2 44#define OLD_A r3 45#define OLD_ALPHA_R s0 46#define OLD_ALPHA_I s1 47 48/****************************************************** 49* [fp, #-128] - [fp, #-64] is reserved 50* for store and restore of floating point 51* registers 52*******************************************************/ 53 54#define KKK [fp, #-240] 55#define KK [fp, #-244 ] 56#define A [fp, #-248 ] 57#define LDC [fp, #-252 ] 58#define M [fp, #-256 ] 59#define N [fp, #-260 ] 60#define K [fp, #-264 ] 61 62#define FP_ZERO [fp, #-236] 63#define FP_ZERO_0 [fp, #-236] 64#define FP_ZERO_1 [fp, #-232] 65 66#define ALPHA_I [fp, #-272] 67#define ALPHA_R [fp, #-280] 68 69#if !defined(__ARM_PCS_VFP) 70#define OLD_ALPHAR_SOFTFP r3 71#define OLD_ALPHAI_SOFTFP [fp, #4] 72#define OLD_A_SOFTFP [fp, #8 ] 73#define B [fp, #12 ] 74#define C [fp, #16 ] 75#define OLD_LDC [fp, #20 ] 76#define OFFSET [fp, #24 ] 77#else 78#define B [fp, #4 ] 79#define C [fp, #8 ] 80#define OLD_LDC [fp, #12 ] 81#define OFFSET [fp, #16 ] 82#endif 83 84#define I r0 85#define J r1 86#define L r2 87 88#define AO r5 89#define BO r6 90 91#define CO1 r8 92#define CO2 r9 93 94#define K1 r7 95#define BC r12 96 97#define A_PRE 96 98#define B_PRE 96 99#define C_PRE 64 100 101#if defined(NN) || defined(NT) || defined(TN) || defined(TT) 102 103 #define FADD_R fsubs 104 #define FADD_I fadds 105 106 #define FMAC_R1 vnmul.f32 107 #define FMAC_R2 vmls.f32 108 #define FMAC_I1 fmuls 109 #define FMAC_I2 vmls.f32 110 111#elif defined(CN) || defined(CT) 112 113 #define FADD_R fadds 114 #define FADD_I fsubs 115 116 #define FMAC_R1 fmuls 117 #define FMAC_R2 fmacs 118 #define FMAC_I1 vnmul.f32 119 #define FMAC_I2 fmacs 120 121#elif defined(NC) || defined(TC) 122 123 #define FADD_R fadds 124 #define FADD_I fsubs 125 126 #define FMAC_R1 fmuls 127 #define FMAC_R2 vmls.f32 128 #define FMAC_I1 fmuls 129 #define FMAC_I2 fmacs 130 131#else 132 133 #define FADD_R fsubs 134 #define FADD_I fadds 135 136 #define FMAC_R1 vnmul.f32 137 #define FMAC_R2 fmacs 138 #define FMAC_I1 vnmul.f32 139 #define FMAC_I2 vmls.f32 140 141#endif 142 143 144 145/************************************************************************************** 146* Macro definitions 147**************************************************************************************/ 148 149.macro INIT2x2 150 151 flds s16 , FP_ZERO 152 vmov.f32 s17, s16 153 vmov.f32 s18, s16 154 vmov.f32 s19, s16 155 vmov.f32 s20, s16 156 vmov.f32 s21, s16 157 vmov.f32 s22, s16 158 vmov.f32 s23, s16 159 vmov.f32 s24, s16 160 vmov.f32 s25, s16 161 vmov.f32 s26, s16 162 vmov.f32 s27, s16 163 vmov.f32 s28, s16 164 vmov.f32 s29, s16 165 vmov.f32 s30, s16 166 vmov.f32 s31, s16 167 168.endm 169 170.macro KERNEL2x2_I 171 pld [ AO , #A_PRE ] 172 pld [ BO , #B_PRE ] 173 vldmia.f32 AO!, { s0 - s1 } 174 vldmia.f32 BO!, { s8 - s9 } 175 176 fmuls s16 , s0, s8 177 fmuls s24 , s1, s9 178 vldmia.f32 AO!, { s2 - s3 } 179 fmuls s17 , s0, s9 180 fmuls s25 , s1, s8 181 182 vldmia.f32 BO!, { s10 - s11 } 183 fmuls s18 , s2, s8 184 fmuls s26 , s3, s9 185 vldmia.f32 AO!, { s4 - s5 } 186 fmuls s19 , s2, s9 187 fmuls s27 , s3, s8 188 189 vldmia.f32 BO!, { s12 - s13 } 190 fmuls s20 , s0, s10 191 fmuls s28 , s1, s11 192 vldmia.f32 AO!, { s6 - s7 } 193 fmuls s21 , s0, s11 194 fmuls s29 , s1, s10 195 196 vldmia.f32 BO!, { s14 - s15 } 197 fmuls s22 , s2, s10 198 fmuls s30 , s3, s11 199 fmuls s23 , s2, s11 200 fmuls s31 , s3, s10 201 202.endm 203 204 205 206.macro KERNEL2x2_M1 207 208 fmacs s16 , s0, s8 209 vldmia.f32 AO!, { s4 - s5 } 210 fmacs s24 , s1, s9 211 fmacs s17 , s0, s9 212 vldmia.f32 BO!, { s12 - s13 } 213 fmacs s25 , s1, s8 214 215 fmacs s18 , s2, s8 216 vldmia.f32 AO!, { s6 - s7 } 217 fmacs s26 , s3, s9 218 fmacs s19 , s2, s9 219 vldmia.f32 BO!, { s14 - s15 } 220 fmacs s27 , s3, s8 221 222 fmacs s20 , s0, s10 223 fmacs s28 , s1, s11 224 fmacs s21 , s0, s11 225 fmacs s29 , s1, s10 226 227 fmacs s22 , s2, s10 228 fmacs s30 , s3, s11 229 fmacs s23 , s2, s11 230 fmacs s31 , s3, s10 231 232.endm 233 234.macro KERNEL2x2_M2 235 pld [ AO , #A_PRE ] 236 237 fmacs s16 , s4, s12 238 pld [ BO , #B_PRE ] 239 fmacs s24 , s5, s13 240 fmacs s17 , s4, s13 241 vldmia.f32 AO!, { s0 - s1 } 242 fmacs s25 , s5, s12 243 244 fmacs s18 , s6, s12 245 fmacs s26 , s7, s13 246 vldmia.f32 BO!, { s8 - s9 } 247 fmacs s19 , s6, s13 248 fmacs s27 , s7, s12 249 250 vldmia.f32 AO!, { s2 - s3 } 251 fmacs s20 , s4, s14 252 fmacs s28 , s5, s15 253 vldmia.f32 BO!, { s10 - s11 } 254 fmacs s21 , s4, s15 255 fmacs s29 , s5, s14 256 257 fmacs s22 , s6, s14 258 fmacs s30 , s7, s15 259 fmacs s23 , s6, s15 260 fmacs s31 , s7, s14 261 262.endm 263 264 265.macro KERNEL2x2_E 266 267 fmacs s16 , s4, s12 268 fmacs s24 , s5, s13 269 fmacs s17 , s4, s13 270 fmacs s25 , s5, s12 271 272 fmacs s18 , s6, s12 273 fmacs s26 , s7, s13 274 fmacs s19 , s6, s13 275 fmacs s27 , s7, s12 276 277 fmacs s20 , s4, s14 278 fmacs s28 , s5, s15 279 fmacs s21 , s4, s15 280 fmacs s29 , s5, s14 281 282 fmacs s22 , s6, s14 283 fmacs s30 , s7, s15 284 fmacs s23 , s6, s15 285 fmacs s31 , s7, s14 286 287.endm 288 289.macro KERNEL2x2_SUB 290 291 vldmia.f32 AO!, { s0 - s1 } 292 vldmia.f32 BO!, { s8 - s9 } 293 294 fmacs s16 , s0, s8 295 fmacs s24 , s1, s9 296 vldmia.f32 AO!, { s2 - s3 } 297 fmacs s17 , s0, s9 298 fmacs s25 , s1, s8 299 300 vldmia.f32 BO!, { s10 - s11 } 301 fmacs s18 , s2, s8 302 fmacs s26 , s3, s9 303 fmacs s19 , s2, s9 304 fmacs s27 , s3, s8 305 306 fmacs s20 , s0, s10 307 fmacs s28 , s1, s11 308 fmacs s21 , s0, s11 309 fmacs s29 , s1, s10 310 311 fmacs s22 , s2, s10 312 fmacs s30 , s3, s11 313 fmacs s23 , s2, s11 314 fmacs s31 , s3, s10 315 316.endm 317 318 319 320 321.macro SAVE2x2 322 323 ldr r3 , LDC 324 add CO2 , CO1, r3 325 flds s0, ALPHA_R 326 flds s1, ALPHA_I 327 328 FADD_R s16, s24 , s16 329 FADD_I s17, s25 , s17 330 FADD_R s18, s26 , s18 331 FADD_I s19, s27 , s19 332 FADD_R s20, s28 , s20 333 FADD_I s21, s29 , s21 334 FADD_R s22, s30 , s22 335 FADD_I s23, s31 , s23 336 337 FMAC_R1 s4 , s0 , s16 338 FMAC_I1 s5 , s0 , s17 339 FMAC_R2 s4 , s1 , s17 340 FMAC_I2 s5 , s1 , s16 341 342 FMAC_R1 s6 , s0 , s18 343 FMAC_I1 s7 , s0 , s19 344 FMAC_R2 s6 , s1 , s19 345 FMAC_I2 s7 , s1 , s18 346 347 FMAC_R1 s8 , s0 , s20 348 FMAC_I1 s9 , s0 , s21 349 FMAC_R2 s8 , s1 , s21 350 FMAC_I2 s9 , s1 , s20 351 352 FMAC_R1 s10, s0 , s22 353 FMAC_I1 s11, s0 , s23 354 FMAC_R2 s10, s1 , s23 355 FMAC_I2 s11, s1 , s22 356 357 vstmia.f32 CO1, { s4 - s7 } 358 vstmia.f32 CO2, { s8 - s11 } 359 360 add CO1, CO1, #16 361 362.endm 363 364/******************************************************************************/ 365 366.macro INIT1x2 367 368 flds s16 , FP_ZERO 369 vmov.f32 s17, s16 370 vmov.f32 s20, s16 371 vmov.f32 s21, s16 372 vmov.f32 s24, s16 373 vmov.f32 s25, s16 374 vmov.f32 s28, s16 375 vmov.f32 s29, s16 376 377.endm 378 379.macro KERNEL1x2_I 380 pld [ AO , #A_PRE ] 381 pld [ BO , #B_PRE ] 382 flds s0 , [ AO ] 383 flds s1 , [ AO, #4 ] 384 flds s8 , [ BO ] 385 flds s9 , [ BO, #4 ] 386 flds s10, [ BO, #8 ] 387 flds s11, [ BO, #12 ] 388 389 fmuls s16 , s0, s8 390 fmuls s24 , s1, s9 391 fmuls s17 , s0, s9 392 fmuls s25 , s1, s8 393 394 fmuls s20 , s0, s10 395 fmuls s28 , s1, s11 396 fmuls s21 , s0, s11 397 fmuls s29 , s1, s10 398 399 add BO , BO, #16 400 add AO , AO, #8 401 402 pld [ BO , #B_PRE ] 403 404 flds s4 , [ AO, #0 ] 405 flds s5 , [ AO, #4 ] 406 407 flds s12, [ BO ] 408 flds s13, [ BO, #4 ] 409 flds s14, [ BO, #8 ] 410 flds s15, [ BO, #12 ] 411 412 add BO , BO, #16 413 add AO , AO, #8 414.endm 415 416 417 418.macro KERNEL1x2_M1 419 pld [ BO , #B_PRE ] 420 421 fmacs s16 , s0, s8 422 fmacs s24 , s1, s9 423 fmacs s17 , s0, s9 424 fmacs s25 , s1, s8 425 426 fmacs s20 , s0, s10 427 fmacs s28 , s1, s11 428 fmacs s21 , s0, s11 429 fmacs s29 , s1, s10 430 431 flds s4 , [ AO, #0 ] 432 flds s5 , [ AO, #4 ] 433 434 flds s12, [ BO ] 435 flds s13, [ BO, #4 ] 436 flds s14, [ BO, #8 ] 437 flds s15, [ BO, #12 ] 438 439 add BO , BO, #16 440 add AO , AO, #8 441.endm 442 443.macro KERNEL1x2_M2 444 pld [ AO , #A_PRE ] 445 pld [ BO , #B_PRE ] 446 447 fmacs s16 , s4, s12 448 fmacs s24 , s5, s13 449 fmacs s17 , s4, s13 450 fmacs s25 , s5, s12 451 452 fmacs s20 , s4, s14 453 fmacs s28 , s5, s15 454 fmacs s21 , s4, s15 455 fmacs s29 , s5, s14 456 457 flds s0 , [ AO, #0 ] 458 flds s1 , [ AO, #4 ] 459 460 flds s8 , [ BO ] 461 flds s9 , [ BO, #4 ] 462 flds s10, [ BO, #8 ] 463 flds s11, [ BO, #12 ] 464 465 add BO , BO, #16 466 add AO , AO, #8 467.endm 468 469 470.macro KERNEL1x2_E 471 472 fmacs s16 , s4, s12 473 fmacs s24 , s5, s13 474 fmacs s17 , s4, s13 475 fmacs s25 , s5, s12 476 477 fmacs s20 , s4, s14 478 fmacs s28 , s5, s15 479 fmacs s21 , s4, s15 480 fmacs s29 , s5, s14 481 482.endm 483 484.macro KERNEL1x2_SUB 485 486 pld [ AO , #A_PRE ] 487 pld [ BO , #B_PRE ] 488 flds s0 , [ AO ] 489 flds s1 , [ AO, #4 ] 490 flds s8 , [ BO ] 491 flds s9 , [ BO, #4 ] 492 flds s10, [ BO, #8 ] 493 flds s11, [ BO, #12 ] 494 495 fmacs s16 , s0, s8 496 fmacs s24 , s1, s9 497 fmacs s17 , s0, s9 498 fmacs s25 , s1, s8 499 500 fmacs s20 , s0, s10 501 fmacs s28 , s1, s11 502 fmacs s21 , s0, s11 503 fmacs s29 , s1, s10 504 505 add BO , BO, #16 506 add AO , AO, #8 507 508.endm 509 510 511 512 513.macro SAVE1x2 514 515 ldr r3 , LDC 516 add CO2 , CO1, r3 517 flds s0, ALPHA_R 518 flds s1, ALPHA_I 519 520 FADD_R s16, s24 , s16 521 FADD_I s17, s25 , s17 522 FADD_R s20, s28 , s20 523 FADD_I s21, s29 , s21 524 525 FMAC_R1 s4 , s0 , s16 526 FMAC_I1 s5 , s0 , s17 527 FMAC_R2 s4 , s1 , s17 528 FMAC_I2 s5 , s1 , s16 529 530 FMAC_R1 s8 , s0 , s20 531 FMAC_I1 s9 , s0 , s21 532 FMAC_R2 s8 , s1 , s21 533 FMAC_I2 s9 , s1 , s20 534 535 vstmia.f32 CO1, { s4 - s5 } 536 vstmia.f32 CO2, { s8 - s9 } 537 538 add CO1, CO1, #8 539 540.endm 541 542/******************************************************************************/ 543 544.macro INIT2x1 545 546 flds s16 , FP_ZERO 547 vmov.f32 s17, s16 548 vmov.f32 s18, s16 549 vmov.f32 s19, s16 550 vmov.f32 s24, s16 551 vmov.f32 s25, s16 552 vmov.f32 s26, s16 553 vmov.f32 s27, s16 554 555.endm 556 557.macro KERNEL2x1_I 558 pld [ AO , #A_PRE ] 559 pld [ BO , #B_PRE ] 560 flds s0 , [ AO ] 561 flds s1 , [ AO, #4 ] 562 flds s2 , [ AO, #8 ] 563 flds s3 , [ AO, #12 ] 564 flds s8 , [ BO ] 565 flds s9 , [ BO, #4 ] 566 567 fmuls s16 , s0, s8 568 fmuls s24 , s1, s9 569 fmuls s17 , s0, s9 570 fmuls s25 , s1, s8 571 572 fmuls s18 , s2, s8 573 fmuls s26 , s3, s9 574 fmuls s19 , s2, s9 575 fmuls s27 , s3, s8 576 577 add BO , BO, #8 578 add AO , AO, #16 579 580 pld [ BO , #B_PRE ] 581 pld [ AO , #A_PRE ] 582 583 flds s4 , [ AO, #0 ] 584 flds s5 , [ AO, #4 ] 585 flds s6 , [ AO, #8 ] 586 flds s7 , [ AO, #12 ] 587 588 flds s12, [ BO ] 589 flds s13, [ BO, #4 ] 590 591 add BO , BO, #8 592 add AO , AO, #16 593.endm 594 595 596 597.macro KERNEL2x1_M1 598 pld [ AO , #A_PRE ] 599 pld [ BO , #B_PRE ] 600 601 fmacs s16 , s0, s8 602 fmacs s24 , s1, s9 603 fmacs s17 , s0, s9 604 fmacs s25 , s1, s8 605 606 fmacs s18 , s2, s8 607 fmacs s26 , s3, s9 608 fmacs s19 , s2, s9 609 fmacs s27 , s3, s8 610 611 flds s4 , [ AO, #0 ] 612 flds s5 , [ AO, #4 ] 613 flds s6 , [ AO, #8 ] 614 flds s7 , [ AO, #12 ] 615 616 flds s12, [ BO ] 617 flds s13, [ BO, #4 ] 618 619 add BO , BO, #8 620 add AO , AO, #16 621.endm 622 623.macro KERNEL2x1_M2 624 pld [ AO , #A_PRE ] 625 pld [ BO , #B_PRE ] 626 627 fmacs s16 , s4, s12 628 fmacs s24 , s5, s13 629 fmacs s17 , s4, s13 630 fmacs s25 , s5, s12 631 632 fmacs s18 , s6, s12 633 fmacs s26 , s7, s13 634 fmacs s19 , s6, s13 635 fmacs s27 , s7, s12 636 637 flds s0 , [ AO, #0 ] 638 flds s1 , [ AO, #4 ] 639 flds s2 , [ AO, #8 ] 640 flds s3 , [ AO, #12 ] 641 642 flds s8 , [ BO ] 643 flds s9 , [ BO, #4 ] 644 645 add BO , BO, #8 646 add AO , AO, #16 647.endm 648 649 650.macro KERNEL2x1_E 651 652 fmacs s16 , s4, s12 653 fmacs s24 , s5, s13 654 fmacs s17 , s4, s13 655 fmacs s25 , s5, s12 656 657 fmacs s18 , s6, s12 658 fmacs s26 , s7, s13 659 fmacs s19 , s6, s13 660 fmacs s27 , s7, s12 661 662.endm 663 664.macro KERNEL2x1_SUB 665 666 pld [ AO , #A_PRE ] 667 pld [ BO , #B_PRE ] 668 flds s0 , [ AO ] 669 flds s1 , [ AO, #4 ] 670 flds s2 , [ AO, #8 ] 671 flds s3 , [ AO, #12 ] 672 flds s8 , [ BO ] 673 flds s9 , [ BO, #4 ] 674 675 fmacs s16 , s0, s8 676 fmacs s24 , s1, s9 677 fmacs s17 , s0, s9 678 fmacs s25 , s1, s8 679 680 fmacs s18 , s2, s8 681 fmacs s26 , s3, s9 682 fmacs s19 , s2, s9 683 fmacs s27 , s3, s8 684 685 add BO , BO, #8 686 add AO , AO, #16 687 688.endm 689 690 691 692 693.macro SAVE2x1 694 695 flds s0, ALPHA_R 696 flds s1, ALPHA_I 697 698 FADD_R s16, s24 , s16 699 FADD_I s17, s25 , s17 700 FADD_R s18, s26 , s18 701 FADD_I s19, s27 , s19 702 703 FMAC_R1 s4 , s0 , s16 704 FMAC_I1 s5 , s0 , s17 705 FMAC_R2 s4 , s1 , s17 706 FMAC_I2 s5 , s1 , s16 707 708 FMAC_R1 s6 , s0 , s18 709 FMAC_I1 s7 , s0 , s19 710 FMAC_R2 s6 , s1 , s19 711 FMAC_I2 s7 , s1 , s18 712 713 vstmia.f32 CO1, { s4 - s7 } 714 715 add CO1, CO1, #16 716 717.endm 718 719/******************************************************************************/ 720 721.macro INIT1x1 722 723 flds s16 , FP_ZERO 724 vmov.f32 s17, s16 725 vmov.f32 s24, s16 726 vmov.f32 s25, s16 727 728.endm 729 730.macro KERNEL1x1_I 731 pld [ AO , #A_PRE ] 732 pld [ BO , #B_PRE ] 733 flds s0 , [ AO ] 734 flds s1 , [ AO, #4 ] 735 flds s8 , [ BO ] 736 flds s9 , [ BO, #4 ] 737 738 fmuls s16 , s0, s8 739 fmuls s24 , s1, s9 740 fmuls s17 , s0, s9 741 fmuls s25 , s1, s8 742 743 add BO , BO, #8 744 add AO , AO, #8 745 746 pld [ BO , #B_PRE ] 747 pld [ AO , #A_PRE ] 748 749 flds s4 , [ AO, #0 ] 750 flds s5 , [ AO, #4 ] 751 752 flds s12, [ BO ] 753 flds s13, [ BO, #4 ] 754 755 add BO , BO, #8 756 add AO , AO, #8 757.endm 758 759 760 761.macro KERNEL1x1_M1 762 763 fmacs s16 , s0, s8 764 fmacs s24 , s1, s9 765 fmacs s17 , s0, s9 766 fmacs s25 , s1, s8 767 768 flds s4 , [ AO, #0 ] 769 flds s5 , [ AO, #4 ] 770 771 flds s12, [ BO ] 772 flds s13, [ BO, #4 ] 773 774 add BO , BO, #8 775 add AO , AO, #8 776.endm 777 778.macro KERNEL1x1_M2 779 780 fmacs s16 , s4, s12 781 fmacs s24 , s5, s13 782 fmacs s17 , s4, s13 783 fmacs s25 , s5, s12 784 785 flds s0 , [ AO, #0 ] 786 flds s1 , [ AO, #4 ] 787 788 flds s8 , [ BO ] 789 flds s9 , [ BO, #4 ] 790 791 add BO , BO, #8 792 add AO , AO, #8 793.endm 794 795 796.macro KERNEL1x1_E 797 798 fmacs s16 , s4, s12 799 fmacs s24 , s5, s13 800 fmacs s17 , s4, s13 801 fmacs s25 , s5, s12 802 803.endm 804 805.macro KERNEL1x1_SUB 806 807 flds s0 , [ AO ] 808 flds s1 , [ AO, #4 ] 809 flds s8 , [ BO ] 810 flds s9 , [ BO, #4 ] 811 812 fmacs s16 , s0, s8 813 fmacs s24 , s1, s9 814 fmacs s17 , s0, s9 815 fmacs s25 , s1, s8 816 817 add BO , BO, #8 818 add AO , AO, #8 819 820.endm 821 822 823 824 825.macro SAVE1x1 826 827 flds s0, ALPHA_R 828 flds s1, ALPHA_I 829 830 FADD_R s16, s24 , s16 831 FADD_I s17, s25 , s17 832 833 FMAC_R1 s4 , s0 , s16 834 FMAC_I1 s5 , s0 , s17 835 FMAC_R2 s4 , s1 , s17 836 FMAC_I2 s5 , s1 , s16 837 838 vstmia.f32 CO1, { s4 - s5 } 839 840 add CO1, CO1, #8 841 842.endm 843 844/******************************************************************************/ 845 846 847/************************************************************************************** 848* End of macro definitions 849**************************************************************************************/ 850 851 PROLOGUE 852 853 .align 5 854 855 push {r4 - r9, fp} 856 add fp, sp, #24 857 sub sp, sp, #STACKSIZE // reserve stack 858 859#if !defined(__ARM_PCS_VFP) 860 vmov OLD_ALPHA_R, OLD_ALPHAR_SOFTFP 861 vldr OLD_ALPHA_I, OLD_ALPHAI_SOFTFP 862 ldr OLD_A, OLD_A_SOFTFP 863#endif 864 str OLD_M, M 865 str OLD_N, N 866 str OLD_K, K 867 str OLD_A, A 868 vstr OLD_ALPHA_R, ALPHA_R 869 vstr OLD_ALPHA_I, ALPHA_I 870 871 sub r3, fp, #128 872 vstm r3, { s8 - s31} // store floating point registers 873 874 movs r4, #0 875 str r4, FP_ZERO 876 str r4, FP_ZERO_1 877 878 ldr r3, OLD_LDC 879 lsl r3, r3, #3 // ldc = ldc * 4 * 2 880 str r3, LDC 881 882 ldr r3, OFFSET 883#ifndef LEFT 884 neg r3 , r3 885#endif 886 str r3 , KK 887 888 ldr BC, B 889 890 ldr J, N 891 asrs J, J, #1 // J = J / 2 892 ble _L1_BEGIN 893 894_L2_BEGIN: 895 896 ldr CO1, C // CO1 = C 897 ldr r4 , LDC 898 lsl r4 , r4 , #1 // LDC * 2 899 add r3 , r4, CO1 900 str r3 , C // store C 901 902#if defined(LEFT) 903 ldr r3 , OFFSET 904 str r3 , KK 905#endif 906 907 ldr AO, A // AO = A 908 pld [AO , #A_PRE-64] 909 pld [AO , #A_PRE-32] 910 911 912 913_L2_M2_BEGIN: 914 915 ldr I, M 916 asrs I, I, #1 // I = I / 2 917 ble _L2_M1_BEGIN 918 919_L2_M2_20: 920 921#if (defined(LEFT) && defined(TRANSA)) || \ 922 (!defined(LEFT) && !defined(TRANSA)) 923 924 mov BO, BC 925#else 926 mov BO, BC 927 ldr r3 , KK 928 lsls r4 , r3 , #4 // 2 * 4 * 2 float values 929 add BO , BO , r4 930 lsls r4 , r3 , #4 // 2 * 4 * 2 float values 931 add AO , AO , r4 932 933#endif 934 935#ifndef TRMMKERNEL 936 ldr K1, K 937#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 938 ldr K1, K 939 ldr r3, KK 940 sub K1, K1, r3 941 str K1, KKK 942#else 943 ldr K1, KK 944#ifdef LEFT 945 add K1, K1, #2 // number of values in AO 946#else 947 add K1, K1, #2 // number of values in BO 948#endif 949 str K1, KKK 950#endif 951 952 asrs L , K1, #3 // L = L / 8 953 cmp L , #3 954 blt _L2_M2_30 955 .align 5 956 957 958 959 KERNEL2x2_I 960 KERNEL2x2_M2 961 KERNEL2x2_M1 962 KERNEL2x2_M2 963 964 KERNEL2x2_M1 965 KERNEL2x2_M2 966 KERNEL2x2_M1 967 KERNEL2x2_M2 968 969 sub L, L, #2 970 971_L2_M2_22: 972 973 KERNEL2x2_M1 974 KERNEL2x2_M2 975 KERNEL2x2_M1 976 KERNEL2x2_M2 977 978 KERNEL2x2_M1 979 KERNEL2x2_M2 980 KERNEL2x2_M1 981 KERNEL2x2_M2 982 983 subs L, L, #1 984 bgt _L2_M2_22 985 986 KERNEL2x2_M1 987 KERNEL2x2_M2 988 KERNEL2x2_M1 989 KERNEL2x2_M2 990 991 KERNEL2x2_M1 992 KERNEL2x2_M2 993 KERNEL2x2_M1 994 KERNEL2x2_E 995 996 b _L2_M2_44 997 998 999_L2_M2_30: 1000 tst L, #3 1001 ble _L2_M2_40 1002 1003 tst L, #2 1004 ble _L2_M2_32 1005 1006 KERNEL2x2_I 1007 KERNEL2x2_M2 1008 KERNEL2x2_M1 1009 KERNEL2x2_M2 1010 1011 KERNEL2x2_M1 1012 KERNEL2x2_M2 1013 KERNEL2x2_M1 1014 KERNEL2x2_M2 1015 1016 KERNEL2x2_M1 1017 KERNEL2x2_M2 1018 KERNEL2x2_M1 1019 KERNEL2x2_M2 1020 1021 1022 KERNEL2x2_M1 1023 KERNEL2x2_M2 1024 KERNEL2x2_M1 1025 KERNEL2x2_E 1026 1027 b _L2_M2_44 1028 1029_L2_M2_32: 1030 1031 tst L, #1 1032 ble _L2_M2_40 1033 1034 KERNEL2x2_I 1035 KERNEL2x2_M2 1036 KERNEL2x2_M1 1037 KERNEL2x2_M2 1038 1039 KERNEL2x2_M1 1040 KERNEL2x2_M2 1041 KERNEL2x2_M1 1042 KERNEL2x2_E 1043 1044 b _L2_M2_44 1045 1046 1047_L2_M2_40: 1048 1049 INIT2x2 1050 1051 1052_L2_M2_44: 1053 1054 ands L , K1, #7 // L = L % 8 1055 ble _L2_M2_100 1056 1057_L2_M2_46: 1058 1059 KERNEL2x2_SUB 1060 1061 subs L, L, #1 1062 bne _L2_M2_46 1063 1064_L2_M2_100: 1065 1066 SAVE2x2 1067 1068#if (defined(LEFT) && defined(TRANSA)) || \ 1069 (!defined(LEFT) && !defined(TRANSA)) 1070 ldr r3 , K 1071 ldr r4 , KKK 1072 sub r3 , r3 , r4 1073 lsls r4 , r3 , #4 // 2 * 4 * 2 float values 1074 add BO , BO , r4 1075 lsls r4 , r3 , #4 // 2 * 4 * 2 float values 1076 add AO , AO , r4 1077#endif 1078 1079#if defined(LEFT) 1080 ldr r3 , KK 1081 add r3 , r3 , #2 // number of values in AO 1082 str r3 , KK 1083#endif 1084 1085 1086_L2_M2_END: 1087 1088 subs I, I, #1 1089 bne _L2_M2_20 1090 1091 1092_L2_M1_BEGIN: 1093 1094 ldr I, M 1095 tst I, #1 // I = I % 2 1096 ble _L2_END 1097 1098_L2_M1_20: 1099 1100 INIT1x2 1101 1102#if (defined(LEFT) && defined(TRANSA)) || \ 1103 (!defined(LEFT) && !defined(TRANSA)) 1104 1105 mov BO, BC 1106#else 1107 mov BO, BC 1108 ldr r3 , KK 1109 lsls r4 , r3 , #4 // 2 * 4 * 2 float values 1110 add BO , BO , r4 1111 lsls r4 , r3 , #3 // 1 * 4 * 2 float values 1112 add AO , AO , r4 1113 1114#endif 1115 1116#ifndef TRMMKERNEL 1117 ldr K1, K 1118#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1119 ldr K1, K 1120 ldr r3, KK 1121 sub K1, K1, r3 1122 str K1, KKK 1123#else 1124 ldr K1, KK 1125#ifdef LEFT 1126 add K1, K1, #1 // number of values in AO 1127#else 1128 add K1, K1, #2 // number of values in BO 1129#endif 1130 str K1, KKK 1131#endif 1132 1133 asrs L , K1, #3 // L = L / 8 1134 ble _L2_M1_40 1135 1136_L2_M1_22: 1137 1138 KERNEL1x2_SUB 1139 KERNEL1x2_SUB 1140 KERNEL1x2_SUB 1141 KERNEL1x2_SUB 1142 1143 KERNEL1x2_SUB 1144 KERNEL1x2_SUB 1145 KERNEL1x2_SUB 1146 KERNEL1x2_SUB 1147 1148 subs L, L, #1 1149 bgt _L2_M1_22 1150 1151 1152_L2_M1_40: 1153 1154 ands L , K1, #7 // L = L % 8 1155 ble _L2_M1_100 1156 1157_L2_M1_42: 1158 1159 KERNEL1x2_SUB 1160 1161 subs L, L, #1 1162 bgt _L2_M1_42 1163 1164_L2_M1_100: 1165 1166 SAVE1x2 1167 1168#if (defined(LEFT) && defined(TRANSA)) || \ 1169 (!defined(LEFT) && !defined(TRANSA)) 1170 ldr r3 , K 1171 ldr r4 , KKK 1172 sub r3 , r3 , r4 1173 lsls r4 , r3 , #4 // 2 * 4 * 2 float values 1174 add BO , BO , r4 1175 lsls r4 , r3 , #3 // 1 * 4 * 2 float values 1176 add AO , AO , r4 1177#endif 1178 1179#if defined(LEFT) 1180 ldr r3 , KK 1181 add r3 , r3 , #1 // number of values in AO 1182 str r3 , KK 1183#endif 1184 1185 1186 1187_L2_END: 1188 1189 mov r3, BC 1190 ldr r4, K 1191 lsl r4, r4, #4 // k * 2 * 4 * 2 1192 add r3, r3, r4 // B = B + K * 2 * 8 1193 mov BC, r3 1194 1195#if !defined(LEFT) 1196 ldr r3 , KK 1197 add r3 , r3 , #2 // number of values in BO 1198 str r3 , KK 1199#endif 1200 1201 subs J , #1 // j-- 1202 bgt _L2_BEGIN 1203 1204 1205 1206/*********************************************************************************************/ 1207 1208_L1_BEGIN: 1209 1210 ldr J , N 1211 tst J , #1 1212 ble _L999 1213 1214 1215 ldr CO1, C // CO1 = C 1216 ldr r4 , LDC 1217 add r3 , r4, CO1 1218 str r3 , C // store C 1219 1220#if defined(LEFT) 1221 ldr r3 , OFFSET 1222 str r3 , KK 1223#endif 1224 1225 ldr AO, A // AO = A 1226 1227_L1_M2_BEGIN: 1228 1229 ldr I, M 1230 asrs I, I, #1 // I = I / 2 1231 ble _L1_M1_BEGIN 1232 1233_L1_M2_20: 1234 1235#if (defined(LEFT) && defined(TRANSA)) || \ 1236 (!defined(LEFT) && !defined(TRANSA)) 1237 1238 mov BO, BC 1239#else 1240 mov BO, BC 1241 ldr r3 , KK 1242 lsls r4 , r3 , #3 // 1 * 4 * 2 float values 1243 add BO , BO , r4 1244 lsls r4 , r3 , #4 // 2 * 4 * 2 float values 1245 add AO , AO , r4 1246 1247#endif 1248 1249#ifndef TRMMKERNEL 1250 ldr K1, K 1251#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1252 ldr K1, K 1253 ldr r3, KK 1254 sub K1, K1, r3 1255 str K1, KKK 1256#else 1257 ldr K1, KK 1258#ifdef LEFT 1259 add K1, K1, #2 // number of values in AO 1260#else 1261 add K1, K1, #1 // number of values in BO 1262#endif 1263 str K1, KKK 1264#endif 1265 1266 asrs L , K1, #3 // L = L / 8 1267 cmp L , #3 1268 blt _L1_M2_30 1269 .align 5 1270 1271 1272 1273 KERNEL2x1_I 1274 KERNEL2x1_M2 1275 KERNEL2x1_M1 1276 KERNEL2x1_M2 1277 1278 KERNEL2x1_M1 1279 KERNEL2x1_M2 1280 KERNEL2x1_M1 1281 KERNEL2x1_M2 1282 1283 sub L, L, #2 1284 1285_L1_M2_22: 1286 1287 KERNEL2x1_M1 1288 KERNEL2x1_M2 1289 KERNEL2x1_M1 1290 KERNEL2x1_M2 1291 1292 KERNEL2x1_M1 1293 KERNEL2x1_M2 1294 KERNEL2x1_M1 1295 KERNEL2x1_M2 1296 1297 subs L, L, #1 1298 bgt _L1_M2_22 1299 1300 KERNEL2x1_M1 1301 KERNEL2x1_M2 1302 KERNEL2x1_M1 1303 KERNEL2x1_M2 1304 1305 KERNEL2x1_M1 1306 KERNEL2x1_M2 1307 KERNEL2x1_M1 1308 KERNEL2x1_E 1309 1310 b _L1_M2_44 1311 1312 1313_L1_M2_30: 1314 tst L, #3 1315 ble _L1_M2_40 1316 1317 tst L, #2 1318 ble _L1_M2_32 1319 1320 KERNEL2x1_I 1321 KERNEL2x1_M2 1322 KERNEL2x1_M1 1323 KERNEL2x1_M2 1324 1325 KERNEL2x1_M1 1326 KERNEL2x1_M2 1327 KERNEL2x1_M1 1328 KERNEL2x1_M2 1329 1330 KERNEL2x1_M1 1331 KERNEL2x1_M2 1332 KERNEL2x1_M1 1333 KERNEL2x1_M2 1334 1335 1336 KERNEL2x1_M1 1337 KERNEL2x1_M2 1338 KERNEL2x1_M1 1339 KERNEL2x1_E 1340 1341 b _L1_M2_44 1342 1343_L1_M2_32: 1344 1345 tst L, #1 1346 ble _L1_M2_40 1347 1348 KERNEL2x1_I 1349 KERNEL2x1_M2 1350 KERNEL2x1_M1 1351 KERNEL2x1_M2 1352 1353 KERNEL2x1_M1 1354 KERNEL2x1_M2 1355 KERNEL2x1_M1 1356 KERNEL2x1_E 1357 1358 b _L1_M2_44 1359 1360 1361_L1_M2_40: 1362 1363 INIT2x1 1364 1365 1366_L1_M2_44: 1367 1368 ands L , K1, #7 // L = L % 8 1369 ble _L1_M2_100 1370 1371_L1_M2_46: 1372 1373 KERNEL2x1_SUB 1374 1375 subs L, L, #1 1376 bne _L1_M2_46 1377 1378_L1_M2_100: 1379 1380 SAVE2x1 1381 1382#if (defined(LEFT) && defined(TRANSA)) || \ 1383 (!defined(LEFT) && !defined(TRANSA)) 1384 ldr r3 , K 1385 ldr r4 , KKK 1386 sub r3 , r3 , r4 1387 lsls r4 , r3 , #3 // 1 * 4 * 2 float values 1388 add BO , BO , r4 1389 lsls r4 , r3 , #4 // 2 * 4 * 2 float values 1390 add AO , AO , r4 1391#endif 1392 1393#if defined(LEFT) 1394 ldr r3 , KK 1395 add r3 , r3 , #2 // number of values in AO 1396 str r3 , KK 1397#endif 1398 1399 1400 1401_L1_M2_END: 1402 1403 subs I, I, #1 1404 bne _L1_M2_20 1405 1406 1407_L1_M1_BEGIN: 1408 1409 ldr I, M 1410 tst I, #1 // I = I % 2 1411 ble _L1_END 1412 1413_L1_M1_20: 1414 1415 INIT1x1 1416 1417#if (defined(LEFT) && defined(TRANSA)) || \ 1418 (!defined(LEFT) && !defined(TRANSA)) 1419 1420 mov BO, BC 1421#else 1422 mov BO, BC 1423 ldr r3 , KK 1424 lsls r4 , r3 , #3 // 1 * 4 * 2 float values 1425 add BO , BO , r4 1426 lsls r4 , r3 , #3 // 1 * 4 * 2 float values 1427 add AO , AO , r4 1428 1429#endif 1430 1431#ifndef TRMMKERNEL 1432 ldr K1, K 1433#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) 1434 ldr K1, K 1435 ldr r3, KK 1436 sub K1, K1, r3 1437 str K1, KKK 1438#else 1439 ldr K1, KK 1440#ifdef LEFT 1441 add K1, K1, #1 // number of values in AO 1442#else 1443 add K1, K1, #1 // number of values in BO 1444#endif 1445 str K1, KKK 1446#endif 1447 1448 asrs L , K1, #3 // L = L / 8 1449 ble _L1_M1_40 1450 1451_L1_M1_22: 1452 1453 KERNEL1x1_SUB 1454 KERNEL1x1_SUB 1455 KERNEL1x1_SUB 1456 KERNEL1x1_SUB 1457 1458 KERNEL1x1_SUB 1459 KERNEL1x1_SUB 1460 KERNEL1x1_SUB 1461 KERNEL1x1_SUB 1462 1463 subs L, L, #1 1464 bgt _L1_M1_22 1465 1466 1467_L1_M1_40: 1468 1469 ands L , K1, #7 // L = L % 8 1470 ble _L1_M1_100 1471 1472_L1_M1_42: 1473 1474 KERNEL1x1_SUB 1475 1476 subs L, L, #1 1477 bgt _L1_M1_42 1478 1479_L1_M1_100: 1480 1481 SAVE1x1 1482 1483 1484_L1_END: 1485 1486 1487 1488_L999: 1489 1490 sub r3, fp, #128 1491 vldm r3, { s8 - s31} // restore floating point registers 1492 1493 movs r0, #0 // set return value 1494 sub sp, fp, #24 1495 pop {r4 - r9, fp} 1496 bx lr 1497 1498 EPILOGUE 1499 1500