1/*************************************************************************** 2Copyright (c) 2013, The OpenBLAS Project 3All rights reserved. 4Redistribution and use in source and binary forms, with or without 5modification, are permitted provided that the following conditions are 6met: 71. Redistributions of source code must retain the above copyright 8notice, this list of conditions and the following disclaimer. 92. Redistributions in binary form must reproduce the above copyright 10notice, this list of conditions and the following disclaimer in 11the documentation and/or other materials provided with the 12distribution. 133. Neither the name of the OpenBLAS project nor the names of 14its contributors may be used to endorse or promote products 15derived from this software without specific prior written permission. 16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE 20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE 25USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26*****************************************************************************/ 27 28/************************************************************************************** 29* 2013/11/29 Saar 30* BLASTEST : OK 31* CTEST : OK 32* TEST : OK 33* 34**************************************************************************************/ 35 36#define ASSEMBLER 37#include "common.h" 38 39#define STACKSIZE 256 40 41#if !defined(__ARM_PCS_VFP) 42#define OLD_ALPHAR [fp, #0 ] 43#define OLD_ALPHAI [fp, #8 ] 44#define OLD_A_SOFTFP [fp, #16] 45#define OLD_LDA [fp, #20] 46#define X [fp, #24] 47#define OLD_INC_X [fp, #28] 48#define Y [fp, #32] 49#define OLD_INC_Y [fp, #36] 50#else 51#define OLD_LDA [fp, #0 ] 52#define X [fp, #4 ] 53#define OLD_INC_X [fp, #8 ] 54#define Y [fp, #12 ] 55#define OLD_INC_Y [fp, #16 ] 56#endif 57 58#define OLD_A r3 59#define OLD_N r1 60 61#define M r0 62#define AO1 r1 63#define J r2 64 65#define AO2 r4 66#define XO r5 67#define YO r6 68#define LDA r7 69#define INC_X r8 70#define INC_Y r9 71 72#define I r12 73 74#define FP_ZERO [fp, #-228] 75#define FP_ZERO_0 [fp, #-228] 76#define FP_ZERO_1 [fp, #-224] 77 78#define N [fp, #-252 ] 79#define A [fp, #-256 ] 80 81 82#define X_PRE 512 83#define A_PRE 512 84#define Y_PRE 32 85 86/************************************************************************************** 87* Macro definitions 88**************************************************************************************/ 89 90#if !defined(CONJ) && !defined(XCONJ) 91 92 #define KMAC_R vmls.f64 93 #define KMAC_I fmacd 94 95 #define FMAC_R1 fmacd 96 #define FMAC_R2 vmls.f64 97 #define FMAC_I1 fmacd 98 #define FMAC_I2 fmacd 99 100#elif defined(CONJ) && !defined(XCONJ) 101 102 #define KMAC_R fmacd 103 #define KMAC_I vmls.f64 104 105 #define FMAC_R1 fmacd 106 #define FMAC_R2 vmls.f64 107 #define FMAC_I1 fmacd 108 #define FMAC_I2 fmacd 109 110#elif !defined(CONJ) && defined(XCONJ) 111 112 #define KMAC_R fmacd 113 #define KMAC_I vmls.f64 114 115 #define FMAC_R1 fmacd 116 #define FMAC_R2 fmacd 117 #define FMAC_I1 vmls.f64 118 #define FMAC_I2 fmacd 119 120#else 121 122 #define KMAC_R vmls.f64 123 #define KMAC_I fmacd 124 125 #define FMAC_R1 fmacd 126 #define FMAC_R2 fmacd 127 #define FMAC_I1 vmls.f64 128 #define FMAC_I2 fmacd 129 130#endif 131 132 133 134.macro INIT_F2 135 136 fldd d12, FP_ZERO 137 vmov.f64 d13, d12 138 vmov.f64 d14, d12 139 vmov.f64 d15, d12 140 141.endm 142 143.macro KERNEL_F2X4 144 145 KERNEL_F2X1 146 KERNEL_F2X1 147 KERNEL_F2X1 148 KERNEL_F2X1 149 150.endm 151 152.macro KERNEL_F2X1 153 154 vldmia.f64 XO! , { d2 - d3 } 155 vldmia.f64 AO1!, { d4 - d5 } 156 157 fmacd d12 , d4 , d2 158 fmacd d13 , d4 , d3 159 vldmia.f64 AO2!, { d8 - d9 } 160 KMAC_R d12 , d5 , d3 161 KMAC_I d13 , d5 , d2 162 163 fmacd d14 , d8 , d2 164 fmacd d15 , d8 , d3 165 KMAC_R d14 , d9 , d3 166 KMAC_I d15 , d9 , d2 167 168.endm 169 170.macro SAVE_F2 171 172 vldmia.f64 YO, { d4 - d7 } 173 174 FMAC_R1 d4 , d0 , d12 175 FMAC_I1 d5 , d0 , d13 176 FMAC_R2 d4 , d1 , d13 177 FMAC_I2 d5 , d1 , d12 178 179 FMAC_R1 d6 , d0 , d14 180 FMAC_I1 d7 , d0 , d15 181 FMAC_R2 d6 , d1 , d15 182 FMAC_I2 d7 , d1 , d14 183 184 vstmia.f64 YO!, { d4 - d7 } 185 186.endm 187 188/************************************************************************************************/ 189 190.macro INIT_F1 191 192 fldd d12, FP_ZERO 193 vmov.f64 d13, d12 194 195.endm 196 197.macro KERNEL_F1X4 198 199 KERNEL_F1X1 200 KERNEL_F1X1 201 KERNEL_F1X1 202 KERNEL_F1X1 203 204.endm 205 206.macro KERNEL_F1X1 207 208 vldmia.f64 XO! , { d2 - d3 } 209 vldmia.f64 AO1!, { d4 - d5 } 210 211 fmacd d12 , d4 , d2 212 fmacd d13 , d4 , d3 213 KMAC_R d12 , d5 , d3 214 KMAC_I d13 , d5 , d2 215 216.endm 217 218.macro SAVE_F1 219 220 vldmia.f64 YO, { d4 - d5 } 221 222 FMAC_R1 d4 , d0 , d12 223 FMAC_I1 d5 , d0 , d13 224 FMAC_R2 d4 , d1 , d13 225 FMAC_I2 d5 , d1 , d12 226 227 vstmia.f64 YO!, { d4 - d5 } 228 229.endm 230 231/************************************************************************************************/ 232 233.macro INIT_S2 234 235 fldd d12, FP_ZERO 236 vmov.f64 d13, d12 237 vmov.f64 d14, d12 238 vmov.f64 d15, d12 239 240.endm 241 242.macro KERNEL_S2X4 243 244 KERNEL_S2X1 245 KERNEL_S2X1 246 KERNEL_S2X1 247 KERNEL_S2X1 248 249.endm 250 251.macro KERNEL_S2X1 252 253 vldmia.f64 XO , { d2 - d3 } 254 vldmia.f64 AO1!, { d4 - d5 } 255 vldmia.f64 AO2!, { d8 - d9 } 256 257 fmacd d12 , d4 , d2 258 fmacd d13 , d4 , d3 259 KMAC_R d12 , d5 , d3 260 KMAC_I d13 , d5 , d2 261 262 fmacd d14 , d8 , d2 263 fmacd d15 , d8 , d3 264 KMAC_R d14 , d9 , d3 265 KMAC_I d15 , d9 , d2 266 267 add XO, XO, INC_X 268 269.endm 270 271.macro SAVE_S2 272 273 vldmia.f64 YO, { d4 - d5 } 274 275 FMAC_R1 d4 , d0 , d12 276 FMAC_I1 d5 , d0 , d13 277 FMAC_R2 d4 , d1 , d13 278 FMAC_I2 d5 , d1 , d12 279 280 vstmia.f64 YO, { d4 - d5 } 281 282 add YO, YO, INC_Y 283 284 vldmia.f64 YO, { d6 - d7 } 285 286 FMAC_R1 d6 , d0 , d14 287 FMAC_I1 d7 , d0 , d15 288 FMAC_R2 d6 , d1 , d15 289 FMAC_I2 d7 , d1 , d14 290 291 vstmia.f64 YO, { d6 - d7 } 292 293 add YO, YO, INC_Y 294 295.endm 296 297/************************************************************************************************/ 298 299.macro INIT_S1 300 301 fldd d12, FP_ZERO 302 vmov.f64 d13, d12 303 304.endm 305 306.macro KERNEL_S1X4 307 308 KERNEL_S1X1 309 KERNEL_S1X1 310 KERNEL_S1X1 311 KERNEL_S1X1 312 313.endm 314 315.macro KERNEL_S1X1 316 317 vldmia.f64 XO , { d2 - d3 } 318 vldmia.f64 AO1!, { d4 - d5 } 319 320 fmacd d12 , d4 , d2 321 fmacd d13 , d4 , d3 322 KMAC_R d12 , d5 , d3 323 KMAC_I d13 , d5 , d2 324 325 add XO, XO, INC_X 326 327.endm 328 329.macro SAVE_S1 330 331 vldmia.f64 YO, { d4 - d5 } 332 333 FMAC_R1 d4 , d0 , d12 334 FMAC_I1 d5 , d0 , d13 335 FMAC_R2 d4 , d1 , d13 336 FMAC_I2 d5 , d1 , d12 337 338 vstmia.f64 YO, { d4 - d5 } 339 340 add YO, YO, INC_Y 341 342.endm 343 344 345 346/************************************************************************************** 347* End of macro definitions 348**************************************************************************************/ 349 350 PROLOGUE 351 352 .align 5 353 push {r4 - r9 , fp} 354 add fp, sp, #28 355 sub sp, sp, #STACKSIZE // reserve stack 356 357 sub r12, fp, #192 358 359#if defined(DOUBLE) 360 vstm r12, { d8 - d15 } // store floating point registers 361#else 362 vstm r12, { s8 - s15 } // store floating point registers 363#endif 364 365 movs r12, #0 366 str r12, FP_ZERO 367 str r12, FP_ZERO_1 368 369 cmp M, #0 370 ble zgemvt_kernel_L999 371 372 cmp OLD_N, #0 373 ble zgemvt_kernel_L999 374 375#if !defined(__ARM_PCS_VFP) 376 vldr d0, OLD_ALPHAR 377 vldr d1, OLD_ALPHAI 378 ldr OLD_A, OLD_A_SOFTFP 379#endif 380 381 str OLD_A, A 382 str OLD_N, N 383 384 ldr INC_X , OLD_INC_X 385 ldr INC_Y , OLD_INC_Y 386 387 cmp INC_X, #0 388 beq zgemvt_kernel_L999 389 390 cmp INC_Y, #0 391 beq zgemvt_kernel_L999 392 393 ldr LDA, OLD_LDA 394 395 396#if defined(DOUBLE) 397 lsl LDA, LDA, #4 // LDA * SIZE 398#else 399 lsl LDA, LDA, #3 // LDA * SIZE 400#endif 401 402 cmp INC_X, #1 403 bne zgemvt_kernel_S2_BEGIN 404 405 cmp INC_Y, #1 406 bne zgemvt_kernel_S2_BEGIN 407 408 409zgemvt_kernel_F2_BEGIN: 410 411 ldr YO , Y 412 413 ldr J, N 414 asrs J, J, #1 // J = N / 2 415 ble zgemvt_kernel_F1_BEGIN 416 417zgemvt_kernel_F2X4: 418 419 ldr AO1, A 420 add AO2, AO1, LDA 421 add r3 , AO2, LDA 422 str r3 , A 423 424 ldr XO , X 425 426 INIT_F2 427 428 asrs I, M, #2 // I = M / 4 429 ble zgemvt_kernel_F2X1 430 431 432zgemvt_kernel_F2X4_10: 433 434 KERNEL_F2X4 435 436 subs I, I, #1 437 bne zgemvt_kernel_F2X4_10 438 439 440zgemvt_kernel_F2X1: 441 442 ands I, M , #3 443 ble zgemvt_kernel_F2_END 444 445zgemvt_kernel_F2X1_10: 446 447 KERNEL_F2X1 448 449 subs I, I, #1 450 bne zgemvt_kernel_F2X1_10 451 452 453zgemvt_kernel_F2_END: 454 455 SAVE_F2 456 457 subs J , J , #1 458 bne zgemvt_kernel_F2X4 459 460 461zgemvt_kernel_F1_BEGIN: 462 463 ldr J, N 464 ands J, J, #1 465 ble zgemvt_kernel_L999 466 467zgemvt_kernel_F1X4: 468 469 ldr AO1, A 470 471 ldr XO , X 472 473 INIT_F1 474 475 asrs I, M, #2 // I = M / 4 476 ble zgemvt_kernel_F1X1 477 478 479zgemvt_kernel_F1X4_10: 480 481 KERNEL_F1X4 482 483 subs I, I, #1 484 bne zgemvt_kernel_F1X4_10 485 486 487zgemvt_kernel_F1X1: 488 489 ands I, M , #3 490 ble zgemvt_kernel_F1_END 491 492zgemvt_kernel_F1X1_10: 493 494 KERNEL_F1X1 495 496 subs I, I, #1 497 bne zgemvt_kernel_F1X1_10 498 499 500zgemvt_kernel_F1_END: 501 502 SAVE_F1 503 504 b zgemvt_kernel_L999 505 506 507 508/*************************************************************************************************************/ 509 510zgemvt_kernel_S2_BEGIN: 511 512#if defined(DOUBLE) 513 lsl INC_X, INC_X, #4 // INC_X * SIZE 514 lsl INC_Y, INC_Y, #4 // INC_Y * SIZE 515#else 516 lsl INC_X, INC_X, #3 // INC_X * SIZE 517 lsl INC_Y, INC_Y, #3 // INC_Y * SIZE 518#endif 519 520 ldr YO , Y 521 522 ldr J, N 523 asrs J, J, #1 // J = N / 2 524 ble zgemvt_kernel_S1_BEGIN 525 526zgemvt_kernel_S2X4: 527 528 ldr AO1, A 529 add AO2, AO1, LDA 530 add r3 , AO2, LDA 531 str r3 , A 532 533 ldr XO , X 534 535 INIT_S2 536 537 asrs I, M, #2 // I = M / 4 538 ble zgemvt_kernel_S2X1 539 540 541zgemvt_kernel_S2X4_10: 542 543 KERNEL_S2X4 544 545 subs I, I, #1 546 bne zgemvt_kernel_S2X4_10 547 548 549zgemvt_kernel_S2X1: 550 551 ands I, M , #3 552 ble zgemvt_kernel_S2_END 553 554zgemvt_kernel_S2X1_10: 555 556 KERNEL_S2X1 557 558 subs I, I, #1 559 bne zgemvt_kernel_S2X1_10 560 561 562zgemvt_kernel_S2_END: 563 564 SAVE_S2 565 566 subs J , J , #1 567 bne zgemvt_kernel_S2X4 568 569 570zgemvt_kernel_S1_BEGIN: 571 572 ldr J, N 573 ands J, J, #1 574 ble zgemvt_kernel_L999 575 576zgemvt_kernel_S1X4: 577 578 ldr AO1, A 579 580 ldr XO , X 581 582 INIT_S1 583 584 asrs I, M, #2 // I = M / 4 585 ble zgemvt_kernel_S1X1 586 587 588zgemvt_kernel_S1X4_10: 589 590 KERNEL_S1X4 591 592 subs I, I, #1 593 bne zgemvt_kernel_S1X4_10 594 595 596zgemvt_kernel_S1X1: 597 598 ands I, M , #3 599 ble zgemvt_kernel_S1_END 600 601zgemvt_kernel_S1X1_10: 602 603 KERNEL_S1X1 604 605 subs I, I, #1 606 bne zgemvt_kernel_S1X1_10 607 608 609zgemvt_kernel_S1_END: 610 611 SAVE_S1 612 613 614 615/*************************************************************************************************************/ 616 617zgemvt_kernel_L999: 618 619 sub r3, fp, #192 620 621#if defined(DOUBLE) 622 vldm r3, { d8 - d15 } // restore floating point registers 623#else 624 vldm r3, { s8 - s15 } // restore floating point registers 625#endif 626 627 mov r0, #0 // set return value 628 629 sub sp, fp, #28 630 pop {r4 -r9 ,fp} 631 bx lr 632 633 EPILOGUE 634 635