1/******************************************************************************* 2Copyright (c) 2015, The OpenBLAS Project 3All rights reserved. 4Redistribution and use in source and binary forms, with or without 5modification, are permitted provided that the following conditions are 6met: 71. Redistributions of source code must retain the above copyright 8notice, this list of conditions and the following disclaimer. 92. Redistributions in binary form must reproduce the above copyright 10notice, this list of conditions and the following disclaimer in 11the documentation and/or other materials provided with the 12distribution. 133. Neither the name of the OpenBLAS project nor the names of 14its contributors may be used to endorse or promote products 15derived from this software without specific prior written permission. 16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE 20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE 25USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26*******************************************************************************/ 27 28#define ASSEMBLER 29#include "common.h" 30 31/* X0 X1 X2 s0 X3 x4 x5 x6 */ 32/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc */ 33 34#define origM x0 35#define origN x1 36#define origK x2 37#define origPA x3 38#define origPB x4 39#define pC x5 40#define LDC x6 41#define temp x7 42#define counterL x8 43#define counterI x9 44#define counterJ x10 45#define pB x11 46#define pCRow0 x12 47#define pCRow1 x13 48#define pCRow2 x14 49#define pA_0 x15 50#define pA_1 x16 51#define pA_2 x17 52#define pA_3 x18 53 54 55#define alpha0 s10 56#define alphaV0 v10.s[0] 57#define alpha1 s11 58#define alphaV1 v11.s[0] 59#define alpha2 s14 60#define alphaV2 v14.s[0] 61#define alpha3 s15 62#define alphaV3 v15.s[0] 63 64// 00 origM 65// 01 origN 66// 02 origK 67// 03 origPA 68// 04 origPB 69// 05 pC 70// 06 origLDC -> LDC 71// 07 offset -> temp 72// 08 counterL 73// 09 counterI 74// 10 counterJ 75// 11 pB 76// 12 pCRow0 77// 13 pCRow1 78// 14 pCRow2 79// 15 pA_0 80// 16 pA_1 81// 17 pA_2 82// 18 must save pA_3 83// 19 must save 84// 20 must save 85// 21 must save 86// 22 must save 87// 23 must save 88// 24 must save 89// 25 must save 90// 26 must save 91// 27 must save 92// 28 must save 93// 29 frame 94// 30 link 95// 31 sp 96 97/***************************** FOR 16x4 ***************************************/ 98//v00 ALPHA -> pA00_0, pA01_0, pA02_0, pA03_0 99//v01 pA10_0, pA11_0, pA12_0, pA13_0 100//v02 pA00_1, pA01_1, pA02_1, pA03_1 101//v03 pA10_1, pA11_1, pA12_1, pA13_1 102//v04 pA00_2, pA01_2, pA02_2, pA03_2 103//v05 pA10_2, pA11_2, pA12_2, pA13_2 104//v06 pA00_3, pA01_3, pA02_3, pA03_3 105//v07 pA10_3, pA11_3, pA12_3, pA13_3 106//v08 must save pB00, pB01, pB02, pB03 107//v09 must save 108//v10 must save ALPHA0 109//v11 must save ALPHA1 110//v12 must save pB10, pB11, pB12, pB13 111//v13 must save 112//v14 must save ALPHA2 113//v15 must save ALPHA3 114//v16 must save C00_0, C01_0, C02_0, C03_0 115//v17 must save C10_0, C11_0, C12_0, C13_0 116//v18 C20_0, C21_0, C22_0, C23_0 117//v19 C30_0, C31_0, C32_0, C33_0 118//v20 C00_1, C01_1, C02_1, C03_1 119//v21 C10_1, C11_1, C12_1, C13_1 120//v22 C20_1, C21_1, C22_1, C23_1 121//v23 C30_1, C31_1, C32_1, C33_1 122//v24 C00_2, C01_2, C02_2, C03_2 123//v25 C10_2, C11_2, C12_2, C13_2 124//v26 C20_2, C21_2, C22_2, C23_2 125//v27 C30_2, C31_2, C32_2, C33_2 126//v28 C00_3, C01_3, C02_3, C03_3 127//v29 C10_3, C11_3, C12_3, C13_3 128//v30 C20_3, C21_3, C22_3, C23_3 129//v31 C30_3, C31_3, C32_3, C33_3 130 131/***************************** EXCEPT FOR 16x4 ********************************/ 132//v00 ALPHA -> pA00, pA01 133//v01 pA02, pA03 134//v02 ppA00, ppA01 135//v03 ppA02, ppA03 136//v04 pA10, pA11 137//v05 pA12, pA13 138//v06 ppA10, ppA11 139//v07 ppA12, ppA13 140//v08 must save pB00, pB01 141//v09 must save pB02, pB03 142//v10 must save ALPHA0 143//v11 must save ALPHA1 144//v12 must save pB10, pB11 145//v13 must save pB12, pB13 146//v14 must save ALPHA2 147//v15 must save ALPHA3 148//v16 must save C00, C01 149//v17 must save C02, C03 150//v18 ppC00, ppC01 151//v19 ppC02, ppC03 152//v20 C10, C11 153//v21 C12, C13 154//v22 ppC10, ppC11 155//v23 ppC12, ppC13 156//v24 C20, C21 157//v25 C22, C23 158//v26 ppC20, ppC21 159//v27 ppC22, ppC23 160//v28 C30, C31 161//v29 C32, C33 162//v30 ppC30, ppC31 163//v31 ppC32, ppC33 164 165/******************************************************************************* 166* Macro definitions 167*******************************************************************************/ 168 169.macro INIT16x4 170 fmov s16, wzr 171 fmov s17, s16 172 fmov s18, s17 173 fmov s19, s16 174 fmov s20, s17 175 fmov s21, s16 176 fmov s22, s17 177 fmov s23, s16 178 fmov s24, s17 179 fmov s25, s16 180 fmov s26, s17 181 fmov s27, s16 182 fmov s28, s17 183 fmov s29, s16 184 fmov s30, s17 185 fmov s31, s16 186.endm 187 188.macro KERNEL16x4_I 189 ld1 {v8.4s}, [pB] 190 add pB, pB, #16 191 192 ld1 {v0.4s}, [pA_0] 193 add pA_0, pA_0, #16 194 195 fmul v16.4s, v0.4s, v8.s[0] 196 fmul v20.4s, v0.4s, v8.s[1] 197 198 ld1 {v2.4s}, [pA_1] 199 add pA_1, pA_1, #16 200 201 fmul v24.4s, v0.4s, v8.s[2] 202 fmul v28.4s, v0.4s, v8.s[3] 203 204 ld1 {v4.4s}, [pA_2] 205 add pA_2, pA_2, #16 206 207 fmul v17.4s, v2.4s, v8.s[0] 208 fmul v21.4s, v2.4s, v8.s[1] 209 210 ld1 {v6.4s}, [pA_3] 211 add pA_3, pA_3, #16 212 213 fmul v25.4s, v2.4s, v8.s[2] 214 fmul v29.4s, v2.4s, v8.s[3] 215 216 ld1 {v12.4s}, [pB] // for next round 217 add pB, pB, #16 218 219 fmul v18.4s, v4.4s, v8.s[0] 220 fmul v19.4s, v6.4s, v8.s[0] 221 222 ld1 {v1.4s}, [pA_0] // for next round 223 add pA_0, pA_0, #16 224 225 fmul v22.4s, v4.4s, v8.s[1] 226 fmul v23.4s, v6.4s, v8.s[1] 227 228 ld1 {v3.4s}, [pA_1] // for next round 229 add pA_1, pA_1, #16 230 231 fmul v26.4s, v4.4s, v8.s[2] 232 fmul v27.4s, v6.4s, v8.s[2] 233 234 ld1 {v5.4s}, [pA_2] // for next round 235 add pA_2, pA_2, #16 236 237 fmul v30.4s, v4.4s, v8.s[3] 238 fmul v31.4s, v6.4s, v8.s[3] 239 240 ld1 {v7.4s}, [pA_3] // for next round 241 add pA_3, pA_3, #16 242.endm 243 244.macro KERNEL16x4_M2 245 fmla v16.4s, v1.4s, v12.s[0] 246 fmla v17.4s, v3.4s, v12.s[0] 247 248 ld1 {v8.4s}, [pB] // for next round 249 add pB, pB, #16 250 251 fmla v18.4s, v5.4s, v12.s[0] 252 fmla v19.4s, v7.4s, v12.s[0] 253 254 ld1 {v0.4s}, [pA_0] // for next round 255 add pA_0, pA_0, #16 256 257 fmla v20.4s, v1.4s, v12.s[1] 258 fmla v21.4s, v3.4s, v12.s[1] 259 260 ld1 {v2.4s}, [pA_1] // for next round 261 add pA_1, pA_1, #16 262 263 fmla v22.4s, v5.4s, v12.s[1] 264 fmla v23.4s, v7.4s, v12.s[1] 265 266 ld1 {v4.4s}, [pA_2] // for next round 267 add pA_2, pA_2, #16 268 269 fmla v24.4s, v1.4s, v12.s[2] 270 fmla v25.4s, v3.4s, v12.s[2] 271 272 ld1 {v6.4s}, [pA_3] // for next round 273 add pA_3, pA_3, #16 274 275 fmla v26.4s, v5.4s, v12.s[2] 276 fmla v27.4s, v7.4s, v12.s[2] 277 278 prfm PLDL1KEEP, [pA_2, #512] 279 280 fmla v28.4s, v1.4s, v12.s[3] 281 fmla v29.4s, v3.4s, v12.s[3] 282 283 prfm PLDL1KEEP, [pA_3, #512] 284 285 fmla v30.4s, v5.4s, v12.s[3] 286 fmla v31.4s, v7.4s, v12.s[3] 287 288 prfm PLDL1KEEP, [pB, #512] 289.endm 290 291.macro KERNEL16x4_M1 292 fmla v16.4s, v0.4s, v8.s[0] 293 fmla v17.4s, v2.4s, v8.s[0] 294 295 ld1 {v12.4s}, [pB] // for next round 296 add pB, pB, #16 297 298 fmla v18.4s, v4.4s, v8.s[0] 299 fmla v19.4s, v6.4s, v8.s[0] 300 301 ld1 {v1.4s}, [pA_0] // for next round 302 add pA_0, pA_0, #16 303 304 fmla v20.4s, v0.4s, v8.s[1] 305 fmla v21.4s, v2.4s, v8.s[1] 306 307 ld1 {v3.4s}, [pA_1] // for next round 308 add pA_1, pA_1, #16 309 310 fmla v22.4s, v4.4s, v8.s[1] 311 fmla v23.4s, v6.4s, v8.s[1] 312 313 ld1 {v5.4s}, [pA_2] // for next round 314 add pA_2, pA_2, #16 315 316 fmla v24.4s, v0.4s, v8.s[2] 317 fmla v25.4s, v2.4s, v8.s[2] 318 319 ld1 {v7.4s}, [pA_3] // for next round 320 add pA_3, pA_3, #16 321 322 fmla v26.4s, v4.4s, v8.s[2] 323 fmla v27.4s, v6.4s, v8.s[2] 324 325 prfm PLDL1KEEP, [pA_0, #512] 326 327 fmla v28.4s, v0.4s, v8.s[3] 328 fmla v29.4s, v2.4s, v8.s[3] 329 330 prfm PLDL1KEEP, [pA_1, #512] 331 332 fmla v30.4s, v4.4s, v8.s[3] 333 fmla v31.4s, v6.4s, v8.s[3] 334.endm 335 336.macro KERNEL16x4_E 337 fmla v16.4s, v1.4s, v12.s[0] 338 fmla v17.4s, v3.4s, v12.s[0] 339 fmla v18.4s, v5.4s, v12.s[0] 340 fmla v19.4s, v7.4s, v12.s[0] 341 fmla v20.4s, v1.4s, v12.s[1] 342 fmla v21.4s, v3.4s, v12.s[1] 343 fmla v22.4s, v5.4s, v12.s[1] 344 fmla v23.4s, v7.4s, v12.s[1] 345 fmla v24.4s, v1.4s, v12.s[2] 346 fmla v25.4s, v3.4s, v12.s[2] 347 fmla v26.4s, v5.4s, v12.s[2] 348 fmla v27.4s, v7.4s, v12.s[2] 349 fmla v28.4s, v1.4s, v12.s[3] 350 fmla v29.4s, v3.4s, v12.s[3] 351 fmla v30.4s, v5.4s, v12.s[3] 352 fmla v31.4s, v7.4s, v12.s[3] 353.endm 354 355.macro KERNEL16x4_SUB 356 ld1 {v8.4s}, [pB] 357 add pB, pB, #16 358 359 ld1 {v0.4s}, [pA_0] 360 add pA_0, pA_0, #16 361 362 fmla v16.4s, v0.4s, v8.s[0] 363 fmla v20.4s, v0.4s, v8.s[1] 364 fmla v24.4s, v0.4s, v8.s[2] 365 fmla v28.4s, v0.4s, v8.s[3] 366 367 ld1 {v2.4s}, [pA_1] 368 add pA_1, pA_1, #16 369 370 fmla v17.4s, v2.4s, v8.s[0] 371 fmla v21.4s, v2.4s, v8.s[1] 372 fmla v25.4s, v2.4s, v8.s[2] 373 fmla v29.4s, v2.4s, v8.s[3] 374 375 ld1 {v4.4s}, [pA_2] 376 add pA_2, pA_2, #16 377 378 fmla v18.4s, v4.4s, v8.s[0] 379 fmla v22.4s, v4.4s, v8.s[1] 380 fmla v26.4s, v4.4s, v8.s[2] 381 fmla v30.4s, v4.4s, v8.s[3] 382 383 ld1 {v6.4s}, [pA_3] 384 add pA_3, pA_3, #16 385 386 fmla v19.4s, v6.4s, v8.s[0] 387 fmla v23.4s, v6.4s, v8.s[1] 388 fmla v27.4s, v6.4s, v8.s[2] 389 fmla v31.4s, v6.4s, v8.s[3] 390.endm 391 392.macro SAVE16x4 393 mov pCRow1, pCRow0 394 395 ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow1] 396 fmla v0.4s, v16.4s, alphaV0 397 fmla v1.4s, v17.4s, alphaV1 398 fmla v2.4s, v18.4s, alphaV2 399 fmla v3.4s, v19.4s, alphaV3 400 st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow1] 401 402 add pCRow1, pCRow1, LDC 403 404 ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] 405 fmla v4.4s, v20.4s, alphaV0 406 fmla v5.4s, v21.4s, alphaV1 407 fmla v6.4s, v22.4s, alphaV2 408 fmla v7.4s, v23.4s, alphaV3 409 st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] 410 411 add pCRow1, pCRow1, LDC 412 413 ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow1] 414 fmla v0.4s, v24.4s, alphaV0 415 fmla v1.4s, v25.4s, alphaV1 416 fmla v2.4s, v26.4s, alphaV2 417 fmla v3.4s, v27.4s, alphaV3 418 st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [pCRow1] 419 420 add pCRow1, pCRow1, LDC 421 422 ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] 423 fmla v4.4s, v28.4s, alphaV0 424 fmla v5.4s, v29.4s, alphaV1 425 fmla v6.4s, v30.4s, alphaV2 426 fmla v7.4s, v31.4s, alphaV3 427 st1 {v4.4s, v5.4s, v6.4s, v7.4s}, [pCRow1] 428 429 add pCRow0, pCRow0, #64 430.endm 431 432/******************************************************************************/ 433 434.macro INIT8x4 435 fmov s16, wzr 436 fmov s17, s16 437 fmov s18, s17 438 fmov s19, s16 439 fmov s20, s17 440 fmov s21, s16 441 fmov s22, s17 442 fmov s23, s16 443 fmov s24, s17 444 fmov s25, s16 445 fmov s26, s17 446 fmov s27, s16 447 fmov s28, s17 448 fmov s29, s16 449 fmov s30, s17 450 fmov s31, s16 451.endm 452 453.macro KERNEL8x4_SUB 454 ld1 {v8.2s, v9.2s}, [pB] 455 add pB, pB, #16 456 ld1 {v0.2s, v1.2s}, [pA_0] 457 add pA_0, pA_0, #16 458 459 fmla v16.2s, v0.2s, v8.s[0] 460 fmla v29.2s, v1.2s, v9.s[1] 461 fmla v20.2s, v0.2s, v8.s[1] 462 fmla v25.2s, v1.2s, v9.s[0] 463 464 ld1 {v2.2s, v3.2s}, [pA_1] 465 add pA_1, pA_1, #16 466 467 fmla v24.2s, v0.2s, v9.s[0] 468 fmla v21.2s, v1.2s, v8.s[1] 469 fmla v28.2s, v0.2s, v9.s[1] 470 fmla v17.2s, v1.2s, v8.s[0] 471 472 fmla v18.2s, v2.2s, v8.s[0] 473 fmla v31.2s, v3.2s, v9.s[1] 474 fmla v22.2s, v2.2s, v8.s[1] 475 fmla v27.2s, v3.2s, v9.s[0] 476 477 fmla v26.2s, v2.2s, v9.s[0] 478 fmla v23.2s, v3.2s, v8.s[1] 479 fmla v30.2s, v2.2s, v9.s[1] 480 fmla v19.2s, v3.2s, v8.s[0] 481.endm 482 483.macro SAVE8x4 484 mov pCRow1, pCRow0 485 486 ld1 {v0.2s, v1.2s}, [pCRow1] 487 fmla v0.2s, v16.2s, alphaV0 488 fmla v1.2s, v17.2s, alphaV1 489 st1 {v0.2s, v1.2s}, [pCRow1] 490 491 add pCRow2, pCRow1, LDC 492 add pCRow1, pCRow1, #16 493 494 ld1 {v2.2s, v3.2s}, [pCRow1] 495 fmla v2.2s, v18.2s, alphaV2 496 fmla v3.2s, v19.2s, alphaV3 497 st1 {v2.2s, v3.2s}, [pCRow1] 498 499 ld1 {v4.2s, v5.2s}, [pCRow2] 500 fmla v4.2s, v20.2s, alphaV0 501 fmla v5.2s, v21.2s, alphaV1 502 st1 {v4.2s, v5.2s}, [pCRow2] 503 504 add pCRow1, pCRow2, LDC 505 add pCRow2, pCRow2, #16 506 507 ld1 {v6.2s, v7.2s}, [pCRow2] 508 fmla v6.2s, v22.2s, alphaV2 509 fmla v7.2s, v23.2s, alphaV3 510 st1 {v6.2s, v7.2s}, [pCRow2] 511 512 ld1 {v0.2s, v1.2s}, [pCRow1] 513 fmla v0.2s, v24.2s, alphaV0 514 fmla v1.2s, v25.2s, alphaV1 515 st1 {v0.2s, v1.2s}, [pCRow1] 516 517 add pCRow2, pCRow1, LDC 518 add pCRow1, pCRow1, #16 519 520 ld1 {v2.2s, v3.2s}, [pCRow1] 521 fmla v2.2s, v26.2s, alphaV2 522 fmla v3.2s, v27.2s, alphaV3 523 st1 {v2.2s, v3.2s}, [pCRow1] 524 525 ld1 {v4.2s, v5.2s}, [pCRow2] 526 fmla v4.2s, v28.2s, alphaV0 527 fmla v5.2s, v29.2s, alphaV1 528 st1 {v4.2s, v5.2s}, [pCRow2] 529 530 add pCRow2, pCRow2, #16 531 532 ld1 {v6.2s, v7.2s}, [pCRow2] 533 fmla v6.2s, v30.2s, alphaV2 534 fmla v7.2s, v31.2s, alphaV3 535 st1 {v6.2s, v7.2s}, [pCRow2] 536 537 add pCRow0, pCRow0, #32 538.endm 539 540/******************************************************************************/ 541 542.macro INIT4x4 543 fmov s16, wzr 544 fmov s17, s16 545 fmov s20, s17 546 fmov s21, s16 547 fmov s24, s17 548 fmov s25, s16 549 fmov s28, s17 550 fmov s29, s16 551.endm 552 553.macro KERNEL4x4_SUB 554 ld1 {v8.2s, v9.2s}, [pB] 555 add pB, pB, #16 556 ld1 {v0.2s, v1.2s}, [pA_0] 557 add pA_0, pA_0, #16 558 559 fmla v16.2s, v0.2s, v8.s[0] 560 fmla v29.2s, v1.2s, v9.s[1] 561 562 fmla v20.2s, v0.2s, v8.s[1] 563 fmla v25.2s, v1.2s, v9.s[0] 564 565 fmla v24.2s, v0.2s, v9.s[0] 566 fmla v21.2s, v1.2s, v8.s[1] 567 568 fmla v28.2s, v0.2s, v9.s[1] 569 fmla v17.2s, v1.2s, v8.s[0] 570.endm 571 572.macro SAVE4x4 573 ld1 {v8.2s, v9.2s}, [pCRow0] 574 fmla v8.2s, v16.2s, alphaV0 575 fmla v9.2s, v17.2s, alphaV1 576 st1 {v8.2s, v9.2s}, [pCRow0] 577 578 add pCRow1, pCRow0, LDC 579 580 ld1 {v12.2s, v13.2s}, [pCRow1] 581 fmla v12.2s, v20.2s, alphaV2 582 fmla v13.2s, v21.2s, alphaV3 583 st1 {v12.2s, v13.2s}, [pCRow1] 584 585 add pCRow2, pCRow1, LDC 586 587 ld1 {v8.2s, v9.2s}, [pCRow2] 588 fmla v8.2s, v24.2s, alphaV0 589 fmla v9.2s, v25.2s, alphaV1 590 st1 {v8.2s, v9.2s}, [pCRow2] 591 592 add pCRow1, pCRow2, LDC 593 594 ld1 {v12.2s, v13.2s}, [pCRow1] 595 fmla v12.2s, v28.2s, alphaV2 596 fmla v13.2s, v29.2s, alphaV3 597 st1 {v12.2s, v13.2s}, [pCRow1] 598 599 add pCRow0, pCRow0, #16 600.endm 601 602/******************************************************************************/ 603 604.macro INIT2x4 605 fmov s16, wzr 606 fmov s20, s16 607 fmov s24, s20 608 fmov s28, s16 609.endm 610 611.macro KERNEL2x4_SUB 612 ld1 {v8.2s, v9.2s}, [pB] 613 add pB, pB, #16 614 ld1 {v0.2s}, [pA_0] 615 add pA_0, pA_0, #8 616 617 fmla v16.2s, v0.2s, v8.s[0] 618 fmla v20.2s, v0.2s, v8.s[1] 619 fmla v24.2s, v0.2s, v9.s[0] 620 fmla v28.2s, v0.2s, v9.s[1] 621.endm 622 623.macro SAVE2x4 624 ld1 {v8.2s}, [pCRow0] 625 fmla v8.2s, v16.2s, alphaV0 626 st1 {v8.2s}, [pCRow0] 627 628 add pCRow1, pCRow0, LDC 629 630 ld1 {v12.2s}, [pCRow1] 631 fmla v12.2s, v20.2s, alphaV1 632 st1 {v12.2s}, [pCRow1] 633 634 add pCRow2, pCRow1, LDC 635 636 ld1 {v8.2s}, [pCRow2] 637 fmla v8.2s, v24.2s, alphaV2 638 st1 {v8.2s}, [pCRow2] 639 640 add pCRow1, pCRow2, LDC 641 642 ld1 {v12.2s}, [pCRow1] 643 fmla v12.2s, v28.2s, alphaV3 644 st1 {v12.2s}, [pCRow1] 645 646 add pCRow0, pCRow0, #8 647.endm 648 649/******************************************************************************/ 650 651.macro INIT1x4 652 fmov s16, wzr 653 fmov s20, s16 654.endm 655 656.macro KERNEL1x4_SUB 657 ldr s0, [pA_0] 658 add pA_0, pA_0, #4 659 660 ld1 {v8.2s, v9.2s}, [pB] 661 add pB, pB, #16 662 663 fmla v16.2s, v8.2s, v0.s[0] 664 fmla v20.2s, v9.2s, v0.s[0] 665.endm 666 667.macro SAVE1x4 668 add pCRow1, pCRow0, LDC 669 670 ld1 {v8.s}[0], [pCRow0] 671 ld1 {v8.s}[1], [pCRow1] 672 fmla v8.2s, v16.2s, alphaV0 673 st1 {v8.s}[0], [pCRow0] 674 st1 {v8.s}[1], [pCRow1] 675 676 add pCRow2, pCRow1, LDC 677 add pCRow1, pCRow2, LDC 678 679 ld1 {v12.s}[0], [pCRow2] 680 ld1 {v12.s}[1], [pCRow1] 681 fmla v12.2s, v20.2s, alphaV1 682 st1 {v12.s}[0], [pCRow2] 683 st1 {v12.s}[1], [pCRow1] 684 685 add pCRow0, pCRow0, #4 686.endm 687 688/******************************************************************************/ 689 690.macro INIT4x2 691 fmov s16, wzr 692 fmov s17, s16 693 fmov s20, s17 694 fmov s21, s16 695.endm 696 697.macro KERNEL4x2_SUB 698 ld1 {v8.2s}, [pB] 699 add pB, pB, #8 700 ld1 {v0.2s, v1.2s}, [pA_0] 701 add pA_0, pA_0, #16 702 703 fmla v16.2s, v0.2s, v8.s[0] 704 fmla v17.2s, v1.2s, v8.s[0] 705 fmla v20.2s, v0.2s, v8.s[1] 706 fmla v21.2s, v1.2s, v8.s[1] 707.endm 708 709.macro SAVE4x2 710 ld1 {v8.2s, v9.2s}, [pCRow0] 711 fmla v8.2s, v16.2s, alphaV0 712 fmla v9.2s, v17.2s, alphaV1 713 st1 {v8.2s, v9.2s}, [pCRow0] 714 715 add pCRow1, pCRow0, LDC 716 717 ld1 {v12.2s, v13.2s}, [pCRow1] 718 fmla v12.2s, v20.2s, alphaV2 719 fmla v13.2s, v21.2s, alphaV3 720 st1 {v12.2s, v13.2s}, [pCRow1] 721 722 add pCRow0, pCRow0, #16 723.endm 724 725/******************************************************************************/ 726 727.macro INIT2x2 728 fmov s16, wzr 729 fmov s20, s16 730.endm 731 732.macro KERNEL2x2_SUB 733 ld1 {v8.2s}, [pB] 734 add pB, pB, #8 735 736 ld1 {v0.2s}, [pA_0] 737 add pA_0, pA_0, #8 738 739 fmla v16.2s, v0.2s, v8.s[0] 740 fmla v20.2s, v0.2s, v8.s[1] 741.endm 742 743.macro SAVE2x2 744 ld1 {v8.2s}, [pCRow0] 745 fmla v8.2s, v16.2s, alphaV0 746 st1 {v8.2s}, [pCRow0] 747 748 add pCRow1 , pCRow0, LDC 749 750 ld1 {v12.2s}, [pCRow1] 751 fmla v12.2s, v20.2s, alphaV1 752 st1 {v12.2s}, [pCRow1] 753 754 add pCRow0, pCRow0, #8 755.endm 756 757/******************************************************************************/ 758 759.macro INIT1x2 760 fmov s16, wzr 761.endm 762 763.macro KERNEL1x2_SUB 764 ld1 {v8.2s} , [pB] 765 add pB , pB, #8 766 767 ldr s0 , [pA_0] 768 add pA_0, pA_0, #4 769 770 fmla v16.2s, v8.2s, v0.s[0] 771.endm 772 773.macro SAVE1x2 774 add pCRow1 , pCRow0, LDC 775 776 ld1 {v8.s}[0], [pCRow0] 777 ld1 {v8.s}[1], [pCRow1] 778 fmla v8.2s, v16.2s, alphaV0 779 st1 {v8.s}[0], [pCRow0] 780 st1 {v8.s}[1], [pCRow1] 781 782 add pCRow0, pCRow0, #4 783.endm 784 785/******************************************************************************/ 786 787.macro INIT4x1 788 fmov s16, wzr 789 fmov s17, s16 790.endm 791 792.macro KERNEL4x1_SUB 793 ldr s8, [pB] 794 add pB , pB, #4 795 796 ld1 {v0.2s, v1.2s}, [pA_0] 797 add pA_0 , pA_0, #16 798 799 fmla v16.2s, v0.2s, v8.s[0] 800 fmla v17.2s, v1.2s, v8.s[0] 801.endm 802 803.macro SAVE4x1 804 ld1 {v8.2s, v9.2s}, [pCRow0] 805 fmla v8.2s, v16.2s, alphaV0 806 fmla v9.2s, v17.2s, alphaV1 807 st1 {v8.2s, v9.2s}, [pCRow0] 808 809 add pCRow0, pCRow0, #16 810.endm 811 812 813 814 815/******************************************************************************/ 816 817.macro INIT2x1 818 fmov s16, wzr 819.endm 820 821.macro KERNEL2x1_SUB 822 ldr s8, [pB] 823 add pB , pB, #4 824 825 ld1 {v0.2s}, [pA_0] 826 add pA_0 , pA_0, #8 827 828 fmla v16.2s, v0.2s, v8.s[0] 829.endm 830 831.macro SAVE2x1 832 ld1 {v8.2s}, [pCRow0] 833 fmla v8.2s, v16.2s, alphaV0 834 st1 {v8.2s}, [pCRow0] 835 836 add pCRow0, pCRow0, #8 837.endm 838 839/******************************************************************************/ 840 841.macro INIT1x1 842 fmov s16, wzr 843.endm 844 845.macro KERNEL1x1_SUB 846 ldr s8, [pB] 847 add pB , pB, #4 848 849 ldr s0, [pA_0] 850 add pA_0 , pA_0, #4 851 852 fmadd s16, s0, s8, s16 853.endm 854 855.macro SAVE1x1 856 ldr s8, [pCRow0] 857 fmadd s8, s16, alpha0, s8 858 str s8, [pCRow0] 859 860 add pCRow0, pCRow0, #4 861.endm 862 863/******************************************************************************* 864* End of macro definitions 865*******************************************************************************/ 866 867 PROLOGUE 868 869 .align 5 870 add sp, sp, #-(11 * 16) 871 stp d8, d9, [sp, #(0 * 16)] 872 stp d10, d11, [sp, #(1 * 16)] 873 stp d12, d13, [sp, #(2 * 16)] 874 stp d14, d15, [sp, #(3 * 16)] 875 stp d16, d17, [sp, #(4 * 16)] 876 stp x18, x19, [sp, #(5 * 16)] 877 stp x20, x21, [sp, #(6 * 16)] 878 stp x22, x23, [sp, #(7 * 16)] 879 stp x24, x25, [sp, #(8 * 16)] 880 stp x26, x27, [sp, #(9 * 16)] 881 str x28, [sp, #(10 * 16)] 882 883 fmov alpha0, s0 884 fmov alpha1, s0 885 fmov alpha2, s0 886 fmov alpha3, s0 887 888 lsl LDC, LDC, #2 // ldc = ldc * 4 889 890 mov pB, origPB 891 892 mov counterJ, origN 893 asr counterJ, counterJ, #2 // J = J / 4 894 cmp counterJ, #0 895 ble .Lsgemm_kernel_L2_BEGIN 896 897/******************************************************************************/ 898 899.Lsgemm_kernel_L4_BEGIN: 900 mov pCRow0, pC // pCRow0 = C 901 add pC, pC, LDC, lsl #2 902 903 lsl temp, origK, #4 // k * 4 * 4 904 mov pA_0, origPA // pA_0 = start of A array 905 add pA_1, temp, pA_0 906 add pA_2, temp, pA_1 907 add pA_3, temp, pA_2 908 909.Lsgemm_kernel_L4_M16_BEGIN: 910 911 mov counterI, origM 912 asr counterI, counterI, #4 // counterI = counterI / 16 913 cmp counterI, #0 914 ble .Lsgemm_kernel_L4_M8_BEGIN 915 916.Lsgemm_kernel_L4_M16_20: 917 918 mov pB, origPB 919 asr counterL , origK, #1 // L = K / 2 920 cmp counterL , #2 // is there at least 4 to do? 921 blt .Lsgemm_kernel_L4_M16_32 922 923 KERNEL16x4_I // do one in the K 924 KERNEL16x4_M2 // do another in the K 925 926 subs counterL, counterL, #2 927 ble .Lsgemm_kernel_L4_M16_22a 928 .align 5 929 930.Lsgemm_kernel_L4_M16_22: 931 932 KERNEL16x4_M1 933 KERNEL16x4_M2 934 935 subs counterL, counterL, #1 936 bgt .Lsgemm_kernel_L4_M16_22 937 938 939.Lsgemm_kernel_L4_M16_22a: 940 941 KERNEL16x4_M1 942 KERNEL16x4_E 943 944 b .Lsgemm_kernel_L4_M16_44 945 946.Lsgemm_kernel_L4_M16_32: 947 948 tst counterL, #1 949 ble .Lsgemm_kernel_L4_M16_40 950 951 KERNEL16x4_I 952 953 KERNEL16x4_E 954 955 b .Lsgemm_kernel_L4_M16_44 956 957 958.Lsgemm_kernel_L4_M16_40: 959 960 INIT16x4 961 962.Lsgemm_kernel_L4_M16_44: 963 964 ands counterL , origK, #1 965 ble .Lsgemm_kernel_L4_M16_100 966 967.Lsgemm_kernel_L4_M16_46: 968 969 KERNEL16x4_SUB 970 971.Lsgemm_kernel_L4_M16_100: 972 973 SAVE16x4 974 975.Lsgemm_kernel_L4_M16_END: 976 lsl temp, origK, #4 // k * 4 * 4 = Four rows of A 977 add pA_0, pA_0, temp 978 add pA_0, pA_0, temp 979 add pA_0, pA_0, temp 980 add pA_1, pA_0, temp 981 add pA_2, pA_1, temp 982 add pA_3, pA_2, temp 983 subs counterI, counterI, #1 984 bne .Lsgemm_kernel_L4_M16_20 985 986.Lsgemm_kernel_L4_M8_BEGIN: 987 mov counterI, origM 988 tst counterI , #15 989 ble .Lsgemm_kernel_L4_END 990 991 tst counterI, #8 992 ble .Lsgemm_kernel_L4_M4_BEGIN 993 994.Lsgemm_kernel_L4_M8_20: 995 996 INIT8x4 997 998 mov pB, origPB 999 asr counterL, origK, #3 // counterL = counterL / 8 1000 cmp counterL, #0 1001 ble .Lsgemm_kernel_L4_M8_40 1002 1003.Lsgemm_kernel_L4_M8_22: 1004 1005 KERNEL8x4_SUB 1006 KERNEL8x4_SUB 1007 KERNEL8x4_SUB 1008 KERNEL8x4_SUB 1009 1010 KERNEL8x4_SUB 1011 KERNEL8x4_SUB 1012 KERNEL8x4_SUB 1013 KERNEL8x4_SUB 1014 1015 subs counterL, counterL, #1 1016 bgt .Lsgemm_kernel_L4_M8_22 1017 1018 1019.Lsgemm_kernel_L4_M8_40: 1020 1021 ands counterL , origK, #7 // counterL = counterL % 8 1022 ble .Lsgemm_kernel_L4_M8_100 1023 1024.Lsgemm_kernel_L4_M8_42: 1025 1026 KERNEL8x4_SUB 1027 1028 subs counterL, counterL, #1 1029 bgt .Lsgemm_kernel_L4_M8_42 1030 1031.Lsgemm_kernel_L4_M8_100: 1032 1033 SAVE8x4 1034 1035.Lsgemm_kernel_L4_M8_END: 1036 lsl temp, origK, #4 // k * 4 * 4 1037 add pA_0, pA_0, temp 1038 1039.Lsgemm_kernel_L4_M4_BEGIN: 1040 mov counterI, origM 1041 tst counterI , #7 1042 ble .Lsgemm_kernel_L4_END 1043 1044 tst counterI, #4 1045 ble .Lsgemm_kernel_L4_M2_BEGIN 1046 1047.Lsgemm_kernel_L4_M4_20: 1048 1049 INIT4x4 1050 1051 mov pB, origPB 1052 asr counterL, origK, #3 // counterL = counterL / 8 1053 cmp counterL, #0 1054 ble .Lsgemm_kernel_L4_M4_40 1055 1056.Lsgemm_kernel_L4_M4_22: 1057 1058 KERNEL4x4_SUB 1059 KERNEL4x4_SUB 1060 KERNEL4x4_SUB 1061 KERNEL4x4_SUB 1062 1063 KERNEL4x4_SUB 1064 KERNEL4x4_SUB 1065 KERNEL4x4_SUB 1066 KERNEL4x4_SUB 1067 1068 subs counterL, counterL, #1 1069 bgt .Lsgemm_kernel_L4_M4_22 1070 1071 1072.Lsgemm_kernel_L4_M4_40: 1073 1074 ands counterL , origK, #7 // counterL = counterL % 8 1075 ble .Lsgemm_kernel_L4_M4_100 1076 1077.Lsgemm_kernel_L4_M4_42: 1078 1079 KERNEL4x4_SUB 1080 1081 subs counterL, counterL, #1 1082 bgt .Lsgemm_kernel_L4_M4_42 1083 1084.Lsgemm_kernel_L4_M4_100: 1085 1086 SAVE4x4 1087 1088.Lsgemm_kernel_L4_M4_END: 1089 1090 1091.Lsgemm_kernel_L4_M2_BEGIN: 1092 1093 mov counterI, origM 1094 tst counterI , #3 1095 ble .Lsgemm_kernel_L4_END 1096 1097 tst counterI, #2 // counterI = counterI / 2 1098 ble .Lsgemm_kernel_L4_M1_BEGIN 1099 1100.Lsgemm_kernel_L4_M2_20: 1101 1102 INIT2x4 1103 1104 mov pB, origPB 1105 asr counterL , origK, #3 // counterL = counterL / 8 1106 cmp counterL , #0 1107 ble .Lsgemm_kernel_L4_M2_40 1108 1109.Lsgemm_kernel_L4_M2_22: 1110 1111 KERNEL2x4_SUB 1112 KERNEL2x4_SUB 1113 KERNEL2x4_SUB 1114 KERNEL2x4_SUB 1115 1116 KERNEL2x4_SUB 1117 KERNEL2x4_SUB 1118 KERNEL2x4_SUB 1119 KERNEL2x4_SUB 1120 1121 subs counterL, counterL, #1 1122 bgt .Lsgemm_kernel_L4_M2_22 1123 1124 1125.Lsgemm_kernel_L4_M2_40: 1126 1127 ands counterL , origK, #7 // counterL = counterL % 8 1128 ble .Lsgemm_kernel_L4_M2_100 1129 1130.Lsgemm_kernel_L4_M2_42: 1131 1132 KERNEL2x4_SUB 1133 1134 subs counterL, counterL, #1 1135 bgt .Lsgemm_kernel_L4_M2_42 1136 1137.Lsgemm_kernel_L4_M2_100: 1138 1139 SAVE2x4 1140 1141.Lsgemm_kernel_L4_M2_END: 1142 1143 1144.Lsgemm_kernel_L4_M1_BEGIN: 1145 1146 tst counterI, #1 // counterI = counterI % 2 1147 ble .Lsgemm_kernel_L4_END 1148 1149.Lsgemm_kernel_L4_M1_20: 1150 1151 INIT1x4 1152 1153 mov pB, origPB 1154 asr counterL , origK, #3 // counterL = counterL / 8 1155 cmp counterL , #0 1156 ble .Lsgemm_kernel_L4_M1_40 1157 1158.Lsgemm_kernel_L4_M1_22: 1159 KERNEL1x4_SUB 1160 KERNEL1x4_SUB 1161 KERNEL1x4_SUB 1162 KERNEL1x4_SUB 1163 1164 KERNEL1x4_SUB 1165 KERNEL1x4_SUB 1166 KERNEL1x4_SUB 1167 KERNEL1x4_SUB 1168 1169 subs counterL, counterL, #1 1170 bgt .Lsgemm_kernel_L4_M1_22 1171 1172 1173.Lsgemm_kernel_L4_M1_40: 1174 1175 ands counterL , origK, #7 // counterL = counterL % 8 1176 ble .Lsgemm_kernel_L4_M1_100 1177 1178.Lsgemm_kernel_L4_M1_42: 1179 1180 KERNEL1x4_SUB 1181 1182 subs counterL, counterL, #1 1183 bgt .Lsgemm_kernel_L4_M1_42 1184 1185.Lsgemm_kernel_L4_M1_100: 1186 1187 SAVE1x4 1188 1189 1190.Lsgemm_kernel_L4_END: 1191 1192 lsl temp, origK, #4 1193 add origPB, origPB, temp // B = B + K * 4 * 4 1194 1195 subs counterJ, counterJ , #1 // j-- 1196 bgt .Lsgemm_kernel_L4_BEGIN 1197 1198 1199/******************************************************************************/ 1200 1201.Lsgemm_kernel_L2_BEGIN: // less than 2 left in N direction 1202 1203 mov counterJ , origN 1204 tst counterJ , #3 1205 ble .Lsgemm_kernel_L999 1206 1207 tst counterJ , #2 1208 ble .Lsgemm_kernel_L1_BEGIN 1209 1210 mov pCRow0, pC // pCRow0 = pC 1211 1212 add pC,pC,LDC, lsl #1 1213 1214 mov pA_0, origPA // pA_0 = A 1215 1216 1217 1218.Lsgemm_kernel_L2_M4_BEGIN: 1219 1220 mov counterI, origM 1221 asr counterI, counterI, #2 // counterI = counterI / 4 1222 cmp counterI,#0 1223 ble .Lsgemm_kernel_L2_M2_BEGIN 1224 1225.Lsgemm_kernel_L2_M4_20: 1226 1227 INIT4x2 1228 1229 mov pB, origPB 1230 asr counterL , origK, #3 // counterL = counterL / 8 1231 cmp counterL,#0 1232 ble .Lsgemm_kernel_L2_M4_40 1233 .align 5 1234 1235.Lsgemm_kernel_L2_M4_22: 1236 KERNEL4x2_SUB 1237 KERNEL4x2_SUB 1238 KERNEL4x2_SUB 1239 KERNEL4x2_SUB 1240 1241 KERNEL4x2_SUB 1242 KERNEL4x2_SUB 1243 KERNEL4x2_SUB 1244 KERNEL4x2_SUB 1245 1246 subs counterL, counterL, #1 1247 bgt .Lsgemm_kernel_L2_M4_22 1248 1249 1250.Lsgemm_kernel_L2_M4_40: 1251 1252 ands counterL , origK, #7 // counterL = counterL % 8 1253 ble .Lsgemm_kernel_L2_M4_100 1254 1255.Lsgemm_kernel_L2_M4_42: 1256 1257 KERNEL4x2_SUB 1258 1259 subs counterL, counterL, #1 1260 bgt .Lsgemm_kernel_L2_M4_42 1261 1262.Lsgemm_kernel_L2_M4_100: 1263 1264 SAVE4x2 1265 1266.Lsgemm_kernel_L2_M4_END: 1267 1268 subs counterI, counterI, #1 1269 bgt .Lsgemm_kernel_L2_M4_20 1270 1271 1272.Lsgemm_kernel_L2_M2_BEGIN: 1273 1274 mov counterI, origM 1275 tst counterI , #3 1276 ble .Lsgemm_kernel_L2_END 1277 1278 tst counterI, #2 // counterI = counterI / 2 1279 ble .Lsgemm_kernel_L2_M1_BEGIN 1280 1281.Lsgemm_kernel_L2_M2_20: 1282 1283 INIT2x2 1284 1285 mov pB, origPB 1286 asr counterL , origK, #3 // counterL = counterL / 8 1287 cmp counterL,#0 1288 ble .Lsgemm_kernel_L2_M2_40 1289 1290.Lsgemm_kernel_L2_M2_22: 1291 1292 KERNEL2x2_SUB 1293 KERNEL2x2_SUB 1294 KERNEL2x2_SUB 1295 KERNEL2x2_SUB 1296 1297 KERNEL2x2_SUB 1298 KERNEL2x2_SUB 1299 KERNEL2x2_SUB 1300 KERNEL2x2_SUB 1301 1302 subs counterL, counterL, #1 1303 bgt .Lsgemm_kernel_L2_M2_22 1304 1305 1306.Lsgemm_kernel_L2_M2_40: 1307 1308 ands counterL , origK, #7 // counterL = counterL % 8 1309 ble .Lsgemm_kernel_L2_M2_100 1310 1311.Lsgemm_kernel_L2_M2_42: 1312 1313 KERNEL2x2_SUB 1314 1315 subs counterL, counterL, #1 1316 bgt .Lsgemm_kernel_L2_M2_42 1317 1318.Lsgemm_kernel_L2_M2_100: 1319 1320 SAVE2x2 1321 1322.Lsgemm_kernel_L2_M2_END: 1323 1324 1325.Lsgemm_kernel_L2_M1_BEGIN: 1326 1327 tst counterI, #1 // counterI = counterI % 2 1328 ble .Lsgemm_kernel_L2_END 1329 1330.Lsgemm_kernel_L2_M1_20: 1331 1332 INIT1x2 1333 1334 mov pB, origPB 1335 asr counterL , origK, #3 // counterL = counterL / 8 1336 cmp counterL, #0 1337 ble .Lsgemm_kernel_L2_M1_40 1338 1339.Lsgemm_kernel_L2_M1_22: 1340 KERNEL1x2_SUB 1341 KERNEL1x2_SUB 1342 KERNEL1x2_SUB 1343 KERNEL1x2_SUB 1344 1345 KERNEL1x2_SUB 1346 KERNEL1x2_SUB 1347 KERNEL1x2_SUB 1348 KERNEL1x2_SUB 1349 1350 subs counterL, counterL, #1 1351 bgt .Lsgemm_kernel_L2_M1_22 1352 1353 1354.Lsgemm_kernel_L2_M1_40: 1355 1356 ands counterL , origK, #7 // counterL = counterL % 8 1357 ble .Lsgemm_kernel_L2_M1_100 1358 1359.Lsgemm_kernel_L2_M1_42: 1360 1361 KERNEL1x2_SUB 1362 1363 subs counterL, counterL, #1 1364 bgt .Lsgemm_kernel_L2_M1_42 1365 1366.Lsgemm_kernel_L2_M1_100: 1367 1368 SAVE1x2 1369 1370 1371.Lsgemm_kernel_L2_END: 1372 add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 1373 1374/******************************************************************************/ 1375 1376.Lsgemm_kernel_L1_BEGIN: 1377 1378 mov counterJ , origN 1379 tst counterJ , #1 1380 ble .Lsgemm_kernel_L999 // done 1381 1382 1383 mov pCRow0, pC // pCRow0 = C 1384 add pC , pC , LDC // Update pC to point to next 1385 1386 mov pA_0, origPA // pA_0 = A 1387 1388 1389 1390.Lsgemm_kernel_L1_M4_BEGIN: 1391 1392 mov counterI, origM 1393 asr counterI, counterI, #2 // counterI = counterI / 4 1394 cmp counterI, #0 1395 ble .Lsgemm_kernel_L1_M2_BEGIN 1396 1397.Lsgemm_kernel_L1_M4_20: 1398 1399 INIT4x1 1400 1401 mov pB, origPB 1402 asr counterL , origK, #3 // counterL = counterL / 8 1403 cmp counterL , #0 1404 ble .Lsgemm_kernel_L1_M4_40 1405 .align 5 1406 1407.Lsgemm_kernel_L1_M4_22: 1408 KERNEL4x1_SUB 1409 KERNEL4x1_SUB 1410 KERNEL4x1_SUB 1411 KERNEL4x1_SUB 1412 1413 KERNEL4x1_SUB 1414 KERNEL4x1_SUB 1415 KERNEL4x1_SUB 1416 KERNEL4x1_SUB 1417 1418 subs counterL, counterL, #1 1419 bgt .Lsgemm_kernel_L1_M4_22 1420 1421 1422.Lsgemm_kernel_L1_M4_40: 1423 1424 ands counterL , origK, #7 // counterL = counterL % 8 1425 ble .Lsgemm_kernel_L1_M4_100 1426 1427.Lsgemm_kernel_L1_M4_42: 1428 1429 KERNEL4x1_SUB 1430 1431 subs counterL, counterL, #1 1432 bgt .Lsgemm_kernel_L1_M4_42 1433 1434.Lsgemm_kernel_L1_M4_100: 1435 1436 SAVE4x1 1437 1438.Lsgemm_kernel_L1_M4_END: 1439 1440 subs counterI, counterI, #1 1441 bgt .Lsgemm_kernel_L1_M4_20 1442 1443 1444.Lsgemm_kernel_L1_M2_BEGIN: 1445 1446 mov counterI, origM 1447 tst counterI , #3 1448 ble .Lsgemm_kernel_L1_END 1449 1450 tst counterI, #2 // counterI = counterI / 2 1451 ble .Lsgemm_kernel_L1_M1_BEGIN 1452 1453.Lsgemm_kernel_L1_M2_20: 1454 1455 INIT2x1 1456 1457 mov pB, origPB 1458 asr counterL , origK, #3 // counterL = counterL / 8 1459 cmp counterL , #0 1460 ble .Lsgemm_kernel_L1_M2_40 1461 1462.Lsgemm_kernel_L1_M2_22: 1463 1464 KERNEL2x1_SUB 1465 KERNEL2x1_SUB 1466 KERNEL2x1_SUB 1467 KERNEL2x1_SUB 1468 1469 KERNEL2x1_SUB 1470 KERNEL2x1_SUB 1471 KERNEL2x1_SUB 1472 KERNEL2x1_SUB 1473 1474 subs counterL, counterL, #1 1475 bgt .Lsgemm_kernel_L1_M2_22 1476 1477 1478.Lsgemm_kernel_L1_M2_40: 1479 1480 ands counterL , origK, #7 // counterL = counterL % 8 1481 ble .Lsgemm_kernel_L1_M2_100 1482 1483.Lsgemm_kernel_L1_M2_42: 1484 1485 KERNEL2x1_SUB 1486 1487 subs counterL, counterL, #1 1488 bgt .Lsgemm_kernel_L1_M2_42 1489 1490.Lsgemm_kernel_L1_M2_100: 1491 1492 SAVE2x1 1493 1494.Lsgemm_kernel_L1_M2_END: 1495 1496 1497.Lsgemm_kernel_L1_M1_BEGIN: 1498 1499 tst counterI, #1 // counterI = counterI % 2 1500 ble .Lsgemm_kernel_L1_END 1501 1502.Lsgemm_kernel_L1_M1_20: 1503 1504 INIT1x1 1505 1506 mov pB, origPB 1507 asr counterL , origK, #3 // counterL = counterL / 8 1508 cmp counterL , #0 1509 ble .Lsgemm_kernel_L1_M1_40 1510 1511.Lsgemm_kernel_L1_M1_22: 1512 KERNEL1x1_SUB 1513 KERNEL1x1_SUB 1514 KERNEL1x1_SUB 1515 KERNEL1x1_SUB 1516 1517 KERNEL1x1_SUB 1518 KERNEL1x1_SUB 1519 KERNEL1x1_SUB 1520 KERNEL1x1_SUB 1521 1522 subs counterL, counterL, #1 1523 bgt .Lsgemm_kernel_L1_M1_22 1524 1525 1526.Lsgemm_kernel_L1_M1_40: 1527 1528 ands counterL , origK, #7 // counterL = counterL % 8 1529 ble .Lsgemm_kernel_L1_M1_100 1530 1531.Lsgemm_kernel_L1_M1_42: 1532 1533 KERNEL1x1_SUB 1534 1535 subs counterL, counterL, #1 1536 bgt .Lsgemm_kernel_L1_M1_42 1537 1538.Lsgemm_kernel_L1_M1_100: 1539 1540 SAVE1x1 1541 1542 1543.Lsgemm_kernel_L1_END: 1544 1545 1546.Lsgemm_kernel_L999: 1547 mov x0, #0 // set return value 1548 ldp d8, d9, [sp, #(0 * 16)] 1549 ldp d10, d11, [sp, #(1 * 16)] 1550 ldp d12, d13, [sp, #(2 * 16)] 1551 ldp d14, d15, [sp, #(3 * 16)] 1552 ldp d16, d17, [sp, #(4 * 16)] 1553 ldp x18, x19, [sp, #(5 * 16)] 1554 ldp x20, x21, [sp, #(6 * 16)] 1555 ldp x22, x23, [sp, #(7 * 16)] 1556 ldp x24, x25, [sp, #(8 * 16)] 1557 ldp x26, x27, [sp, #(9 * 16)] 1558 ldr x28, [sp, #(10 * 16)] 1559 add sp, sp, #(11*16) 1560 ret 1561 1562 EPILOGUE 1563 1564