1/******************************************************************************* 2Copyright (c) 2015, The OpenBLAS Project 3All rights reserved. 4Redistribution and use in source and binary forms, with or without 5modification, are permitted provided that the following conditions are 6met: 71. Redistributions of source code must retain the above copyright 8notice, this list of conditions and the following disclaimer. 92. Redistributions in binary form must reproduce the above copyright 10notice, this list of conditions and the following disclaimer in 11the documentation and/or other materials provided with the 12distribution. 133. Neither the name of the OpenBLAS project nor the names of 14its contributors may be used to endorse or promote products 15derived from this software without specific prior written permission. 16THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 17AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 18IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 19ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE 20LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 21DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 22SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 23CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 24OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE 25USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 26*******************************************************************************/ 27 28#define ASSEMBLER 29#include "common.h" 30 31/* X0 X1 X2 s0 X3 x4 x5 x6 */ 32/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/ 33 34#define origM x0 35#define origN x1 36#define origK x2 37#define origPA x3 38#define origPB x4 39#define pC x5 40#define LDC x6 41#define temp x7 42#define counterL x8 43#define counterI x9 44#define counterJ x10 45#define pB x11 46#define pCRow0 x12 47#define pCRow1 x13 48#define pCRow2 x14 49#define pA x15 50 51#define alpha0 d2 52#define alphaV0 v2.d[0] 53#define alpha1 d3 54#define alphaV1 v3.d[0] 55#define alpha2 d6 56#define alphaV2 v6.d[0] 57#define alpha3 d7 58#define alphaV3 v7.d[0] 59 60// 00 origM 61// 01 origN 62// 02 origK 63// 03 origPA 64// 04 origPB 65// 05 pC 66// 06 origLDC -> LDC 67// 07 temp 68// 08 counterL 69// 09 counterI 70// 10 counterJ 71// 11 pB 72// 12 pCRow0 73// 13 pCRow1 74// 14 pCRow2 75// 15 pA 76// 16 77// 17 78// 18 must save 79// 19 must save 80// 20 must save 81// 21 must save 82// 22 must save 83// 23 must save 84// 24 must save 85// 25 must save 86// 26 must save 87// 27 must save 88// 28 must save 89// 29 frame 90// 30 link 91// 31 sp 92 93//v00 ALPHA -> pA00, pA01 94//v01 pA02, pA03 95//v02 ALPHA0 96//v03 ALPHA1 97//v04 pA10, pA11 98//v05 pA12, pA13 99//v06 ALPHA2 100//v07 ALPHA3 101//v08 must save pB0_0, pB0_1 102//v09 must save pB0_2, pB0_3 103//v10 must save pB0_4, pB0_5 104//v11 must save pB0_6, pB0_7 105//v12 must save pB1_0, pB1_1 106//v13 must save pB1_2, pB1_3 107//v14 must save pB1_4, pB1_5 108//v15 must save pB1_6, pB1_7 109//v16 must save C00, C01 110//v17 must save C02, C03 111//v18 C04, C05 112//v19 C06, C07 113//v20 C10, C11 114//v21 C12, C13 115//v22 C14, C15 116//v23 C16, C17 117//v24 C20, C21 118//v25 C22, C23 119//v26 C24, C25 120//v27 C26, C27 121//v28 C30, C31 122//v29 C32, C33 123//v30 C34, C35 124//v31 C36, C37 125 126/******************************************************************************* 127* Macro definitions 128*******************************************************************************/ 129 130.macro INIT4x8 131 fmov d16, xzr 132 fmov d17, xzr 133 fmov d18, xzr 134 fmov d19, d16 135 fmov d20, xzr 136 fmov d21, d16 137 fmov d22, d17 138 fmov d23, d18 139 fmov d24, xzr 140 fmov d25, d16 141 fmov d26, d17 142 fmov d27, d18 143 fmov d28, xzr 144 fmov d29, d16 145 fmov d30, d17 146 fmov d31, d18 147.endm 148 149.macro KERNEL4x8_I 150 ld1 {v8.2d, v9.2d}, [pB] 151 add pB, pB, #32 152 ld1 {v0.2d, v1.2d}, [pA] 153 add pA, pA, #32 154 ld1 {v10.2d, v11.2d}, [pB] 155 add pB, pB, #32 156 157 fmul v16.2d, v0.2d, v8.d[0] 158 fmul v17.2d, v1.2d, v8.d[0] 159 fmul v18.2d, v0.2d, v8.d[1] 160 fmul v19.2d, v1.2d, v8.d[1] 161 162 fmul v20.2d, v0.2d, v9.d[0] 163 fmul v21.2d, v1.2d, v9.d[0] 164 fmul v22.2d, v0.2d, v9.d[1] 165 fmul v23.2d, v1.2d, v9.d[1] 166 167 fmul v24.2d, v0.2d, v10.d[0] 168 fmul v25.2d, v1.2d, v10.d[0] 169 fmul v26.2d, v0.2d, v10.d[1] 170 fmul v27.2d, v1.2d, v10.d[1] 171 172 fmul v28.2d, v0.2d, v11.d[0] 173 fmul v29.2d, v1.2d, v11.d[0] 174 fmul v30.2d, v0.2d, v11.d[1] 175 fmul v31.2d, v1.2d, v11.d[1] 176 177 ld1 {v12.2d, v13.2d}, [pB] 178 add pB, pB, #32 179 ld1 {v4.2d, v5.2d}, [pA] 180 add pA, pA, #32 181 ld1 {v14.2d, v15.2d}, [pB] 182 add pB, pB, #32 183.endm 184 185.macro KERNEL4x8_M1 186 fmla v16.2d, v0.2d, v8.d[0] 187 fmla v17.2d, v1.2d, v8.d[0] 188 fmla v18.2d, v0.2d, v8.d[1] 189 fmla v19.2d, v1.2d, v8.d[1] 190 191 fmla v20.2d, v0.2d, v9.d[0] 192 fmla v21.2d, v1.2d, v9.d[0] 193 fmla v22.2d, v0.2d, v9.d[1] 194 fmla v23.2d, v1.2d, v9.d[1] 195 196 fmla v24.2d, v0.2d, v10.d[0] 197 fmla v25.2d, v1.2d, v10.d[0] 198 fmla v26.2d, v0.2d, v10.d[1] 199 fmla v27.2d, v1.2d, v10.d[1] 200 201 fmla v28.2d, v0.2d, v11.d[0] 202 fmla v29.2d, v1.2d, v11.d[0] 203 fmla v30.2d, v0.2d, v11.d[1] 204 fmla v31.2d, v1.2d, v11.d[1] 205 206 ld1 {v12.2d, v13.2d}, [pB] // For next round 207 add pB, pB, #32 208 ld1 {v4.2d, v5.2d}, [pA] // For next round 209 add pA, pA, #32 210 ld1 {v14.2d, v15.2d}, [pB] 211 add pB, pB, #32 212 213 prfm PLDL1KEEP, [pA, #512] 214.endm 215 216.macro KERNEL4x8_M2 217 fmla v16.2d, v4.2d, v12.d[0] 218 fmla v17.2d, v5.2d, v12.d[0] 219 fmla v18.2d, v4.2d, v12.d[1] 220 fmla v19.2d, v5.2d, v12.d[1] 221 222 fmla v20.2d, v4.2d, v13.d[0] 223 fmla v21.2d, v5.2d, v13.d[0] 224 fmla v22.2d, v4.2d, v13.d[1] 225 fmla v23.2d, v5.2d, v13.d[1] 226 227 fmla v24.2d, v4.2d, v14.d[0] 228 fmla v25.2d, v5.2d, v14.d[0] 229 fmla v26.2d, v4.2d, v14.d[1] 230 fmla v27.2d, v5.2d, v14.d[1] 231 232 fmla v28.2d, v4.2d, v15.d[0] 233 fmla v29.2d, v5.2d, v15.d[0] 234 fmla v30.2d, v4.2d, v15.d[1] 235 fmla v31.2d, v5.2d, v15.d[1] 236 237 ld1 {v8.2d, v9.2d}, [pB] // For next round 238 add pB, pB, #32 239 ld1 {v0.2d, v1.2d}, [pA] // For next round 240 add pA, pA, #32 241 ld1 {v10.2d, v11.2d}, [pB] 242 add pB, pB, #32 243 244 prfm PLDL1KEEP, [pB, #512] 245.endm 246 247.macro KERNEL4x8_E 248 fmla v16.2d, v4.2d, v12.d[0] 249 fmla v17.2d, v5.2d, v12.d[0] 250 fmla v18.2d, v4.2d, v12.d[1] 251 fmla v19.2d, v5.2d, v12.d[1] 252 253 fmla v20.2d, v4.2d, v13.d[0] 254 fmla v21.2d, v5.2d, v13.d[0] 255 fmla v22.2d, v4.2d, v13.d[1] 256 fmla v23.2d, v5.2d, v13.d[1] 257 258 fmla v24.2d, v4.2d, v14.d[0] 259 fmla v25.2d, v5.2d, v14.d[0] 260 fmla v26.2d, v4.2d, v14.d[1] 261 fmla v27.2d, v5.2d, v14.d[1] 262 263 fmla v28.2d, v4.2d, v15.d[0] 264 fmla v29.2d, v5.2d, v15.d[0] 265 fmla v30.2d, v4.2d, v15.d[1] 266 fmla v31.2d, v5.2d, v15.d[1] 267.endm 268 269.macro KERNEL4x8_SUB 270 ld1 {v8.2d, v9.2d}, [pB] // For next round 271 add pB, pB, #32 272 ld1 {v0.2d, v1.2d}, [pA] // For next round 273 add pA, pA, #32 274 ld1 {v10.2d, v11.2d}, [pB] 275 add pB, pB, #32 276 277 fmla v16.2d, v0.2d, v8.d[0] 278 fmla v17.2d, v1.2d, v8.d[0] 279 fmla v18.2d, v0.2d, v8.d[1] 280 fmla v19.2d, v1.2d, v8.d[1] 281 282 fmla v20.2d, v0.2d, v9.d[0] 283 fmla v21.2d, v1.2d, v9.d[0] 284 fmla v22.2d, v0.2d, v9.d[1] 285 fmla v23.2d, v1.2d, v9.d[1] 286 287 fmla v24.2d, v0.2d, v10.d[0] 288 fmla v25.2d, v1.2d, v10.d[0] 289 fmla v26.2d, v0.2d, v10.d[1] 290 fmla v27.2d, v1.2d, v10.d[1] 291 292 fmla v28.2d, v0.2d, v11.d[0] 293 fmla v29.2d, v1.2d, v11.d[0] 294 fmla v30.2d, v0.2d, v11.d[1] 295 fmla v31.2d, v1.2d, v11.d[1] 296.endm 297 298.macro SAVE4x8 299 add pCRow1, pCRow0, LDC 300 301 ld1 {v8.2d, v9.2d}, [pCRow0] 302 fmla v8.2d, v16.2d, alphaV0 303 fmla v9.2d, v17.2d, alphaV1 304 st1 {v8.2d, v9.2d}, [pCRow0] 305 306 add pCRow2, pCRow1, LDC 307 308 ld1 {v10.2d, v11.2d}, [pCRow1] 309 fmla v10.2d, v18.2d, alphaV2 310 fmla v11.2d, v19.2d, alphaV3 311 st1 {v10.2d, v11.2d}, [pCRow1] 312 313 add pCRow1, pCRow2, LDC 314 315 ld1 {v12.2d, v13.2d}, [pCRow2] 316 fmla v12.2d, v20.2d, alphaV0 317 fmla v13.2d, v21.2d, alphaV1 318 st1 {v12.2d, v13.2d}, [pCRow2] 319 320 add pCRow2, pCRow1, LDC 321 322 ld1 {v14.2d, v15.2d}, [pCRow1] 323 fmla v14.2d, v22.2d, alphaV2 324 fmla v15.2d, v23.2d, alphaV3 325 st1 {v14.2d, v15.2d}, [pCRow1] 326 327 add pCRow1, pCRow2, LDC 328 329 ld1 {v8.2d, v9.2d}, [pCRow2] 330 fmla v8.2d, v24.2d, alphaV0 331 fmla v9.2d, v25.2d, alphaV1 332 st1 {v8.2d, v9.2d}, [pCRow2] 333 334 add pCRow2, pCRow1, LDC 335 336 ld1 {v10.2d, v11.2d}, [pCRow1] 337 fmla v10.2d, v26.2d, alphaV2 338 fmla v11.2d, v27.2d, alphaV3 339 st1 {v10.2d, v11.2d}, [pCRow1] 340 341 add pCRow1, pCRow2, LDC 342 343 ld1 {v12.2d, v13.2d}, [pCRow2] 344 fmla v12.2d, v28.2d, alphaV0 345 fmla v13.2d, v29.2d, alphaV1 346 st1 {v12.2d, v13.2d}, [pCRow2] 347 348 ld1 {v14.2d, v15.2d}, [pCRow1] 349 fmla v14.2d, v30.2d, alphaV2 350 fmla v15.2d, v31.2d, alphaV3 351 st1 {v14.2d, v15.2d}, [pCRow1] 352 353 add pCRow0, pCRow0, #32 354.endm 355 356/******************************************************************************/ 357 358.macro INIT2x8 359 fmov d16, xzr 360 fmov d18, xzr 361 fmov d20, xzr 362 fmov d22, d16 363 fmov d24, xzr 364 fmov d26, d16 365 fmov d28, xzr 366 fmov d30, d16 367.endm 368 369.macro KERNEL2x8_SUB 370 ld1 {v8.2d, v9.2d}, [pB] 371 add pB, pB, #32 372 ld1 {v0.2d}, [pA] 373 add pA, pA, #16 374 ld1 {v10.2d, v11.2d}, [pB] 375 add pB, pB, #32 376 377 fmla v16.2d, v0.2d, v8.d[0] 378 fmla v18.2d, v0.2d, v8.d[1] 379 380 fmla v20.2d, v0.2d, v9.d[0] 381 fmla v22.2d, v0.2d, v9.d[1] 382 383 fmla v24.2d, v0.2d, v10.d[0] 384 fmla v26.2d, v0.2d, v10.d[1] 385 386 fmla v28.2d, v0.2d, v11.d[0] 387 fmla v30.2d, v0.2d, v11.d[1] 388.endm 389 390.macro SAVE2x8 391 add pCRow1, pCRow0, LDC 392 393 ld1 {v8.2d}, [pCRow0] 394 fmla v8.2d, v16.2d, alphaV0 395 st1 {v8.2d}, [pCRow0] 396 397 add pCRow2, pCRow1, LDC 398 399 ld1 {v10.2d}, [pCRow1] 400 fmla v10.2d, v18.2d, alphaV2 401 st1 {v10.2d}, [pCRow1] 402 403 add pCRow1, pCRow2, LDC 404 405 ld1 {v12.2d}, [pCRow2] 406 fmla v12.2d, v20.2d, alphaV0 407 st1 {v12.2d}, [pCRow2] 408 409 add pCRow2, pCRow1, LDC 410 411 ld1 {v14.2d}, [pCRow1] 412 fmla v14.2d, v22.2d, alphaV2 413 st1 {v14.2d}, [pCRow1] 414 415 add pCRow1, pCRow2, LDC 416 417 ld1 {v8.2d}, [pCRow2] 418 fmla v8.2d, v24.2d, alphaV0 419 st1 {v8.2d}, [pCRow2] 420 421 add pCRow2, pCRow1, LDC 422 423 ld1 {v10.2d}, [pCRow1] 424 fmla v10.2d, v26.2d, alphaV2 425 st1 {v10.2d}, [pCRow1] 426 427 add pCRow1, pCRow2, LDC 428 429 ld1 {v12.2d}, [pCRow2] 430 fmla v12.2d, v28.2d, alphaV0 431 st1 {v12.2d}, [pCRow2] 432 433 add pCRow2, pCRow1, LDC 434 435 ld1 {v14.2d}, [pCRow1] 436 fmla v14.2d, v30.2d, alphaV2 437 st1 {v14.2d}, [pCRow1] 438 439 add pCRow0, pCRow0, #16 440.endm 441 442/******************************************************************************/ 443 444.macro INIT1x8 445 fmov d16, xzr 446 fmov d20, xzr 447 fmov d24, xzr 448 fmov d28, xzr 449.endm 450 451.macro KERNEL1x8_SUB 452 ld1 {v8.2d, v9.2d}, [pB] 453 add pB, pB, #32 454 ldr d0, [pA] 455 add pA, pA, #8 456 ld1 {v10.2d, v11.2d}, [pB] 457 add pB, pB, #32 458 459 fmla v16.2d, v8.2d, v0.d[0] 460 fmla v20.2d, v9.2d, v0.d[0] 461 fmla v24.2d, v10.2d, v0.d[0] 462 fmla v28.2d, v11.2d, v0.d[0] 463.endm 464 465.macro SAVE1x8 466 add pCRow1, pCRow0, LDC 467 468 ld1 {v8.d}[0], [pCRow0] 469 ld1 {v8.d}[1], [pCRow1] 470 fmla v8.2d, v16.2d, alphaV0 471 st1 {v8.d}[0], [pCRow0] 472 st1 {v8.d}[1], [pCRow1] 473 474 add pCRow2, pCRow1, LDC 475 add pCRow1, pCRow2, LDC 476 477 ld1 {v10.d}[0], [pCRow2] 478 ld1 {v10.d}[1], [pCRow1] 479 fmla v10.2d, v20.2d, alphaV1 480 st1 {v10.d}[0], [pCRow2] 481 st1 {v10.d}[1], [pCRow1] 482 483 add pCRow2, pCRow1, LDC 484 add pCRow1, pCRow2, LDC 485 486 ld1 {v12.d}[0], [pCRow2] 487 ld1 {v12.d}[1], [pCRow1] 488 fmla v12.2d, v24.2d, alphaV2 489 st1 {v12.d}[0], [pCRow2] 490 st1 {v12.d}[1], [pCRow1] 491 492 add pCRow2, pCRow1, LDC 493 add pCRow1, pCRow2, LDC 494 495 ld1 {v14.d}[0], [pCRow2] 496 ld1 {v14.d}[1], [pCRow1] 497 fmla v14.2d, v28.2d, alphaV3 498 st1 {v14.d}[0], [pCRow2] 499 st1 {v14.d}[1], [pCRow1] 500 501 add pCRow0, pCRow0, #8 502.endm 503 504/******************************************************************************/ 505 506.macro INIT4x4 507 fmov d16, xzr 508 fmov d17, d16 509 fmov d20, d17 510 fmov d21, d16 511 fmov d24, d17 512 fmov d25, d16 513 fmov d28, d17 514 fmov d29, d16 515.endm 516 517.macro KERNEL4x4_I 518 ld1 {v8.2d, v9.2d}, [pB] 519 add pB, pB, #32 520 ld1 {v0.2d, v1.2d}, [pA] 521 add pA, pA, #32 522 523 fmul v16.2d, v0.2d, v8.d[0] 524 fmul v29.2d, v1.2d, v9.d[1] 525 526 fmul v20.2d, v0.2d, v8.d[1] 527 fmul v25.2d, v1.2d, v9.d[0] 528 529 fmul v24.2d, v0.2d, v9.d[0] 530 fmul v21.2d, v1.2d, v8.d[1] 531 532 fmul v28.2d, v0.2d, v9.d[1] 533 fmul v17.2d, v1.2d, v8.d[0] 534 535 ld1 {v12.2d, v13.2d}, [pB] 536 add pB, pB, #32 537 ld1 {v4.2d, v5.2d}, [pA] 538 add pA, pA, #32 539.endm 540 541.macro KERNEL4x4_M1 542 fmla v16.2d, v0.2d, v8.d[0] 543 fmla v29.2d, v1.2d, v9.d[1] 544 545 ld1 {v12.2d, v13.2d}, [pB] // For next round 546 add pB, pB, #32 547 548 fmla v20.2d, v0.2d, v8.d[1] 549 fmla v25.2d, v1.2d, v9.d[0] 550 551 ld1 {v4.2d, v5.2d}, [pA] // For next round 552 add pA, pA, #32 553 554 fmla v24.2d, v0.2d, v9.d[0] 555 fmla v21.2d, v1.2d, v8.d[1] 556 557 prfm PLDL1KEEP, [pA, #512] 558 559 fmla v28.2d, v0.2d, v9.d[1] 560 fmla v17.2d, v1.2d, v8.d[0] 561.endm 562 563.macro KERNEL4x4_M2 564 fmla v16.2d, v4.2d, v12.d[0] 565 fmla v29.2d, v5.2d, v13.d[1] 566 567 ld1 {v8.2d, v9.2d}, [pB] // For next round 568 add pB, pB, #32 569 570 fmla v20.2d, v4.2d, v12.d[1] 571 fmla v25.2d, v5.2d, v13.d[0] 572 573 ld1 {v0.2d, v1.2d}, [pA] // For next round 574 add pA, pA, #32 575 576 fmla v24.2d, v4.2d, v13.d[0] 577 fmla v21.2d, v5.2d, v12.d[1] 578 579 prfm PLDL1KEEP, [pB, #512] 580 581 fmla v28.2d, v4.2d, v13.d[1] 582 fmla v17.2d, v5.2d, v12.d[0] 583.endm 584 585.macro KERNEL4x4_E 586 fmla v16.2d, v4.2d, v12.d[0] 587 fmla v29.2d, v5.2d, v13.d[1] 588 589 fmla v20.2d, v4.2d, v12.d[1] 590 fmla v25.2d, v5.2d, v13.d[0] 591 592 fmla v24.2d, v4.2d, v13.d[0] 593 fmla v21.2d, v5.2d, v12.d[1] 594 595 fmla v28.2d, v4.2d, v13.d[1] 596 fmla v17.2d, v5.2d, v12.d[0] 597.endm 598 599.macro KERNEL4x4_SUB 600 ld1 {v8.2d, v9.2d}, [pB] 601 add pB, pB, #32 602 ld1 {v0.2d, v1.2d}, [pA] 603 add pA, pA, #32 604 605 fmla v16.2d, v0.2d, v8.d[0] 606 fmla v29.2d, v1.2d, v9.d[1] 607 608 fmla v20.2d, v0.2d, v8.d[1] 609 fmla v25.2d, v1.2d, v9.d[0] 610 611 fmla v24.2d, v0.2d, v9.d[0] 612 fmla v21.2d, v1.2d, v8.d[1] 613 614 fmla v28.2d, v0.2d, v9.d[1] 615 fmla v17.2d, v1.2d, v8.d[0] 616.endm 617 618.macro SAVE4x4 619 ld1 {v8.2d, v9.2d}, [pCRow0] 620 fmla v8.2d, v16.2d, alphaV0 621 fmla v9.2d, v17.2d, alphaV1 622 st1 {v8.2d, v9.2d}, [pCRow0] 623 624 add pCRow1, pCRow0, LDC 625 626 ld1 {v12.2d, v13.2d}, [pCRow1] 627 fmla v12.2d, v20.2d, alphaV2 628 fmla v13.2d, v21.2d, alphaV3 629 st1 {v12.2d, v13.2d}, [pCRow1] 630 631 add pCRow2, pCRow1, LDC 632 633 ld1 {v8.2d, v9.2d}, [pCRow2] 634 fmla v8.2d, v24.2d, alphaV0 635 fmla v9.2d, v25.2d, alphaV1 636 st1 {v8.2d, v9.2d}, [pCRow2] 637 638 add pCRow1, pCRow2, LDC 639 640 ld1 {v12.2d, v13.2d}, [pCRow1] 641 fmla v12.2d, v28.2d, alphaV2 642 fmla v13.2d, v29.2d, alphaV3 643 st1 {v12.2d, v13.2d}, [pCRow1] 644 645 add pCRow0, pCRow0, #32 646.endm 647 648/******************************************************************************/ 649 650.macro INIT2x4 651 fmov d16, xzr 652 fmov d20, d16 653 fmov d24, d20 654 fmov d28, d16 655.endm 656 657.macro KERNEL2x4_SUB 658 ld1 {v8.2d, v9.2d}, [pB] 659 add pB, pB, #32 660 ld1 {v0.2d}, [pA] 661 add pA, pA, #16 662 663 fmla v16.2d, v0.2d, v8.d[0] 664 fmla v20.2d, v0.2d, v8.d[1] 665 fmla v24.2d, v0.2d, v9.d[0] 666 fmla v28.2d, v0.2d, v9.d[1] 667.endm 668 669.macro SAVE2x4 670 ld1 {v8.2d}, [pCRow0] 671 fmla v8.2d, v16.2d, alphaV0 672 st1 {v8.2d}, [pCRow0] 673 674 add pCRow1, pCRow0, LDC 675 676 ld1 {v12.2d}, [pCRow1] 677 fmla v12.2d, v20.2d, alphaV1 678 st1 {v12.2d}, [pCRow1] 679 680 add pCRow2, pCRow1, LDC 681 682 ld1 {v8.2d}, [pCRow2] 683 fmla v8.2d, v24.2d, alphaV2 684 st1 {v8.2d}, [pCRow2] 685 686 add pCRow1, pCRow2, LDC 687 688 ld1 {v12.2d}, [pCRow1] 689 fmla v12.2d, v28.2d, alphaV3 690 st1 {v12.2d}, [pCRow1] 691 692 add pCRow0, pCRow0, #16 693.endm 694 695/******************************************************************************/ 696 697.macro INIT1x4 698 fmov d16, xzr 699 fmov d20, d16 700.endm 701 702.macro KERNEL1x4_SUB 703 ldr d0, [pA] 704 add pA, pA, #8 705 706 ld1 {v8.2d, v9.2d}, [pB] 707 add pB, pB, #32 708 709 fmla v16.2d, v8.2d, v0.d[0] 710 fmla v20.2d, v9.2d, v0.d[0] 711.endm 712 713.macro SAVE1x4 714 add pCRow1, pCRow0, LDC 715 716 ld1 {v8.d}[0], [pCRow0] 717 ld1 {v8.d}[1], [pCRow1] 718 fmla v8.2d, v16.2d, alphaV0 719 st1 {v8.d}[0], [pCRow0] 720 st1 {v8.d}[1], [pCRow1] 721 722 add pCRow2, pCRow1, LDC 723 add pCRow1, pCRow2, LDC 724 725 ld1 {v12.d}[0], [pCRow2] 726 ld1 {v12.d}[1], [pCRow1] 727 fmla v12.2d, v20.2d, alphaV1 728 st1 {v12.d}[0], [pCRow2] 729 st1 {v12.d}[1], [pCRow1] 730 731 add pCRow0, pCRow0, #8 732.endm 733 734/******************************************************************************/ 735 736.macro INIT4x2 737 fmov d16, xzr 738 fmov d17, d16 739 fmov d20, d17 740 fmov d21, d16 741.endm 742 743.macro KERNEL4x2_SUB 744 ld1 {v8.2d}, [pB] 745 add pB, pB, #16 746 ld1 {v0.2d, v1.2d}, [pA] 747 add pA, pA, #32 748 749 fmla v16.2d, v0.2d, v8.d[0] 750 fmla v17.2d, v1.2d, v8.d[0] 751 fmla v20.2d, v0.2d, v8.d[1] 752 fmla v21.2d, v1.2d, v8.d[1] 753.endm 754 755.macro SAVE4x2 756 ld1 {v8.2d, v9.2d}, [pCRow0] 757 fmla v8.2d, v16.2d, alphaV0 758 fmla v9.2d, v17.2d, alphaV1 759 st1 {v8.2d, v9.2d}, [pCRow0] 760 761 add pCRow1, pCRow0, LDC 762 763 ld1 {v12.2d, v13.2d}, [pCRow1] 764 fmla v12.2d, v20.2d, alphaV2 765 fmla v13.2d, v21.2d, alphaV3 766 st1 {v12.2d, v13.2d}, [pCRow1] 767 768 add pCRow0, pCRow0, #32 769.endm 770 771/******************************************************************************/ 772 773.macro INIT2x2 774 fmov d16, xzr 775 fmov d20, d16 776.endm 777 778.macro KERNEL2x2_SUB 779 ld1 {v8.2d}, [pB] 780 add pB, pB, #16 781 782 ld1 {v0.2d}, [pA] 783 add pA, pA, #16 784 785 fmla v16.2d, v0.2d, v8.d[0] 786 fmla v20.2d, v0.2d, v8.d[1] 787.endm 788 789.macro SAVE2x2 790 ld1 {v8.2d}, [pCRow0] 791 fmla v8.2d, v16.2d, alphaV0 792 st1 {v8.2d}, [pCRow0] 793 794 add pCRow1 , pCRow0, LDC 795 796 ld1 {v12.2d}, [pCRow1] 797 fmla v12.2d, v20.2d, alphaV1 798 st1 {v12.2d}, [pCRow1] 799 800 add pCRow0, pCRow0, #16 801.endm 802 803/******************************************************************************/ 804 805.macro INIT1x2 806 fmov d16, xzr 807.endm 808 809.macro KERNEL1x2_SUB 810 ld1 {v8.2d} , [pB] 811 add pB , pB, #16 812 813 ldr d0 , [pA] 814 add pA, pA, #8 815 816 fmla v16.2d, v8.2d, v0.d[0] 817.endm 818 819.macro SAVE1x2 820 add pCRow1 , pCRow0, LDC 821 822 ld1 {v8.d}[0], [pCRow0] 823 ld1 {v8.d}[1], [pCRow1] 824 fmla v8.2d, v16.2d, alphaV0 825 st1 {v8.d}[0], [pCRow0] 826 st1 {v8.d}[1], [pCRow1] 827 828 add pCRow0, pCRow0, #8 829.endm 830 831/******************************************************************************/ 832 833.macro INIT4x1 834 fmov d16, xzr 835 fmov d17, d16 836.endm 837 838.macro KERNEL4x1_SUB 839 ldr d8, [pB] 840 add pB , pB, #8 841 842 ld1 {v0.2d, v1.2d}, [pA] 843 add pA , pA, #32 844 845 fmla v16.2d, v0.2d, v8.d[0] 846 fmla v17.2d, v1.2d, v8.d[0] 847.endm 848 849.macro SAVE4x1 850 ld1 {v8.2d, v9.2d}, [pCRow0] 851 fmla v8.2d, v16.2d, alphaV0 852 fmla v9.2d, v17.2d, alphaV1 853 st1 {v8.2d, v9.2d}, [pCRow0] 854 855 add pCRow0, pCRow0, #32 856.endm 857 858 859 860 861/******************************************************************************/ 862 863.macro INIT2x1 864 fmov d16, xzr 865.endm 866 867.macro KERNEL2x1_SUB 868 ldr d8, [pB] 869 add pB , pB, #8 870 871 ld1 {v0.2d}, [pA] 872 add pA , pA, #16 873 874 fmla v16.2d, v0.2d, v8.d[0] 875.endm 876 877.macro SAVE2x1 878 ld1 {v8.2d}, [pCRow0] 879 fmla v8.2d, v16.2d, alphaV0 880 st1 {v8.2d}, [pCRow0] 881 882 add pCRow0, pCRow0, #16 883.endm 884 885/******************************************************************************/ 886 887.macro INIT1x1 888 fmov d16, xzr 889.endm 890 891.macro KERNEL1x1_SUB 892 ldr d8, [pB] 893 add pB , pB, #8 894 895 ldr d0, [pA] 896 add pA , pA, #8 897 898 fmadd d16, d0, d8, d16 899.endm 900 901.macro SAVE1x1 902 ldr d8, [pCRow0] 903 fmadd d8, d16, alpha0, d8 904 str d8, [pCRow0] 905 906 add pCRow0, pCRow0, #8 907.endm 908 909/******************************************************************************* 910* End of macro definitions 911*******************************************************************************/ 912 913 PROLOGUE 914 915 .align 5 916 add sp, sp, #-(11 * 16) 917 stp d8, d9, [sp, #(0 * 16)] 918 stp d10, d11, [sp, #(1 * 16)] 919 stp d12, d13, [sp, #(2 * 16)] 920 stp d14, d15, [sp, #(3 * 16)] 921 stp d16, d17, [sp, #(4 * 16)] 922 stp x18, x19, [sp, #(5 * 16)] 923 stp x20, x21, [sp, #(6 * 16)] 924 stp x22, x23, [sp, #(7 * 16)] 925 stp x24, x25, [sp, #(8 * 16)] 926 stp x26, x27, [sp, #(9 * 16)] 927 str x28, [sp, #(10 * 16)] 928 929 fmov alpha0, d0 930 fmov alpha1, d0 931 fmov alpha2, d0 932 fmov alpha3, d0 933 934 lsl LDC, LDC, #3 // ldc = ldc * 8 935 936 mov pB, origPB 937 938 mov counterJ, origN 939 asr counterJ, counterJ, #3 // J = J / 8 940 cmp counterJ, #0 941 ble .Ldgemm_kernel_L4_BEGIN 942 943/******************************************************************************/ 944 945.Ldgemm_kernel_L8_BEGIN: 946 947 mov pCRow0, pC // pCRow0 = C 948 add pC, pC, LDC, lsl #3 949 950 mov pA, origPA // pA = start of A array 951 952.Ldgemm_kernel_L8_M4_BEGIN: 953 954 mov counterI, origM 955 asr counterI, counterI, #2 // counterI = counterI / 4 956 cmp counterI, #0 957 ble .Ldgemm_kernel_L8_M2_BEGIN 958 959.Ldgemm_kernel_L8_M4_20: 960 961 mov pB, origPB 962 963 asr counterL , origK, #1 // L = K / 2 964 cmp counterL , #2 // is there at least 4 to do? 965 blt .Ldgemm_kernel_L8_M4_32 966 967 KERNEL4x8_I // do one in the K 968 KERNEL4x8_M2 // do another in the K 969 970 subs counterL, counterL, #2 971 ble .Ldgemm_kernel_L8_M4_22a 972 .align 5 973 974.Ldgemm_kernel_L8_M4_22: 975 976 KERNEL4x8_M1 977 KERNEL4x8_M2 978 979 subs counterL, counterL, #1 980 bgt .Ldgemm_kernel_L8_M4_22 981 982 983.Ldgemm_kernel_L8_M4_22a: 984 985 KERNEL4x8_M1 986 KERNEL4x8_E 987 988 b .Ldgemm_kernel_L8_M4_44 989 990.Ldgemm_kernel_L8_M4_32: 991 992 tst counterL, #1 993 ble .Ldgemm_kernel_L8_M4_40 994 995 KERNEL4x8_I 996 997 KERNEL4x8_E 998 999 b .Ldgemm_kernel_L8_M4_44 1000 1001 1002.Ldgemm_kernel_L8_M4_40: 1003 1004 INIT4x8 1005 1006.Ldgemm_kernel_L8_M4_44: 1007 1008 ands counterL , origK, #1 1009 ble .Ldgemm_kernel_L8_M4_100 1010 1011.Ldgemm_kernel_L8_M4_46: 1012 1013 KERNEL4x8_SUB 1014 1015.Ldgemm_kernel_L8_M4_100: 1016 1017 SAVE4x8 1018 1019.Ldgemm_kernel_L8_M4_END: 1020 subs counterI, counterI, #1 1021 bne .Ldgemm_kernel_L8_M4_20 1022 1023.Ldgemm_kernel_L8_M2_BEGIN: 1024 1025 mov counterI, origM 1026 tst counterI , #3 1027 ble .Ldgemm_kernel_L8_END 1028 1029 tst counterI, #2 // counterI = counterI / 2 1030 ble .Ldgemm_kernel_L8_M1_BEGIN 1031 1032.Ldgemm_kernel_L8_M2_20: 1033 1034 INIT2x8 1035 1036 mov pB, origPB 1037 1038 asr counterL , origK, #3 // counterL = counterL / 8 1039 cmp counterL , #0 1040 ble .Ldgemm_kernel_L8_M2_40 1041 1042.Ldgemm_kernel_L8_M2_22: 1043 1044 KERNEL2x8_SUB 1045 KERNEL2x8_SUB 1046 KERNEL2x8_SUB 1047 KERNEL2x8_SUB 1048 1049 KERNEL2x8_SUB 1050 KERNEL2x8_SUB 1051 KERNEL2x8_SUB 1052 KERNEL2x8_SUB 1053 1054 subs counterL, counterL, #1 1055 bgt .Ldgemm_kernel_L8_M2_22 1056 1057 1058.Ldgemm_kernel_L8_M2_40: 1059 1060 ands counterL , origK, #7 // counterL = counterL % 8 1061 ble .Ldgemm_kernel_L8_M2_100 1062 1063.Ldgemm_kernel_L8_M2_42: 1064 1065 KERNEL2x8_SUB 1066 1067 subs counterL, counterL, #1 1068 bgt .Ldgemm_kernel_L8_M2_42 1069 1070.Ldgemm_kernel_L8_M2_100: 1071 1072 SAVE2x8 1073 1074.Ldgemm_kernel_L8_M2_END: 1075 1076 1077.Ldgemm_kernel_L8_M1_BEGIN: 1078 1079 tst counterI, #1 // counterI = counterI % 2 1080 ble .Ldgemm_kernel_L8_END 1081 1082.Ldgemm_kernel_L8_M1_20: 1083 1084 INIT1x8 1085 1086 mov pB, origPB 1087 1088 asr counterL , origK, #3 // counterL = counterL / 8 1089 cmp counterL , #0 1090 ble .Ldgemm_kernel_L8_M1_40 1091 1092.Ldgemm_kernel_L8_M1_22: 1093 KERNEL1x8_SUB 1094 KERNEL1x8_SUB 1095 KERNEL1x8_SUB 1096 KERNEL1x8_SUB 1097 1098 KERNEL1x8_SUB 1099 KERNEL1x8_SUB 1100 KERNEL1x8_SUB 1101 KERNEL1x8_SUB 1102 1103 subs counterL, counterL, #1 1104 bgt .Ldgemm_kernel_L8_M1_22 1105 1106 1107.Ldgemm_kernel_L8_M1_40: 1108 1109 ands counterL , origK, #7 // counterL = counterL % 8 1110 ble .Ldgemm_kernel_L8_M1_100 1111 1112.Ldgemm_kernel_L8_M1_42: 1113 1114 KERNEL1x8_SUB 1115 1116 subs counterL, counterL, #1 1117 bgt .Ldgemm_kernel_L8_M1_42 1118 1119.Ldgemm_kernel_L8_M1_100: 1120 1121 SAVE1x8 1122 1123.Ldgemm_kernel_L8_END: 1124 1125 lsl temp, origK, #6 1126 add origPB, origPB, temp // B = B + K * 8 * 8 1127 1128 subs counterJ, counterJ , #1 // j-- 1129 bgt .Ldgemm_kernel_L8_BEGIN 1130 1131 1132/******************************************************************************/ 1133 1134.Ldgemm_kernel_L4_BEGIN: 1135 1136 mov counterJ , origN 1137 tst counterJ , #7 1138 ble .Ldgemm_kernel_L999 1139 1140 tst counterJ , #4 1141 ble .Ldgemm_kernel_L2_BEGIN 1142 1143 mov pCRow0, pC // pCRow0 = C 1144 add pC, pC, LDC, lsl #2 1145 1146 mov pA, origPA // pA = start of A array 1147 1148.Ldgemm_kernel_L4_M4_BEGIN: 1149 1150 mov counterI, origM 1151 asr counterI, counterI, #2 // counterI = counterI / 4 1152 cmp counterI, #0 1153 ble .Ldgemm_kernel_L4_M2_BEGIN 1154 1155.Ldgemm_kernel_L4_M4_20: 1156 1157 mov pB, origPB 1158 1159 asr counterL , origK, #1 // L = K / 2 1160 cmp counterL , #2 // is there at least 4 to do? 1161 blt .Ldgemm_kernel_L4_M4_32 1162 1163 KERNEL4x4_I // do one in the K 1164 KERNEL4x4_M2 // do another in the K 1165 1166 subs counterL, counterL, #2 1167 ble .Ldgemm_kernel_L4_M4_22a 1168 .align 5 1169 1170.Ldgemm_kernel_L4_M4_22: 1171 1172 KERNEL4x4_M1 1173 KERNEL4x4_M2 1174 1175 subs counterL, counterL, #1 1176 bgt .Ldgemm_kernel_L4_M4_22 1177 1178 1179.Ldgemm_kernel_L4_M4_22a: 1180 1181 KERNEL4x4_M1 1182 KERNEL4x4_E 1183 1184 b .Ldgemm_kernel_L4_M4_44 1185 1186.Ldgemm_kernel_L4_M4_32: 1187 1188 tst counterL, #1 1189 ble .Ldgemm_kernel_L4_M4_40 1190 1191 KERNEL4x4_I 1192 1193 KERNEL4x4_E 1194 1195 b .Ldgemm_kernel_L4_M4_44 1196 1197 1198.Ldgemm_kernel_L4_M4_40: 1199 1200 INIT4x4 1201 1202.Ldgemm_kernel_L4_M4_44: 1203 1204 ands counterL , origK, #1 1205 ble .Ldgemm_kernel_L4_M4_100 1206 1207.Ldgemm_kernel_L4_M4_46: 1208 1209 KERNEL4x4_SUB 1210 1211.Ldgemm_kernel_L4_M4_100: 1212 1213 SAVE4x4 1214 1215.Ldgemm_kernel_L4_M4_END: 1216 subs counterI, counterI, #1 1217 bne .Ldgemm_kernel_L4_M4_20 1218 1219.Ldgemm_kernel_L4_M2_BEGIN: 1220 1221 mov counterI, origM 1222 tst counterI , #3 1223 ble .Ldgemm_kernel_L4_END 1224 1225 tst counterI, #2 // counterI = counterI / 2 1226 ble .Ldgemm_kernel_L4_M1_BEGIN 1227 1228.Ldgemm_kernel_L4_M2_20: 1229 1230 INIT2x4 1231 1232 mov pB, origPB 1233 1234 asr counterL , origK, #3 // counterL = counterL / 8 1235 cmp counterL , #0 1236 ble .Ldgemm_kernel_L4_M2_40 1237 1238.Ldgemm_kernel_L4_M2_22: 1239 1240 KERNEL2x4_SUB 1241 KERNEL2x4_SUB 1242 KERNEL2x4_SUB 1243 KERNEL2x4_SUB 1244 1245 KERNEL2x4_SUB 1246 KERNEL2x4_SUB 1247 KERNEL2x4_SUB 1248 KERNEL2x4_SUB 1249 1250 subs counterL, counterL, #1 1251 bgt .Ldgemm_kernel_L4_M2_22 1252 1253 1254.Ldgemm_kernel_L4_M2_40: 1255 1256 ands counterL , origK, #7 // counterL = counterL % 8 1257 ble .Ldgemm_kernel_L4_M2_100 1258 1259.Ldgemm_kernel_L4_M2_42: 1260 1261 KERNEL2x4_SUB 1262 1263 subs counterL, counterL, #1 1264 bgt .Ldgemm_kernel_L4_M2_42 1265 1266.Ldgemm_kernel_L4_M2_100: 1267 1268 SAVE2x4 1269 1270.Ldgemm_kernel_L4_M2_END: 1271 1272 1273.Ldgemm_kernel_L4_M1_BEGIN: 1274 1275 tst counterI, #1 // counterI = counterI % 2 1276 ble .Ldgemm_kernel_L4_END 1277 1278.Ldgemm_kernel_L4_M1_20: 1279 1280 INIT1x4 1281 1282 mov pB, origPB 1283 1284 asr counterL , origK, #3 // counterL = counterL / 8 1285 cmp counterL , #0 1286 ble .Ldgemm_kernel_L4_M1_40 1287 1288.Ldgemm_kernel_L4_M1_22: 1289 KERNEL1x4_SUB 1290 KERNEL1x4_SUB 1291 KERNEL1x4_SUB 1292 KERNEL1x4_SUB 1293 1294 KERNEL1x4_SUB 1295 KERNEL1x4_SUB 1296 KERNEL1x4_SUB 1297 KERNEL1x4_SUB 1298 1299 subs counterL, counterL, #1 1300 bgt .Ldgemm_kernel_L4_M1_22 1301 1302 1303.Ldgemm_kernel_L4_M1_40: 1304 1305 ands counterL , origK, #7 // counterL = counterL % 8 1306 ble .Ldgemm_kernel_L4_M1_100 1307 1308.Ldgemm_kernel_L4_M1_42: 1309 1310 KERNEL1x4_SUB 1311 1312 subs counterL, counterL, #1 1313 bgt .Ldgemm_kernel_L4_M1_42 1314 1315.Ldgemm_kernel_L4_M1_100: 1316 1317 SAVE1x4 1318 1319.Ldgemm_kernel_L4_END: 1320 1321 lsl temp, origK, #5 1322 add origPB, origPB, temp // B = B + K * 4 * 8 1323 1324/******************************************************************************/ 1325 1326.Ldgemm_kernel_L2_BEGIN: // less than 2 left in N direction 1327 1328 mov counterJ , origN 1329 tst counterJ , #3 1330 ble .Ldgemm_kernel_L999 // error, N was less than 4? 1331 1332 tst counterJ , #2 1333 ble .Ldgemm_kernel_L1_BEGIN 1334 1335 mov pCRow0, pC // pCRow0 = pC 1336 1337 add pC,pC,LDC, lsl #1 1338 1339 mov pA, origPA // pA = A 1340 1341 1342.Ldgemm_kernel_L2_M4_BEGIN: 1343 1344 mov counterI, origM 1345 asr counterI, counterI, #2 // counterI = counterI / 4 1346 cmp counterI,#0 1347 ble .Ldgemm_kernel_L2_M2_BEGIN 1348 1349.Ldgemm_kernel_L2_M4_20: 1350 1351 INIT4x2 1352 1353 mov pB, origPB 1354 1355 asr counterL , origK, #3 // counterL = counterL / 8 1356 cmp counterL,#0 1357 ble .Ldgemm_kernel_L2_M4_40 1358 .align 5 1359 1360.Ldgemm_kernel_L2_M4_22: 1361 KERNEL4x2_SUB 1362 KERNEL4x2_SUB 1363 KERNEL4x2_SUB 1364 KERNEL4x2_SUB 1365 1366 KERNEL4x2_SUB 1367 KERNEL4x2_SUB 1368 KERNEL4x2_SUB 1369 KERNEL4x2_SUB 1370 1371 subs counterL, counterL, #1 1372 bgt .Ldgemm_kernel_L2_M4_22 1373 1374 1375.Ldgemm_kernel_L2_M4_40: 1376 1377 ands counterL , origK, #7 // counterL = counterL % 8 1378 ble .Ldgemm_kernel_L2_M4_100 1379 1380.Ldgemm_kernel_L2_M4_42: 1381 1382 KERNEL4x2_SUB 1383 1384 subs counterL, counterL, #1 1385 bgt .Ldgemm_kernel_L2_M4_42 1386 1387.Ldgemm_kernel_L2_M4_100: 1388 1389 SAVE4x2 1390 1391.Ldgemm_kernel_L2_M4_END: 1392 1393 subs counterI, counterI, #1 1394 bgt .Ldgemm_kernel_L2_M4_20 1395 1396 1397.Ldgemm_kernel_L2_M2_BEGIN: 1398 1399 mov counterI, origM 1400 tst counterI , #3 1401 ble .Ldgemm_kernel_L2_END 1402 1403 tst counterI, #2 // counterI = counterI / 2 1404 ble .Ldgemm_kernel_L2_M1_BEGIN 1405 1406.Ldgemm_kernel_L2_M2_20: 1407 1408 INIT2x2 1409 1410 mov pB, origPB 1411 1412 asr counterL , origK, #3 // counterL = counterL / 8 1413 cmp counterL,#0 1414 ble .Ldgemm_kernel_L2_M2_40 1415 1416.Ldgemm_kernel_L2_M2_22: 1417 1418 KERNEL2x2_SUB 1419 KERNEL2x2_SUB 1420 KERNEL2x2_SUB 1421 KERNEL2x2_SUB 1422 1423 KERNEL2x2_SUB 1424 KERNEL2x2_SUB 1425 KERNEL2x2_SUB 1426 KERNEL2x2_SUB 1427 1428 subs counterL, counterL, #1 1429 bgt .Ldgemm_kernel_L2_M2_22 1430 1431 1432.Ldgemm_kernel_L2_M2_40: 1433 1434 ands counterL , origK, #7 // counterL = counterL % 8 1435 ble .Ldgemm_kernel_L2_M2_100 1436 1437.Ldgemm_kernel_L2_M2_42: 1438 1439 KERNEL2x2_SUB 1440 1441 subs counterL, counterL, #1 1442 bgt .Ldgemm_kernel_L2_M2_42 1443 1444.Ldgemm_kernel_L2_M2_100: 1445 1446 SAVE2x2 1447 1448.Ldgemm_kernel_L2_M2_END: 1449 1450 1451.Ldgemm_kernel_L2_M1_BEGIN: 1452 1453 tst counterI, #1 // counterI = counterI % 2 1454 ble .Ldgemm_kernel_L2_END 1455 1456.Ldgemm_kernel_L2_M1_20: 1457 1458 INIT1x2 1459 1460 mov pB, origPB 1461 1462 asr counterL , origK, #3 // counterL = counterL / 8 1463 cmp counterL, #0 1464 ble .Ldgemm_kernel_L2_M1_40 1465 1466.Ldgemm_kernel_L2_M1_22: 1467 KERNEL1x2_SUB 1468 KERNEL1x2_SUB 1469 KERNEL1x2_SUB 1470 KERNEL1x2_SUB 1471 1472 KERNEL1x2_SUB 1473 KERNEL1x2_SUB 1474 KERNEL1x2_SUB 1475 KERNEL1x2_SUB 1476 1477 subs counterL, counterL, #1 1478 bgt .Ldgemm_kernel_L2_M1_22 1479 1480 1481.Ldgemm_kernel_L2_M1_40: 1482 1483 ands counterL , origK, #7 // counterL = counterL % 8 1484 ble .Ldgemm_kernel_L2_M1_100 1485 1486.Ldgemm_kernel_L2_M1_42: 1487 1488 KERNEL1x2_SUB 1489 1490 subs counterL, counterL, #1 1491 bgt .Ldgemm_kernel_L2_M1_42 1492 1493.Ldgemm_kernel_L2_M1_100: 1494 1495 SAVE1x2 1496 1497.Ldgemm_kernel_L2_END: 1498 add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 1499 1500/******************************************************************************/ 1501 1502.Ldgemm_kernel_L1_BEGIN: 1503 1504 mov counterJ , origN 1505 tst counterJ , #1 1506 ble .Ldgemm_kernel_L999 // done 1507 1508 1509 mov pCRow0, pC // pCRow0 = C 1510 add pC , pC , LDC // Update pC to point to next 1511 1512 mov pA, origPA // pA = A 1513 1514.Ldgemm_kernel_L1_M4_BEGIN: 1515 1516 mov counterI, origM 1517 asr counterI, counterI, #2 // counterI = counterI / 4 1518 cmp counterI, #0 1519 ble .Ldgemm_kernel_L1_M2_BEGIN 1520 1521.Ldgemm_kernel_L1_M4_20: 1522 1523 INIT4x1 1524 1525 mov pB, origPB 1526 asr counterL , origK, #3 // counterL = counterL / 8 1527 cmp counterL , #0 1528 ble .Ldgemm_kernel_L1_M4_40 1529 .align 5 1530 1531.Ldgemm_kernel_L1_M4_22: 1532 KERNEL4x1_SUB 1533 KERNEL4x1_SUB 1534 KERNEL4x1_SUB 1535 KERNEL4x1_SUB 1536 1537 KERNEL4x1_SUB 1538 KERNEL4x1_SUB 1539 KERNEL4x1_SUB 1540 KERNEL4x1_SUB 1541 1542 subs counterL, counterL, #1 1543 bgt .Ldgemm_kernel_L1_M4_22 1544 1545 1546.Ldgemm_kernel_L1_M4_40: 1547 1548 ands counterL , origK, #7 // counterL = counterL % 8 1549 ble .Ldgemm_kernel_L1_M4_100 1550 1551.Ldgemm_kernel_L1_M4_42: 1552 1553 KERNEL4x1_SUB 1554 1555 subs counterL, counterL, #1 1556 bgt .Ldgemm_kernel_L1_M4_42 1557 1558.Ldgemm_kernel_L1_M4_100: 1559 1560 SAVE4x1 1561 1562.Ldgemm_kernel_L1_M4_END: 1563 1564 subs counterI, counterI, #1 1565 bgt .Ldgemm_kernel_L1_M4_20 1566 1567 1568.Ldgemm_kernel_L1_M2_BEGIN: 1569 1570 mov counterI, origM 1571 tst counterI , #3 1572 ble .Ldgemm_kernel_L1_END 1573 1574 tst counterI, #2 // counterI = counterI / 2 1575 ble .Ldgemm_kernel_L1_M1_BEGIN 1576 1577.Ldgemm_kernel_L1_M2_20: 1578 1579 INIT2x1 1580 1581 mov pB, origPB 1582 1583 asr counterL , origK, #3 // counterL = counterL / 8 1584 cmp counterL , #0 1585 ble .Ldgemm_kernel_L1_M2_40 1586 1587.Ldgemm_kernel_L1_M2_22: 1588 1589 KERNEL2x1_SUB 1590 KERNEL2x1_SUB 1591 KERNEL2x1_SUB 1592 KERNEL2x1_SUB 1593 1594 KERNEL2x1_SUB 1595 KERNEL2x1_SUB 1596 KERNEL2x1_SUB 1597 KERNEL2x1_SUB 1598 1599 subs counterL, counterL, #1 1600 bgt .Ldgemm_kernel_L1_M2_22 1601 1602 1603.Ldgemm_kernel_L1_M2_40: 1604 1605 ands counterL , origK, #7 // counterL = counterL % 8 1606 ble .Ldgemm_kernel_L1_M2_100 1607 1608.Ldgemm_kernel_L1_M2_42: 1609 1610 KERNEL2x1_SUB 1611 1612 subs counterL, counterL, #1 1613 bgt .Ldgemm_kernel_L1_M2_42 1614 1615.Ldgemm_kernel_L1_M2_100: 1616 1617 SAVE2x1 1618 1619.Ldgemm_kernel_L1_M2_END: 1620 1621 1622.Ldgemm_kernel_L1_M1_BEGIN: 1623 1624 tst counterI, #1 // counterI = counterI % 2 1625 ble .Ldgemm_kernel_L1_END 1626 1627.Ldgemm_kernel_L1_M1_20: 1628 1629 INIT1x1 1630 1631 mov pB, origPB 1632 1633 asr counterL , origK, #3 // counterL = counterL / 8 1634 cmp counterL , #0 1635 ble .Ldgemm_kernel_L1_M1_40 1636 1637.Ldgemm_kernel_L1_M1_22: 1638 KERNEL1x1_SUB 1639 KERNEL1x1_SUB 1640 KERNEL1x1_SUB 1641 KERNEL1x1_SUB 1642 1643 KERNEL1x1_SUB 1644 KERNEL1x1_SUB 1645 KERNEL1x1_SUB 1646 KERNEL1x1_SUB 1647 1648 subs counterL, counterL, #1 1649 bgt .Ldgemm_kernel_L1_M1_22 1650 1651 1652.Ldgemm_kernel_L1_M1_40: 1653 1654 ands counterL , origK, #7 // counterL = counterL % 8 1655 ble .Ldgemm_kernel_L1_M1_100 1656 1657.Ldgemm_kernel_L1_M1_42: 1658 1659 KERNEL1x1_SUB 1660 1661 subs counterL, counterL, #1 1662 bgt .Ldgemm_kernel_L1_M1_42 1663 1664.Ldgemm_kernel_L1_M1_100: 1665 1666 SAVE1x1 1667 1668 1669.Ldgemm_kernel_L1_END: 1670 1671 1672.Ldgemm_kernel_L999: 1673 mov x0, #0 // set return value 1674 ldp d8, d9, [sp, #(0 * 16)] 1675 ldp d10, d11, [sp, #(1 * 16)] 1676 ldp d12, d13, [sp, #(2 * 16)] 1677 ldp d14, d15, [sp, #(3 * 16)] 1678 ldp d16, d17, [sp, #(4 * 16)] 1679 ldp x18, x19, [sp, #(5 * 16)] 1680 ldp x20, x21, [sp, #(6 * 16)] 1681 ldp x22, x23, [sp, #(7 * 16)] 1682 ldp x24, x25, [sp, #(8 * 16)] 1683 ldp x26, x27, [sp, #(9 * 16)] 1684 ldr x28, [sp, #(10 * 16)] 1685 add sp, sp, #(11*16) 1686 ret 1687 1688 EPILOGUE 1689 1690