1// 2// MNNPackedMatMulRemainFP16.S 3// MNN 4// 5// Created by MNN on 2020/06/10. 6// Copyright © 2018, Alibaba Group Holding Limited 7// 8 9#ifdef __aarch64__ 10 11#include "MNNAsmGlobal.h" 12 13.text 14.align 5 15// 8 * 24 MatMul, C(UP_DIV(h,8), e, h8) = B(UP_DIV(h,hP), l, hP) * A(l, e), hP = 24 16// Remain meaning is eSize is any value 17asm_function MNNPackedMatMulRemainFP16 18//void MNNPackedMatMulRemainFP16(FLOAT16* C, const FLOAT16* A, const FLOAT16* B, size_t eSize, const size_t* parameter, const FLOAT16* postParameters, const FLOAT16* bias); 19//Auto x0: C, x1:A, x2:B, x3:eSize, x4:parameter, x6:postParameters, x7:bias 20// parameter: {aStride, l, h, cStride, bExtraStride} 21sub sp, sp, #32 22str x19, [sp, #0] 23str x20, [sp, #8] 24str x21, [sp, #16] 25add sp, sp, #32 26ldr x11, [x4, #0] // aStride 27ldr x9, [x4, #8] // l 28ldr x10, [x4, #16] // h 29 30ldr x7, [x4, #24] // cStride 31ldr x19, [x4, #40] // bExtraStride 32 33add x10, x10, #7 34lsr x10, x10, #3 35 36cbz x5, Start 37ld1 {v5.4s}, [x5] 38fcvtn v5.4h, v5.4s 39dup v6.8h, v5.h[2] // Min Value 40dup v7.8h, v5.h[3] // Max Value 41 42Start: 43 44E8: 45cmp x3, #8 46blt E4 47 48// 8x16 49LoopE8: 50 mov x20, x6 51 mov x8, x10 52 mov x21, x0 53 mov x13, x2 54 55 LH8: 56 cmp x8, #2 57 blt LH4 58 sub x14, x7, #64 59 LoopH8x8: 60 mov x15, x1 61 subs x12, x9, #1 62 ld1 {v3.8h, v4.8h}, [x13], #32 63 ld1 {v0.8h}, [x15], x11 64 fmul v16.8h, v3.8h, v0.h[0] 65 fmul v17.8h, v3.8h, v0.h[1] 66 fmul v18.8h, v3.8h, v0.h[2] 67 fmul v19.8h, v3.8h, v0.h[3] 68 69 fmul v20.8h, v4.8h, v0.h[0] 70 fmul v21.8h, v4.8h, v0.h[1] 71 fmul v22.8h, v4.8h, v0.h[2] 72 fmul v23.8h, v4.8h, v0.h[3] 73 74 fmul v24.8h, v3.8h, v0.h[4] 75 fmul v25.8h, v3.8h, v0.h[5] 76 fmul v26.8h, v3.8h, v0.h[6] 77 fmul v27.8h, v3.8h, v0.h[7] 78 79 fmul v28.8h, v4.8h, v0.h[4] 80 fmul v29.8h, v4.8h, v0.h[5] 81 fmul v30.8h, v4.8h, v0.h[6] 82 fmul v31.8h, v4.8h, v0.h[7] 83 beq LoopLEnd 84 85 LoopL: 86 ld1 {v3.8h, v4.8h}, [x13], #32 87 ld1 {v0.8h}, [x15], x11 88 fmla v16.8h, v3.8h, v0.h[0] 89 fmla v17.8h, v3.8h, v0.h[1] 90 fmla v18.8h, v3.8h, v0.h[2] 91 fmla v19.8h, v3.8h, v0.h[3] 92 93 fmla v20.8h, v4.8h, v0.h[0] 94 fmla v21.8h, v4.8h, v0.h[1] 95 fmla v22.8h, v4.8h, v0.h[2] 96 fmla v23.8h, v4.8h, v0.h[3] 97 98 fmla v24.8h, v3.8h, v0.h[4] 99 fmla v25.8h, v3.8h, v0.h[5] 100 fmla v26.8h, v3.8h, v0.h[6] 101 fmla v27.8h, v3.8h, v0.h[7] 102 103 fmla v28.8h, v4.8h, v0.h[4] 104 fmla v29.8h, v4.8h, v0.h[5] 105 fmla v30.8h, v4.8h, v0.h[6] 106 fmla v31.8h, v4.8h, v0.h[7] 107 108 subs x12, x12, #1 109 bne LoopL 110 111 LoopLEnd: 112 113 add x13, x13, x19 114 sub x8, x8, #2 115 116 cbz x5, StoreLH8 117 AddBiasLH8: 118 ld1 {v0.8h, v1.8h}, [x20], #32 119 120 fmla v16.8h, v0.8h, v5.h[1] 121 fmla v17.8h, v0.8h, v5.h[1] 122 fmla v18.8h, v0.8h, v5.h[1] 123 fmla v19.8h, v0.8h, v5.h[1] 124 125 fmla v20.8h, v1.8h, v5.h[1] 126 fmla v21.8h, v1.8h, v5.h[1] 127 fmla v22.8h, v1.8h, v5.h[1] 128 fmla v23.8h, v1.8h, v5.h[1] 129 130 fmla v24.8h, v0.8h, v5.h[1] 131 fmla v25.8h, v0.8h, v5.h[1] 132 fmla v26.8h, v0.8h, v5.h[1] 133 fmla v27.8h, v0.8h, v5.h[1] 134 135 fmla v28.8h, v1.8h, v5.h[1] 136 fmla v29.8h, v1.8h, v5.h[1] 137 fmla v30.8h, v1.8h, v5.h[1] 138 fmla v31.8h, v1.8h, v5.h[1] 139 140 PostTreatLH8: 141 fmax v16.8h, v16.8h, v6.8h 142 fmax v17.8h, v17.8h, v6.8h 143 fmax v18.8h, v18.8h, v6.8h 144 fmax v19.8h, v19.8h, v6.8h 145 fmax v20.8h, v20.8h, v6.8h 146 fmax v21.8h, v21.8h, v6.8h 147 fmax v22.8h, v22.8h, v6.8h 148 fmax v23.8h, v23.8h, v6.8h 149 fmax v24.8h, v24.8h, v6.8h 150 fmax v25.8h, v25.8h, v6.8h 151 fmax v26.8h, v26.8h, v6.8h 152 fmax v27.8h, v27.8h, v6.8h 153 fmax v28.8h, v28.8h, v6.8h 154 fmax v29.8h, v29.8h, v6.8h 155 fmax v30.8h, v30.8h, v6.8h 156 fmax v31.8h, v31.8h, v6.8h 157 158 fmin v16.8h, v16.8h, v7.8h 159 fmin v17.8h, v17.8h, v7.8h 160 fmin v18.8h, v18.8h, v7.8h 161 fmin v19.8h, v19.8h, v7.8h 162 fmin v20.8h, v20.8h, v7.8h 163 fmin v21.8h, v21.8h, v7.8h 164 fmin v22.8h, v22.8h, v7.8h 165 fmin v23.8h, v23.8h, v7.8h 166 fmin v24.8h, v24.8h, v7.8h 167 fmin v25.8h, v25.8h, v7.8h 168 fmin v26.8h, v26.8h, v7.8h 169 fmin v27.8h, v27.8h, v7.8h 170 fmin v28.8h, v28.8h, v7.8h 171 fmin v29.8h, v29.8h, v7.8h 172 fmin v30.8h, v30.8h, v7.8h 173 fmin v31.8h, v31.8h, v7.8h 174 175 StoreLH8: 176 st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64 177 st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x0], x14 178 179 st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64 180 st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x0], x14 181 cmp x8, #2 182 bge LoopH8x8 183 184 LH4: 185 cbz x8, E8End 186 LoopHRemain: 187 mov x15, x1 188 subs x12, x9, #1 189 ld1 {v3.8h}, [x13] 190 ld1 {v0.8h}, [x15], x11 191 fmul v16.8h, v3.8h, v0.h[0] 192 fmul v17.8h, v3.8h, v0.h[1] 193 add x13, x13, #32 194 fmul v18.8h, v3.8h, v0.h[2] 195 fmul v19.8h, v3.8h, v0.h[3] 196 fmul v20.8h, v3.8h, v0.h[4] 197 fmul v21.8h, v3.8h, v0.h[5] 198 fmul v22.8h, v3.8h, v0.h[6] 199 fmul v23.8h, v3.8h, v0.h[7] 200 beq LoopLREnd 201 202 LoopLR: 203 ld1 {v3.8h}, [x13] 204 ld1 {v0.8h}, [x15], x11 205 fmla v16.8h, v3.8h, v0.h[0] 206 fmla v17.8h, v3.8h, v0.h[1] 207 fmla v18.8h, v3.8h, v0.h[2] 208 fmla v19.8h, v3.8h, v0.h[3] 209 add x13, x13, #32 210 211 fmla v20.8h, v3.8h, v0.h[4] 212 fmla v21.8h, v3.8h, v0.h[5] 213 fmla v22.8h, v3.8h, v0.h[6] 214 fmla v23.8h, v3.8h, v0.h[7] 215 216 subs x12, x12, #1 217 bne LoopLR 218 LoopLREnd: 219 220 cbz x5, StoreLH8x4 221 AddBiasLH8x4: 222 ld1 {v0.8h}, [x20] 223 224 fmla v16.8h, v0.8h, v5.h[1] 225 fmla v17.8h, v0.8h, v5.h[1] 226 fmla v18.8h, v0.8h, v5.h[1] 227 fmla v19.8h, v0.8h, v5.h[1] 228 229 fmla v20.8h, v0.8h, v5.h[1] 230 fmla v21.8h, v0.8h, v5.h[1] 231 fmla v22.8h, v0.8h, v5.h[1] 232 fmla v23.8h, v0.8h, v5.h[1] 233 234 PostTreatLH8x4: 235 fmax v16.8h, v16.8h, v6.8h 236 fmax v17.8h, v17.8h, v6.8h 237 fmax v18.8h, v18.8h, v6.8h 238 fmax v19.8h, v19.8h, v6.8h 239 fmax v20.8h, v20.8h, v6.8h 240 fmax v21.8h, v21.8h, v6.8h 241 fmax v22.8h, v22.8h, v6.8h 242 fmax v23.8h, v23.8h, v6.8h 243 244 fmin v16.8h, v16.8h, v7.8h 245 fmin v17.8h, v17.8h, v7.8h 246 fmin v18.8h, v18.8h, v7.8h 247 fmin v19.8h, v19.8h, v7.8h 248 fmin v20.8h, v20.8h, v7.8h 249 fmin v21.8h, v21.8h, v7.8h 250 fmin v22.8h, v22.8h, v7.8h 251 fmin v23.8h, v23.8h, v7.8h 252 253 StoreLH8x4: 254 255 st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64 256 st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64 257 258 E8End: 259 260 sub x3, x3, #8 261 add x0, x21, #128 262 add x1, x1, #16 263 264E4: 265cmp x3, #4 266mov x20, x6 267blt E1 268 mov x8, x10 269 mov x21, x0 270 mov x13, x2 271 272 cmp x8, #2 273 blt E4LH4 274 275 E4LH8: 276 E4LoopH8: 277 mov x15, x1 278 subs x12, x9, #1 279 ld1 {v3.8h, v4.8h}, [x13], #32 280 ld1 {v0.4h}, [x15], x11 281 fmul v16.8h, v3.8h, v0.h[0] 282 fmul v17.8h, v3.8h, v0.h[1] 283 fmul v18.8h, v3.8h, v0.h[2] 284 fmul v19.8h, v3.8h, v0.h[3] 285 286 fmul v20.8h, v4.8h, v0.h[0] 287 fmul v21.8h, v4.8h, v0.h[1] 288 fmul v22.8h, v4.8h, v0.h[2] 289 fmul v23.8h, v4.8h, v0.h[3] 290 291 beq E4LoopLEnd 292 293 subs x12, x12, #1 294 ld1 {v3.8h, v4.8h}, [x13], #32 295 ld1 {v0.4h}, [x15], x11 296 fmla v16.8h, v3.8h, v0.h[0] 297 fmla v17.8h, v3.8h, v0.h[1] 298 299 beq E4LoopLComputeEnd 300 301 E4LoopL: 302 fmla v18.8h, v3.8h, v0.h[2] 303 fmla v19.8h, v3.8h, v0.h[3] 304 305 fmla v20.8h, v4.8h, v0.h[0] 306 fmla v21.8h, v4.8h, v0.h[1] 307 fmla v22.8h, v4.8h, v0.h[2] 308 fmla v23.8h, v4.8h, v0.h[3] 309 310 ld1 {v3.8h, v4.8h}, [x13], #32 311 ld1 {v0.4h}, [x15], x11 312 fmla v16.8h, v3.8h, v0.h[0] 313 fmla v17.8h, v3.8h, v0.h[1] 314 315 subs x12, x12, #1 316 bne E4LoopL 317 E4LoopLComputeEnd: 318 fmla v18.8h, v3.8h, v0.h[2] 319 fmla v19.8h, v3.8h, v0.h[3] 320 321 fmla v20.8h, v4.8h, v0.h[0] 322 fmla v21.8h, v4.8h, v0.h[1] 323 fmla v22.8h, v4.8h, v0.h[2] 324 fmla v23.8h, v4.8h, v0.h[3] 325 326 E4LoopLEnd: 327 add x13, x13, x19 328 sub x8, x8, #2 329 cmp x8, #2 330 331 cbz x5, StoreLH4x8 332 333 AddBiasLH4x8: 334 ld1 {v0.8h, v1.8h}, [x20], #32 335 336 fmla v16.8h, v0.8h, v5.h[1] 337 fmla v17.8h, v0.8h, v5.h[1] 338 fmla v18.8h, v0.8h, v5.h[1] 339 fmla v19.8h, v0.8h, v5.h[1] 340 341 fmla v20.8h, v1.8h, v5.h[1] 342 fmla v21.8h, v1.8h, v5.h[1] 343 fmla v22.8h, v1.8h, v5.h[1] 344 fmla v23.8h, v1.8h, v5.h[1] 345 346 PostTreatLH4x8: 347 fmax v16.8h, v16.8h, v6.8h 348 fmax v17.8h, v17.8h, v6.8h 349 fmax v18.8h, v18.8h, v6.8h 350 fmax v19.8h, v19.8h, v6.8h 351 fmax v20.8h, v20.8h, v6.8h 352 fmax v21.8h, v21.8h, v6.8h 353 fmax v22.8h, v22.8h, v6.8h 354 fmax v23.8h, v23.8h, v6.8h 355 356 fmin v16.8h, v16.8h, v7.8h 357 fmin v17.8h, v17.8h, v7.8h 358 fmin v18.8h, v18.8h, v7.8h 359 fmin v19.8h, v19.8h, v7.8h 360 fmin v20.8h, v20.8h, v7.8h 361 fmin v21.8h, v21.8h, v7.8h 362 fmin v22.8h, v22.8h, v7.8h 363 fmin v23.8h, v23.8h, v7.8h 364 365 StoreLH4x8: 366 367 st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x7 368 st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], x7 369 370 bge E4LoopH8 371 372 E4LH4: 373 cbz x8, E4End 374 mov x15, x1 375 subs x12, x9, #1 376 ld1 {v3.8h}, [x13] 377 ld1 {v0.4h}, [x15], x11 378 fmul v16.8h, v3.8h, v0.h[0] 379 fmul v17.8h, v3.8h, v0.h[1] 380 fmul v18.8h, v3.8h, v0.h[2] 381 fmul v19.8h, v3.8h, v0.h[3] 382 add x13, x13, #32 383 384 beq E4LoopLREnd 385 386 E4LoopLR: 387 ld1 {v3.8h}, [x13] 388 ld1 {v0.4h}, [x15], x11 389 fmla v16.8h, v3.8h, v0.h[0] 390 fmla v17.8h, v3.8h, v0.h[1] 391 fmla v18.8h, v3.8h, v0.h[2] 392 fmla v19.8h, v3.8h, v0.h[3] 393 add x13, x13, #32 394 395 subs x12, x12, #1 396 bne E4LoopLR 397 E4LoopLREnd: 398 399 cbz x5, StoreLH4x4 400 AddBiasLH4x4: 401 ld1 {v0.8h}, [x20] 402 403 fmla v16.8h, v0.8h, v5.h[1] 404 fmla v17.8h, v0.8h, v5.h[1] 405 fmla v18.8h, v0.8h, v5.h[1] 406 fmla v19.8h, v0.8h, v5.h[1] 407 408 409 PostTreatLH4x4: 410 fmax v16.8h, v16.8h, v6.8h 411 fmax v17.8h, v17.8h, v6.8h 412 fmax v18.8h, v18.8h, v6.8h 413 fmax v19.8h, v19.8h, v6.8h 414 415 fmin v16.8h, v16.8h, v7.8h 416 fmin v17.8h, v17.8h, v7.8h 417 fmin v18.8h, v18.8h, v7.8h 418 fmin v19.8h, v19.8h, v7.8h 419 420 StoreLH4x4: 421 st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0] 422 423 E4End: 424 425 sub x3, x3, #4 426 add x0, x21, #64 427 add x1, x1, #8 428 429E1: 430cmp x3, #0 431beq End 432 433LoopE1: 434 mov x20, x6 435 mov x8, x10 436 mov x21, x0 437 mov x13, x2 438 439 cmp x8, #2 440 blt E1LH4 441 442 E1LH8: 443 E1LoopH8: 444 mov x15, x1 445 subs x12, x9, #1 446 ld1 {v3.8h, v4.8h}, [x13], #32 447 ld1 {v0.h}[0], [x15], x11 448 fmul v16.8h, v3.8h, v0.h[0] 449 fmul v20.8h, v4.8h, v0.h[0] 450 451 beq E1LoopLEnd 452 453 E1LoopL: 454 ld1 {v3.8h, v4.8h}, [x13], #32 455 ld1 {v0.h}[0], [x15], x11 456 fmla v16.8h, v3.8h, v0.h[0] 457 fmla v20.8h, v4.8h, v0.h[0] 458 459 subs x12, x12, #1 460 bne E1LoopL 461 462 E1LoopLEnd: 463 464 add x13, x13, x19 465 sub x8, x8, #2 466 cmp x8, #2 467 468 cbz x5, StoreLH1x8 469 AddBiasLH1x8: 470 ld1 {v0.8h, v1.8h}, [x20], #32 471 472 fmla v16.8h, v0.8h, v5.h[1] 473 fmla v20.8h, v1.8h, v5.h[1] 474 475 PostTreatLH1x8: 476 fmax v16.8h, v16.8h, v6.8h 477 fmax v20.8h, v20.8h, v6.8h 478 fmin v16.8h, v16.8h, v7.8h 479 fmin v20.8h, v20.8h, v7.8h 480 481 StoreLH1x8: 482 483 st1 {v16.8h}, [x0], x7 484 st1 {v20.8h}, [x0], x7 485 486 bge E1LoopH8 487 488 E1LH4: 489 cbz x8, E1End 490 mov x15, x1 491 subs x12, x9, #1 492 ld1 {v3.8h}, [x13] 493 ld1 {v0.h}[0], [x15], x11 494 fmul v16.8h, v3.8h, v0.h[0] 495 add x13, x13, #32 496 497 beq E1LoopLREnd 498 499 E1LoopLR: 500 ld1 {v3.8h}, [x13] 501 ld1 {v0.h}[0], [x15], x11 502 fmla v16.8h, v3.8h, v0.h[0] 503 add x13, x13, #32 504 505 subs x12, x12, #1 506 bne E1LoopLR 507 E1LoopLREnd: 508 509 cbz x5, StoreLH1x4 510 AddBiasLH1x4: 511 ld1 {v0.8h}, [x20] 512 fmla v16.8h, v0.8h, v5.h[1] 513 514 PostTreatLH1x4: 515 fmax v16.8h, v16.8h, v6.8h 516 fmin v16.8h, v16.8h, v7.8h 517 518 StoreLH1x4: 519 st1 {v16.8h}, [x0] 520 521 E1End: 522 523 subs x3, x3, #1 524 add x0, x21, #16 525 add x1, x1, #2 526 bne LoopE1 527 528 529End: 530sub sp, sp, #32 531ldr x19, [sp, #0] 532ldr x20, [sp, #8] 533ldr x21, [sp, #16] 534add sp, sp, #32 535 536ret 537 538 539#endif 540