1// 2// MNNGemmInt8AddBiasScale_ARMV82_Unit.S 3// MNN 4// 5// Created by MNN on 2019/12/17. 6// Copyright © 2018, Alibaba Group Holding Limited 7// 8 9#if defined(__aarch64__) && defined(ENABLE_ARMV82) 10#include "MNNAsmGlobal.h" 11 12.text 13.align 5 14 15.macro SET_BIAS s, d0, d1, d2, d3 16 mov \d0\().16b, \s\().16b 17 mov \d1\().16b, \s\().16b 18 mov \d2\().16b, \s\().16b 19 mov \d3\().16b, \s\().16b 20.endm 21.macro COMPUTE s0, s1, d0, d1, d2, d3 22 sdot \d0\().4s, \s0\().16b, \s1\().4b[0] 23 sdot \d1\().4s, \s0\().16b, \s1\().4b[1] 24 sdot \d2\().4s, \s0\().16b, \s1\().4b[2] 25 sdot \d3\().4s, \s0\().16b, \s1\().4b[3] 26.endm 27.macro Int32ToFloat z0, z1, z2, z3 28 scvtf \z0\().4s, \z0\().4s 29 scvtf \z1\().4s, \z1\().4s 30 scvtf \z2\().4s, \z2\().4s 31 scvtf \z3\().4s, \z3\().4s 32.endm 33.macro MUL_SCALE s, d0, d1, d2, d3 34 fmul \d0\().4s, \d0\().4s, \s\().4s 35 fmul \d1\().4s, \d1\().4s, \s\().4s 36 fmul \d2\().4s, \d2\().4s, \s\().4s 37 fmul \d3\().4s, \d3\().4s, \s\().4s 38.endm 39.macro FloatToInt32 z0, z1, z2, z3 40 fcvtas \z0\().4s, \z0\().4s 41 fcvtas \z1\().4s, \z1\().4s 42 fcvtas \z2\().4s, \z2\().4s 43 fcvtas \z3\().4s, \z3\().4s 44.endm 45.macro Int32ToInt16 s0, s1, s2, s3, d0, d1 46 sqxtn \d0\().4h, \s0\().4s 47 sqxtn2 \d0\().8h, \s1\().4s 48 sqxtn \d1\().4h, \s2\().4s 49 sqxtn2 \d1\().8h, \s3\().4s 50.endm 51.macro Int16ToInt8_ONE s0, s1, d0 52 sqxtn \d0\().8b, \s0\().8h 53 sqxtn2 \d0\().16b, \s1\().8h 54.endm 55.macro Int16ToInt8 s0, s1, s2, s3, d0, d1 56 Int16ToInt8_ONE \s0, \s1, \d0 57 Int16ToInt8_ONE \s2, \s3, \d1 58.endm 59 60asm_function MNNGemmInt8AddBiasScale_ARMV82_Unit 61 62//struct QuanPostTreatParameters { 63// const float* scale; 64// const int32_t* bias; 65// int32_t maxValue; 66// int32_t minValue; 67//}; 68 69//void MNNGemmInt8AddBiasScale_ARMV82_Unit(int8_t* dst, const int8_t* src, 70// const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad, 71// const QuanPostTreatParameters* parameters, size_t realDstCount); 72 73//Auto: x0:dst, x1:src, x2:weight, x3:src_depth_quad, x4:dst_step 74//x5:dst_depth_quad, x6: parameters, x7: realDstCount 75 76//Load from x7: x8: scale, x9: bias, w12: maxValue, w13: minValue 77ldr x8, [x6, #0] 78ldr x9, [x6, #8] 79ldr w12, [x6, #16] 80ldr w13, [x6, #20] 81dup v7.16b, w12 // max 82dup v6.16b, w13 // min 83 84sub sp, sp, #160 85stp x19, x20, [sp], #16 86stp x21, x22, [sp], #16 87st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 88st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 89 90mov x21, #4 // sizeof(int8_t) * UNIT 91cbnz x8, Start 92mov x21, #16 // sizeof(float) * UNIT 93Start: 94lsl x15, x3, #4 // x15 = src_depth_quad * UNIT * SRC_UNIT 95mov x22, #48 // src_steps 96 97TILE_12: 98 cmp x7, #12 99 blt TILE_8 100 cmp x5, #2 101 blt L4LoopDz_TILE_12 102L8LoopDz_TILE_12: 103 ld1 {v0.4s, v1.4s}, [x9], #32 // bias 104 mov x11, x1 105 mov x13, x3 106 107 SET_BIAS v0, v8, v9, v10, v11 108 SET_BIAS v0, v12, v13, v14, v15 109 SET_BIAS v0, v16, v17, v18, v19 110 SET_BIAS v1, v20, v21, v22, v23 111 SET_BIAS v1, v24, v25, v26, v27 112 SET_BIAS v1, v28, v29, v30, v31 113 114 L8LoopSz_TILE_12: 115 ld1 {v3.16b}, [x2], x15 // weight 116 ld1 {v0.16b, v1.16b, v2.16b}, [x11], #48 // src 117 COMPUTE v3, v0, v8, v9, v10, v11 118 ld1 {v4.16b}, [x2], #16 119 COMPUTE v3, v1, v12, v13, v14, v15 120 COMPUTE v3, v2, v16, v17, v18, v19 121 COMPUTE v4, v0, v20, v21, v22, v23 122 sub x2, x2, x15 123 COMPUTE v4, v1, v24, v25, v26, v27 124 subs x13, x13, #1 125 COMPUTE v4, v2, v28, v29, v30, v31 126 bne L8LoopSz_TILE_12 127 128 L8LoopSzEnd_TILE_12: 129 add x2, x2, x15 130 sub x5, x5, #2 131 cbnz x8, L8Tile12Quan 132 sub x4, x4, #128 133 Int32ToFloat v8, v9, v10, v11 134 Int32ToFloat v12, v13, v14, v15 135 Int32ToFloat v16, v17, v18, v19 136 Int32ToFloat v20, v21, v22, v23 137 Int32ToFloat v24, v25, v26, v27 138 Int32ToFloat v28, v29, v30, v31 139 st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x0], #64 140 st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x0], #64 141 st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], x4 142 st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x0], #64 143 st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x0], #64 144 st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x0], x4 145 add x4, x4, #128 146 b L8Tile12LoopCheck 147 148 L8Tile12Quan: 149 ld1 {v0.4s, v1.4s}, [x8], #32 // scale 150 Int32ToFloat v8, v9, v10, v11 151 Int32ToFloat v12, v13, v14, v15 152 Int32ToFloat v16, v17, v18, v19 153 Int32ToFloat v20, v21, v22, v23 154 Int32ToFloat v24, v25, v26, v27 155 Int32ToFloat v28, v29, v30, v31 156 MUL_SCALE v0, v8, v9, v10, v11 157 MUL_SCALE v0, v12, v13, v14, v15 158 MUL_SCALE v0, v16, v17, v18, v19 159 MUL_SCALE v1, v20, v21, v22, v23 160 MUL_SCALE v1, v24, v25, v26, v27 161 MUL_SCALE v1, v28, v29, v30, v31 162 FloatToInt32 v8, v9, v10, v11 163 FloatToInt32 v12, v13, v14, v15 164 FloatToInt32 v16, v17, v18, v19 165 FloatToInt32 v20, v21, v22, v23 166 FloatToInt32 v24, v25, v26, v27 167 FloatToInt32 v28, v29, v30, v31 168 Int32ToInt16 v8, v9, v10, v11, v0, v1 169 Int32ToInt16 v12, v13, v14, v15, v2, v3 170 Int32ToInt16 v16, v17, v18, v19, v4, v5 171 Int32ToInt16 v20, v21, v22, v23, v8, v9 172 Int32ToInt16 v24, v25, v26, v27, v10, v11 173 Int32ToInt16 v28, v29, v30, v31, v12, v13 174 Int16ToInt8 v0, v1, v2, v3, v16, v17 175 Int16ToInt8 v4, v5, v8, v9, v18, v19 176 Int16ToInt8 v10, v11, v12, v13, v20, v21 177 smax v16.16b, v6.16b, v16.16b 178 smax v17.16b, v6.16b, v17.16b 179 smax v18.16b, v6.16b, v18.16b 180 smax v19.16b, v6.16b, v19.16b 181 smax v20.16b, v6.16b, v20.16b 182 smax v21.16b, v6.16b, v21.16b 183 smin v16.16b, v7.16b, v16.16b 184 smin v17.16b, v7.16b, v17.16b 185 smin v18.16b, v7.16b, v18.16b 186 smin v19.16b, v7.16b, v19.16b 187 smin v20.16b, v7.16b, v20.16b 188 smin v21.16b, v7.16b, v21.16b 189 st1 {v16.16b, v17.16b, v18.16b}, [x0], x4 190 st1 {v19.16b, v20.16b, v21.16b}, [x0], x4 191 192 L8Tile12LoopCheck: 193 cmp x5, #1 194 bgt L8LoopDz_TILE_12 195 blt End 196 197L4LoopDz_TILE_12: 198 ld1 {v0.4s}, [x9] // bias 199 200 SET_BIAS v0, v8, v9, v10, v11 201 SET_BIAS v0, v12, v13, v14, v15 202 SET_BIAS v0, v16, v17, v18, v19 203 204 L4LoopSz_TILE_12: 205 ld1 {v3.16b}, [x2], #16 // weight 206 ld1 {v0.16b, v1.16b, v2.16b}, [x1], #48 // src 207 COMPUTE v3, v0, v8, v9, v10, v11 208 COMPUTE v3, v1, v12, v13, v14, v15 209 subs x3, x3, #1 210 COMPUTE v3, v2, v16, v17, v18, v19 211 bne L4LoopSz_TILE_12 212 213 L4LoopSzEnd_TILE_12: 214 cbnz x8, L4Tile12Quan 215 sub x4, x4, #128 216 Int32ToFloat v8, v9, v10, v11 217 Int32ToFloat v12, v13, v14, v15 218 Int32ToFloat v16, v17, v18, v19 219 st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x0], #64 220 st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x0], #64 221 st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], x4 222 add x4, x4, #128 223 b End 224 225 L4Tile12Quan: 226 ld1 {v0.4s}, [x8] // scale 227 Int32ToFloat v8, v9, v10, v11 228 Int32ToFloat v12, v13, v14, v15 229 Int32ToFloat v16, v17, v18, v19 230 MUL_SCALE v0, v8, v9, v10, v11 231 MUL_SCALE v0, v12, v13, v14, v15 232 MUL_SCALE v0, v16, v17, v18, v19 233 FloatToInt32 v8, v9, v10, v11 234 FloatToInt32 v12, v13, v14, v15 235 FloatToInt32 v16, v17, v18, v19 236 Int32ToInt16 v8, v9, v10, v11, v0, v1 237 Int32ToInt16 v12, v13, v14, v15, v2, v3 238 Int32ToInt16 v16, v17, v18, v19, v4, v5 239 Int16ToInt8 v0, v1, v2, v3, v16, v17 240 Int16ToInt8_ONE v4, v5, v18 241 smax v16.16b, v6.16b, v16.16b 242 smax v17.16b, v6.16b, v17.16b 243 smax v18.16b, v6.16b, v18.16b 244 smin v16.16b, v7.16b, v16.16b 245 smin v17.16b, v7.16b, v17.16b 246 smin v18.16b, v7.16b, v18.16b 247 st1 {v16.16b, v17.16b, v18.16b}, [x0], x4 248 b End 249 250TILE_8: 251 cmp x7, #8 252 blt TILE_4 253 mov x10, x0 254 mov x12, x2 255 mov x14, x5 256 mov x19, x8 // scale 257 mov x20, x9 // bias 258 cmp x5, #2 259 blt L4LoopDz_TILE_8 260L8LoopDz_TILE_8: 261 ld1 {v0.4s, v1.4s}, [x20], #32 // bias 262 mov x11, x1 263 mov x13, x3 264 265 SET_BIAS v0, v8, v9, v10, v11 266 SET_BIAS v0, v12, v13, v14, v15 267 SET_BIAS v1, v16, v17, v18, v19 268 SET_BIAS v1, v20, v21, v22, v23 269 270 L8LoopSz_TILE_8: 271 ld1 {v3.16b}, [x12], x15 // weight 272 ld1 {v0.16b, v1.16b}, [x11], x22 // src 273 COMPUTE v3, v0, v8, v9, v10, v11 274 ld1 {v4.16b}, [x12], #16 275 COMPUTE v3, v1, v12, v13, v14, v15 276 sub x12, x12, x15 277 COMPUTE v4, v0, v16, v17, v18, v19 278 subs x13, x13, #1 279 COMPUTE v4, v1, v20, v21, v22, v23 280 bne L8LoopSz_TILE_8 281 282 L8LoopSzEnd_TILE_8: 283 add x12, x12, x15 284 sub x14, x14, #2 285 cbnz x8, L8Tile8Quan 286 sub x4, x4, #64 287 Int32ToFloat v8, v9, v10, v11 288 Int32ToFloat v12, v13, v14, v15 289 Int32ToFloat v16, v17, v18, v19 290 Int32ToFloat v20, v21, v22, v23 291 st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x10], #64 292 st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x10], x4 293 st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x10], #64 294 st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x10], x4 295 add x4, x4, #64 296 b L8Tile8LoopCheck 297 298 L8Tile8Quan: 299 ld1 {v0.4s, v1.4s}, [x19], #32 // scale 300 Int32ToFloat v8, v9, v10, v11 301 Int32ToFloat v12, v13, v14, v15 302 Int32ToFloat v16, v17, v18, v19 303 Int32ToFloat v20, v21, v22, v23 304 MUL_SCALE v0, v8, v9, v10, v11 305 MUL_SCALE v0, v12, v13, v14, v15 306 MUL_SCALE v1, v16, v17, v18, v19 307 MUL_SCALE v1, v20, v21, v22, v23 308 FloatToInt32 v8, v9, v10, v11 309 FloatToInt32 v12, v13, v14, v15 310 FloatToInt32 v16, v17, v18, v19 311 FloatToInt32 v20, v21, v22, v23 312 Int32ToInt16 v8, v9, v10, v11, v0, v1 313 Int32ToInt16 v12, v13, v14, v15, v2, v3 314 Int32ToInt16 v16, v17, v18, v19, v4, v5 315 Int32ToInt16 v20, v21, v22, v23, v8, v9 316 Int16ToInt8 v0, v1, v2, v3, v16, v17 317 Int16ToInt8 v4, v5, v8, v9, v18, v19 318 smax v16.16b, v6.16b, v16.16b 319 smax v17.16b, v6.16b, v17.16b 320 smax v18.16b, v6.16b, v18.16b 321 smax v19.16b, v6.16b, v19.16b 322 smin v16.16b, v7.16b, v16.16b 323 smin v17.16b, v7.16b, v17.16b 324 smin v18.16b, v7.16b, v18.16b 325 smin v19.16b, v7.16b, v19.16b 326 st1 {v16.16b, v17.16b}, [x10], x4 327 st1 {v18.16b, v19.16b}, [x10], x4 328 329 L8Tile8LoopCheck: 330 cmp x14, #1 331 bgt L8LoopDz_TILE_8 332 cbz x14, Tile8End 333 334L4LoopDz_TILE_8: 335 ld1 {v0.4s}, [x20], #16 // bias 336 mov x11, x1 337 mov x13, x3 338 339 SET_BIAS v0, v8, v9, v10, v11 340 SET_BIAS v0, v12, v13, v14, v15 341 342 L4LoopSz_TILE_8: 343 ld1 {v3.16b}, [x12], #16 // weight 344 ld1 {v0.16b, v1.16b}, [x11], x22 // src 345 COMPUTE v3, v0, v8, v9, v10, v11 346 subs x13, x13, #1 347 COMPUTE v3, v1, v12, v13, v14, v15 348 bne L4LoopSz_TILE_8 349 350 L4LoopSzEnd_TILE_8: 351 cbnz x8, L4Tile8Quan 352 sub x4, x4, #64 353 Int32ToFloat v8, v9, v10, v11 354 Int32ToFloat v12, v13, v14, v15 355 st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x10], #64 356 st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x10], x4 357 add x4, x4, #64 358 b Tile8End 359 360 L4Tile8Quan: 361 ld1 {v0.4s, v1.4s}, [x19], #32 // scale 362 Int32ToFloat v8, v9, v10, v11 363 Int32ToFloat v12, v13, v14, v15 364 MUL_SCALE v0, v8, v9, v10, v11 365 MUL_SCALE v0, v12, v13, v14, v15 366 FloatToInt32 v8, v9, v10, v11 367 FloatToInt32 v12, v13, v14, v15 368 Int32ToInt16 v8, v9, v10, v11, v0, v1 369 Int32ToInt16 v12, v13, v14, v15, v2, v3 370 Int16ToInt8 v0, v1, v2, v3, v16, v17 371 smax v16.16b, v6.16b, v16.16b 372 smax v17.16b, v6.16b, v17.16b 373 smin v16.16b, v7.16b, v16.16b 374 smin v17.16b, v7.16b, v17.16b 375 st1 {v16.16b, v17.16b}, [x10], x4 376 377Tile8End: 378 sub x7, x7, #8 379 add x0, x0, x21, LSL #3 380 add x1, x1, #32 381 382TILE_4: 383 cmp x7, #4 384 blt TILE_1 385 mov x10, x0 386 mov x12, x2 387 mov x14, x5 388 mov x19, x8 389 mov x20, x9 390 cmp x5, #2 391 blt L4LoopDz_TILE_4 392L8LoopDz_TILE_4: 393 ld1 {v0.4s, v1.4s}, [x20], #32 // bias 394 mov x11, x1 395 mov x13, x3 396 397 SET_BIAS v0, v8, v9, v10, v11 398 SET_BIAS v1, v12, v13, v14, v15 399 400 L8LoopSz_TILE_4: 401 ld1 {v3.16b}, [x12], x15 // weight 402 ld1 {v0.16b}, [x11], x22 // src 403 ld1 {v4.16b}, [x12], #16 // weight 404 COMPUTE v3, v0, v8, v9, v10, v11 405 subs x13, x13, #1 406 sub x12, x12, x15 407 COMPUTE v4, v0, v12, v13, v14, v15 408 bne L8LoopSz_TILE_4 409 410 L8LoopSzEnd_TILE_4: 411 add x12, x12, x15 412 sub x14, x14, #2 413 cbnz x8, L8Tile4Quan 414 Int32ToFloat v8, v9, v10, v11 415 Int32ToFloat v12, v13, v14, v15 416 st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x10], x4 417 st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x10], x4 418 b L8Tile4LoopCheck 419 420 L8Tile4Quan: 421 ld1 {v0.4s, v1.4s}, [x19], #32 // scale 422 Int32ToFloat v8, v9, v10, v11 423 Int32ToFloat v12, v13, v14, v15 424 MUL_SCALE v0, v8, v9, v10, v11 425 MUL_SCALE v1, v12, v13, v14, v15 426 FloatToInt32 v8, v9, v10, v11 427 FloatToInt32 v12, v13, v14, v15 428 Int32ToInt16 v8, v9, v10, v11, v0, v1 429 Int32ToInt16 v12, v13, v14, v15, v2, v3 430 Int16ToInt8 v0, v1, v2, v3, v16, v17 431 smax v16.16b, v6.16b, v16.16b 432 smax v17.16b, v6.16b, v17.16b 433 smin v16.16b, v7.16b, v16.16b 434 smin v17.16b, v7.16b, v17.16b 435 st1 {v16.16b}, [x10], x4 436 st1 {v17.16b}, [x10], x4 437 438 L8Tile4LoopCheck: 439 cmp x14, #1 440 bgt L8LoopDz_TILE_4 441 cbz x14, Tile4End 442 443L4LoopDz_TILE_4: 444 ld1 {v0.4s}, [x20], #16 // bias 445 mov x11, x1 446 mov x13, x3 447 SET_BIAS v0, v8, v9, v10, v11 448 449 L4LoopSz_TILE_4: 450 ld1 {v3.16b}, [x12], #16 // weight 451 ld1 {v0.16b}, [x11], x22 // src 452 subs x13, x13, #1 453 COMPUTE v3, v0, v8, v9, v10, v11 454 bne L4LoopSz_TILE_4 455 456 L4LoopSzEnd_TILE_4: 457 cbnz x8, L4Tile4Quan 458 Int32ToFloat v8, v9, v10, v11 459 st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x10], x4 460 b Tile4End 461 462 L4Tile4Quan: 463 ld1 {v0.4s}, [x19], #16 // scale 464 Int32ToFloat v8, v9, v10, v11 465 MUL_SCALE v0, v8, v9, v10, v11 466 FloatToInt32 v8, v9, v10, v11 467 Int32ToInt16 v8, v9, v10, v11, v0, v1 468 Int16ToInt8_ONE v0, v1, v16 469 smax v16.16b, v6.16b, v16.16b 470 smin v16.16b, v7.16b, v16.16b 471 st1 {v16.16b}, [x10], x4 472 473Tile4End: 474 sub x7, x7, #4 475 add x0, x0, x21, LSL #2 476 add x1, x1, #16 477 478TILE_1: 479 cbz x7, End 480 mov x10, x0 481 mov x12, x2 482 mov x14, x5 483 mov x19, x8 484 mov x20, x9 485 cmp x5, #2 486 blt L4LoopDz_TILE_1 487L8LoopDz_TILE_1: 488 ld1 {v0.4s, v1.4s}, [x20], #32 // bias 489 mov x11, x1 490 mov x13, x3 491 mov v8.4s, v0.4s 492 mov v9.4s, v1.4s 493 L8LoopSz_TILE_1: 494 ld1 {v3.16b}, [x12], x15 // weight 495 ld1 {v0.s}[0], [x11], x22 // src 496 ld1 {v4.16b}, [x12], #16 // weight 497 sdot v8.4s, v3.16b, v0.4b[0] 498 subs x13, x13, #1 499 sub x12, x12, x15 500 sdot v9.4s, v4.16b, v0.4b[0] 501 bne L8LoopSz_TILE_1 502 503 L8LoopSzEnd_TILE_1: 504 add x12, x12, x15 505 sub x14, x14, #2 506 cbnz x8, L8Tile1Quan 507 scvtf v8.4s, v8.4s 508 scvtf v9.4s, v9.4s 509 st1 {v8.4s}, [x10], x4 510 st1 {v9.4s}, [x10], x4 511 b L8Tile1LoopCheck 512 513 L8Tile1Quan: 514 ld1 {v0.4s, v1.4s}, [x19], #32 // scale 515 scvtf v8.4s, v8.4s 516 scvtf v9.4s, v9.4s 517 fmul v8.4s, v8.4s, v0.4s 518 fmul v9.4s, v9.4s, v1.4s 519 fcvtas v8.4s, v8.4s 520 fcvtas v9.4s, v9.4s 521 sqxtn v0.4h, v8.4s 522 sqxtn2 v0.8h, v9.4s 523 sqxtn v16.8b, v0.8h 524 smax v16.16b, v6.16b, v16.16b 525 smin v16.16b, v7.16b, v16.16b 526 st1 {v16.s}[0], [x10], x4 527 st1 {v16.s}[1], [x10], x4 528 529 L8Tile1LoopCheck: 530 cmp x14, #1 531 bgt L8LoopDz_TILE_1 532 cbz x14, Tile1End 533 534L4LoopDz_TILE_1: 535 ld1 {v0.4s}, [x20], #16 // bias 536 mov x11, x1 537 mov x13, x3 538 mov v8.4s, v0.4s 539 L4LoopSz_TILE_1: 540 ld1 {v3.16b}, [x12], #16 // weight 541 ld1 {v0.s}[0], [x11], x22 // src 542 subs x13, x13, #1 543 sdot v8.4s, v3.16b, v0.4b[0] 544 bne L4LoopSz_TILE_1 545 546 L4LoopSzEnd_TILE_1: 547 cbnz x8, L4Tile1Quan 548 scvtf v8.4s, v8.4s 549 st1 {v8.4s}, [x10], x4 550 b Tile1End 551 552 L4Tile1Quan: 553 ld1 {v0.4s}, [x19], #16 // scale 554 scvtf v8.4s, v8.4s 555 fmul v8.4s, v8.4s, v0.4s 556 fcvtas v8.4s, v8.4s 557 sqxtn v0.4h, v8.4s 558 sqxtn v16.8b, v0.8h 559 smax v16.8b, v6.8b, v16.8b 560 smin v16.8b, v7.8b, v16.8b 561 st1 {v16.s}[0], [x10], x4 562 563Tile1End: 564 sub x7, x7, #1 565 add x0, x0, x21 566 add x1, x1, #4 567 b TILE_1 568 569End: 570sub sp, sp, #160 571ldp x19, x20, [sp], #16 572ldp x21, x22, [sp], #16 573ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64 574ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64 575ret 576 577#endif // MNN_USE_ARMV82 578