1//
2//  MNNGemmInt8AddBiasScale_ARMV82_Unit.S
3//  MNN
4//
5//  Created by MNN on 2019/12/17.
6//  Copyright © 2018, Alibaba Group Holding Limited
7//
8
9#if defined(__aarch64__) && defined(ENABLE_ARMV82)
10#include "MNNAsmGlobal.h"
11
12.text
13.align 5
14
15.macro SET_BIAS s, d0, d1, d2, d3
16    mov \d0\().16b, \s\().16b
17    mov \d1\().16b, \s\().16b
18    mov \d2\().16b, \s\().16b
19    mov \d3\().16b, \s\().16b
20.endm
21.macro COMPUTE s0, s1, d0, d1, d2, d3
22    sdot \d0\().4s, \s0\().16b, \s1\().4b[0]
23    sdot \d1\().4s, \s0\().16b, \s1\().4b[1]
24    sdot \d2\().4s, \s0\().16b, \s1\().4b[2]
25    sdot \d3\().4s, \s0\().16b, \s1\().4b[3]
26.endm
27.macro Int32ToFloat z0, z1, z2, z3
28    scvtf \z0\().4s, \z0\().4s
29    scvtf \z1\().4s, \z1\().4s
30    scvtf \z2\().4s, \z2\().4s
31    scvtf \z3\().4s, \z3\().4s
32.endm
33.macro MUL_SCALE s, d0, d1, d2, d3
34    fmul \d0\().4s, \d0\().4s, \s\().4s
35    fmul \d1\().4s, \d1\().4s, \s\().4s
36    fmul \d2\().4s, \d2\().4s, \s\().4s
37    fmul \d3\().4s, \d3\().4s, \s\().4s
38.endm
39.macro FloatToInt32 z0, z1, z2, z3
40    fcvtas \z0\().4s, \z0\().4s
41    fcvtas \z1\().4s, \z1\().4s
42    fcvtas \z2\().4s, \z2\().4s
43    fcvtas \z3\().4s, \z3\().4s
44.endm
45.macro Int32ToInt16 s0, s1, s2, s3, d0, d1
46    sqxtn \d0\().4h,  \s0\().4s
47    sqxtn2 \d0\().8h, \s1\().4s
48    sqxtn \d1\().4h,  \s2\().4s
49    sqxtn2 \d1\().8h, \s3\().4s
50.endm
51.macro Int16ToInt8_ONE s0, s1, d0
52    sqxtn \d0\().8b,   \s0\().8h
53    sqxtn2 \d0\().16b, \s1\().8h
54.endm
55.macro Int16ToInt8 s0, s1, s2, s3, d0, d1
56    Int16ToInt8_ONE \s0, \s1, \d0
57    Int16ToInt8_ONE \s2, \s3, \d1
58.endm
59
60asm_function MNNGemmInt8AddBiasScale_ARMV82_Unit
61
62//struct QuanPostTreatParameters {
63//    const float* scale;
64//    const int32_t* bias;
65//    int32_t maxValue;
66//    int32_t minValue;
67//};
68
69//void MNNGemmInt8AddBiasScale_ARMV82_Unit(int8_t* dst, const int8_t* src,
70//    const int8_t* weight, size_t src_depth_quad, size_t dst_step, size_t dst_depth_quad,
71// const QuanPostTreatParameters* parameters, size_t realDstCount);
72
73//Auto: x0:dst, x1:src, x2:weight, x3:src_depth_quad, x4:dst_step
74//x5:dst_depth_quad, x6: parameters, x7: realDstCount
75
76//Load from x7: x8: scale, x9: bias, w12: maxValue, w13: minValue
77ldr x8, [x6, #0]
78ldr x9, [x6, #8]
79ldr w12, [x6, #16]
80ldr w13, [x6, #20]
81dup v7.16b, w12 // max
82dup v6.16b, w13 // min
83
84sub sp, sp, #160
85stp x19, x20, [sp], #16
86stp x21, x22, [sp], #16
87st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
88st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
89
90mov x21, #4 // sizeof(int8_t) * UNIT
91cbnz x8, Start
92mov x21, #16 // sizeof(float) * UNIT
93Start:
94lsl x15, x3, #4 // x15 = src_depth_quad * UNIT * SRC_UNIT
95mov x22, #48 // src_steps
96
97TILE_12:
98    cmp x7, #12
99    blt TILE_8
100    cmp x5, #2
101    blt L4LoopDz_TILE_12
102L8LoopDz_TILE_12:
103    ld1 {v0.4s, v1.4s}, [x9], #32 // bias
104    mov x11, x1
105    mov x13, x3
106
107    SET_BIAS v0, v8, v9, v10, v11
108    SET_BIAS v0, v12, v13, v14, v15
109    SET_BIAS v0, v16, v17, v18, v19
110    SET_BIAS v1, v20, v21, v22, v23
111    SET_BIAS v1, v24, v25, v26, v27
112    SET_BIAS v1, v28, v29, v30, v31
113
114    L8LoopSz_TILE_12:
115        ld1 {v3.16b}, [x2], x15 // weight
116        ld1 {v0.16b, v1.16b, v2.16b}, [x11], #48 // src
117        COMPUTE v3, v0, v8, v9, v10, v11
118        ld1 {v4.16b}, [x2], #16
119        COMPUTE v3, v1, v12, v13, v14, v15
120        COMPUTE v3, v2, v16, v17, v18, v19
121        COMPUTE v4, v0, v20, v21, v22, v23
122        sub x2, x2, x15
123        COMPUTE v4, v1, v24, v25, v26, v27
124        subs x13, x13, #1
125        COMPUTE v4, v2, v28, v29, v30, v31
126        bne L8LoopSz_TILE_12
127
128    L8LoopSzEnd_TILE_12:
129    add x2, x2, x15
130    sub x5, x5, #2
131    cbnz x8, L8Tile12Quan
132    sub x4, x4, #128
133    Int32ToFloat v8, v9, v10, v11
134    Int32ToFloat v12, v13, v14, v15
135    Int32ToFloat v16, v17, v18, v19
136    Int32ToFloat v20, v21, v22, v23
137    Int32ToFloat v24, v25, v26, v27
138    Int32ToFloat v28, v29, v30, v31
139    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x0], #64
140    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x0], #64
141    st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], x4
142    st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x0], #64
143    st1 {v24.4s, v25.4s, v26.4s, v27.4s}, [x0], #64
144    st1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x0], x4
145    add x4, x4, #128
146    b L8Tile12LoopCheck
147
148    L8Tile12Quan:
149    ld1 {v0.4s, v1.4s}, [x8], #32 // scale
150    Int32ToFloat v8, v9, v10, v11
151    Int32ToFloat v12, v13, v14, v15
152    Int32ToFloat v16, v17, v18, v19
153    Int32ToFloat v20, v21, v22, v23
154    Int32ToFloat v24, v25, v26, v27
155    Int32ToFloat v28, v29, v30, v31
156    MUL_SCALE v0, v8, v9, v10, v11
157    MUL_SCALE v0, v12, v13, v14, v15
158    MUL_SCALE v0, v16, v17, v18, v19
159    MUL_SCALE v1, v20, v21, v22, v23
160    MUL_SCALE v1, v24, v25, v26, v27
161    MUL_SCALE v1, v28, v29, v30, v31
162    FloatToInt32 v8, v9, v10, v11
163    FloatToInt32 v12, v13, v14, v15
164    FloatToInt32 v16, v17, v18, v19
165    FloatToInt32 v20, v21, v22, v23
166    FloatToInt32 v24, v25, v26, v27
167    FloatToInt32 v28, v29, v30, v31
168    Int32ToInt16 v8, v9, v10, v11, v0, v1
169    Int32ToInt16 v12, v13, v14, v15, v2, v3
170    Int32ToInt16 v16, v17, v18, v19, v4, v5
171    Int32ToInt16 v20, v21, v22, v23, v8, v9
172    Int32ToInt16 v24, v25, v26, v27, v10, v11
173    Int32ToInt16 v28, v29, v30, v31, v12, v13
174    Int16ToInt8 v0, v1, v2, v3, v16, v17
175    Int16ToInt8 v4, v5, v8, v9, v18, v19
176    Int16ToInt8 v10, v11, v12, v13, v20, v21
177    smax v16.16b, v6.16b, v16.16b
178    smax v17.16b, v6.16b, v17.16b
179    smax v18.16b, v6.16b, v18.16b
180    smax v19.16b, v6.16b, v19.16b
181    smax v20.16b, v6.16b, v20.16b
182    smax v21.16b, v6.16b, v21.16b
183    smin v16.16b, v7.16b, v16.16b
184    smin v17.16b, v7.16b, v17.16b
185    smin v18.16b, v7.16b, v18.16b
186    smin v19.16b, v7.16b, v19.16b
187    smin v20.16b, v7.16b, v20.16b
188    smin v21.16b, v7.16b, v21.16b
189    st1 {v16.16b, v17.16b, v18.16b}, [x0], x4
190    st1 {v19.16b, v20.16b, v21.16b}, [x0], x4
191
192    L8Tile12LoopCheck:
193    cmp x5, #1
194    bgt L8LoopDz_TILE_12
195    blt End
196
197L4LoopDz_TILE_12:
198    ld1 {v0.4s}, [x9] // bias
199
200    SET_BIAS v0, v8, v9, v10, v11
201    SET_BIAS v0, v12, v13, v14, v15
202    SET_BIAS v0, v16, v17, v18, v19
203
204    L4LoopSz_TILE_12:
205        ld1 {v3.16b}, [x2], #16 // weight
206        ld1 {v0.16b, v1.16b, v2.16b}, [x1], #48 // src
207        COMPUTE v3, v0, v8, v9, v10, v11
208        COMPUTE v3, v1, v12, v13, v14, v15
209        subs x3, x3, #1
210        COMPUTE v3, v2, v16, v17, v18, v19
211        bne L4LoopSz_TILE_12
212
213    L4LoopSzEnd_TILE_12:
214    cbnz x8, L4Tile12Quan
215    sub x4, x4, #128
216    Int32ToFloat v8, v9, v10, v11
217    Int32ToFloat v12, v13, v14, v15
218    Int32ToFloat v16, v17, v18, v19
219    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x0], #64
220    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x0], #64
221    st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x0], x4
222    add x4, x4, #128
223    b End
224
225    L4Tile12Quan:
226    ld1 {v0.4s}, [x8] // scale
227    Int32ToFloat v8, v9, v10, v11
228    Int32ToFloat v12, v13, v14, v15
229    Int32ToFloat v16, v17, v18, v19
230    MUL_SCALE v0, v8, v9, v10, v11
231    MUL_SCALE v0, v12, v13, v14, v15
232    MUL_SCALE v0, v16, v17, v18, v19
233    FloatToInt32 v8, v9, v10, v11
234    FloatToInt32 v12, v13, v14, v15
235    FloatToInt32 v16, v17, v18, v19
236    Int32ToInt16 v8, v9, v10, v11, v0, v1
237    Int32ToInt16 v12, v13, v14, v15, v2, v3
238    Int32ToInt16 v16, v17, v18, v19, v4, v5
239    Int16ToInt8 v0, v1, v2, v3, v16, v17
240    Int16ToInt8_ONE v4, v5, v18
241    smax v16.16b, v6.16b, v16.16b
242    smax v17.16b, v6.16b, v17.16b
243    smax v18.16b, v6.16b, v18.16b
244    smin v16.16b, v7.16b, v16.16b
245    smin v17.16b, v7.16b, v17.16b
246    smin v18.16b, v7.16b, v18.16b
247    st1 {v16.16b, v17.16b, v18.16b}, [x0], x4
248    b End
249
250TILE_8:
251    cmp x7, #8
252    blt TILE_4
253    mov x10, x0
254    mov x12, x2
255    mov x14, x5
256    mov x19, x8 // scale
257    mov x20, x9 // bias
258    cmp x5, #2
259    blt L4LoopDz_TILE_8
260L8LoopDz_TILE_8:
261    ld1 {v0.4s, v1.4s}, [x20], #32 // bias
262    mov x11, x1
263    mov x13, x3
264
265    SET_BIAS v0, v8, v9, v10, v11
266    SET_BIAS v0, v12, v13, v14, v15
267    SET_BIAS v1, v16, v17, v18, v19
268    SET_BIAS v1, v20, v21, v22, v23
269
270    L8LoopSz_TILE_8:
271        ld1 {v3.16b}, [x12], x15 // weight
272        ld1 {v0.16b, v1.16b}, [x11], x22 // src
273        COMPUTE v3, v0, v8, v9, v10, v11
274        ld1 {v4.16b}, [x12], #16
275        COMPUTE v3, v1, v12, v13, v14, v15
276        sub x12, x12, x15
277        COMPUTE v4, v0, v16, v17, v18, v19
278        subs x13, x13, #1
279        COMPUTE v4, v1, v20, v21, v22, v23
280        bne L8LoopSz_TILE_8
281
282    L8LoopSzEnd_TILE_8:
283    add x12, x12, x15
284    sub x14, x14, #2
285    cbnz x8, L8Tile8Quan
286    sub x4, x4, #64
287    Int32ToFloat v8, v9, v10, v11
288    Int32ToFloat v12, v13, v14, v15
289    Int32ToFloat v16, v17, v18, v19
290    Int32ToFloat v20, v21, v22, v23
291    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x10], #64
292    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x10], x4
293    st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x10], #64
294    st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [x10], x4
295    add x4, x4, #64
296    b L8Tile8LoopCheck
297
298    L8Tile8Quan:
299    ld1 {v0.4s, v1.4s}, [x19], #32 // scale
300    Int32ToFloat v8, v9, v10, v11
301    Int32ToFloat v12, v13, v14, v15
302    Int32ToFloat v16, v17, v18, v19
303    Int32ToFloat v20, v21, v22, v23
304    MUL_SCALE v0, v8, v9, v10, v11
305    MUL_SCALE v0, v12, v13, v14, v15
306    MUL_SCALE v1, v16, v17, v18, v19
307    MUL_SCALE v1, v20, v21, v22, v23
308    FloatToInt32 v8, v9, v10, v11
309    FloatToInt32 v12, v13, v14, v15
310    FloatToInt32 v16, v17, v18, v19
311    FloatToInt32 v20, v21, v22, v23
312    Int32ToInt16 v8, v9, v10, v11, v0, v1
313    Int32ToInt16 v12, v13, v14, v15, v2, v3
314    Int32ToInt16 v16, v17, v18, v19, v4, v5
315    Int32ToInt16 v20, v21, v22, v23, v8, v9
316    Int16ToInt8 v0, v1, v2, v3, v16, v17
317    Int16ToInt8 v4, v5, v8, v9, v18, v19
318    smax v16.16b, v6.16b, v16.16b
319    smax v17.16b, v6.16b, v17.16b
320    smax v18.16b, v6.16b, v18.16b
321    smax v19.16b, v6.16b, v19.16b
322    smin v16.16b, v7.16b, v16.16b
323    smin v17.16b, v7.16b, v17.16b
324    smin v18.16b, v7.16b, v18.16b
325    smin v19.16b, v7.16b, v19.16b
326    st1 {v16.16b, v17.16b}, [x10], x4
327    st1 {v18.16b, v19.16b}, [x10], x4
328
329    L8Tile8LoopCheck:
330    cmp x14, #1
331    bgt L8LoopDz_TILE_8
332    cbz x14, Tile8End
333
334L4LoopDz_TILE_8:
335    ld1 {v0.4s}, [x20], #16 // bias
336    mov x11, x1
337    mov x13, x3
338
339    SET_BIAS v0, v8, v9, v10, v11
340    SET_BIAS v0, v12, v13, v14, v15
341
342    L4LoopSz_TILE_8:
343        ld1 {v3.16b}, [x12], #16 // weight
344        ld1 {v0.16b, v1.16b}, [x11], x22 // src
345        COMPUTE v3, v0, v8, v9, v10, v11
346        subs x13, x13, #1
347        COMPUTE v3, v1, v12, v13, v14, v15
348        bne L4LoopSz_TILE_8
349
350    L4LoopSzEnd_TILE_8:
351    cbnz x8, L4Tile8Quan
352    sub x4, x4, #64
353    Int32ToFloat v8, v9, v10, v11
354    Int32ToFloat v12, v13, v14, v15
355    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x10], #64
356    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x10], x4
357    add x4, x4, #64
358    b Tile8End
359
360    L4Tile8Quan:
361    ld1 {v0.4s, v1.4s}, [x19], #32 // scale
362    Int32ToFloat v8, v9, v10, v11
363    Int32ToFloat v12, v13, v14, v15
364    MUL_SCALE v0, v8, v9, v10, v11
365    MUL_SCALE v0, v12, v13, v14, v15
366    FloatToInt32 v8, v9, v10, v11
367    FloatToInt32 v12, v13, v14, v15
368    Int32ToInt16 v8, v9, v10, v11, v0, v1
369    Int32ToInt16 v12, v13, v14, v15, v2, v3
370    Int16ToInt8 v0, v1, v2, v3, v16, v17
371    smax v16.16b, v6.16b, v16.16b
372    smax v17.16b, v6.16b, v17.16b
373    smin v16.16b, v7.16b, v16.16b
374    smin v17.16b, v7.16b, v17.16b
375    st1 {v16.16b, v17.16b}, [x10], x4
376
377Tile8End:
378    sub x7, x7, #8
379    add x0, x0, x21, LSL #3
380    add x1, x1, #32
381
382TILE_4:
383    cmp x7, #4
384    blt TILE_1
385    mov x10, x0
386    mov x12, x2
387    mov x14, x5
388    mov x19, x8
389    mov x20, x9
390    cmp x5, #2
391    blt L4LoopDz_TILE_4
392L8LoopDz_TILE_4:
393    ld1 {v0.4s, v1.4s}, [x20], #32 // bias
394    mov x11, x1
395    mov x13, x3
396
397    SET_BIAS v0, v8, v9, v10, v11
398    SET_BIAS v1, v12, v13, v14, v15
399
400    L8LoopSz_TILE_4:
401        ld1 {v3.16b}, [x12], x15 // weight
402        ld1 {v0.16b}, [x11], x22 // src
403        ld1 {v4.16b}, [x12], #16 // weight
404        COMPUTE v3, v0, v8, v9, v10, v11
405        subs x13, x13, #1
406        sub x12, x12, x15
407        COMPUTE v4, v0, v12, v13, v14, v15
408        bne L8LoopSz_TILE_4
409
410    L8LoopSzEnd_TILE_4:
411    add x12, x12, x15
412    sub x14, x14, #2
413    cbnz x8, L8Tile4Quan
414    Int32ToFloat v8, v9, v10, v11
415    Int32ToFloat v12, v13, v14, v15
416    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x10], x4
417    st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [x10], x4
418    b L8Tile4LoopCheck
419
420    L8Tile4Quan:
421    ld1 {v0.4s, v1.4s}, [x19], #32 // scale
422    Int32ToFloat v8, v9, v10, v11
423    Int32ToFloat v12, v13, v14, v15
424    MUL_SCALE v0, v8, v9, v10, v11
425    MUL_SCALE v1, v12, v13, v14, v15
426    FloatToInt32 v8, v9, v10, v11
427    FloatToInt32 v12, v13, v14, v15
428    Int32ToInt16 v8, v9, v10, v11, v0, v1
429    Int32ToInt16 v12, v13, v14, v15, v2, v3
430    Int16ToInt8 v0, v1, v2, v3, v16, v17
431    smax v16.16b, v6.16b, v16.16b
432    smax v17.16b, v6.16b, v17.16b
433    smin v16.16b, v7.16b, v16.16b
434    smin v17.16b, v7.16b, v17.16b
435    st1 {v16.16b}, [x10], x4
436    st1 {v17.16b}, [x10], x4
437
438    L8Tile4LoopCheck:
439    cmp x14, #1
440    bgt L8LoopDz_TILE_4
441    cbz x14, Tile4End
442
443L4LoopDz_TILE_4:
444    ld1 {v0.4s}, [x20], #16 // bias
445    mov x11, x1
446    mov x13, x3
447    SET_BIAS v0, v8, v9, v10, v11
448
449    L4LoopSz_TILE_4:
450        ld1 {v3.16b}, [x12], #16 // weight
451        ld1 {v0.16b}, [x11], x22 // src
452        subs x13, x13, #1
453        COMPUTE v3, v0, v8, v9, v10, v11
454        bne L4LoopSz_TILE_4
455
456    L4LoopSzEnd_TILE_4:
457    cbnz x8, L4Tile4Quan
458    Int32ToFloat v8, v9, v10, v11
459    st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [x10], x4
460    b Tile4End
461
462    L4Tile4Quan:
463    ld1 {v0.4s}, [x19], #16 // scale
464    Int32ToFloat v8, v9, v10, v11
465    MUL_SCALE v0, v8, v9, v10, v11
466    FloatToInt32 v8, v9, v10, v11
467    Int32ToInt16 v8, v9, v10, v11, v0, v1
468    Int16ToInt8_ONE v0, v1, v16
469    smax v16.16b, v6.16b, v16.16b
470    smin v16.16b, v7.16b, v16.16b
471    st1 {v16.16b}, [x10], x4
472
473Tile4End:
474    sub x7, x7, #4
475    add x0, x0, x21, LSL #2
476    add x1, x1, #16
477
478TILE_1:
479    cbz x7, End
480    mov x10, x0
481    mov x12, x2
482    mov x14, x5
483    mov x19, x8
484    mov x20, x9
485    cmp x5, #2
486    blt L4LoopDz_TILE_1
487L8LoopDz_TILE_1:
488    ld1 {v0.4s, v1.4s}, [x20], #32 // bias
489    mov x11, x1
490    mov x13, x3
491    mov v8.4s, v0.4s
492    mov v9.4s, v1.4s
493    L8LoopSz_TILE_1:
494        ld1 {v3.16b}, [x12], x15 // weight
495        ld1 {v0.s}[0], [x11], x22 // src
496        ld1 {v4.16b}, [x12], #16 // weight
497        sdot v8.4s, v3.16b, v0.4b[0]
498        subs x13, x13, #1
499        sub x12, x12, x15
500        sdot v9.4s, v4.16b, v0.4b[0]
501        bne L8LoopSz_TILE_1
502
503    L8LoopSzEnd_TILE_1:
504    add x12, x12, x15
505    sub x14, x14, #2
506    cbnz x8, L8Tile1Quan
507    scvtf v8.4s, v8.4s
508    scvtf v9.4s, v9.4s
509    st1 {v8.4s}, [x10], x4
510    st1 {v9.4s}, [x10], x4
511    b L8Tile1LoopCheck
512
513    L8Tile1Quan:
514    ld1 {v0.4s, v1.4s}, [x19], #32 // scale
515    scvtf v8.4s, v8.4s
516    scvtf v9.4s, v9.4s
517    fmul v8.4s, v8.4s, v0.4s
518    fmul v9.4s, v9.4s, v1.4s
519    fcvtas v8.4s, v8.4s
520    fcvtas v9.4s, v9.4s
521    sqxtn v0.4h, v8.4s
522    sqxtn2 v0.8h, v9.4s
523    sqxtn v16.8b, v0.8h
524    smax v16.16b, v6.16b, v16.16b
525    smin v16.16b, v7.16b, v16.16b
526    st1 {v16.s}[0], [x10], x4
527    st1 {v16.s}[1], [x10], x4
528
529    L8Tile1LoopCheck:
530    cmp x14, #1
531    bgt L8LoopDz_TILE_1
532    cbz x14, Tile1End
533
534L4LoopDz_TILE_1:
535    ld1 {v0.4s}, [x20], #16 // bias
536    mov x11, x1
537    mov x13, x3
538    mov v8.4s, v0.4s
539    L4LoopSz_TILE_1:
540        ld1 {v3.16b}, [x12], #16 // weight
541        ld1 {v0.s}[0], [x11], x22 // src
542        subs x13, x13, #1
543        sdot v8.4s, v3.16b, v0.4b[0]
544        bne L4LoopSz_TILE_1
545
546    L4LoopSzEnd_TILE_1:
547    cbnz x8, L4Tile1Quan
548    scvtf v8.4s, v8.4s
549    st1 {v8.4s}, [x10], x4
550    b Tile1End
551
552    L4Tile1Quan:
553    ld1 {v0.4s}, [x19], #16 // scale
554    scvtf v8.4s, v8.4s
555    fmul v8.4s, v8.4s, v0.4s
556    fcvtas v8.4s, v8.4s
557    sqxtn v0.4h, v8.4s
558    sqxtn v16.8b, v0.8h
559    smax v16.8b, v6.8b, v16.8b
560    smin v16.8b, v7.8b, v16.8b
561    st1 {v16.s}[0], [x10], x4
562
563Tile1End:
564    sub x7, x7, #1
565    add x0, x0, x21
566    add x1, x1, #4
567    b TILE_1
568
569End:
570sub sp, sp, #160
571ldp x19, x20, [sp], #16
572ldp x21, x22, [sp], #16
573ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
574ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
575ret
576
577#endif // MNN_USE_ARMV82
578