1//
2//  MNNGemmInt8AddBiasScale_16x4_Unit.S
3//  MNN
4//
5//  Created by MNN on 2019/06/11.
6//  Copyright © 2018, Alibaba Group Holding Limited
7//
8
9#ifdef __aarch64__
10
11#include "MNNAsmGlobal.h"
12
13.text
14.align 5
15
16asm_function MNNGemmInt8AddBiasScale_16x4_Unit
17
18//struct QuanPostTreatParameters {
19//    const float* scale;
20//    const int32_t* bias;
21//    int32_t maxValue;
22//    int32_t minValue;
23//};
24
25//void MNNGemmInt8AddBiasScale_16x4_Unit(int8_t* dst, const int8_t* src, const int8_t* weight, size_t src_depth_quad, size_t dst_step,
26//                                              size_t dst_depth_quad, const QuanPostTreatParameters* post, size_t realSize) {
27
28//Auto: x0: dst*, x1: src*, x2:weight*, x3: src_depth_quad, x4: dst_step,
29// x5: dst_depth_quad, x6: post, x7: realSize
30
31//Load from post:
32// x7: scale, x10: bias, w11: maxValue, w6: minValue
33mov x8, x7
34ldr x7, [x6, #0]
35ldr x10, [x6, #8]
36ldr w11, [x6, #16]
37ldr w6, [x6, #20]
38
39sub sp, sp, #128
40st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
41st1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
42
43cmp x8, #3
44beq L3Dz
45
46cmp x8, #2
47beq L2Dz
48
49cmp x8, #1
50beq L1Dz
51
52cbz x7, L4LoopDz
53sub x4, x4, #8
54L4LoopDz:
55    mov x8, x1
56    ld1 {v0.16b}, [x2], #16
57    ld1 {v1.16b}, [x2], #16
58    ld1 {v2.16b}, [x2], #16
59    ld1 {v3.16b}, [x2], #16
60    dup v16.4s, wzr
61    dup v17.4s, wzr
62    ld1 {v4.16b}, [x1], #16
63    ld1 {v5.16b}, [x1], #16
64    ld1 {v6.16b}, [x1], #16
65    ld1 {v7.16b}, [x1], #16
66
67    smull v8.8h, v0.8b, v4.8b
68    dup v18.4s, wzr
69    smull v9.8h, v1.8b, v4.8b
70    dup v19.4s, wzr
71    smull v10.8h, v2.8b, v4.8b
72    dup v20.4s, wzr
73    smull v11.8h, v3.8b, v4.8b
74    dup v21.4s, wzr
75    smull v12.8h, v0.8b, v5.8b
76    dup v22.4s, wzr
77    smull v13.8h, v1.8b, v5.8b
78    dup v23.4s, wzr
79    smull v14.8h, v2.8b, v5.8b
80    dup v24.4s, wzr
81    smull v15.8h, v3.8b, v5.8b
82    subs x9, x3, #1
83    smlal2 v8.8h, v0.16b, v4.16b
84    dup v25.4s, wzr
85    smlal2 v9.8h, v1.16b, v4.16b
86    dup v26.4s, wzr
87    smlal2 v10.8h, v2.16b, v4.16b
88    dup v27.4s, wzr
89    smlal2 v11.8h, v3.16b, v4.16b
90    dup v28.4s, wzr
91    smlal2 v12.8h, v0.16b, v5.16b
92    dup v29.4s, wzr
93    smlal2 v13.8h, v1.16b, v5.16b
94    dup v30.4s, wzr
95    smlal2 v14.8h, v2.16b, v5.16b
96    dup v31.4s, wzr
97    smlal2 v15.8h, v3.16b, v5.16b
98    beq L4LoopSzEnd
99
100    L4LoopSz:
101
102        sadalp v16.4s, v8.8h
103        ld1 {v4.16b}, [x1], #16
104        smull v8.8h, v0.8b, v6.8b
105        sadalp v17.4s, v9.8h
106        ld1 {v5.16b}, [x1], #16
107        smull v9.8h, v1.8b, v6.8b
108        sadalp v18.4s, v10.8h
109        smull v10.8h, v2.8b, v6.8b
110        sadalp v19.4s, v11.8h
111        smull v11.8h, v3.8b, v6.8b
112        sadalp v20.4s, v12.8h
113        smull v12.8h, v0.8b, v7.8b
114        sadalp v21.4s, v13.8h
115        smull v13.8h, v1.8b, v7.8b
116        sadalp v22.4s, v14.8h
117        smull v14.8h, v2.8b, v7.8b
118        sadalp v23.4s, v15.8h
119        smull v15.8h, v3.8b, v7.8b
120
121        smlal2 v8.8h,  v0.16b, v6.16b
122        smlal2 v9.8h,  v1.16b, v6.16b
123        smlal2 v10.8h, v2.16b, v6.16b
124        smlal2 v11.8h, v3.16b, v6.16b
125
126        ld1 {v6.16b}, [x1], #16
127
128        smlal2 v12.8h, v0.16b, v7.16b
129        ld1 {v0.16b}, [x2], #16
130        smlal2 v13.8h, v1.16b, v7.16b
131        ld1 {v1.16b}, [x2], #16
132        smlal2 v14.8h, v2.16b, v7.16b
133        ld1 {v2.16b}, [x2], #16
134        smlal2 v15.8h, v3.16b, v7.16b
135        ld1 {v3.16b}, [x2], #16
136
137        sadalp v24.4s, v8.8h
138        smull v8.8h, v0.8b, v4.8b
139        sadalp v25.4s, v9.8h
140        ld1 {v7.16b}, [x1], #16
141        smull v9.8h, v1.8b, v4.8b
142        sadalp v26.4s, v10.8h
143        smull v10.8h, v2.8b, v4.8b
144        sadalp v27.4s, v11.8h
145        smull v11.8h, v3.8b, v4.8b
146        sadalp v28.4s, v12.8h
147        smull v12.8h, v0.8b, v5.8b
148        sadalp v29.4s, v13.8h
149        smull v13.8h, v1.8b, v5.8b
150        sadalp v30.4s, v14.8h
151        smull v14.8h, v2.8b, v5.8b
152        sadalp v31.4s, v15.8h
153        smull v15.8h, v3.8b, v5.8b
154
155        smlal2 v8.8h, v0.16b, v4.16b
156        smlal2 v9.8h, v1.16b, v4.16b
157        smlal2 v10.8h, v2.16b, v4.16b
158        smlal2 v11.8h, v3.16b, v4.16b
159
160        smlal2 v12.8h, v0.16b, v5.16b
161        smlal2 v13.8h, v1.16b, v5.16b
162        smlal2 v14.8h, v2.16b, v5.16b
163        smlal2 v15.8h, v3.16b, v5.16b
164
165        subs x9, x9, #1
166        bne L4LoopSz
167
168    L4LoopSzEnd:
169    sadalp v16.4s, v8.8h
170    smull v8.8h, v0.8b, v6.8b
171    sadalp v17.4s, v9.8h
172    smull v9.8h, v1.8b, v6.8b
173    sadalp v18.4s, v10.8h
174    smull v10.8h, v2.8b, v6.8b
175    sadalp  v19.4s, v11.8h
176    smull v11.8h, v3.8b, v6.8b
177    sadalp  v20.4s, v12.8h
178    smull v12.8h, v0.8b, v7.8b
179    sadalp  v21.4s, v13.8h
180    smull v13.8h, v1.8b, v7.8b
181    sadalp  v22.4s, v14.8h
182    smull v14.8h, v2.8b, v7.8b
183    sadalp  v23.4s, v15.8h
184    smull v15.8h, v3.8b, v7.8b
185
186    smlal2 v8.8h, v0.16b, v6.16b
187    smlal2 v9.8h, v1.16b, v6.16b
188    smlal2 v10.8h, v2.16b, v6.16b
189    smlal2 v11.8h, v3.16b, v6.16b
190
191    smlal2 v12.8h, v0.16b, v7.16b
192    smlal2 v13.8h, v1.16b, v7.16b
193    smlal2 v14.8h, v2.16b, v7.16b
194    smlal2 v15.8h, v3.16b, v7.16b
195
196    sadalp v24.4s, v8.8h
197    sadalp v25.4s, v9.8h
198    sadalp v26.4s, v10.8h
199    sadalp v27.4s, v11.8h
200    sadalp v28.4s, v12.8h
201    sadalp v29.4s, v13.8h
202    sadalp v30.4s, v14.8h
203    sadalp v31.4s, v15.8h
204
205    ld1 {v0.4s}, [x10], #16
206    addp v4.4s, v16.4s, v17.4s
207    addp v5.4s, v18.4s, v19.4s
208    addp v6.4s, v20.4s, v21.4s
209    addp v7.4s, v22.4s, v23.4s
210    addp v8.4s, v24.4s, v25.4s
211    addp v9.4s, v26.4s, v27.4s
212    addp v10.4s, v28.4s, v29.4s
213    addp v11.4s, v30.4s, v31.4s
214
215    addp v12.4s, v4.4s, v5.4s
216    addp v13.4s, v6.4s, v7.4s
217    addp v14.4s, v8.4s, v9.4s
218    addp v15.4s, v10.4s, v11.4s
219
220    cbnz x7, L4Quan
221    add v16.4s, v12.4s, v0.4s
222    add v17.4s, v13.4s, v0.4s
223    add v18.4s, v14.4s, v0.4s
224    add v19.4s, v15.4s, v0.4s
225    scvtf v0.4s, v16.4s
226    scvtf v1.4s, v17.4s
227    scvtf v2.4s, v18.4s
228    scvtf v3.4s, v19.4s
229    subs x5, x5, #1
230    mov x1, x8
231    st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [x0], x4
232    b L4LoopCheck
233
234    L4Quan:
235    ld1 {v1.4s}, [x7], #16
236    add v16.4s, v12.4s, v0.4s
237    add v17.4s, v13.4s, v0.4s
238    add v18.4s, v14.4s, v0.4s
239    add v19.4s, v15.4s, v0.4s
240
241    dup v31.4s, w6 // Min
242    dup v30.4s, w11 // Max
243
244    scvtf v4.4s, v16.4s
245    scvtf v5.4s, v17.4s
246    scvtf v6.4s, v18.4s
247    scvtf v7.4s, v19.4s
248
249    fmul v12.4s, v4.4s, v1.4s
250    fmul v13.4s, v5.4s, v1.4s
251    fmul v14.4s, v6.4s, v1.4s
252    fmul v15.4s, v7.4s, v1.4s
253
254    fcvtas v8.4s, v12.4s
255    fcvtas v9.4s, v13.4s
256    fcvtas v10.4s, v14.4s
257    fcvtas v11.4s, v15.4s
258
259    smin v8.4s, v30.4s, v8.4s
260    smin v9.4s, v30.4s, v9.4s
261    smin v10.4s, v30.4s, v10.4s
262    smin v11.4s, v30.4s, v11.4s
263
264    smax v8.4s, v31.4s, v8.4s
265    smax v9.4s, v31.4s, v9.4s
266    smax v10.4s, v31.4s, v10.4s
267    smax v11.4s, v31.4s, v11.4s
268
269    sqxtn v0.4h, v8.4s
270    sqxtn2 v0.8h, v9.4s
271    sqxtn v1.4h, v10.4s
272    sqxtn2 v1.8h, v11.4s
273
274    sqxtn v2.8b, v0.8h
275    sqxtn v3.8b, v1.8h
276    st1 {v2.8b}, [x0], #8
277    subs x5, x5, #1
278    mov x1, x8
279    st1 {v3.8b}, [x0], x4
280L4LoopCheck:
281    bne L4LoopDz
282
283b End
284
285L3Dz:
286cbz x7, L3LoopDz
287sub x4, x4, #8
288L3LoopDz:
289    mov x8, x1
290    ld1 {v0.16b}, [x2], #16
291    ld1 {v1.16b}, [x2], #16
292    ld1 {v2.16b}, [x2], #16
293    ld1 {v3.16b}, [x2], #16
294    dup v16.4s, wzr
295    dup v17.4s, wzr
296    ld1 {v4.16b}, [x1], #16
297    ld1 {v5.16b}, [x1], #16
298    ld1 {v6.16b}, [x1], #16
299    add x1, x1, #16
300
301    smull v8.8h, v0.8b, v4.8b
302    dup v18.4s, wzr
303    smull v9.8h, v1.8b, v4.8b
304    dup v19.4s, wzr
305    smull v10.8h, v2.8b, v4.8b
306    dup v20.4s, wzr
307    smull v11.8h, v3.8b, v4.8b
308    dup v21.4s, wzr
309    smull v12.8h, v0.8b, v5.8b
310    dup v22.4s, wzr
311    smull v13.8h, v1.8b, v5.8b
312    dup v23.4s, wzr
313    smull v14.8h, v2.8b, v5.8b
314    dup v24.4s, wzr
315    smull v15.8h, v3.8b, v5.8b
316    subs x9, x3, #1
317    smlal2 v8.8h, v0.16b, v4.16b
318    dup v25.4s, wzr
319    smlal2 v9.8h, v1.16b, v4.16b
320    dup v26.4s, wzr
321    smlal2 v10.8h, v2.16b, v4.16b
322    dup v27.4s, wzr
323    smlal2 v11.8h, v3.16b, v4.16b
324    smlal2 v12.8h, v0.16b, v5.16b
325    smlal2 v13.8h, v1.16b, v5.16b
326    smlal2 v14.8h, v2.16b, v5.16b
327    smlal2 v15.8h, v3.16b, v5.16b
328    beq L3LoopSzEnd
329
330    L3LoopSz:
331
332        sadalp v16.4s, v8.8h
333        ld1 {v4.16b}, [x1], #16
334        smull v8.8h, v0.8b, v6.8b
335        sadalp v17.4s, v9.8h
336        ld1 {v5.16b}, [x1], #16
337        smull v9.8h, v1.8b, v6.8b
338        sadalp v18.4s, v10.8h
339        smull v10.8h, v2.8b, v6.8b
340        sadalp v19.4s, v11.8h
341        smull v11.8h, v3.8b, v6.8b
342        sadalp v20.4s, v12.8h
343        sadalp v21.4s, v13.8h
344        sadalp v22.4s, v14.8h
345        sadalp v23.4s, v15.8h
346
347        smlal2 v8.8h,  v0.16b, v6.16b
348        smlal2 v9.8h,  v1.16b, v6.16b
349        smlal2 v10.8h, v2.16b, v6.16b
350        smlal2 v11.8h, v3.16b, v6.16b
351
352        ld1 {v6.16b}, [x1], #16
353
354        ld1 {v0.16b}, [x2], #16
355        ld1 {v1.16b}, [x2], #16
356        ld1 {v2.16b}, [x2], #16
357        ld1 {v3.16b}, [x2], #16
358        add x1, x1, #16
359
360        sadalp v24.4s, v8.8h
361        smull v8.8h, v0.8b, v4.8b
362        sadalp v25.4s, v9.8h
363        smull v9.8h, v1.8b, v4.8b
364        sadalp v26.4s, v10.8h
365        smull v10.8h, v2.8b, v4.8b
366        sadalp v27.4s, v11.8h
367        smull v11.8h, v3.8b, v4.8b
368        smull v12.8h, v0.8b, v5.8b
369        smull v13.8h, v1.8b, v5.8b
370        smull v14.8h, v2.8b, v5.8b
371        smull v15.8h, v3.8b, v5.8b
372
373        smlal2 v8.8h, v0.16b, v4.16b
374        smlal2 v9.8h, v1.16b, v4.16b
375        smlal2 v10.8h, v2.16b, v4.16b
376        smlal2 v11.8h, v3.16b, v4.16b
377
378        smlal2 v12.8h, v0.16b, v5.16b
379        smlal2 v13.8h, v1.16b, v5.16b
380        smlal2 v14.8h, v2.16b, v5.16b
381        smlal2 v15.8h, v3.16b, v5.16b
382
383        subs x9, x9, #1
384        bne L3LoopSz
385
386    L3LoopSzEnd:
387    sadalp v16.4s, v8.8h
388    smull v8.8h, v0.8b, v6.8b
389    sadalp v17.4s, v9.8h
390    smull v9.8h, v1.8b, v6.8b
391    sadalp v18.4s, v10.8h
392    smull v10.8h, v2.8b, v6.8b
393    sadalp  v19.4s, v11.8h
394    smull v11.8h, v3.8b, v6.8b
395    sadalp  v20.4s, v12.8h
396    sadalp  v21.4s, v13.8h
397    sadalp  v22.4s, v14.8h
398    sadalp  v23.4s, v15.8h
399
400    smlal2 v8.8h, v0.16b, v6.16b
401    smlal2 v9.8h, v1.16b, v6.16b
402    smlal2 v10.8h, v2.16b, v6.16b
403    smlal2 v11.8h, v3.16b, v6.16b
404
405    sadalp v24.4s, v8.8h
406    sadalp v25.4s, v9.8h
407    sadalp v26.4s, v10.8h
408    sadalp v27.4s, v11.8h
409
410    ld1 {v0.4s}, [x10], #16
411    addp v4.4s, v16.4s, v17.4s
412    addp v5.4s, v18.4s, v19.4s
413    addp v6.4s, v20.4s, v21.4s
414    addp v7.4s, v22.4s, v23.4s
415    addp v8.4s, v24.4s, v25.4s
416    addp v9.4s, v26.4s, v27.4s
417
418    addp v12.4s, v4.4s, v5.4s
419    addp v13.4s, v6.4s, v7.4s
420    addp v14.4s, v8.4s, v9.4s
421
422    cbnz x7, L3Quan
423    add v16.4s, v12.4s, v0.4s
424    add v17.4s, v13.4s, v0.4s
425    add v18.4s, v14.4s, v0.4s
426    scvtf v0.4s, v16.4s
427    scvtf v1.4s, v17.4s
428    scvtf v2.4s, v18.4s
429    subs x5, x5, #1
430    mov x1, x8
431    st1 {v0.4s, v1.4s, v2.4s}, [x0], x4
432    b L3LoopCheck
433
434    L3Quan:
435    ld1 {v1.4s}, [x7], #16
436    add v16.4s, v12.4s, v0.4s
437    add v17.4s, v13.4s, v0.4s
438    add v18.4s, v14.4s, v0.4s
439
440    dup v31.4s, w6 // Min
441    dup v30.4s, w11 // Max
442
443    scvtf v4.4s, v16.4s
444    scvtf v5.4s, v17.4s
445    scvtf v6.4s, v18.4s
446
447    fmul v12.4s, v4.4s, v1.4s
448    fmul v13.4s, v5.4s, v1.4s
449    fmul v14.4s, v6.4s, v1.4s
450
451    fcvtas v8.4s, v12.4s
452    fcvtas v9.4s, v13.4s
453    fcvtas v10.4s, v14.4s
454
455    smin v8.4s, v30.4s, v8.4s
456    smin v9.4s, v30.4s, v9.4s
457    smin v10.4s, v30.4s, v10.4s
458
459    smax v8.4s, v31.4s, v8.4s
460    smax v9.4s, v31.4s, v9.4s
461    smax v10.4s, v31.4s, v10.4s
462
463    sqxtn v0.4h, v8.4s
464    sqxtn2 v0.8h, v9.4s
465    sqxtn v1.4h, v10.4s
466
467    sqxtn v2.8b, v0.8h
468    sqxtn v3.8b, v1.8h
469    st1 {v2.8b}, [x0], #8
470    subs x5, x5, #1
471    mov x1, x8
472    st1 {v3.s}[0], [x0], x4
473L3LoopCheck:
474    bne L3LoopDz
475
476b End
477
478L2Dz:
479L2LoopDz:
480    mov x8, x1
481    ld1 {v0.16b}, [x2], #16
482    ld1 {v1.16b}, [x2], #16
483    ld1 {v2.16b}, [x2], #16
484    ld1 {v3.16b}, [x2], #16
485    dup v16.4s, wzr
486    dup v17.4s, wzr
487    ld1 {v4.16b}, [x1], #16
488    ld1 {v5.16b}, [x1], #16
489    add x1, x1, #32
490
491    smull v8.8h, v0.8b, v4.8b
492    dup v18.4s, wzr
493    smull v9.8h, v1.8b, v4.8b
494    dup v19.4s, wzr
495    smull v10.8h, v2.8b, v4.8b
496    dup v20.4s, wzr
497    smull v11.8h, v3.8b, v4.8b
498    dup v21.4s, wzr
499    smull v12.8h, v0.8b, v5.8b
500    dup v22.4s, wzr
501    smull v13.8h, v1.8b, v5.8b
502    dup v23.4s, wzr
503    smull v14.8h, v2.8b, v5.8b
504    smull v15.8h, v3.8b, v5.8b
505    subs x9, x3, #1
506    smlal2 v8.8h, v0.16b, v4.16b
507    smlal2 v9.8h, v1.16b, v4.16b
508    smlal2 v10.8h, v2.16b, v4.16b
509    smlal2 v11.8h, v3.16b, v4.16b
510    smlal2 v12.8h, v0.16b, v5.16b
511    smlal2 v13.8h, v1.16b, v5.16b
512    smlal2 v14.8h, v2.16b, v5.16b
513    smlal2 v15.8h, v3.16b, v5.16b
514    beq L2LoopSzEnd
515
516    L2LoopSz:
517
518        sadalp v16.4s, v8.8h
519        ld1 {v4.16b}, [x1], #16
520        sadalp v17.4s, v9.8h
521        ld1 {v5.16b}, [x1], #16
522        sadalp v18.4s, v10.8h
523        sadalp v19.4s, v11.8h
524        sadalp v20.4s, v12.8h
525        sadalp v21.4s, v13.8h
526        sadalp v22.4s, v14.8h
527        sadalp v23.4s, v15.8h
528
529        ld1 {v0.16b}, [x2], #16
530        ld1 {v1.16b}, [x2], #16
531        ld1 {v2.16b}, [x2], #16
532        ld1 {v3.16b}, [x2], #16
533        add x1, x1, #32
534
535        smull v8.8h, v0.8b, v4.8b
536        smull v9.8h, v1.8b, v4.8b
537        smull v10.8h, v2.8b, v4.8b
538        smull v11.8h, v3.8b, v4.8b
539        smull v12.8h, v0.8b, v5.8b
540        smull v13.8h, v1.8b, v5.8b
541        smull v14.8h, v2.8b, v5.8b
542        smull v15.8h, v3.8b, v5.8b
543
544        smlal2 v8.8h, v0.16b, v4.16b
545        smlal2 v9.8h, v1.16b, v4.16b
546        smlal2 v10.8h, v2.16b, v4.16b
547        smlal2 v11.8h, v3.16b, v4.16b
548
549        smlal2 v12.8h, v0.16b, v5.16b
550        smlal2 v13.8h, v1.16b, v5.16b
551        smlal2 v14.8h, v2.16b, v5.16b
552        smlal2 v15.8h, v3.16b, v5.16b
553
554        subs x9, x9, #1
555        bne L2LoopSz
556
557    L2LoopSzEnd:
558    sadalp v16.4s, v8.8h
559    sadalp v17.4s, v9.8h
560    sadalp v18.4s, v10.8h
561    sadalp  v19.4s, v11.8h
562    sadalp  v20.4s, v12.8h
563    sadalp  v21.4s, v13.8h
564    sadalp  v22.4s, v14.8h
565    sadalp  v23.4s, v15.8h
566
567    ld1 {v0.4s}, [x10], #16
568    addp v4.4s, v16.4s, v17.4s
569    addp v5.4s, v18.4s, v19.4s
570    addp v6.4s, v20.4s, v21.4s
571    addp v7.4s, v22.4s, v23.4s
572
573    addp v12.4s, v4.4s, v5.4s
574    addp v13.4s, v6.4s, v7.4s
575
576    cbnz x7, L2Quan
577    add v16.4s, v12.4s, v0.4s
578    add v17.4s, v13.4s, v0.4s
579    scvtf v0.4s, v16.4s
580    scvtf v1.4s, v17.4s
581    subs x5, x5, #1
582    mov x1, x8
583    st1 {v0.4s, v1.4s}, [x0], x4
584    b L2LoopCheck
585
586    L2Quan:
587    ld1 {v1.4s}, [x7], #16
588    add v16.4s, v12.4s, v0.4s
589    add v17.4s, v13.4s, v0.4s
590
591    dup v31.4s, w6 // Min
592    dup v30.4s, w11 // Max
593
594    scvtf v4.4s, v16.4s
595    scvtf v5.4s, v17.4s
596
597    fmul v12.4s, v4.4s, v1.4s
598    fmul v13.4s, v5.4s, v1.4s
599
600    fcvtas v8.4s, v12.4s
601    fcvtas v9.4s, v13.4s
602
603    smin v8.4s, v30.4s, v8.4s
604    smin v9.4s, v30.4s, v9.4s
605
606    smax v8.4s, v31.4s, v8.4s
607    smax v9.4s, v31.4s, v9.4s
608
609    sqxtn v0.4h, v8.4s
610    sqxtn2 v0.8h, v9.4s
611
612    sqxtn v2.8b, v0.8h
613    st1 {v2.8b}, [x0], x4
614    subs x5, x5, #1
615    mov x1, x8
616L2LoopCheck:
617    bne L2LoopDz
618
619b End
620
621L1Dz:
622L1LoopDz:
623    mov x8, x1
624    ld1 {v0.16b}, [x2], #16
625    ld1 {v1.16b}, [x2], #16
626    ld1 {v2.16b}, [x2], #16
627    ld1 {v3.16b}, [x2], #16
628    dup v16.4s, wzr
629    dup v17.4s, wzr
630    ld1 {v4.16b}, [x1], #16
631    add x1, x1, #48
632
633    smull v8.8h, v0.8b, v4.8b
634    dup v18.4s, wzr
635    smull v9.8h, v1.8b, v4.8b
636    dup v19.4s, wzr
637    smull v10.8h, v2.8b, v4.8b
638    smull v11.8h, v3.8b, v4.8b
639    subs x9, x3, #1
640    smlal2 v8.8h, v0.16b, v4.16b
641    smlal2 v9.8h, v1.16b, v4.16b
642    smlal2 v10.8h, v2.16b, v4.16b
643    smlal2 v11.8h, v3.16b, v4.16b
644    beq L1LoopSzEnd
645
646    L1LoopSz:
647        sadalp v16.4s, v8.8h
648        ld1 {v4.16b}, [x1], #16
649        sadalp v17.4s, v9.8h
650        sadalp v18.4s, v10.8h
651        sadalp v19.4s, v11.8h
652        sadalp v20.4s, v12.8h
653        sadalp v21.4s, v13.8h
654        sadalp v22.4s, v14.8h
655        sadalp v23.4s, v15.8h
656
657        ld1 {v0.16b}, [x2], #16
658        ld1 {v1.16b}, [x2], #16
659        ld1 {v2.16b}, [x2], #16
660        ld1 {v3.16b}, [x2], #16
661        add x1, x1, #48
662
663        smull v8.8h, v0.8b, v4.8b
664        smull v9.8h, v1.8b, v4.8b
665        smull v10.8h, v2.8b, v4.8b
666        smull v11.8h, v3.8b, v4.8b
667
668        smlal2 v8.8h, v0.16b, v4.16b
669        smlal2 v9.8h, v1.16b, v4.16b
670        smlal2 v10.8h, v2.16b, v4.16b
671        smlal2 v11.8h, v3.16b, v4.16b
672
673        subs x9, x9, #1
674        bne L1LoopSz
675
676    L1LoopSzEnd:
677    sadalp v16.4s, v8.8h
678    sadalp v17.4s, v9.8h
679    sadalp v18.4s, v10.8h
680    sadalp  v19.4s, v11.8h
681
682    ld1 {v0.4s}, [x10], #16
683    addp v4.4s, v16.4s, v17.4s
684    addp v5.4s, v18.4s, v19.4s
685
686    addp v12.4s, v4.4s, v5.4s
687
688    cbnz x7, L1Quan
689    add v16.4s, v12.4s, v0.4s
690    scvtf v0.4s, v16.4s
691    subs x5, x5, #1
692    mov x1, x8
693    st1 {v0.4s}, [x0], x4
694    b L1LoopCheck
695
696    L1Quan:
697    ld1 {v1.4s}, [x7], #16
698    add v16.4s, v12.4s, v0.4s
699
700    dup v31.4s, w6 // Min
701    dup v30.4s, w11 // Max
702
703    scvtf v4.4s, v16.4s
704
705    fmul v12.4s, v4.4s, v1.4s
706
707    fcvtas v8.4s, v12.4s
708
709    smin v8.4s, v30.4s, v8.4s
710
711    smax v8.4s, v31.4s, v8.4s
712
713    sqxtn v0.4h, v8.4s
714
715    sqxtn v2.8b, v0.8h
716    st1 {v2.s}[0], [x0], x4
717    subs x5, x5, #1
718    mov x1, x8
719L1LoopCheck:
720    bne L1LoopDz
721
722End:
723sub sp, sp, #128
724ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
725ld1 {v12.4s, v13.4s, v14.4s, v15.4s}, [sp], #64
726ret
727
728#endif
729