1//
2//  MNNPackedMatMulRemainFP16.S
3//  MNN
4//
5//  Created by MNN on 2020/06/10.
6//  Copyright © 2018, Alibaba Group Holding Limited
7//
8
9#ifdef __aarch64__
10
11#include "MNNAsmGlobal.h"
12
13.text
14.align 5
15// 8 * 24 MatMul, C(UP_DIV(h,8), e, h8) = B(UP_DIV(h,hP), l, hP) * A(l, e), hP = 24
16// Remain meaning is eSize is any value
17asm_function MNNPackedMatMulRemainFP16
18//void MNNPackedMatMulRemainFP16(FLOAT16* C, const FLOAT16* A, const FLOAT16* B, size_t eSize, const size_t* parameter, const FLOAT16* postParameters, const FLOAT16* bias);
19//Auto x0: C, x1:A, x2:B, x3:eSize, x4:parameter, x6:postParameters, x7:bias
20// parameter: {aStride, l, h, cStride, bExtraStride}
21sub sp, sp, #32
22str x19, [sp, #0]
23str x20, [sp, #8]
24str x21, [sp, #16]
25add sp, sp, #32
26ldr x11, [x4, #0] // aStride
27ldr x9, [x4, #8] // l
28ldr x10, [x4, #16] // h
29
30ldr x7, [x4, #24] // cStride
31ldr x19, [x4, #40] // bExtraStride
32
33add x10, x10, #7
34lsr x10, x10, #3
35
36cbz x5, Start
37ld1 {v5.4s}, [x5]
38fcvtn v5.4h, v5.4s
39dup v6.8h, v5.h[2] // Min Value
40dup v7.8h, v5.h[3] // Max Value
41
42Start:
43
44E8:
45cmp x3, #8
46blt E4
47
48// 8x16
49LoopE8:
50    mov x20, x6
51    mov x8, x10
52    mov x21, x0
53    mov x13, x2
54
55    LH8:
56    cmp x8, #2
57    blt LH4
58    sub x14, x7, #64
59    LoopH8x8:
60        mov x15, x1
61        subs x12, x9, #1
62        ld1 {v3.8h, v4.8h}, [x13], #32
63        ld1 {v0.8h}, [x15], x11
64        fmul v16.8h, v3.8h, v0.h[0]
65        fmul v17.8h, v3.8h, v0.h[1]
66        fmul v18.8h, v3.8h, v0.h[2]
67        fmul v19.8h, v3.8h, v0.h[3]
68
69        fmul v20.8h, v4.8h, v0.h[0]
70        fmul v21.8h, v4.8h, v0.h[1]
71        fmul v22.8h, v4.8h, v0.h[2]
72        fmul v23.8h, v4.8h, v0.h[3]
73
74        fmul v24.8h, v3.8h, v0.h[4]
75        fmul v25.8h, v3.8h, v0.h[5]
76        fmul v26.8h, v3.8h, v0.h[6]
77        fmul v27.8h, v3.8h, v0.h[7]
78
79        fmul v28.8h, v4.8h, v0.h[4]
80        fmul v29.8h, v4.8h, v0.h[5]
81        fmul v30.8h, v4.8h, v0.h[6]
82        fmul v31.8h, v4.8h, v0.h[7]
83        beq LoopLEnd
84
85        LoopL:
86            ld1 {v3.8h, v4.8h}, [x13], #32
87            ld1 {v0.8h}, [x15], x11
88            fmla v16.8h, v3.8h, v0.h[0]
89            fmla v17.8h, v3.8h, v0.h[1]
90            fmla v18.8h, v3.8h, v0.h[2]
91            fmla v19.8h, v3.8h, v0.h[3]
92
93            fmla v20.8h, v4.8h, v0.h[0]
94            fmla v21.8h, v4.8h, v0.h[1]
95            fmla v22.8h, v4.8h, v0.h[2]
96            fmla v23.8h, v4.8h, v0.h[3]
97
98            fmla v24.8h, v3.8h, v0.h[4]
99            fmla v25.8h, v3.8h, v0.h[5]
100            fmla v26.8h, v3.8h, v0.h[6]
101            fmla v27.8h, v3.8h, v0.h[7]
102
103            fmla v28.8h, v4.8h, v0.h[4]
104            fmla v29.8h, v4.8h, v0.h[5]
105            fmla v30.8h, v4.8h, v0.h[6]
106            fmla v31.8h, v4.8h, v0.h[7]
107
108            subs x12, x12, #1
109            bne LoopL
110
111        LoopLEnd:
112
113        add x13, x13, x19
114        sub x8, x8, #2
115
116        cbz x5, StoreLH8
117        AddBiasLH8:
118        ld1 {v0.8h, v1.8h}, [x20], #32
119
120        fmla v16.8h, v0.8h, v5.h[1]
121        fmla v17.8h, v0.8h, v5.h[1]
122        fmla v18.8h, v0.8h, v5.h[1]
123        fmla v19.8h, v0.8h, v5.h[1]
124
125        fmla v20.8h, v1.8h, v5.h[1]
126        fmla v21.8h, v1.8h, v5.h[1]
127        fmla v22.8h, v1.8h, v5.h[1]
128        fmla v23.8h, v1.8h, v5.h[1]
129
130        fmla v24.8h, v0.8h, v5.h[1]
131        fmla v25.8h, v0.8h, v5.h[1]
132        fmla v26.8h, v0.8h, v5.h[1]
133        fmla v27.8h, v0.8h, v5.h[1]
134
135        fmla v28.8h, v1.8h, v5.h[1]
136        fmla v29.8h, v1.8h, v5.h[1]
137        fmla v30.8h, v1.8h, v5.h[1]
138        fmla v31.8h, v1.8h, v5.h[1]
139
140        PostTreatLH8:
141        fmax v16.8h, v16.8h, v6.8h
142        fmax v17.8h, v17.8h, v6.8h
143        fmax v18.8h, v18.8h, v6.8h
144        fmax v19.8h, v19.8h, v6.8h
145        fmax v20.8h, v20.8h, v6.8h
146        fmax v21.8h, v21.8h, v6.8h
147        fmax v22.8h, v22.8h, v6.8h
148        fmax v23.8h, v23.8h, v6.8h
149        fmax v24.8h, v24.8h, v6.8h
150        fmax v25.8h, v25.8h, v6.8h
151        fmax v26.8h, v26.8h, v6.8h
152        fmax v27.8h, v27.8h, v6.8h
153        fmax v28.8h, v28.8h, v6.8h
154        fmax v29.8h, v29.8h, v6.8h
155        fmax v30.8h, v30.8h, v6.8h
156        fmax v31.8h, v31.8h, v6.8h
157
158        fmin v16.8h, v16.8h, v7.8h
159        fmin v17.8h, v17.8h, v7.8h
160        fmin v18.8h, v18.8h, v7.8h
161        fmin v19.8h, v19.8h, v7.8h
162        fmin v20.8h, v20.8h, v7.8h
163        fmin v21.8h, v21.8h, v7.8h
164        fmin v22.8h, v22.8h, v7.8h
165        fmin v23.8h, v23.8h, v7.8h
166        fmin v24.8h, v24.8h, v7.8h
167        fmin v25.8h, v25.8h, v7.8h
168        fmin v26.8h, v26.8h, v7.8h
169        fmin v27.8h, v27.8h, v7.8h
170        fmin v28.8h, v28.8h, v7.8h
171        fmin v29.8h, v29.8h, v7.8h
172        fmin v30.8h, v30.8h, v7.8h
173        fmin v31.8h, v31.8h, v7.8h
174
175        StoreLH8:
176        st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64
177        st1 {v24.8h, v25.8h, v26.8h, v27.8h}, [x0], x14
178
179        st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64
180        st1 {v28.8h, v29.8h, v30.8h, v31.8h}, [x0], x14
181        cmp x8, #2
182        bge LoopH8x8
183
184    LH4:
185    cbz x8, E8End
186    LoopHRemain:
187        mov x15, x1
188        subs x12, x9, #1
189        ld1 {v3.8h}, [x13]
190        ld1 {v0.8h}, [x15], x11
191        fmul v16.8h, v3.8h, v0.h[0]
192        fmul v17.8h, v3.8h, v0.h[1]
193        add x13, x13, #32
194        fmul v18.8h, v3.8h, v0.h[2]
195        fmul v19.8h, v3.8h, v0.h[3]
196        fmul v20.8h, v3.8h, v0.h[4]
197        fmul v21.8h, v3.8h, v0.h[5]
198        fmul v22.8h, v3.8h, v0.h[6]
199        fmul v23.8h, v3.8h, v0.h[7]
200        beq LoopLREnd
201
202        LoopLR:
203            ld1 {v3.8h}, [x13]
204            ld1 {v0.8h}, [x15], x11
205            fmla v16.8h, v3.8h, v0.h[0]
206            fmla v17.8h, v3.8h, v0.h[1]
207            fmla v18.8h, v3.8h, v0.h[2]
208            fmla v19.8h, v3.8h, v0.h[3]
209            add x13, x13, #32
210
211            fmla v20.8h, v3.8h, v0.h[4]
212            fmla v21.8h, v3.8h, v0.h[5]
213            fmla v22.8h, v3.8h, v0.h[6]
214            fmla v23.8h, v3.8h, v0.h[7]
215
216            subs x12, x12, #1
217            bne LoopLR
218        LoopLREnd:
219
220        cbz x5, StoreLH8x4
221        AddBiasLH8x4:
222        ld1 {v0.8h}, [x20]
223
224        fmla v16.8h, v0.8h, v5.h[1]
225        fmla v17.8h, v0.8h, v5.h[1]
226        fmla v18.8h, v0.8h, v5.h[1]
227        fmla v19.8h, v0.8h, v5.h[1]
228
229        fmla v20.8h, v0.8h, v5.h[1]
230        fmla v21.8h, v0.8h, v5.h[1]
231        fmla v22.8h, v0.8h, v5.h[1]
232        fmla v23.8h, v0.8h, v5.h[1]
233
234        PostTreatLH8x4:
235        fmax v16.8h, v16.8h, v6.8h
236        fmax v17.8h, v17.8h, v6.8h
237        fmax v18.8h, v18.8h, v6.8h
238        fmax v19.8h, v19.8h, v6.8h
239        fmax v20.8h, v20.8h, v6.8h
240        fmax v21.8h, v21.8h, v6.8h
241        fmax v22.8h, v22.8h, v6.8h
242        fmax v23.8h, v23.8h, v6.8h
243
244        fmin v16.8h, v16.8h, v7.8h
245        fmin v17.8h, v17.8h, v7.8h
246        fmin v18.8h, v18.8h, v7.8h
247        fmin v19.8h, v19.8h, v7.8h
248        fmin v20.8h, v20.8h, v7.8h
249        fmin v21.8h, v21.8h, v7.8h
250        fmin v22.8h, v22.8h, v7.8h
251        fmin v23.8h, v23.8h, v7.8h
252
253        StoreLH8x4:
254
255        st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], #64
256        st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], #64
257
258    E8End:
259
260    sub x3, x3, #8
261    add x0, x21, #128
262    add x1, x1, #16
263
264E4:
265cmp x3, #4
266mov x20, x6
267blt E1
268    mov x8, x10
269    mov x21, x0
270    mov x13, x2
271
272    cmp x8, #2
273    blt E4LH4
274
275    E4LH8:
276    E4LoopH8:
277        mov x15, x1
278        subs x12, x9, #1
279        ld1 {v3.8h, v4.8h}, [x13], #32
280        ld1 {v0.4h}, [x15], x11
281        fmul v16.8h, v3.8h, v0.h[0]
282        fmul v17.8h, v3.8h, v0.h[1]
283        fmul v18.8h, v3.8h, v0.h[2]
284        fmul v19.8h, v3.8h, v0.h[3]
285
286        fmul v20.8h, v4.8h, v0.h[0]
287        fmul v21.8h, v4.8h, v0.h[1]
288        fmul v22.8h, v4.8h, v0.h[2]
289        fmul v23.8h, v4.8h, v0.h[3]
290
291        beq E4LoopLEnd
292
293        subs x12, x12, #1
294        ld1 {v3.8h, v4.8h}, [x13], #32
295        ld1 {v0.4h}, [x15], x11
296        fmla v16.8h, v3.8h, v0.h[0]
297        fmla v17.8h, v3.8h, v0.h[1]
298
299        beq E4LoopLComputeEnd
300
301        E4LoopL:
302            fmla v18.8h, v3.8h, v0.h[2]
303            fmla v19.8h, v3.8h, v0.h[3]
304
305            fmla v20.8h, v4.8h, v0.h[0]
306            fmla v21.8h, v4.8h, v0.h[1]
307            fmla v22.8h, v4.8h, v0.h[2]
308            fmla v23.8h, v4.8h, v0.h[3]
309
310            ld1 {v3.8h, v4.8h}, [x13], #32
311            ld1 {v0.4h}, [x15], x11
312            fmla v16.8h, v3.8h, v0.h[0]
313            fmla v17.8h, v3.8h, v0.h[1]
314
315            subs x12, x12, #1
316            bne E4LoopL
317        E4LoopLComputeEnd:
318        fmla v18.8h, v3.8h, v0.h[2]
319        fmla v19.8h, v3.8h, v0.h[3]
320
321        fmla v20.8h, v4.8h, v0.h[0]
322        fmla v21.8h, v4.8h, v0.h[1]
323        fmla v22.8h, v4.8h, v0.h[2]
324        fmla v23.8h, v4.8h, v0.h[3]
325
326        E4LoopLEnd:
327        add x13, x13, x19
328        sub x8, x8, #2
329        cmp x8, #2
330
331        cbz x5, StoreLH4x8
332
333        AddBiasLH4x8:
334        ld1 {v0.8h, v1.8h}, [x20], #32
335
336        fmla v16.8h, v0.8h, v5.h[1]
337        fmla v17.8h, v0.8h, v5.h[1]
338        fmla v18.8h, v0.8h, v5.h[1]
339        fmla v19.8h, v0.8h, v5.h[1]
340
341        fmla v20.8h, v1.8h, v5.h[1]
342        fmla v21.8h, v1.8h, v5.h[1]
343        fmla v22.8h, v1.8h, v5.h[1]
344        fmla v23.8h, v1.8h, v5.h[1]
345
346        PostTreatLH4x8:
347        fmax v16.8h, v16.8h, v6.8h
348        fmax v17.8h, v17.8h, v6.8h
349        fmax v18.8h, v18.8h, v6.8h
350        fmax v19.8h, v19.8h, v6.8h
351        fmax v20.8h, v20.8h, v6.8h
352        fmax v21.8h, v21.8h, v6.8h
353        fmax v22.8h, v22.8h, v6.8h
354        fmax v23.8h, v23.8h, v6.8h
355
356        fmin v16.8h, v16.8h, v7.8h
357        fmin v17.8h, v17.8h, v7.8h
358        fmin v18.8h, v18.8h, v7.8h
359        fmin v19.8h, v19.8h, v7.8h
360        fmin v20.8h, v20.8h, v7.8h
361        fmin v21.8h, v21.8h, v7.8h
362        fmin v22.8h, v22.8h, v7.8h
363        fmin v23.8h, v23.8h, v7.8h
364
365        StoreLH4x8:
366
367        st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0], x7
368        st1 {v20.8h, v21.8h, v22.8h, v23.8h}, [x0], x7
369
370        bge E4LoopH8
371
372    E4LH4:
373    cbz x8, E4End
374    mov x15, x1
375    subs x12, x9, #1
376    ld1 {v3.8h}, [x13]
377    ld1 {v0.4h}, [x15], x11
378    fmul v16.8h, v3.8h, v0.h[0]
379    fmul v17.8h, v3.8h, v0.h[1]
380    fmul v18.8h, v3.8h, v0.h[2]
381    fmul v19.8h, v3.8h, v0.h[3]
382    add x13, x13, #32
383
384    beq E4LoopLREnd
385
386    E4LoopLR:
387        ld1 {v3.8h}, [x13]
388        ld1 {v0.4h}, [x15], x11
389        fmla v16.8h, v3.8h, v0.h[0]
390        fmla v17.8h, v3.8h, v0.h[1]
391        fmla v18.8h, v3.8h, v0.h[2]
392        fmla v19.8h, v3.8h, v0.h[3]
393        add x13, x13, #32
394
395        subs x12, x12, #1
396        bne E4LoopLR
397    E4LoopLREnd:
398
399    cbz x5, StoreLH4x4
400    AddBiasLH4x4:
401    ld1 {v0.8h}, [x20]
402
403    fmla v16.8h, v0.8h, v5.h[1]
404    fmla v17.8h, v0.8h, v5.h[1]
405    fmla v18.8h, v0.8h, v5.h[1]
406    fmla v19.8h, v0.8h, v5.h[1]
407
408
409    PostTreatLH4x4:
410    fmax v16.8h, v16.8h, v6.8h
411    fmax v17.8h, v17.8h, v6.8h
412    fmax v18.8h, v18.8h, v6.8h
413    fmax v19.8h, v19.8h, v6.8h
414
415    fmin v16.8h, v16.8h, v7.8h
416    fmin v17.8h, v17.8h, v7.8h
417    fmin v18.8h, v18.8h, v7.8h
418    fmin v19.8h, v19.8h, v7.8h
419
420    StoreLH4x4:
421    st1 {v16.8h, v17.8h, v18.8h, v19.8h}, [x0]
422
423    E4End:
424
425    sub x3, x3, #4
426    add x0, x21, #64
427    add x1, x1, #8
428
429E1:
430cmp x3, #0
431beq End
432
433LoopE1:
434    mov x20, x6
435    mov x8, x10
436    mov x21, x0
437    mov x13, x2
438
439    cmp x8, #2
440    blt E1LH4
441
442    E1LH8:
443    E1LoopH8:
444        mov x15, x1
445        subs x12, x9, #1
446        ld1 {v3.8h, v4.8h}, [x13], #32
447        ld1 {v0.h}[0], [x15], x11
448        fmul v16.8h, v3.8h, v0.h[0]
449        fmul v20.8h, v4.8h, v0.h[0]
450
451        beq E1LoopLEnd
452
453        E1LoopL:
454            ld1 {v3.8h, v4.8h}, [x13], #32
455            ld1 {v0.h}[0], [x15], x11
456            fmla v16.8h, v3.8h, v0.h[0]
457            fmla v20.8h, v4.8h, v0.h[0]
458
459            subs x12, x12, #1
460            bne E1LoopL
461
462        E1LoopLEnd:
463
464        add x13, x13, x19
465        sub x8, x8, #2
466        cmp x8, #2
467
468        cbz x5, StoreLH1x8
469        AddBiasLH1x8:
470        ld1 {v0.8h, v1.8h}, [x20], #32
471
472        fmla v16.8h, v0.8h, v5.h[1]
473        fmla v20.8h, v1.8h, v5.h[1]
474
475        PostTreatLH1x8:
476        fmax v16.8h, v16.8h, v6.8h
477        fmax v20.8h, v20.8h, v6.8h
478        fmin v16.8h, v16.8h, v7.8h
479        fmin v20.8h, v20.8h, v7.8h
480
481        StoreLH1x8:
482
483        st1 {v16.8h}, [x0], x7
484        st1 {v20.8h}, [x0], x7
485
486        bge E1LoopH8
487
488    E1LH4:
489    cbz x8, E1End
490    mov x15, x1
491    subs x12, x9, #1
492    ld1 {v3.8h}, [x13]
493    ld1 {v0.h}[0], [x15], x11
494    fmul v16.8h, v3.8h, v0.h[0]
495    add x13, x13, #32
496
497    beq E1LoopLREnd
498
499    E1LoopLR:
500        ld1 {v3.8h}, [x13]
501        ld1 {v0.h}[0], [x15], x11
502        fmla v16.8h, v3.8h, v0.h[0]
503        add x13, x13, #32
504
505        subs x12, x12, #1
506        bne E1LoopLR
507    E1LoopLREnd:
508
509    cbz x5, StoreLH1x4
510    AddBiasLH1x4:
511    ld1 {v0.8h}, [x20]
512    fmla v16.8h, v0.8h, v5.h[1]
513
514    PostTreatLH1x4:
515    fmax v16.8h, v16.8h, v6.8h
516    fmin v16.8h, v16.8h, v7.8h
517
518    StoreLH1x4:
519    st1 {v16.8h}, [x0]
520
521    E1End:
522
523    subs x3, x3, #1
524    add x0, x21, #16
525    add x1, x1, #2
526    bne LoopE1
527
528
529End:
530sub sp, sp, #32
531ldr x19, [sp, #0]
532ldr x20, [sp, #8]
533ldr x21, [sp, #16]
534add sp, sp, #32
535
536ret
537
538
539#endif
540