1//
2//  MNNConvDwF23MulTransUnit.S
3//  MNN
4//
5//  Created by MNN on 2019/4/4.
6//  Copyright © 2018, Alibaba Group Holding Limited
7//
8#ifdef __aarch64__
9
10#include "MNNAsmGlobal.h"
11
12.text
13.align 5
14
15asm_function MNNConvDwF23MulTransUnit
16//void MNNConvDwF23MulTransUnit(float **cacheLine, const float *weigth, float *dest, size_t ow, const float* bias, const float* parameters);
17//Auto: x0:cacheLine, x1:weight, x2:dest, x3:ow, x4: bias, x5: parameters
18sub sp, sp, #64
19st1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
20
21ld1 {v8.4s}, [x4] // bias
22ldr w9, [x5, #8]
23ldr w10, [x5, #12]
24dup v9.4s, w9 // min
25dup v10.4s, w10 // max
26
27ldr x4, [x0, #0]
28ldr x5, [x0, #8]
29ldr x6, [x0, #16]
30
31ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [x1], #64
32ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [x1], #64
33ld1 {v28.4s, v29.4s, v30.4s, v31.4s}, [x1]
34
35
36L2:
37cmp x3, #2
38blt L1
39
40LoopL2:
41
42ld1 {v20.4s, v21.4s}, [x4], #32
43fmul v0.4s, v4.4s, v20.4s
44ld1 {v22.4s, v23.4s}, [x4], #32
45fmul v1.4s, v5.4s, v21.4s
46fmul v2.4s, v6.4s, v22.4s
47ld1 {v20.4s, v21.4s}, [x5], #32
48fmul v3.4s, v7.4s, v23.4s
49
50fmla v0.4s, v16.4s, v20.4s
51ld1 {v22.4s, v23.4s}, [x5], #32
52fmla v1.4s, v17.4s, v21.4s
53fmla v2.4s, v18.4s, v22.4s
54fmla v3.4s, v19.4s, v23.4s
55
56ld1 {v20.4s, v21.4s}, [x6], #32
57fmla v0.4s, v28.4s, v20.4s
58fmla v1.4s, v29.4s, v21.4s
59fadd v0.4s, v1.4s, v0.4s
60ld1 {v22.4s, v23.4s}, [x6], #32
61
62fmla v2.4s, v30.4s, v22.4s
63fmla v3.4s, v31.4s, v23.4s
64fadd v0.4s, v0.4s, v2.4s
65
66fadd v3.4s, v3.4s, v1.4s
67fsub v1.4s, v3.4s, v2.4s
68
69fadd v0.4s, v0.4s, v8.4s
70fadd v1.4s, v1.4s, v8.4s
71
72fmin v0.4s, v0.4s, v10.4s
73fmin v1.4s, v1.4s, v10.4s
74
75fmax v0.4s, v0.4s, v9.4s
76fmax v1.4s, v1.4s, v9.4s
77
78st1 {v0.4s, v1.4s}, [x2], #32
79
80sub x3, x3, #2
81cmp x3, #2
82bge LoopL2
83
84
85L1:
86cmp x3, #0
87beq End
88ld1 {v20.4s, v21.4s, v22.4s}, [x4]
89fmul v0.4s, v4.4s, v20.4s
90fmul v1.4s, v5.4s, v21.4s
91fmul v2.4s, v6.4s, v22.4s
92ld1 {v20.4s, v21.4s, v22.4s}, [x5]
93
94fmla v0.4s, v16.4s, v20.4s
95fmla v1.4s, v17.4s, v21.4s
96fmla v2.4s, v18.4s, v22.4s
97
98ld1 {v20.4s, v21.4s, v22.4s}, [x6]
99fmla v0.4s, v28.4s, v20.4s
100fmla v1.4s, v29.4s, v21.4s
101fadd v0.4s, v1.4s, v0.4s
102
103fmla v2.4s, v30.4s, v22.4s
104fadd v0.4s, v0.4s, v2.4s
105
106fadd v0.4s, v0.4s, v8.4s
107fmin v0.4s, v0.4s, v10.4s
108fmax v0.4s, v0.4s, v9.4s
109
110st1 {v0.4s}, [x2]
111End:
112
113sub sp, sp, #64
114ld1 {v8.4s, v9.4s, v10.4s, v11.4s}, [sp], #64
115
116ret
117#endif
118