1//
2//  MNNWinogradMatrixProductRight.S
3//  MNN
4//
5//  Created by MNN on 2018/08/22.
6//  Copyright © 2018, Alibaba Group Holding Limited
7//
8
9#ifdef __aarch64__
10
11#include "MNNAsmGlobal.h"
12
13.text
14.align 5
15
16asm_function MNNWinogradMatrixProductRight
17//void MNNWinogradMatrixProductRight(const float* S, const float* B, float* M, size_t w, size_t h, size_t k, size_t length);
18
19//Auto: x0: S, x1:B, x2: M, x3:w, x4:h, x5:k, x6:length
20
21//unitStepInFloat
22mov x8, #16 // 4*sizeof(float)
23mul x8, x6, x8
24
25//srcYUnitStep
26mul x9, x5, x8
27
28//B's step
29mov x10, #4
30mul x10, x4, x10
31
32LoopY:
33    mov v4.d[0], x1
34    mov v4.d[1], x3
35    LoopX:
36        mov v5.d[0], x0
37        mov v5.d[1], x1
38        movi v30.4s, #0
39        mov x11, x6
40        LoopUnitSetZero:
41            st1 {v30.4s}, [x2], #16
42            subs x11, x11, #1
43            bne LoopUnitSetZero
44        sub x2, x2, x8
45        mov x12, x5
46
47        LK4:
48        cmp x12, #4
49        blt LK3
50        mov v6.d[0], x3
51        mov v6.d[1], x4
52        LoopK4:
53            ld1 {v0.s}[0], [x1], x10
54            ld1 {v0.s}[1], [x1], x10
55            ld1 {v0.s}[2], [x1], x10
56            ld1 {v0.s}[3], [x1], x10
57            mov x11, x6
58            mov v7.d[0], x1
59
60            add x1, x0, x8
61            add x3, x1, x8
62            add x4, x3, x8
63
64            LoopUnitK4:
65                ld1 {v16.4s}, [x2]
66                ld1 {v20.4s}, [x0], #16
67                fmla v16.4s, v20.4s, v0.s[0]
68                ld1 {v21.4s}, [x1], #16
69                fmul v17.4s, v21.4s, v0.s[1]
70                ld1 {v20.4s}, [x3], #16
71                fmla v16.4s, v20.4s, v0.s[2]
72                ld1 {v21.4s}, [x4], #16
73                fmla v17.4s, v21.4s, v0.s[3]
74
75                fadd v17.4s, v16.4s, v17.4s
76                st1 {v17.4s}, [x2], #16
77                subs x11, x11, #1
78                bne LoopUnitK4
79            sub x2, x2, x8
80            sub x12, x12, #4
81            mov x0, x4
82
83            mov x1, v7.d[0]
84            cmp x12, #4
85            bge LoopK4
86        mov x3, v6.d[0]
87        mov x4, v6.d[1]
88
89        LK3:
90        cmp x12, #3
91        blt LK1
92        mov v6.d[0], x3
93        LoopK3:
94            ld1 {v0.s}[0], [x1], x10
95            ld1 {v0.s}[1], [x1], x10
96            ld1 {v0.s}[2], [x1], x10
97            mov x11, x6
98            mov v7.d[0], x1
99
100            add x1, x0, x8
101            add x3, x1, x8
102
103            LoopUnitK3:
104                ld1 {v16.4s}, [x2]
105                ld1 {v20.4s}, [x0], #16
106                fmla v16.4s, v20.4s, v0.s[0]
107                ld1 {v21.4s}, [x1], #16
108                fmul v17.4s, v21.4s, v0.s[1]
109                ld1 {v20.4s}, [x3], #16
110                fmla v16.4s, v20.4s, v0.s[2]
111
112                fadd v17.4s, v16.4s, v17.4s
113                st1 {v17.4s}, [x2], #16
114                subs x11, x11, #1
115                bne LoopUnitK3
116            sub x2, x2, x8
117            sub x12, x12, #3
118            mov x0, x4
119            mov x1, v7.d[0]
120            cmp x12, #3
121            bge LoopK3
122        mov x3, v6.d[0]
123
124        LK1:
125        cmp x12, #0
126        beq LKEnd
127
128        LoopK:
129            ld1 {v31.s}[0], [x1], x10
130
131            dup v31.4s, v31.s[0]
132            mov x11, x6
133            LoopUnit:
134                ld1 {v0.4s}, [x2]
135                ld1 {v1.4s}, [x0], #16
136                fmla v0.4s, v1.4s, v31.4s
137
138                st1 {v0.4s}, [x2], #16
139                subs x11, x11, #1
140                bne LoopUnit
141            subs x12, x12, #1
142
143            sub x2, x2, x8
144            bne LoopK
145        LKEnd:
146        mov x0, v5.d[0]
147        mov x1, v5.d[1]
148        subs x3, x3, #1
149        add x2, x2, x8
150        add x1, x1, #4 //sizeof(float)
151
152        bne LoopX
153    mov x1, v4.d[0]
154    mov x3, v4.d[1]
155    add x0, x0, x9
156
157    subs x4, x4, #1
158    bne LoopY
159
160
161
162    ret
163
164#endif
165