1//
2//  NEON_MNNUnPackC4_BF16.S
3//  MNN
4//
5//  Created by MNN on 2019/02/02.
6//  Copyright © 2018, Alibaba Group Holding Limited
7//
8
9#ifdef __aarch64__
10#include "MNNAsmGlobal.h"
11
12.text
13.align 5
14
15asm_function NEON_MNNUnpackC4_BF16
16// treate float pointer as int16_t*
17//void NEON_MNNUnpackC4_BF16(float* dst, const float* src, size_t area, size_t depth)
18//Auto load:
19//x0:dst, x1:src, x2:area, x3:depth
20mul x4, x2, x3
21cmp x4, #0
22beq DownEnd
23
24//Swap x0, x1
25mov x4, x0
26mov x0, x1
27mov x1, x4
28
29//x4: srcDepthOffset:area * sizeof(int16_t)
30mov x4, #2 // sizeof(int16_t)
31mul x4, x2, x4
32
33DownL4:
34cmp x3, #3
35ble DownL3
36
37DownL4Loop:
38add x5, x1, x4
39add x6, x4, x5
40add x7, x4, x6
41mov x8, x2
42cmp x8, #3
43ble DownL4AreaRemain
44DownL4AreaLoop:
45ld4 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32 // 16 * sizeof(int16_t)
46st1 {v0.4h}, [x1], #8 // 4 * sizeof(int16_t)
47st1 {v1.4h}, [x5], #8 // 4 * sizeof(int16_t)
48st1 {v2.4h}, [x6], #8 // 4 * sizeof(int16_t)
49st1 {v3.4h}, [x7], #8 // 4 * sizeof(int16_t)
50sub x8, x8, #4
51cmp x8, #4
52bge DownL4AreaLoop
53
54DownL4AreaRemain:
55cmp x8, #0
56beq DownL4AreaRemainEnd
57DownL4AreaRemainLoop:
58ld1 {v0.4h}, [x0], #8
59st1 {v0.h}[0], [x1], #2
60st1 {v0.h}[1], [x5], #2
61st1 {v0.h}[2], [x6], #2
62st1 {v0.h}[3], [x7], #2
63
64
65subs x8, x8, #1
66bne DownL4AreaRemainLoop
67DownL4AreaRemainEnd:
68sub x3, x3, #4
69mov x1, x7
70cmp x3, #4
71bge DownL4Loop
72
73DownL3:
74cmp x3, #2
75ble DownL2
76add x5, x1, x4
77add x6, x4, x5
78mov x8, x2
79cmp x8, #3
80ble DownL3AreaRemain
81DownL3AreaLoop:
82ld4 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32  // 16 * sizeof(int16_t)
83st1 {v0.4h}, [x1], #8 // 4 * sizeof(int16_t)
84st1 {v1.4h}, [x5], #8 // 4 * sizeof(int16_t)
85st1 {v2.4h}, [x6], #8 // 4 * sizeof(int16_t)
86sub x8, x8, #4
87cmp x8, #4
88bge DownL3AreaLoop
89
90cmp x8, #0
91beq DownL3AreaRemainEnd
92DownL3AreaRemain:
93ld1 {v0.4h}, [x0], #8   // 4 * sizeof(int16_t)
94st1 {v0.h}[0], [x1], #2 // sizeof(int16_t)
95st1 {v0.h}[1], [x5], #2 // sizeof(int16_t)
96st1 {v0.h}[2], [x6], #2 // sizeof(int16_t)
97
98subs x8, x8, #1
99bne DownL3AreaRemain
100
101DownL3AreaRemainEnd:
102sub x3, x3, #3
103
104
105DownL2:
106cmp x3, #1
107ble DownL1
108add x5, x1, x4
109mov x8, x2
110cmp x8, #3
111ble DownL2AreaRemain
112DownL2AreaLoop:
113ld4 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32 // 16 * sizeof(int16_t)
114st1 {v0.4h}, [x1], #8
115st1 {v1.4h}, [x5], #8
116
117sub x8, x8, #4
118cmp x8, #4
119bge DownL2AreaLoop
120
121cmp x8, #0
122beq DownL2AreaRemainEnd
123DownL2AreaRemain:
124ld1 {v0.4h}, [x0], #8 // 4 * sizeof(int16_t)
125st1 {v0.h}[0], [x1], #2
126st1 {v0.h}[1], [x5], #2
127
128subs x8, x8, #1
129bne DownL2AreaRemain
130
131DownL2AreaRemainEnd:
132sub x3, x3, #2
133
134DownL1:
135cmp x3, #0
136beq DownEnd
137mov x8, x2
138cmp x8, #3
139ble DownL1AreaRemain
140DownL1AreaLoop:
141ld4 {v0.4h, v1.4h, v2.4h, v3.4h}, [x0], #32 // 16 * sizeof(int16_t)
142st1 {v0.4h}, [x1], #8
143
144sub x8, x8, #4
145cmp x8, #4
146bge DownL1AreaLoop
147
148cmp x8, #0
149beq DownL1AreaRemainEnd
150DownL1AreaRemain:
151movi v0.4h, #0
152ld1 {v0.4h}, [x0], #8  // 4 * sizeof(int16_t)
153st1 {v0.h}[0], [x1], #2
154
155
156subs x8, x8, #1
157bne DownL1AreaRemain
158
159DownL1AreaRemainEnd:
160
161DownEnd:
162
163ret
164
165
166#endif
167
168