1//
2//  MNNUnPackC4.S
3//  MNN
4//
5//  Created by MNN on 2019/02/02.
6//  Copyright © 2018, Alibaba Group Holding Limited
7//
8
9#ifdef __arm__
10#ifndef __aarch64__
11
12#include "MNNAsmGlobal.h"
13.text
14.align 5
15
16.macro transpose
17vtrn.32 d0, d2
18vtrn.32 d1, d3
19vtrn.32 d4, d6
20vtrn.32 d5, d7
21vswp d1, d4
22vswp d3, d6
23.endm
24
25
26asm_function MNNUnpackC4
27//void MNNUnpackC4(float* dst, const float* src, size_t area, size_t depth);
28//Auto load:
29//r0:dst, r1:src, r2:area, r3:depth
30
31
32push {r4, r5, r6, r7, r8, lr}
33mul r4, r2, r3
34cmp r4, #0
35beq DownEnd
36
37//Swap r0 and r1 for conviniense
38mov r4, r0
39mov r0, r1
40mov r1, r4
41
42//r4: srcDepthOffset:area*sizeof(float)
43mov r4, #4
44mul r4, r2, r4
45
46DownL4:
47cmp r3, #3
48ble DownL3
49
50DownL4Loop:
51add r5, r1, r4
52add r6, r4, r5
53add r7, r4, r6
54mov r8, r2
55cmp r8, #3
56ble DownL4AreaRemain
57DownL4AreaLoop:
58vld1.32 {q0, q1}, [r0]!
59vld1.32 {q2, q3}, [r0]!
60transpose
61sub r8, r8, #4
62vst1.32 {q0}, [r1]!
63vst1.32 {q1}, [r5]!
64vst1.32 {q2}, [r6]!
65vst1.32 {q3}, [r7]!
66cmp r8, #4
67bge DownL4AreaLoop
68
69DownL4AreaRemain:
70cmp r8, #0
71beq DownL4AreaRemainEnd
72DownL4AreaRemainLoop:
73
74vld1.32 {q0}, [r0]!
75
76vst1.32 {d0[0]}, [r1]!
77vst1.32 {d0[1]}, [r5]!
78vst1.32 {d1[0]}, [r6]!
79vst1.32 {d1[1]}, [r7]!
80
81subs r8, r8, #1
82bne DownL4AreaRemainLoop
83DownL4AreaRemainEnd:
84sub r3, r3, #4
85mov r1, r7
86cmp r3, #4
87bge DownL4Loop
88
89DownL3:
90cmp r3, #2
91ble DownL2
92add r5, r1, r4
93add r6, r4, r5
94mov r8, r2
95cmp r8, #3
96ble DownL3AreaRemain
97DownL3AreaLoop:
98vld1.32 {q0, q1}, [r0]!
99vld1.32 {q2, q3}, [r0]!
100transpose
101sub r8, r8, #4
102vst1.32 {q0}, [r1]!
103vst1.32 {q1}, [r5]!
104vst1.32 {q2}, [r6]!
105cmp r8, #4
106bge DownL3AreaLoop
107
108cmp r8, #0
109beq DownL3AreaRemainEnd
110DownL3AreaRemain:
111vld1.32 {q0}, [r0]!
112
113vst1.32 {d0[0]}, [r1]!
114vst1.32 {d0[1]}, [r5]!
115vst1.32 {d1[0]}, [r6]!
116
117subs r8, r8, #1
118bne DownL3AreaRemain
119
120DownL3AreaRemainEnd:
121sub r3, r3, #3
122
123
124DownL2:
125cmp r3, #1
126ble DownL1
127add r5, r1, r4
128mov r8, r2
129cmp r8, #3
130ble DownL2AreaRemain
131DownL2AreaLoop:
132vld1.32 {q0, q1}, [r0]!
133vld1.32 {q2, q3}, [r0]!
134transpose
135vst1.32 {q0}, [r1]!
136vst1.32 {q1}, [r5]!
137sub r8, r8, #4
138cmp r8, #4
139bge DownL2AreaLoop
140
141cmp r8, #0
142beq DownL2AreaRemainEnd
143DownL2AreaRemain:
144vld1.32 {q0}, [r0]!
145vst1.32 {d0[0]}, [r1]!
146vst1.32 {d0[1]}, [r5]!
147
148subs r8, r8, #1
149bne DownL2AreaRemain
150
151DownL2AreaRemainEnd:
152sub r3, r3, #2
153
154DownL1:
155cmp r3, #0
156beq DownEnd
157mov r8, r2
158cmp r8, #3
159ble DownL1AreaRemain
160DownL1AreaLoop:
161vld1.32 {q0, q1}, [r0]!
162vld1.32 {q2, q3}, [r0]!
163transpose
164vst1.32 {q0}, [r1]!
165sub r8, r8, #4
166cmp r8, #4
167bge DownL1AreaLoop
168
169cmp r8, #0
170beq DownL1AreaRemainEnd
171DownL1AreaRemain:
172vld1.32 {q0}, [r0]!
173
174vst1.32 {d0[0]}, [r1]!
175subs r8, r8, #1
176bne DownL1AreaRemain
177
178DownL1AreaRemainEnd:
179
180DownEnd:
181
182
183
184pop {r4, r5, r6, r7, r8, pc}
185
186
187
188#endif
189#endif
190