1//
2//  MNNPackC4_BF16.S
3//  MNN
4//
5//  Created by MNN on 2021/02/26.
6//  Copyright © 2018-2021 Alibaba Group Holding Limited
7//
8
9
10
11
12#ifdef __arm__
13#ifndef __aarch64__
14
15#include "MNNAsmGlobal.h"
16.text
17.align 5
18
19// .macro transpose
20// vtrn.16 d0, d1
21// vtrn.16 d2, d3
22// vswp d0[2-3], d1[2-3] // should swap high half of d-vector, the half is 32-bit. there is no instruction, we use vst4.16 instead
23// vswp d2[2-3], d3[2-3]
24// .endm
25
26asm_function MNNPackC4_BF16
27// treate float pointer as int16_t*
28//void MNNPackC4_BF16(float* dst, const float* src, size_t area, size_t depth)
29//Auto load:
30//r0:dst, r1:src, r2:area, r3:depth
31
32
33push {r4, r5, r6, r7, r8, lr}
34
35mul r4, r2, r3
36cmp r4, #0
37beq UpEnd
38
39//r4: src DepthOffset:area*sizeof(int16_t)
40mov r4, #2 // sizeof(int16_t)
41mul r4, r2, r4
42
43UpL4:
44cmp r3, #3
45ble UpL3
46
47UpL4Loop:
48add r5, r1, r4
49add r6, r4, r5
50add r7, r4, r6
51mov r8, r2
52cmp r8, #3
53ble UpL4AreaRemain
54UpL4AreaLoop:
55vld1.16 {d0}, [r1]! // load 4 elements of 16-bit into 64bit vector register d0
56vld1.16 {d1}, [r5]!
57vld1.16 {d2}, [r6]!
58vld1.16 {d3}, [r7]!
59// transpose // no suitable instruction to transpose int16_t type
60vst4.16 {d0, d1, d2, d3}, [r0]!
61sub r8, r8, #4
62cmp r8, #4
63bge UpL4AreaLoop
64
65UpL4AreaRemain:
66cmp r8, #0
67beq UpL4AreaRemainEnd
68UpL4AreaRemainLoop:
69vld1.16 {d0[0]}, [r1]!
70vld1.16 {d0[1]}, [r5]!
71vld1.16 {d0[2]}, [r6]!
72vld1.16 {d0[3]}, [r7]!
73
74vst1.16 {d0}, [r0]!
75
76subs r8, r8, #1
77bne UpL4AreaRemainLoop
78UpL4AreaRemainEnd:
79sub r3, r3, #4
80mov r1, r7
81cmp r3, #4
82bge UpL4Loop
83
84UpL3:
85cmp r3, #2
86ble UpL2
87add r5, r1, r4
88add r6, r4, r5
89mov r8, r2
90cmp r8, #3
91ble UpL3AreaRemain
92UpL3AreaLoop:
93vld1.16 {d0}, [r1]!
94vmov.i16 d3, #0
95vld1.16 {d1}, [r5]!
96vld1.16 {d2}, [r6]!
97// transpose // no suitable instruction to transpose int16_t type
98vst4.16 {d0, d1, d2, d3}, [r0]!
99sub r8, r8, #4
100cmp r8, #4
101bge UpL3AreaLoop
102
103cmp r8, #0
104beq UpL3AreaRemainEnd
105UpL3AreaRemain:
106vmov.i16 d0, #0
107vld1.16 {d0[0]}, [r1]!
108vld1.16 {d0[1]}, [r5]!
109vld1.16 {d0[2]}, [r6]!
110
111vst1.16 {d0}, [r0]!
112
113subs r8, r8, #1
114bne UpL3AreaRemain
115
116UpL3AreaRemainEnd:
117sub r3, r3, #3
118
119
120UpL2:
121cmp r3, #1
122ble UpL1
123add r5, r1, r4
124mov r8, r2
125cmp r8, #3
126ble UpL2AreaRemain
127UpL2AreaLoop:
128vld1.16 {d0}, [r1]!
129vmov.i16 d3, #0
130vld1.16 {d1}, [r5]!
131vmov.i16 d2, #0
132// transpose // no suitable instruction to transpose int16_t type
133vst4.16 {d0, d1, d2, d3}, [r0]!
134sub r8, r8, #4
135cmp r8, #4
136bge UpL2AreaLoop
137
138cmp r8, #0
139beq UpL2AreaRemainEnd
140UpL2AreaRemain:
141vmov.i16 d0, #0
142vld1.16 {d0[0]}, [r1]!
143vld1.16 {d0[1]}, [r5]!
144
145vst1.16 {d0}, [r0]!
146
147subs r8, r8, #1
148bne UpL2AreaRemain
149
150UpL2AreaRemainEnd:
151sub r3, r3, #2
152
153UpL1:
154cmp r3, #0
155beq UpEnd
156mov r8, r2
157cmp r8, #3
158ble UpL1AreaRemain
159UpL1AreaLoop:
160vld1.16 {d0}, [r1]!
161vmov.i16 d3, #0
162vmov.i16 d1, #0
163vmov.i16 d2, #0
164// transpose // no suitable instruction to transpose int16_t type
165vst4.16 {d0, d1, d2, d3}, [r0]!
166sub r8, r8, #4
167cmp r8, #4
168bge UpL1AreaLoop
169
170cmp r8, #0
171beq UpL1AreaRemainEnd
172UpL1AreaRemain:
173vmov.i16 d0, #0
174vld1.16 {d0[0]}, [r1]!
175
176vst1.16 {d0}, [r0]!
177
178subs r8, r8, #1
179bne UpL1AreaRemain
180
181UpL1AreaRemainEnd:
182
183UpEnd:
184
185pop {r4, r5, r6, r7, r8, pc}
186
187#endif
188#endif
189