1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14
pooling3x3s2_max_pack4_neon(const Mat & bottom_blob,Mat & top_blob,const Option & opt)15 static void pooling3x3s2_max_pack4_neon(const Mat& bottom_blob, Mat& top_blob, const Option& opt)
16 {
17 int w = bottom_blob.w;
18 int inch = bottom_blob.c;
19
20 int outw = top_blob.w;
21 int outh = top_blob.h;
22
23 const int tailstep = (w - 2 * outw + w) * 4;
24
25 #pragma omp parallel for num_threads(opt.num_threads)
26 for (int q = 0; q < inch; q++)
27 {
28 const Mat img0 = bottom_blob.channel(q);
29 float* outptr = top_blob.channel(q);
30
31 const float* r0 = img0.row(0);
32 const float* r1 = img0.row(1);
33 const float* r2 = img0.row(2);
34
35 for (int i = 0; i < outh; i++)
36 {
37 int j = 0;
38
39 for (; j + 3 < outw; j += 4)
40 {
41 #if __aarch64__
42 asm volatile(
43 "prfm pldl1keep, [%1, #512] \n"
44 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%1], #64 \n"
45
46 "fmax v16.4s, v0.4s, v1.4s \n"
47 "fmax v17.4s, v2.4s, v3.4s \n"
48
49 "prfm pldl1keep, [%1, #512] \n"
50 "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%1], #64 \n"
51
52 "fmax v18.4s, v4.4s, v5.4s \n"
53 "fmax v19.4s, v6.4s, v7.4s \n"
54
55 "ld1 {v8.4s}, [%1] \n"
56
57 "fmax v20.4s, v16.4s, v2.4s \n"
58 "fmax v21.4s, v17.4s, v4.4s \n"
59
60 "prfm pldl1keep, [%2, #512] \n"
61 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%2], #64 \n"
62
63 "fmax v22.4s, v18.4s, v6.4s \n"
64 "fmax v23.4s, v19.4s, v8.4s \n"
65
66 "prfm pldl1keep, [%2, #512] \n"
67 "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%2], #64 \n"
68
69 "fmax v16.4s, v0.4s, v1.4s \n"
70 "fmax v17.4s, v2.4s, v3.4s \n"
71
72 "fmax v18.4s, v4.4s, v5.4s \n"
73 "fmax v19.4s, v6.4s, v7.4s \n"
74
75 "ld1 {v8.4s}, [%2] \n"
76
77 "fmax v24.4s, v16.4s, v2.4s \n"
78 "fmax v25.4s, v17.4s, v4.4s \n"
79
80 "prfm pldl1keep, [%3, #512] \n"
81 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%3], #64 \n"
82
83 "fmax v26.4s, v18.4s, v6.4s \n"
84 "fmax v27.4s, v19.4s, v8.4s \n"
85
86 "prfm pldl1keep, [%3, #512] \n"
87 "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%3], #64 \n"
88
89 "fmax v16.4s, v0.4s, v1.4s \n"
90 "fmax v17.4s, v2.4s, v3.4s \n"
91
92 "fmax v18.4s, v4.4s, v5.4s \n"
93 "fmax v19.4s, v6.4s, v7.4s \n"
94
95 "ld1 {v8.4s}, [%3] \n"
96
97 "fmax v28.4s, v16.4s, v2.4s \n"
98 "fmax v29.4s, v17.4s, v4.4s \n"
99 "fmax v30.4s, v18.4s, v6.4s \n"
100 "fmax v31.4s, v19.4s, v8.4s \n"
101
102 "fmax v20.4s, v20.4s, v24.4s \n"
103 "fmax v21.4s, v21.4s, v25.4s \n"
104 "fmax v22.4s, v22.4s, v26.4s \n"
105 "fmax v23.4s, v23.4s, v27.4s \n"
106
107 "fmax v20.4s, v20.4s, v28.4s \n"
108 "fmax v21.4s, v21.4s, v29.4s \n"
109 "fmax v22.4s, v22.4s, v30.4s \n"
110 "fmax v23.4s, v23.4s, v31.4s \n"
111
112 "st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"
113
114 : "=r"(outptr), // %0
115 "=r"(r0), // %1
116 "=r"(r1), // %2
117 "=r"(r2) // %3
118 : "0"(outptr),
119 "1"(r0),
120 "2"(r1),
121 "3"(r2)
122 : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31");
123 #else // __aarch64__
124 asm volatile(
125 "pld [%1, #512] \n"
126 "vldm %1!, {d0-d7} \n"
127
128 "pld [%2, #512] \n"
129 "vldm %2!, {d8-d15} \n"
130
131 "vmax.f32 q0, q0, q4 \n"
132 "vmax.f32 q1, q1, q5 \n"
133
134 "pld [%3, #512] \n"
135 "vldm %3!, {d16-d23} \n"
136
137 "vmax.f32 q2, q2, q6 \n"
138 "vmax.f32 q3, q3, q7 \n"
139
140 "vmax.f32 q0, q0, q8 \n"
141 "vmax.f32 q1, q1, q9 \n"
142
143 "pld [%1, #512] \n"
144 "vldm %1!, {d8-d15} \n"
145
146 "vmax.f32 q2, q2, q10 \n"
147 "vmax.f32 q3, q3, q11 \n"
148
149 "pld [%2, #512] \n"
150 "vldm %2!, {d16-d23} \n"
151
152 "vmax.f32 q4, q4, q8 \n"
153 "vmax.f32 q5, q5, q9 \n"
154
155 "pld [%3, #512] \n"
156 "vldm %3!, {d24-d31} \n"
157
158 "vmax.f32 q6, q6, q10 \n"
159 "vmax.f32 q7, q7, q11 \n"
160
161 "vmax.f32 q4, q4, q12 \n"
162 "vmax.f32 q5, q5, q13 \n"
163
164 "vld1.f32 {d24-d25}, [%1 :128] \n"
165 "vld1.f32 {d26-d27}, [%2 :128] \n"
166
167 "vmax.f32 q6, q6, q14 \n"
168 "vmax.f32 q7, q7, q15 \n"
169
170 "vld1.f32 {d28-d29}, [%3 :128] \n"
171
172 "vmax.f32 q8, q12, q13 \n"
173 "vmax.f32 q8, q8, q14 \n"
174
175 "vmax.f32 q12, q0, q1 \n"
176 "vmax.f32 q13, q2, q3 \n"
177 "vmax.f32 q14, q4, q5 \n"
178 "vmax.f32 q15, q6, q7 \n"
179
180 "vmax.f32 q12, q12, q2 \n"
181 "vmax.f32 q13, q13, q4 \n"
182 "vmax.f32 q14, q14, q6 \n"
183 "vmax.f32 q15, q15, q8 \n"
184
185 "vstm %0!, {d24-d31} \n"
186
187 : "=r"(outptr), // %0
188 "=r"(r0), // %1
189 "=r"(r1), // %2
190 "=r"(r2) // %3
191 : "0"(outptr),
192 "1"(r0),
193 "2"(r1),
194 "3"(r2)
195 : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
196 #endif // __aarch64__
197 }
198 for (; j + 1 < outw; j += 2)
199 {
200 #if __aarch64__
201 asm volatile(
202 "prfm pldl1keep, [%1, #512] \n"
203 "ld1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%1], #64 \n"
204
205 "prfm pldl1keep, [%2, #512] \n"
206 "ld1 {v4.4s, v5.4s, v6.4s, v7.4s}, [%2], #64 \n"
207
208 "fmax v16.4s, v0.4s, v4.4s \n"
209 "fmax v17.4s, v1.4s, v5.4s \n"
210
211 "prfm pldl1keep, [%3, #512] \n"
212 "ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [%3], #64 \n"
213
214 "fmax v18.4s, v2.4s, v6.4s \n"
215 "fmax v19.4s, v3.4s, v7.4s \n"
216
217 "ld1 {v0.4s}, [%1] \n"
218
219 "fmax v16.4s, v16.4s, v20.4s \n"
220 "fmax v17.4s, v17.4s, v21.4s \n"
221
222 "ld1 {v1.4s}, [%2] \n"
223
224 "fmax v18.4s, v18.4s, v22.4s \n"
225 "fmax v19.4s, v19.4s, v23.4s \n"
226
227 "ld1 {v2.4s}, [%3] \n"
228
229 "fmax v3.4s, v0.4s, v1.4s \n"
230
231 "fmax v20.4s, v16.4s, v17.4s \n"
232 "fmax v21.4s, v18.4s, v19.4s \n"
233
234 "fmax v3.4s, v3.4s, v2.4s \n"
235
236 "fmax v20.4s, v20.4s, v18.4s \n"
237 "fmax v21.4s, v21.4s, v3.4s \n"
238
239 "st1 {v20.4s, v21.4s}, [%0], #32 \n"
240
241 : "=r"(outptr), // %0
242 "=r"(r0), // %1
243 "=r"(r1), // %2
244 "=r"(r2) // %3
245 : "0"(outptr),
246 "1"(r0),
247 "2"(r1),
248 "3"(r2)
249 : "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
250 #else // __aarch64__
251 asm volatile(
252 "pld [%1, #512] \n"
253 "vldm %1!, {d0-d7} \n"
254
255 "pld [%2, #512] \n"
256 "vldm %2!, {d8-d15} \n"
257
258 "vmax.f32 q12, q0, q4 \n"
259 "vmax.f32 q13, q1, q5 \n"
260
261 "pld [%3, #512] \n"
262 "vldm %3!, {d16-d23} \n"
263
264 "vmax.f32 q14, q2, q6 \n"
265 "vmax.f32 q15, q3, q7 \n"
266
267 "vld1.f32 {d0-d1}, [%1 :128] \n"
268
269 "vmax.f32 q12, q12, q8 \n"
270 "vmax.f32 q13, q13, q9 \n"
271
272 "vld1.f32 {d2-d3}, [%2 :128] \n"
273
274 "vmax.f32 q14, q14, q10 \n"
275 "vmax.f32 q15, q15, q11 \n"
276
277 "vld1.f32 {d4-d5}, [%3 :128] \n"
278
279 "vmax.f32 q3, q0, q1 \n"
280
281 "vmax.f32 q4, q12, q13 \n"
282 "vmax.f32 q5, q14, q15 \n"
283
284 "vmax.f32 q3, q3, q2 \n"
285
286 "vmax.f32 q4, q4, q14 \n"
287 "vmax.f32 q5, q5, q3 \n"
288
289 "vst1.f32 {d8-d11}, [%0 :128]! \n"
290
291 : "=r"(outptr), // %0
292 "=r"(r0), // %1
293 "=r"(r1), // %2
294 "=r"(r2) // %3
295 : "0"(outptr),
296 "1"(r0),
297 "2"(r1),
298 "3"(r2)
299 : "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
300 #endif // __aarch64__
301 }
302 for (; j < outw; j++)
303 {
304 float32x4_t _r00 = vld1q_f32(r0);
305 float32x4_t _r01 = vld1q_f32(r0 + 4);
306 float32x4_t _r02 = vld1q_f32(r0 + 8);
307 float32x4_t _r10 = vld1q_f32(r1);
308 float32x4_t _r11 = vld1q_f32(r1 + 4);
309 float32x4_t _r12 = vld1q_f32(r1 + 8);
310 float32x4_t _r20 = vld1q_f32(r2);
311 float32x4_t _r21 = vld1q_f32(r2 + 4);
312 float32x4_t _r22 = vld1q_f32(r2 + 8);
313
314 float32x4_t _max0 = vmaxq_f32(vmaxq_f32(_r00, _r01), _r02);
315 float32x4_t _max1 = vmaxq_f32(vmaxq_f32(_r10, _r11), _r12);
316 float32x4_t _max2 = vmaxq_f32(vmaxq_f32(_r20, _r21), _r22);
317
318 float32x4_t _max = vmaxq_f32(vmaxq_f32(_max0, _max1), _max2);
319
320 vst1q_f32(outptr, _max);
321
322 r0 += 8;
323 r1 += 8;
324 r2 += 8;
325 outptr += 4;
326 }
327
328 r0 += tailstep;
329 r1 += tailstep;
330 r2 += tailstep;
331 }
332 }
333 }
334