1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14
padding_constant_pack4_neon(const Mat & src,Mat & dst,int top,int bottom,int left,int right,float32x4_t v)15 static void padding_constant_pack4_neon(const Mat& src, Mat& dst, int top, int bottom, int left, int right, float32x4_t v)
16 {
17 const float* ptr = src;
18 float* outptr = dst;
19
20 int w = src.w;
21 int h = src.h;
22
23 int top_size = top * dst.w;
24 int bottom_size = bottom * dst.w;
25
26 #if __aarch64__
27 asm volatile(
28 "mov v0.16b, %10.16b \n"
29 "mov v1.16b, %10.16b \n"
30 "mov v2.16b, %10.16b \n"
31 "mov v3.16b, %10.16b \n"
32
33 // fill top
34 "lsr w4, %w8, #3 \n" // w4 = nn = top_size >> 3
35 "cmp w4, #0 \n"
36 "beq 1f \n"
37
38 "0: \n"
39 "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n"
40 "subs w4, w4, #1 \n"
41 "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n"
42 "bne 0b \n"
43
44 "1: \n"
45
46 // fill top remain
47 "and w4, %w8, #7 \n" // w4 = remain = top_size & 7
48
49 "cmp w4, #4 \n" // w4 >= 4
50 "blt 2f \n"
51 "sub w4, w4, #4 \n"
52 "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n"
53 "2: \n"
54
55 "cmp w4, #2 \n" // w4 >= 2
56 "blt 3f \n"
57 "sub w4, w4, #2 \n"
58 "st1 {v0.4s, v1.4s}, [%0], #32 \n"
59 "3: \n"
60
61 "cmp w4, #0 \n" // w4 > 0
62 "beq 4f \n"
63 "st1 {v0.4s}, [%0], #16 \n"
64 "4: \n"
65
66 // fill center h loop
67 "cmp %w5, #0 \n"
68 "beq 15f \n"
69 "5: \n"
70
71 // fill left
72 "mov w4, %w6 \n" // w4 = left
73 "cmp w4, #0 \n"
74 "beq 7f \n"
75
76 "6: \n"
77 "st1 {v0.4s}, [%0], #16 \n"
78 "subs w4, w4, #1 \n"
79 "bne 6b \n"
80
81 "7: \n"
82
83 // fill middle
84 "lsr w4, %w4, #3 \n" // w4 = nn = w >> 3
85 "cmp w4, #0 \n"
86 "beq 9f \n"
87
88 "8: \n"
89 "prfm pldl1keep, [%1, #512] \n"
90 "ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%1], #64 \n"
91 "prfm pldl1keep, [%1, #512] \n"
92 "ld1 {v20.4s, v21.4s, v22.4s, v23.4s}, [%1], #64 \n"
93 "subs w4, w4, #1 \n"
94 "st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
95 "st1 {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"
96 "bne 8b \n"
97
98 "9: \n"
99
100 "and w4, %w4, #7 \n" // w4 = remain = w & 7
101
102 "cmp w4, #4 \n" // w4 >= 4
103 "blt 10f \n"
104 "prfm pldl1keep, [%1, #512] \n"
105 "ld1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%1], #64 \n"
106 "sub w4, w4, #4 \n"
107 "st1 {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
108 "10: \n"
109
110 "cmp w4, #2 \n" // w4 >= 2
111 "blt 11f \n"
112 "prfm pldl1keep, [%1, #256] \n"
113 "ld1 {v16.4s, v17.4s}, [%1], #32 \n"
114 "sub w4, w4, #2 \n"
115 "st1 {v16.4s, v17.4s}, [%0], #32 \n"
116 "11: \n"
117
118 "cmp w4, #0 \n" // w4 > 0
119 "beq 12f \n"
120 "prfm pldl1keep, [%1, #128] \n"
121 "ld1 {v16.4s}, [%1], #16 \n"
122 "st1 {v16.4s}, [%0], #16 \n"
123 "12: \n"
124
125 // fill right
126 "mov w4, %w7 \n" // w4 = right
127 "cmp w4, #0 \n"
128 "beq 14f \n"
129
130 "13: \n"
131 "subs w4, w4, #1 \n"
132 "st1 {v0.4s}, [%0], #16 \n"
133 "bne 13b \n"
134 "14: \n"
135
136 "subs %w5, %w5, #1 \n"
137 "bne 5b \n"
138
139 "15: \n"
140
141 // fill bottom
142 "lsr w4, %w9, #3 \n" // w4 = nn = bottom_size >> 3
143 "cmp w4, #0 \n"
144 "beq 17f \n"
145
146 "16: \n"
147 "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n"
148 "subs w4, w4, #1 \n"
149 "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n"
150 "bne 16b \n"
151 "17: \n"
152
153 // fill bottom remain
154 "and w4, %w9, #7 \n" // w4 = remain = bottom_size & 7
155
156 "cmp w4, #4 \n" // w4 >= 4
157 "blt 18f \n"
158 "sub w4, w4, #4 \n"
159 "st1 {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n"
160 "18: \n"
161
162 "cmp w4, #2 \n" // w4 >= 2
163 "blt 19f \n"
164 "sub w4, w4, #2 \n"
165 "st1 {v0.4s, v1.4s}, [%0], #32 \n"
166 "19: \n"
167
168 "cmp w4, #0 \n" // w4 > 0
169 "beq 20f \n"
170 "st1 {v0.4s}, [%0], #16 \n"
171 "20: \n"
172
173 : "=r"(outptr), // %0
174 "=r"(ptr) // %1
175 : "0"(outptr),
176 "1"(ptr),
177 "r"(w), // %4
178 "r"(h), // %5
179 "r"(left), // %6
180 "r"(right), // %7
181 "r"(top_size), // %8
182 "r"(bottom_size), // %9
183 "w"(v) // %10
184 : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
185 #else // __aarch64__
186 asm volatile(
187 "vmov q0, %q10 \n"
188 "vmov q1, %q10 \n"
189 "vmov q2, %q10 \n"
190 "vmov q3, %q10 \n"
191
192 // fill top
193 "lsr r4, %8, #3 \n" // r4 = nn = top_size >> 3
194 "cmp r4, #0 \n"
195 "beq 1f \n"
196
197 "0: \n"
198 "vstm %0!, {d0-d7} \n"
199 "subs r4, r4, #1 \n"
200 "vstm %0!, {d0-d7} \n"
201 "bne 0b \n"
202
203 "1: \n"
204
205 // fill top remain
206 "and r4, %8, #7 \n" // r4 = remain = top_size & 7
207
208 "cmp r4, #4 \n" // r4 >= 4
209 "blt 2f \n"
210 "sub r4, r4, #4 \n"
211 "vstm %0!, {d0-d7} \n"
212 "2: \n"
213
214 "cmp r4, #2 \n" // r4 >= 2
215 "blt 3f \n"
216 "sub r4, r4, #2 \n"
217 "vst1.f32 {d0-d3}, [%0 :128]! \n"
218 "3: \n"
219
220 "cmp r4, #0 \n" // r4 > 0
221 "beq 4f \n"
222 "vst1.f32 {d0-d1}, [%0 :128]! \n"
223 "4: \n"
224
225 // fill center h loop
226 "cmp %5, #0 \n"
227 "beq 15f \n"
228 "5: \n"
229
230 // fill left
231 "mov r4, %6 \n" // r4 = left
232 "cmp r4, #0 \n"
233 "beq 7f \n"
234
235 "6: \n"
236 "vst1.f32 {d0-d1}, [%0 :128]! \n"
237 "subs r4, r4, #1 \n"
238 "bne 6b \n"
239
240 "7: \n"
241
242 // fill middle
243 "lsr r4, %4, #3 \n" // r4 = nn = w >> 3
244 "cmp r4, #0 \n"
245 "beq 9f \n"
246
247 "8: \n"
248 "pld [%1, #512] \n"
249 "vldm %1!, {d16-d23} \n"
250 "pld [%1, #512] \n"
251 "vldm %1!, {d24-d31} \n"
252 "subs r4, r4, #1 \n"
253 "vstm %0!, {d16-d23} \n"
254 "vstm %0!, {d24-d31} \n"
255 "bne 8b \n"
256
257 "9: \n"
258
259 "and r4, %4, #7 \n" // r4 = remain = w & 7
260
261 "cmp r4, #4 \n" // r4 >= 4
262 "blt 10f \n"
263 "pld [%1, #512] \n"
264 "vldm %1!, {d16-d23} \n"
265 "sub r4, r4, #4 \n"
266 "vstm %0!, {d16-d23} \n"
267 "10: \n"
268
269 "cmp r4, #2 \n" // r4 >= 2
270 "blt 11f \n"
271 "pld [%1, #256] \n"
272 "vld1.f32 {d16-d19}, [%1 :128]! \n"
273 "sub r4, r4, #2 \n"
274 "vst1.f32 {d16-d19}, [%0 :128]! \n"
275 "11: \n"
276
277 "cmp r4, #0 \n" // r4 > 0
278 "beq 12f \n"
279 "pld [%1, #128] \n"
280 "vld1.f32 {d16-d17}, [%1 :128]! \n"
281 "vst1.f32 {d16-d17}, [%0 :128]! \n"
282 "12: \n"
283
284 // fill right
285 "mov r4, %7 \n" // r4 = right
286 "cmp r4, #0 \n"
287 "beq 14f \n"
288
289 "13: \n"
290 "subs r4, r4, #1 \n"
291 "vst1.f32 {d0-d1}, [%0 :128]! \n"
292 "bne 13b \n"
293 "14: \n"
294
295 "subs %5, %5, #1 \n"
296 "bne 5b \n"
297
298 "15: \n"
299
300 // fill bottom
301 "lsr r4, %9, #3 \n" // r4 = nn = bottom_size >> 3
302 "cmp r4, #0 \n"
303 "beq 17f \n"
304
305 "16: \n"
306 "vstm %0!, {d0-d7} \n"
307 "subs r4, r4, #1 \n"
308 "vstm %0!, {d0-d7} \n"
309 "bne 16b \n"
310 "17: \n"
311
312 // fill bottom remain
313 "and r4, %9, #7 \n" // r4 = remain = bottom_size & 7
314
315 "cmp r4, #4 \n" // r4 >= 4
316 "blt 18f \n"
317 "sub r4, r4, #4 \n"
318 "vstm %0!, {d0-d7} \n"
319 "18: \n"
320
321 "cmp r4, #2 \n" // r4 >= 2
322 "blt 19f \n"
323 "sub r4, r4, #2 \n"
324 "vst1.f32 {d0-d3}, [%0 :128]! \n"
325 "19: \n"
326
327 "cmp r4, #0 \n" // r4 > 0
328 "beq 20f \n"
329 "vst1.f32 {d0-d1}, [%0 :128]! \n"
330 "20: \n"
331
332 : "=r"(outptr), // %0
333 "=r"(ptr) // %1
334 : "0"(outptr),
335 "1"(ptr),
336 "r"(w), // %4
337 "r"(h), // %5
338 "r"(left), // %6
339 "r"(right), // %7
340 "r"(top_size), // %8
341 "r"(bottom_size), // %9
342 "w"(v) // %10
343 : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
344 #endif // __aarch64__
345 }
346
padding_replicate_pack4_neon(const Mat & src,Mat & dst,int top,int bottom,int left,int right)347 static void padding_replicate_pack4_neon(const Mat& src, Mat& dst, int top, int bottom, int left, int right)
348 {
349 const float* ptr = src;
350 float* outptr = dst;
351
352 // fill top
353 for (int y = 0; y < top; y++)
354 {
355 const float* ptr0 = ptr;
356 float32x4_t _p = vld1q_f32(ptr0);
357 for (int x = 0; x < left; x++)
358 {
359 vst1q_f32(outptr, _p);
360 outptr += 4;
361 }
362 for (int x = 0; x < src.w; x++)
363 {
364 _p = vld1q_f32(ptr0);
365 vst1q_f32(outptr, _p);
366 ptr0 += 4;
367 outptr += 4;
368 }
369 for (int x = 0; x < right; x++)
370 {
371 vst1q_f32(outptr, _p);
372 outptr += 4;
373 }
374 }
375 // fill center
376 for (int y = 0; y < src.h; y++)
377 {
378 float32x4_t _p = vld1q_f32(ptr);
379 for (int x = 0; x < left; x++)
380 {
381 vst1q_f32(outptr, _p);
382 outptr += 4;
383 }
384 for (int x = 0; x < src.w; x++)
385 {
386 _p = vld1q_f32(ptr);
387 vst1q_f32(outptr, _p);
388 ptr += 4;
389 outptr += 4;
390 }
391 for (int x = 0; x < right; x++)
392 {
393 vst1q_f32(outptr, _p);
394 outptr += 4;
395 }
396 }
397 // fill bottom
398 ptr -= src.w * 4;
399 for (int y = 0; y < bottom; y++)
400 {
401 const float* ptr0 = ptr;
402 float32x4_t _p = vld1q_f32(ptr0);
403 for (int x = 0; x < left; x++)
404 {
405 vst1q_f32(outptr, _p);
406 outptr += 4;
407 }
408 for (int x = 0; x < src.w; x++)
409 {
410 _p = vld1q_f32(ptr0);
411 vst1q_f32(outptr, _p);
412 ptr0 += 4;
413 outptr += 4;
414 }
415 for (int x = 0; x < right; x++)
416 {
417 vst1q_f32(outptr, _p);
418 outptr += 4;
419 }
420 }
421 }
422
padding_reflect_pack4_neon(const Mat & src,Mat & dst,int top,int bottom,int left,int right)423 static void padding_reflect_pack4_neon(const Mat& src, Mat& dst, int top, int bottom, int left, int right)
424 {
425 const float* ptr = src;
426 float* outptr = dst;
427
428 // fill top
429 ptr += top * src.w * 4;
430 for (int y = 0; y < top; y++)
431 {
432 const float* ptr0 = ptr;
433 for (int x = 0; x < left; x++)
434 {
435 float32x4_t _p = vld1q_f32(ptr0 + (left - x) * 4);
436 vst1q_f32(outptr, _p);
437 outptr += 4;
438 }
439 for (int x = 0; x < src.w; x++)
440 {
441 float32x4_t _p = vld1q_f32(ptr0);
442 vst1q_f32(outptr, _p);
443 ptr0 += 4;
444 outptr += 4;
445 }
446 for (int x = 0; x < right; x++)
447 {
448 float32x4_t _p = vld1q_f32(ptr0 - 8 - x * 4);
449 vst1q_f32(outptr, _p);
450 outptr += 4;
451 }
452 ptr -= src.w * 4;
453 }
454 // fill center
455 for (int y = 0; y < src.h; y++)
456 {
457 for (int x = 0; x < left; x++)
458 {
459 float32x4_t _p = vld1q_f32(ptr + (left - x) * 4);
460 vst1q_f32(outptr, _p);
461 outptr += 4;
462 }
463 for (int x = 0; x < src.w; x++)
464 {
465 float32x4_t _p = vld1q_f32(ptr);
466 vst1q_f32(outptr, _p);
467 ptr += 4;
468 outptr += 4;
469 }
470 for (int x = 0; x < right; x++)
471 {
472 float32x4_t _p = vld1q_f32(ptr - 8 - x * 4);
473 vst1q_f32(outptr, _p);
474 outptr += 4;
475 }
476 }
477 // fill bottom
478 ptr -= 2 * src.w * 4;
479 for (int y = 0; y < bottom; y++)
480 {
481 const float* ptr0 = ptr;
482 for (int x = 0; x < left; x++)
483 {
484 float32x4_t _p = vld1q_f32(ptr0 + (left - x) * 4);
485 vst1q_f32(outptr, _p);
486 outptr += 4;
487 }
488 for (int x = 0; x < src.w; x++)
489 {
490 float32x4_t _p = vld1q_f32(ptr0);
491 vst1q_f32(outptr, _p);
492 ptr0 += 4;
493 outptr += 4;
494 }
495 for (int x = 0; x < right; x++)
496 {
497 float32x4_t _p = vld1q_f32(ptr0 - 8 - x * 4);
498 vst1q_f32(outptr, _p);
499 outptr += 4;
500 }
501 ptr -= src.w * 4;
502 }
503 }
504