1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 
padding_constant_pack4_neon(const Mat & src,Mat & dst,int top,int bottom,int left,int right,float32x4_t v)15 static void padding_constant_pack4_neon(const Mat& src, Mat& dst, int top, int bottom, int left, int right, float32x4_t v)
16 {
17     const float* ptr = src;
18     float* outptr = dst;
19 
20     int w = src.w;
21     int h = src.h;
22 
23     int top_size = top * dst.w;
24     int bottom_size = bottom * dst.w;
25 
26 #if __aarch64__
27     asm volatile(
28         "mov    v0.16b, %10.16b         \n"
29         "mov    v1.16b, %10.16b         \n"
30         "mov    v2.16b, %10.16b         \n"
31         "mov    v3.16b, %10.16b         \n"
32 
33         // fill top
34         "lsr    w4, %w8, #3             \n" // w4 = nn = top_size >> 3
35         "cmp    w4, #0                  \n"
36         "beq    1f                      \n"
37 
38         "0:                             \n"
39         "st1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n"
40         "subs   w4, w4, #1              \n"
41         "st1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n"
42         "bne    0b                      \n"
43 
44         "1:                             \n"
45 
46         // fill top remain
47         "and    w4, %w8, #7             \n" // w4 = remain = top_size & 7
48 
49         "cmp    w4, #4                  \n" // w4 >= 4
50         "blt    2f                      \n"
51         "sub    w4, w4, #4              \n"
52         "st1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n"
53         "2:                             \n"
54 
55         "cmp    w4, #2                  \n" // w4 >= 2
56         "blt    3f                      \n"
57         "sub    w4, w4, #2              \n"
58         "st1    {v0.4s, v1.4s}, [%0], #32 \n"
59         "3:                             \n"
60 
61         "cmp    w4, #0                  \n" // w4 > 0
62         "beq    4f                      \n"
63         "st1    {v0.4s}, [%0], #16      \n"
64         "4:                             \n"
65 
66         // fill center h loop
67         "cmp    %w5, #0                 \n"
68         "beq    15f                     \n"
69         "5:                             \n"
70 
71         // fill left
72         "mov    w4, %w6                 \n" // w4 = left
73         "cmp    w4, #0                  \n"
74         "beq    7f                      \n"
75 
76         "6:                             \n"
77         "st1    {v0.4s}, [%0], #16      \n"
78         "subs   w4, w4, #1              \n"
79         "bne    6b                      \n"
80 
81         "7:                             \n"
82 
83         // fill middle
84         "lsr    w4, %w4, #3             \n" // w4 = nn = w >> 3
85         "cmp    w4, #0                  \n"
86         "beq    9f                      \n"
87 
88         "8:                             \n"
89         "prfm   pldl1keep, [%1, #512]   \n"
90         "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%1], #64 \n"
91         "prfm   pldl1keep, [%1, #512]   \n"
92         "ld1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%1], #64 \n"
93         "subs   w4, w4, #1              \n"
94         "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
95         "st1    {v20.4s, v21.4s, v22.4s, v23.4s}, [%0], #64 \n"
96         "bne    8b                      \n"
97 
98         "9:                             \n"
99 
100         "and    w4, %w4, #7             \n" // w4 = remain = w & 7
101 
102         "cmp    w4, #4                  \n" // w4 >= 4
103         "blt    10f                     \n"
104         "prfm   pldl1keep, [%1, #512]   \n"
105         "ld1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%1], #64 \n"
106         "sub    w4, w4, #4              \n"
107         "st1    {v16.4s, v17.4s, v18.4s, v19.4s}, [%0], #64 \n"
108         "10:                            \n"
109 
110         "cmp    w4, #2                  \n" // w4 >= 2
111         "blt    11f                     \n"
112         "prfm   pldl1keep, [%1, #256]   \n"
113         "ld1    {v16.4s, v17.4s}, [%1], #32 \n"
114         "sub    w4, w4, #2              \n"
115         "st1    {v16.4s, v17.4s}, [%0], #32 \n"
116         "11:                            \n"
117 
118         "cmp    w4, #0                  \n" // w4 > 0
119         "beq    12f                     \n"
120         "prfm   pldl1keep, [%1, #128]   \n"
121         "ld1    {v16.4s}, [%1], #16     \n"
122         "st1    {v16.4s}, [%0], #16     \n"
123         "12:                            \n"
124 
125         // fill right
126         "mov    w4, %w7                 \n" // w4 = right
127         "cmp    w4, #0                  \n"
128         "beq    14f                     \n"
129 
130         "13:                            \n"
131         "subs   w4, w4, #1              \n"
132         "st1    {v0.4s}, [%0], #16      \n"
133         "bne    13b                     \n"
134         "14:                            \n"
135 
136         "subs   %w5, %w5, #1            \n"
137         "bne    5b                      \n"
138 
139         "15:                            \n"
140 
141         // fill bottom
142         "lsr    w4, %w9, #3             \n" // w4 = nn = bottom_size >> 3
143         "cmp    w4, #0                  \n"
144         "beq    17f                     \n"
145 
146         "16:                            \n"
147         "st1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n"
148         "subs   w4, w4, #1              \n"
149         "st1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n"
150         "bne    16b                     \n"
151         "17:                            \n"
152 
153         // fill bottom remain
154         "and    w4, %w9, #7             \n" // w4 = remain = bottom_size & 7
155 
156         "cmp    w4, #4                  \n" // w4 >= 4
157         "blt    18f                     \n"
158         "sub    w4, w4, #4              \n"
159         "st1    {v0.4s, v1.4s, v2.4s, v3.4s}, [%0], #64 \n"
160         "18:                            \n"
161 
162         "cmp    w4, #2                  \n" // w4 >= 2
163         "blt    19f                     \n"
164         "sub    w4, w4, #2              \n"
165         "st1    {v0.4s, v1.4s}, [%0], #32 \n"
166         "19:                            \n"
167 
168         "cmp    w4, #0                  \n" // w4 > 0
169         "beq    20f                     \n"
170         "st1    {v0.4s}, [%0], #16      \n"
171         "20:                            \n"
172 
173         : "=r"(outptr), // %0
174         "=r"(ptr)     // %1
175         : "0"(outptr),
176         "1"(ptr),
177         "r"(w),           // %4
178         "r"(h),           // %5
179         "r"(left),        // %6
180         "r"(right),       // %7
181         "r"(top_size),    // %8
182         "r"(bottom_size), // %9
183         "w"(v)            // %10
184         : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23");
185 #else  // __aarch64__
186     asm volatile(
187         "vmov       q0, %q10            \n"
188         "vmov       q1, %q10            \n"
189         "vmov       q2, %q10            \n"
190         "vmov       q3, %q10            \n"
191 
192         // fill top
193         "lsr        r4, %8, #3          \n" // r4 = nn = top_size >> 3
194         "cmp        r4, #0              \n"
195         "beq        1f                  \n"
196 
197         "0:                             \n"
198         "vstm       %0!, {d0-d7}        \n"
199         "subs       r4, r4, #1          \n"
200         "vstm       %0!, {d0-d7}        \n"
201         "bne        0b                  \n"
202 
203         "1:                             \n"
204 
205         // fill top remain
206         "and        r4, %8, #7          \n" // r4 = remain = top_size & 7
207 
208         "cmp        r4, #4              \n" // r4 >= 4
209         "blt        2f                  \n"
210         "sub        r4, r4, #4          \n"
211         "vstm       %0!, {d0-d7}        \n"
212         "2:                             \n"
213 
214         "cmp        r4, #2              \n" // r4 >= 2
215         "blt        3f                  \n"
216         "sub        r4, r4, #2          \n"
217         "vst1.f32   {d0-d3}, [%0 :128]! \n"
218         "3:                             \n"
219 
220         "cmp        r4, #0              \n" // r4 > 0
221         "beq        4f                  \n"
222         "vst1.f32   {d0-d1}, [%0 :128]! \n"
223         "4:                             \n"
224 
225         // fill center h loop
226         "cmp        %5, #0              \n"
227         "beq        15f                 \n"
228         "5:                             \n"
229 
230         // fill left
231         "mov        r4, %6              \n" // r4 = left
232         "cmp        r4, #0              \n"
233         "beq        7f                  \n"
234 
235         "6:                             \n"
236         "vst1.f32   {d0-d1}, [%0 :128]! \n"
237         "subs       r4, r4, #1          \n"
238         "bne        6b                  \n"
239 
240         "7:                             \n"
241 
242         // fill middle
243         "lsr        r4, %4, #3          \n" // r4 = nn = w >> 3
244         "cmp        r4, #0              \n"
245         "beq        9f                  \n"
246 
247         "8:                             \n"
248         "pld        [%1, #512]          \n"
249         "vldm       %1!, {d16-d23}      \n"
250         "pld        [%1, #512]          \n"
251         "vldm       %1!, {d24-d31}      \n"
252         "subs       r4, r4, #1          \n"
253         "vstm       %0!, {d16-d23}      \n"
254         "vstm       %0!, {d24-d31}      \n"
255         "bne        8b                  \n"
256 
257         "9:                             \n"
258 
259         "and        r4, %4, #7          \n" // r4 = remain = w & 7
260 
261         "cmp        r4, #4              \n" // r4 >= 4
262         "blt        10f                 \n"
263         "pld        [%1, #512]          \n"
264         "vldm       %1!, {d16-d23}      \n"
265         "sub        r4, r4, #4          \n"
266         "vstm       %0!, {d16-d23}      \n"
267         "10:                            \n"
268 
269         "cmp        r4, #2              \n" // r4 >= 2
270         "blt        11f                 \n"
271         "pld        [%1, #256]          \n"
272         "vld1.f32   {d16-d19}, [%1 :128]! \n"
273         "sub        r4, r4, #2          \n"
274         "vst1.f32   {d16-d19}, [%0 :128]! \n"
275         "11:                            \n"
276 
277         "cmp        r4, #0              \n" // r4 > 0
278         "beq        12f                 \n"
279         "pld        [%1, #128]          \n"
280         "vld1.f32   {d16-d17}, [%1 :128]! \n"
281         "vst1.f32   {d16-d17}, [%0 :128]! \n"
282         "12:                            \n"
283 
284         // fill right
285         "mov        r4, %7              \n" // r4 = right
286         "cmp        r4, #0              \n"
287         "beq        14f                 \n"
288 
289         "13:                            \n"
290         "subs       r4, r4, #1          \n"
291         "vst1.f32   {d0-d1}, [%0 :128]! \n"
292         "bne        13b                 \n"
293         "14:                            \n"
294 
295         "subs       %5, %5, #1          \n"
296         "bne        5b                  \n"
297 
298         "15:                            \n"
299 
300         // fill bottom
301         "lsr        r4, %9, #3          \n" // r4 = nn = bottom_size >> 3
302         "cmp        r4, #0              \n"
303         "beq        17f                 \n"
304 
305         "16:                            \n"
306         "vstm       %0!, {d0-d7}        \n"
307         "subs       r4, r4, #1          \n"
308         "vstm       %0!, {d0-d7}        \n"
309         "bne        16b                 \n"
310         "17:                            \n"
311 
312         // fill bottom remain
313         "and        r4, %9, #7          \n" // r4 = remain = bottom_size & 7
314 
315         "cmp        r4, #4              \n" // r4 >= 4
316         "blt        18f                 \n"
317         "sub        r4, r4, #4          \n"
318         "vstm       %0!, {d0-d7}        \n"
319         "18:                            \n"
320 
321         "cmp        r4, #2              \n" // r4 >= 2
322         "blt        19f                 \n"
323         "sub        r4, r4, #2          \n"
324         "vst1.f32   {d0-d3}, [%0 :128]! \n"
325         "19:                            \n"
326 
327         "cmp        r4, #0              \n" // r4 > 0
328         "beq        20f                 \n"
329         "vst1.f32   {d0-d1}, [%0 :128]! \n"
330         "20:                            \n"
331 
332         : "=r"(outptr), // %0
333         "=r"(ptr)     // %1
334         : "0"(outptr),
335         "1"(ptr),
336         "r"(w),           // %4
337         "r"(h),           // %5
338         "r"(left),        // %6
339         "r"(right),       // %7
340         "r"(top_size),    // %8
341         "r"(bottom_size), // %9
342         "w"(v)            // %10
343         : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15");
344 #endif // __aarch64__
345 }
346 
padding_replicate_pack4_neon(const Mat & src,Mat & dst,int top,int bottom,int left,int right)347 static void padding_replicate_pack4_neon(const Mat& src, Mat& dst, int top, int bottom, int left, int right)
348 {
349     const float* ptr = src;
350     float* outptr = dst;
351 
352     // fill top
353     for (int y = 0; y < top; y++)
354     {
355         const float* ptr0 = ptr;
356         float32x4_t _p = vld1q_f32(ptr0);
357         for (int x = 0; x < left; x++)
358         {
359             vst1q_f32(outptr, _p);
360             outptr += 4;
361         }
362         for (int x = 0; x < src.w; x++)
363         {
364             _p = vld1q_f32(ptr0);
365             vst1q_f32(outptr, _p);
366             ptr0 += 4;
367             outptr += 4;
368         }
369         for (int x = 0; x < right; x++)
370         {
371             vst1q_f32(outptr, _p);
372             outptr += 4;
373         }
374     }
375     // fill center
376     for (int y = 0; y < src.h; y++)
377     {
378         float32x4_t _p = vld1q_f32(ptr);
379         for (int x = 0; x < left; x++)
380         {
381             vst1q_f32(outptr, _p);
382             outptr += 4;
383         }
384         for (int x = 0; x < src.w; x++)
385         {
386             _p = vld1q_f32(ptr);
387             vst1q_f32(outptr, _p);
388             ptr += 4;
389             outptr += 4;
390         }
391         for (int x = 0; x < right; x++)
392         {
393             vst1q_f32(outptr, _p);
394             outptr += 4;
395         }
396     }
397     // fill bottom
398     ptr -= src.w * 4;
399     for (int y = 0; y < bottom; y++)
400     {
401         const float* ptr0 = ptr;
402         float32x4_t _p = vld1q_f32(ptr0);
403         for (int x = 0; x < left; x++)
404         {
405             vst1q_f32(outptr, _p);
406             outptr += 4;
407         }
408         for (int x = 0; x < src.w; x++)
409         {
410             _p = vld1q_f32(ptr0);
411             vst1q_f32(outptr, _p);
412             ptr0 += 4;
413             outptr += 4;
414         }
415         for (int x = 0; x < right; x++)
416         {
417             vst1q_f32(outptr, _p);
418             outptr += 4;
419         }
420     }
421 }
422 
padding_reflect_pack4_neon(const Mat & src,Mat & dst,int top,int bottom,int left,int right)423 static void padding_reflect_pack4_neon(const Mat& src, Mat& dst, int top, int bottom, int left, int right)
424 {
425     const float* ptr = src;
426     float* outptr = dst;
427 
428     // fill top
429     ptr += top * src.w * 4;
430     for (int y = 0; y < top; y++)
431     {
432         const float* ptr0 = ptr;
433         for (int x = 0; x < left; x++)
434         {
435             float32x4_t _p = vld1q_f32(ptr0 + (left - x) * 4);
436             vst1q_f32(outptr, _p);
437             outptr += 4;
438         }
439         for (int x = 0; x < src.w; x++)
440         {
441             float32x4_t _p = vld1q_f32(ptr0);
442             vst1q_f32(outptr, _p);
443             ptr0 += 4;
444             outptr += 4;
445         }
446         for (int x = 0; x < right; x++)
447         {
448             float32x4_t _p = vld1q_f32(ptr0 - 8 - x * 4);
449             vst1q_f32(outptr, _p);
450             outptr += 4;
451         }
452         ptr -= src.w * 4;
453     }
454     // fill center
455     for (int y = 0; y < src.h; y++)
456     {
457         for (int x = 0; x < left; x++)
458         {
459             float32x4_t _p = vld1q_f32(ptr + (left - x) * 4);
460             vst1q_f32(outptr, _p);
461             outptr += 4;
462         }
463         for (int x = 0; x < src.w; x++)
464         {
465             float32x4_t _p = vld1q_f32(ptr);
466             vst1q_f32(outptr, _p);
467             ptr += 4;
468             outptr += 4;
469         }
470         for (int x = 0; x < right; x++)
471         {
472             float32x4_t _p = vld1q_f32(ptr - 8 - x * 4);
473             vst1q_f32(outptr, _p);
474             outptr += 4;
475         }
476     }
477     // fill bottom
478     ptr -= 2 * src.w * 4;
479     for (int y = 0; y < bottom; y++)
480     {
481         const float* ptr0 = ptr;
482         for (int x = 0; x < left; x++)
483         {
484             float32x4_t _p = vld1q_f32(ptr0 + (left - x) * 4);
485             vst1q_f32(outptr, _p);
486             outptr += 4;
487         }
488         for (int x = 0; x < src.w; x++)
489         {
490             float32x4_t _p = vld1q_f32(ptr0);
491             vst1q_f32(outptr, _p);
492             ptr0 += 4;
493             outptr += 4;
494         }
495         for (int x = 0; x < right; x++)
496         {
497             float32x4_t _p = vld1q_f32(ptr0 - 8 - x * 4);
498             vst1q_f32(outptr, _p);
499             outptr += 4;
500         }
501         ptr -= src.w * 4;
502     }
503 }
504