1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 
padding_constant_pack8_int8_neon(const Mat & src,Mat & dst,int top,int bottom,int left,int right,int8x8_t v)15 static void padding_constant_pack8_int8_neon(const Mat& src, Mat& dst, int top, int bottom, int left, int right, int8x8_t v)
16 {
17     const signed char* ptr = src;
18     signed char* outptr = dst;
19 
20     int w = src.w;
21     int h = src.h;
22 
23     int top_size = top * dst.w;
24     int bottom_size = bottom * dst.w;
25 
26 #if __aarch64__
27     asm volatile(
28         "mov    v0.8b, %10.8b           \n"
29         "mov    v0.d[1], v0.d[0]        \n"
30         "mov    v1.16b, v0.16b          \n"
31         "mov    v2.16b, v0.16b          \n"
32         "mov    v3.16b, v0.16b          \n"
33 
34         // fill top
35         "lsr    w4, %w8, #3             \n" // w4 = nn = top_size >> 3
36         "cmp    w4, #0                  \n"
37         "beq    1f                      \n"
38 
39         "0:                             \n"
40         "st1    {v0.16b, v1.16b, v2.16b, v3.16b}, [%0], #64 \n"
41         "subs   w4, w4, #1              \n"
42         "bne    0b                      \n"
43 
44         "1:                             \n"
45 
46         // fill top remain
47         "and    w4, %w8, #7             \n" // w4 = remain = top_size & 7
48 
49         "cmp    w4, #4                  \n" // w4 >= 4
50         "blt    2f                      \n"
51         "sub    w4, w4, #4              \n"
52         "st1    {v0.16b, v1.16b}, [%0], #32 \n"
53         "2:                             \n"
54 
55         "cmp    w4, #2                  \n" // w4 >= 2
56         "blt    3f                      \n"
57         "sub    w4, w4, #2              \n"
58         "st1    {v0.16b}, [%0], #16     \n"
59         "3:                             \n"
60 
61         "cmp    w4, #0                  \n" // w4 > 0
62         "beq    4f                      \n"
63         "st1    {v0.8b}, [%0], #8       \n"
64         "4:                             \n"
65 
66         // fill center h loop
67         "cmp    %w5, #0                 \n"
68         "beq    15f                     \n"
69         "5:                             \n"
70 
71         // fill left
72         "mov    w4, %w6                 \n" // w4 = left
73         "cmp    w4, #0                  \n"
74         "beq    7f                      \n"
75 
76         "6:                             \n"
77         "st1    {v0.8b}, [%0], #8       \n"
78         "subs   w4, w4, #1              \n"
79         "bne    6b                      \n"
80 
81         "7:                             \n"
82 
83         // fill middle
84         "lsr    w4, %w4, #3             \n" // w4 = nn = w >> 3
85         "cmp    w4, #0                  \n"
86         "beq    9f                      \n"
87 
88         "8:                             \n"
89         "prfm   pldl1keep, [%1, #512]   \n"
90         "ld1    {v16.16b, v17.16b, v18.16b, v19.16b}, [%1], #64 \n"
91         "subs   w4, w4, #1              \n"
92         "st1    {v16.16b, v17.16b, v18.16b, v19.16b}, [%0], #64 \n"
93         "bne    8b                      \n"
94 
95         "9:                             \n"
96 
97         "and    w4, %w4, #7             \n" // w4 = remain = w & 7
98 
99         "cmp    w4, #4                  \n" // w4 >= 4
100         "blt    10f                     \n"
101         "prfm   pldl1keep, [%1, #256]   \n"
102         "ld1    {v16.16b, v17.16b}, [%1], #32 \n"
103         "sub    w4, w4, #4              \n"
104         "st1    {v16.16b, v17.16b}, [%0], #32 \n"
105         "10:                            \n"
106 
107         "cmp    w4, #2                  \n" // w4 >= 2
108         "blt    11f                     \n"
109         "prfm   pldl1keep, [%1, #128]   \n"
110         "ld1    {v16.16b}, [%1], #16    \n"
111         "sub    w4, w4, #2              \n"
112         "st1    {v16.16b}, [%0], #16    \n"
113         "11:                            \n"
114 
115         "cmp    w4, #0                  \n" // w4 > 0
116         "beq    12f                     \n"
117         "prfm   pldl1keep, [%1, #64]    \n"
118         "ld1    {v16.8b}, [%1], #8      \n"
119         "st1    {v16.8b}, [%0], #8      \n"
120         "12:                            \n"
121 
122         // fill right
123         "mov    w4, %w7                 \n" // w4 = right
124         "cmp    w4, #0                  \n"
125         "beq    14f                     \n"
126 
127         "13:                            \n"
128         "subs   w4, w4, #1              \n"
129         "st1    {v0.8b}, [%0], #8       \n"
130         "bne    13b                     \n"
131         "14:                            \n"
132 
133         "subs   %w5, %w5, #1            \n"
134         "bne    5b                      \n"
135 
136         "15:                            \n"
137 
138         // fill bottom
139         "lsr    w4, %w9, #3             \n" // w4 = nn = bottom_size >> 3
140         "cmp    w4, #0                  \n"
141         "beq    17f                     \n"
142 
143         "16:                            \n"
144         "st1    {v0.16b, v1.16b, v2.16b, v3.16b}, [%0], #64 \n"
145         "subs   w4, w4, #1              \n"
146         "bne    16b                     \n"
147         "17:                            \n"
148 
149         // fill bottom remain
150         "and    w4, %w9, #7             \n" // w4 = remain = bottom_size & 7
151 
152         "cmp    w4, #4                  \n" // w4 >= 4
153         "blt    18f                     \n"
154         "sub    w4, w4, #4              \n"
155         "st1    {v0.16b, v1.16b}, [%0], #32 \n"
156         "18:                            \n"
157 
158         "cmp    w4, #2                  \n" // w4 >= 2
159         "blt    19f                     \n"
160         "sub    w4, w4, #2              \n"
161         "st1    {v0.16b}, [%0], #16     \n"
162         "19:                            \n"
163 
164         "cmp    w4, #0                  \n" // w4 > 0
165         "beq    20f                     \n"
166         "st1    {v0.8b}, [%0], #8       \n"
167         "20:                            \n"
168 
169         : "=r"(outptr), // %0
170         "=r"(ptr)     // %1
171         : "0"(outptr),
172         "1"(ptr),
173         "r"(w),           // %4
174         "r"(h),           // %5
175         "r"(left),        // %6
176         "r"(right),       // %7
177         "r"(top_size),    // %8
178         "r"(bottom_size), // %9
179         "w"(v)            // %10
180         : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
181 #else  // __aarch64__
182     asm volatile(
183         "vmov       d0, %P10            \n"
184         "vmov       d1, d0              \n"
185         "vmov       q1, q0              \n"
186         "vmov       q2, q0              \n"
187         "vmov       q3, q0              \n"
188 
189         // fill top
190         "lsr        r4, %8, #3          \n" // r4 = nn = top_size >> 3
191         "cmp        r4, #0              \n"
192         "beq        1f                  \n"
193 
194         "0:                             \n"
195         "vstm       %0!, {d0-d7}        \n"
196         "subs       r4, r4, #1          \n"
197         "bne        0b                  \n"
198 
199         "1:                             \n"
200 
201         // fill top remain
202         "and        r4, %8, #7          \n" // r4 = remain = top_size & 7
203 
204         "cmp        r4, #4              \n" // r4 >= 4
205         "blt        2f                  \n"
206         "sub        r4, r4, #4          \n"
207         "vst1.s8    {d0-d3}, [%0 :128]! \n"
208         "2:                             \n"
209 
210         "cmp        r4, #2              \n" // r4 >= 2
211         "blt        3f                  \n"
212         "sub        r4, r4, #2          \n"
213         "vst1.s8    {d0-d1}, [%0 :128]! \n"
214         "3:                             \n"
215 
216         "cmp        r4, #0              \n" // r4 > 0
217         "beq        4f                  \n"
218         "vst1.s8    {d0}, [%0 :64]!     \n"
219         "4:                             \n"
220 
221         // fill center h loop
222         "cmp        %5, #0              \n"
223         "beq        15f                 \n"
224         "5:                             \n"
225 
226         // fill left
227         "mov        r4, %6              \n" // r4 = left
228         "cmp        r4, #0              \n"
229         "beq        7f                  \n"
230 
231         "6:                             \n"
232         "vst1.s8    {d0}, [%0 :64]!     \n"
233         "subs       r4, r4, #1          \n"
234         "bne        6b                  \n"
235 
236         "7:                             \n"
237 
238         // fill middle
239         "lsr        r4, %4, #3          \n" // r4 = nn = w >> 3
240         "cmp        r4, #0              \n"
241         "beq        9f                  \n"
242 
243         "8:                             \n"
244         "pld        [%1, #512]          \n"
245         "vldm       %1!, {d16-d23}      \n"
246         "subs       r4, r4, #1          \n"
247         "vstm       %0!, {d16-d23}      \n"
248         "bne        8b                  \n"
249 
250         "9:                             \n"
251 
252         "and        r4, %4, #7          \n" // r4 = remain = w & 7
253 
254         "cmp        r4, #4              \n" // r4 >= 4
255         "blt        10f                 \n"
256         "pld        [%1, #256]          \n"
257         "vld1.s8    {d16-d19}, [%1 :64]! \n"
258         "sub        r4, r4, #4          \n"
259         "vst1.s8    {d16-d19}, [%0 :64]! \n"
260         "10:                            \n"
261 
262         "cmp        r4, #2              \n" // r4 >= 2
263         "blt        11f                 \n"
264         "pld        [%1, #128]          \n"
265         "vld1.s8    {d16-d17}, [%1 :64]! \n"
266         "sub        r4, r4, #2          \n"
267         "vst1.s8    {d16-d17}, [%0 :64]! \n"
268         "11:                            \n"
269 
270         "cmp        r4, #0              \n" // r4 > 0
271         "beq        12f                 \n"
272         "pld        [%1, #64]           \n"
273         "vld1.s8    {d16}, [%1 :64]!    \n"
274         "vst1.s8    {d16}, [%0 :64]!    \n"
275         "12:                            \n"
276 
277         // fill right
278         "mov        r4, %7              \n" // r4 = right
279         "cmp        r4, #0              \n"
280         "beq        14f                 \n"
281 
282         "13:                            \n"
283         "subs       r4, r4, #1          \n"
284         "vst1.s8    {d0}, [%0 :64]!     \n"
285         "bne        13b                 \n"
286         "14:                            \n"
287 
288         "subs       %5, %5, #1          \n"
289         "bne        5b                  \n"
290 
291         "15:                            \n"
292 
293         // fill bottom
294         "lsr        r4, %9, #3          \n" // r4 = nn = bottom_size >> 3
295         "cmp        r4, #0              \n"
296         "beq        17f                 \n"
297 
298         "16:                            \n"
299         "vstm       %0!, {d0-d7}        \n"
300         "subs       r4, r4, #1          \n"
301         "bne        16b                 \n"
302         "17:                            \n"
303 
304         // fill bottom remain
305         "and        r4, %9, #7          \n" // r4 = remain = bottom_size & 7
306 
307         "cmp        r4, #4              \n" // r4 >= 4
308         "blt        18f                 \n"
309         "sub        r4, r4, #4          \n"
310         "vst1.s8    {d0-d3}, [%0 :64]!  \n"
311         "18:                            \n"
312 
313         "cmp        r4, #2              \n" // r4 >= 2
314         "blt        19f                 \n"
315         "sub        r4, r4, #2          \n"
316         "vst1.s8    {d0-d1}, [%0 :64]!  \n"
317         "19:                            \n"
318 
319         "cmp        r4, #0              \n" // r4 > 0
320         "beq        20f                 \n"
321         "vst1.s8    {d0}, [%0 :64]!     \n"
322         "20:                            \n"
323 
324         : "=r"(outptr), // %0
325         "=r"(ptr)     // %1
326         : "0"(outptr),
327         "1"(ptr),
328         "r"(w),           // %4
329         "r"(h),           // %5
330         "r"(left),        // %6
331         "r"(right),       // %7
332         "r"(top_size),    // %8
333         "r"(bottom_size), // %9
334         "w"(v)            // %10
335         : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
336 #endif // __aarch64__
337 }
338 
padding_replicate_pack8_int8_neon(const Mat & src,Mat & dst,int top,int bottom,int left,int right)339 static void padding_replicate_pack8_int8_neon(const Mat& src, Mat& dst, int top, int bottom, int left, int right)
340 {
341     const signed char* ptr = src;
342     signed char* outptr = dst;
343 
344     // fill top
345     for (int y = 0; y < top; y++)
346     {
347         const signed char* ptr0 = ptr;
348         int8x8_t _p = vld1_s8(ptr0);
349         for (int x = 0; x < left; x++)
350         {
351             vst1_s8(outptr, _p);
352             outptr += 8;
353         }
354         for (int x = 0; x < src.w; x++)
355         {
356             _p = vld1_s8(ptr0);
357             vst1_s8(outptr, _p);
358             ptr0 += 8;
359             outptr += 8;
360         }
361         for (int x = 0; x < right; x++)
362         {
363             vst1_s8(outptr, _p);
364             outptr += 8;
365         }
366     }
367     // fill center
368     for (int y = 0; y < src.h; y++)
369     {
370         int8x8_t _p = vld1_s8(ptr);
371         for (int x = 0; x < left; x++)
372         {
373             vst1_s8(outptr, _p);
374             outptr += 8;
375         }
376         for (int x = 0; x < src.w; x++)
377         {
378             _p = vld1_s8(ptr);
379             vst1_s8(outptr, _p);
380             ptr += 8;
381             outptr += 8;
382         }
383         for (int x = 0; x < right; x++)
384         {
385             vst1_s8(outptr, _p);
386             outptr += 8;
387         }
388     }
389     // fill bottom
390     ptr -= src.w * 8;
391     for (int y = 0; y < bottom; y++)
392     {
393         const signed char* ptr0 = ptr;
394         int8x8_t _p = vld1_s8(ptr0);
395         for (int x = 0; x < left; x++)
396         {
397             vst1_s8(outptr, _p);
398             outptr += 8;
399         }
400         for (int x = 0; x < src.w; x++)
401         {
402             _p = vld1_s8(ptr0);
403             vst1_s8(outptr, _p);
404             ptr0 += 8;
405             outptr += 8;
406         }
407         for (int x = 0; x < right; x++)
408         {
409             vst1_s8(outptr, _p);
410             outptr += 8;
411         }
412     }
413 }
414 
padding_reflect_pack8_int8_neon(const Mat & src,Mat & dst,int top,int bottom,int left,int right)415 static void padding_reflect_pack8_int8_neon(const Mat& src, Mat& dst, int top, int bottom, int left, int right)
416 {
417     const signed char* ptr = src;
418     signed char* outptr = dst;
419 
420     // fill top
421     ptr += top * src.w * 8;
422     for (int y = 0; y < top; y++)
423     {
424         const signed char* ptr0 = ptr;
425         for (int x = 0; x < left; x++)
426         {
427             int8x8_t _p = vld1_s8(ptr0 + (left - x) * 8);
428             vst1_s8(outptr, _p);
429             outptr += 8;
430         }
431         for (int x = 0; x < src.w; x++)
432         {
433             int8x8_t _p = vld1_s8(ptr0);
434             vst1_s8(outptr, _p);
435             ptr0 += 8;
436             outptr += 8;
437         }
438         for (int x = 0; x < right; x++)
439         {
440             int8x8_t _p = vld1_s8(ptr0 - 16 - x * 8);
441             vst1_s8(outptr, _p);
442             outptr += 8;
443         }
444         ptr -= src.w * 8;
445     }
446     // fill center
447     for (int y = 0; y < src.h; y++)
448     {
449         for (int x = 0; x < left; x++)
450         {
451             int8x8_t _p = vld1_s8(ptr + (left - x) * 8);
452             vst1_s8(outptr, _p);
453             outptr += 8;
454         }
455         for (int x = 0; x < src.w; x++)
456         {
457             int8x8_t _p = vld1_s8(ptr);
458             vst1_s8(outptr, _p);
459             ptr += 8;
460             outptr += 8;
461         }
462         for (int x = 0; x < right; x++)
463         {
464             int8x8_t _p = vld1_s8(ptr - 16 - x * 8);
465             vst1_s8(outptr, _p);
466             outptr += 8;
467         }
468     }
469     // fill bottom
470     ptr -= 2 * src.w * 8;
471     for (int y = 0; y < bottom; y++)
472     {
473         const signed char* ptr0 = ptr;
474         for (int x = 0; x < left; x++)
475         {
476             int8x8_t _p = vld1_s8(ptr0 + (left - x) * 8);
477             vst1_s8(outptr, _p);
478             outptr += 8;
479         }
480         for (int x = 0; x < src.w; x++)
481         {
482             int8x8_t _p = vld1_s8(ptr0);
483             vst1_s8(outptr, _p);
484             ptr0 += 8;
485             outptr += 8;
486         }
487         for (int x = 0; x < right; x++)
488         {
489             int8x8_t _p = vld1_s8(ptr0 - 16 - x * 8);
490             vst1_s8(outptr, _p);
491             outptr += 8;
492         }
493         ptr -= src.w * 8;
494     }
495 }
496