1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14
padding_constant_pack8_int8_neon(const Mat & src,Mat & dst,int top,int bottom,int left,int right,int8x8_t v)15 static void padding_constant_pack8_int8_neon(const Mat& src, Mat& dst, int top, int bottom, int left, int right, int8x8_t v)
16 {
17 const signed char* ptr = src;
18 signed char* outptr = dst;
19
20 int w = src.w;
21 int h = src.h;
22
23 int top_size = top * dst.w;
24 int bottom_size = bottom * dst.w;
25
26 #if __aarch64__
27 asm volatile(
28 "mov v0.8b, %10.8b \n"
29 "mov v0.d[1], v0.d[0] \n"
30 "mov v1.16b, v0.16b \n"
31 "mov v2.16b, v0.16b \n"
32 "mov v3.16b, v0.16b \n"
33
34 // fill top
35 "lsr w4, %w8, #3 \n" // w4 = nn = top_size >> 3
36 "cmp w4, #0 \n"
37 "beq 1f \n"
38
39 "0: \n"
40 "st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [%0], #64 \n"
41 "subs w4, w4, #1 \n"
42 "bne 0b \n"
43
44 "1: \n"
45
46 // fill top remain
47 "and w4, %w8, #7 \n" // w4 = remain = top_size & 7
48
49 "cmp w4, #4 \n" // w4 >= 4
50 "blt 2f \n"
51 "sub w4, w4, #4 \n"
52 "st1 {v0.16b, v1.16b}, [%0], #32 \n"
53 "2: \n"
54
55 "cmp w4, #2 \n" // w4 >= 2
56 "blt 3f \n"
57 "sub w4, w4, #2 \n"
58 "st1 {v0.16b}, [%0], #16 \n"
59 "3: \n"
60
61 "cmp w4, #0 \n" // w4 > 0
62 "beq 4f \n"
63 "st1 {v0.8b}, [%0], #8 \n"
64 "4: \n"
65
66 // fill center h loop
67 "cmp %w5, #0 \n"
68 "beq 15f \n"
69 "5: \n"
70
71 // fill left
72 "mov w4, %w6 \n" // w4 = left
73 "cmp w4, #0 \n"
74 "beq 7f \n"
75
76 "6: \n"
77 "st1 {v0.8b}, [%0], #8 \n"
78 "subs w4, w4, #1 \n"
79 "bne 6b \n"
80
81 "7: \n"
82
83 // fill middle
84 "lsr w4, %w4, #3 \n" // w4 = nn = w >> 3
85 "cmp w4, #0 \n"
86 "beq 9f \n"
87
88 "8: \n"
89 "prfm pldl1keep, [%1, #512] \n"
90 "ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [%1], #64 \n"
91 "subs w4, w4, #1 \n"
92 "st1 {v16.16b, v17.16b, v18.16b, v19.16b}, [%0], #64 \n"
93 "bne 8b \n"
94
95 "9: \n"
96
97 "and w4, %w4, #7 \n" // w4 = remain = w & 7
98
99 "cmp w4, #4 \n" // w4 >= 4
100 "blt 10f \n"
101 "prfm pldl1keep, [%1, #256] \n"
102 "ld1 {v16.16b, v17.16b}, [%1], #32 \n"
103 "sub w4, w4, #4 \n"
104 "st1 {v16.16b, v17.16b}, [%0], #32 \n"
105 "10: \n"
106
107 "cmp w4, #2 \n" // w4 >= 2
108 "blt 11f \n"
109 "prfm pldl1keep, [%1, #128] \n"
110 "ld1 {v16.16b}, [%1], #16 \n"
111 "sub w4, w4, #2 \n"
112 "st1 {v16.16b}, [%0], #16 \n"
113 "11: \n"
114
115 "cmp w4, #0 \n" // w4 > 0
116 "beq 12f \n"
117 "prfm pldl1keep, [%1, #64] \n"
118 "ld1 {v16.8b}, [%1], #8 \n"
119 "st1 {v16.8b}, [%0], #8 \n"
120 "12: \n"
121
122 // fill right
123 "mov w4, %w7 \n" // w4 = right
124 "cmp w4, #0 \n"
125 "beq 14f \n"
126
127 "13: \n"
128 "subs w4, w4, #1 \n"
129 "st1 {v0.8b}, [%0], #8 \n"
130 "bne 13b \n"
131 "14: \n"
132
133 "subs %w5, %w5, #1 \n"
134 "bne 5b \n"
135
136 "15: \n"
137
138 // fill bottom
139 "lsr w4, %w9, #3 \n" // w4 = nn = bottom_size >> 3
140 "cmp w4, #0 \n"
141 "beq 17f \n"
142
143 "16: \n"
144 "st1 {v0.16b, v1.16b, v2.16b, v3.16b}, [%0], #64 \n"
145 "subs w4, w4, #1 \n"
146 "bne 16b \n"
147 "17: \n"
148
149 // fill bottom remain
150 "and w4, %w9, #7 \n" // w4 = remain = bottom_size & 7
151
152 "cmp w4, #4 \n" // w4 >= 4
153 "blt 18f \n"
154 "sub w4, w4, #4 \n"
155 "st1 {v0.16b, v1.16b}, [%0], #32 \n"
156 "18: \n"
157
158 "cmp w4, #2 \n" // w4 >= 2
159 "blt 19f \n"
160 "sub w4, w4, #2 \n"
161 "st1 {v0.16b}, [%0], #16 \n"
162 "19: \n"
163
164 "cmp w4, #0 \n" // w4 > 0
165 "beq 20f \n"
166 "st1 {v0.8b}, [%0], #8 \n"
167 "20: \n"
168
169 : "=r"(outptr), // %0
170 "=r"(ptr) // %1
171 : "0"(outptr),
172 "1"(ptr),
173 "r"(w), // %4
174 "r"(h), // %5
175 "r"(left), // %6
176 "r"(right), // %7
177 "r"(top_size), // %8
178 "r"(bottom_size), // %9
179 "w"(v) // %10
180 : "cc", "memory", "x4", "v0", "v1", "v2", "v3", "v16", "v17", "v18", "v19");
181 #else // __aarch64__
182 asm volatile(
183 "vmov d0, %P10 \n"
184 "vmov d1, d0 \n"
185 "vmov q1, q0 \n"
186 "vmov q2, q0 \n"
187 "vmov q3, q0 \n"
188
189 // fill top
190 "lsr r4, %8, #3 \n" // r4 = nn = top_size >> 3
191 "cmp r4, #0 \n"
192 "beq 1f \n"
193
194 "0: \n"
195 "vstm %0!, {d0-d7} \n"
196 "subs r4, r4, #1 \n"
197 "bne 0b \n"
198
199 "1: \n"
200
201 // fill top remain
202 "and r4, %8, #7 \n" // r4 = remain = top_size & 7
203
204 "cmp r4, #4 \n" // r4 >= 4
205 "blt 2f \n"
206 "sub r4, r4, #4 \n"
207 "vst1.s8 {d0-d3}, [%0 :128]! \n"
208 "2: \n"
209
210 "cmp r4, #2 \n" // r4 >= 2
211 "blt 3f \n"
212 "sub r4, r4, #2 \n"
213 "vst1.s8 {d0-d1}, [%0 :128]! \n"
214 "3: \n"
215
216 "cmp r4, #0 \n" // r4 > 0
217 "beq 4f \n"
218 "vst1.s8 {d0}, [%0 :64]! \n"
219 "4: \n"
220
221 // fill center h loop
222 "cmp %5, #0 \n"
223 "beq 15f \n"
224 "5: \n"
225
226 // fill left
227 "mov r4, %6 \n" // r4 = left
228 "cmp r4, #0 \n"
229 "beq 7f \n"
230
231 "6: \n"
232 "vst1.s8 {d0}, [%0 :64]! \n"
233 "subs r4, r4, #1 \n"
234 "bne 6b \n"
235
236 "7: \n"
237
238 // fill middle
239 "lsr r4, %4, #3 \n" // r4 = nn = w >> 3
240 "cmp r4, #0 \n"
241 "beq 9f \n"
242
243 "8: \n"
244 "pld [%1, #512] \n"
245 "vldm %1!, {d16-d23} \n"
246 "subs r4, r4, #1 \n"
247 "vstm %0!, {d16-d23} \n"
248 "bne 8b \n"
249
250 "9: \n"
251
252 "and r4, %4, #7 \n" // r4 = remain = w & 7
253
254 "cmp r4, #4 \n" // r4 >= 4
255 "blt 10f \n"
256 "pld [%1, #256] \n"
257 "vld1.s8 {d16-d19}, [%1 :64]! \n"
258 "sub r4, r4, #4 \n"
259 "vst1.s8 {d16-d19}, [%0 :64]! \n"
260 "10: \n"
261
262 "cmp r4, #2 \n" // r4 >= 2
263 "blt 11f \n"
264 "pld [%1, #128] \n"
265 "vld1.s8 {d16-d17}, [%1 :64]! \n"
266 "sub r4, r4, #2 \n"
267 "vst1.s8 {d16-d17}, [%0 :64]! \n"
268 "11: \n"
269
270 "cmp r4, #0 \n" // r4 > 0
271 "beq 12f \n"
272 "pld [%1, #64] \n"
273 "vld1.s8 {d16}, [%1 :64]! \n"
274 "vst1.s8 {d16}, [%0 :64]! \n"
275 "12: \n"
276
277 // fill right
278 "mov r4, %7 \n" // r4 = right
279 "cmp r4, #0 \n"
280 "beq 14f \n"
281
282 "13: \n"
283 "subs r4, r4, #1 \n"
284 "vst1.s8 {d0}, [%0 :64]! \n"
285 "bne 13b \n"
286 "14: \n"
287
288 "subs %5, %5, #1 \n"
289 "bne 5b \n"
290
291 "15: \n"
292
293 // fill bottom
294 "lsr r4, %9, #3 \n" // r4 = nn = bottom_size >> 3
295 "cmp r4, #0 \n"
296 "beq 17f \n"
297
298 "16: \n"
299 "vstm %0!, {d0-d7} \n"
300 "subs r4, r4, #1 \n"
301 "bne 16b \n"
302 "17: \n"
303
304 // fill bottom remain
305 "and r4, %9, #7 \n" // r4 = remain = bottom_size & 7
306
307 "cmp r4, #4 \n" // r4 >= 4
308 "blt 18f \n"
309 "sub r4, r4, #4 \n"
310 "vst1.s8 {d0-d3}, [%0 :64]! \n"
311 "18: \n"
312
313 "cmp r4, #2 \n" // r4 >= 2
314 "blt 19f \n"
315 "sub r4, r4, #2 \n"
316 "vst1.s8 {d0-d1}, [%0 :64]! \n"
317 "19: \n"
318
319 "cmp r4, #0 \n" // r4 > 0
320 "beq 20f \n"
321 "vst1.s8 {d0}, [%0 :64]! \n"
322 "20: \n"
323
324 : "=r"(outptr), // %0
325 "=r"(ptr) // %1
326 : "0"(outptr),
327 "1"(ptr),
328 "r"(w), // %4
329 "r"(h), // %5
330 "r"(left), // %6
331 "r"(right), // %7
332 "r"(top_size), // %8
333 "r"(bottom_size), // %9
334 "w"(v) // %10
335 : "cc", "memory", "r4", "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11");
336 #endif // __aarch64__
337 }
338
padding_replicate_pack8_int8_neon(const Mat & src,Mat & dst,int top,int bottom,int left,int right)339 static void padding_replicate_pack8_int8_neon(const Mat& src, Mat& dst, int top, int bottom, int left, int right)
340 {
341 const signed char* ptr = src;
342 signed char* outptr = dst;
343
344 // fill top
345 for (int y = 0; y < top; y++)
346 {
347 const signed char* ptr0 = ptr;
348 int8x8_t _p = vld1_s8(ptr0);
349 for (int x = 0; x < left; x++)
350 {
351 vst1_s8(outptr, _p);
352 outptr += 8;
353 }
354 for (int x = 0; x < src.w; x++)
355 {
356 _p = vld1_s8(ptr0);
357 vst1_s8(outptr, _p);
358 ptr0 += 8;
359 outptr += 8;
360 }
361 for (int x = 0; x < right; x++)
362 {
363 vst1_s8(outptr, _p);
364 outptr += 8;
365 }
366 }
367 // fill center
368 for (int y = 0; y < src.h; y++)
369 {
370 int8x8_t _p = vld1_s8(ptr);
371 for (int x = 0; x < left; x++)
372 {
373 vst1_s8(outptr, _p);
374 outptr += 8;
375 }
376 for (int x = 0; x < src.w; x++)
377 {
378 _p = vld1_s8(ptr);
379 vst1_s8(outptr, _p);
380 ptr += 8;
381 outptr += 8;
382 }
383 for (int x = 0; x < right; x++)
384 {
385 vst1_s8(outptr, _p);
386 outptr += 8;
387 }
388 }
389 // fill bottom
390 ptr -= src.w * 8;
391 for (int y = 0; y < bottom; y++)
392 {
393 const signed char* ptr0 = ptr;
394 int8x8_t _p = vld1_s8(ptr0);
395 for (int x = 0; x < left; x++)
396 {
397 vst1_s8(outptr, _p);
398 outptr += 8;
399 }
400 for (int x = 0; x < src.w; x++)
401 {
402 _p = vld1_s8(ptr0);
403 vst1_s8(outptr, _p);
404 ptr0 += 8;
405 outptr += 8;
406 }
407 for (int x = 0; x < right; x++)
408 {
409 vst1_s8(outptr, _p);
410 outptr += 8;
411 }
412 }
413 }
414
padding_reflect_pack8_int8_neon(const Mat & src,Mat & dst,int top,int bottom,int left,int right)415 static void padding_reflect_pack8_int8_neon(const Mat& src, Mat& dst, int top, int bottom, int left, int right)
416 {
417 const signed char* ptr = src;
418 signed char* outptr = dst;
419
420 // fill top
421 ptr += top * src.w * 8;
422 for (int y = 0; y < top; y++)
423 {
424 const signed char* ptr0 = ptr;
425 for (int x = 0; x < left; x++)
426 {
427 int8x8_t _p = vld1_s8(ptr0 + (left - x) * 8);
428 vst1_s8(outptr, _p);
429 outptr += 8;
430 }
431 for (int x = 0; x < src.w; x++)
432 {
433 int8x8_t _p = vld1_s8(ptr0);
434 vst1_s8(outptr, _p);
435 ptr0 += 8;
436 outptr += 8;
437 }
438 for (int x = 0; x < right; x++)
439 {
440 int8x8_t _p = vld1_s8(ptr0 - 16 - x * 8);
441 vst1_s8(outptr, _p);
442 outptr += 8;
443 }
444 ptr -= src.w * 8;
445 }
446 // fill center
447 for (int y = 0; y < src.h; y++)
448 {
449 for (int x = 0; x < left; x++)
450 {
451 int8x8_t _p = vld1_s8(ptr + (left - x) * 8);
452 vst1_s8(outptr, _p);
453 outptr += 8;
454 }
455 for (int x = 0; x < src.w; x++)
456 {
457 int8x8_t _p = vld1_s8(ptr);
458 vst1_s8(outptr, _p);
459 ptr += 8;
460 outptr += 8;
461 }
462 for (int x = 0; x < right; x++)
463 {
464 int8x8_t _p = vld1_s8(ptr - 16 - x * 8);
465 vst1_s8(outptr, _p);
466 outptr += 8;
467 }
468 }
469 // fill bottom
470 ptr -= 2 * src.w * 8;
471 for (int y = 0; y < bottom; y++)
472 {
473 const signed char* ptr0 = ptr;
474 for (int x = 0; x < left; x++)
475 {
476 int8x8_t _p = vld1_s8(ptr0 + (left - x) * 8);
477 vst1_s8(outptr, _p);
478 outptr += 8;
479 }
480 for (int x = 0; x < src.w; x++)
481 {
482 int8x8_t _p = vld1_s8(ptr0);
483 vst1_s8(outptr, _p);
484 ptr0 += 8;
485 outptr += 8;
486 }
487 for (int x = 0; x < right; x++)
488 {
489 int8x8_t _p = vld1_s8(ptr0 - 16 - x * 8);
490 vst1_s8(outptr, _p);
491 outptr += 8;
492 }
493 ptr -= src.w * 8;
494 }
495 }
496