1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14
15 #include "crop_arm.h"
16
17 #if __ARM_NEON
18 #include <arm_neon.h>
19 #endif // __ARM_NEON
20
21 namespace ncnn {
22
Crop_arm()23 Crop_arm::Crop_arm()
24 {
25 #if __ARM_NEON
26 support_packing = true;
27 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
28 support_fp16_storage = true;
29 #endif
30 #endif // __ARM_NEON
31
32 #if NCNN_BF16
33 support_bf16_storage = true;
34 #endif
35 }
36
37 #if __ARM_NEON
crop_pack8_neon(const Mat & src,Mat & dst,int top,int left)38 static void crop_pack8_neon(const Mat& src, Mat& dst, int top, int left)
39 {
40 int w = dst.w;
41 int h = dst.h;
42 int right = src.w - dst.w - left;
43
44 const float* ptr = src.row(top) + left * 8;
45 float* outptr = dst;
46
47 for (int y = 0; y < h; y++)
48 {
49 for (int x = 0; x < w; x++)
50 {
51 float32x4_t _p0 = vld1q_f32(ptr);
52 float32x4_t _p1 = vld1q_f32(ptr + 4);
53 vst1q_f32(outptr, _p0);
54 vst1q_f32(outptr + 4, _p1);
55 ptr += 8;
56 outptr += 8;
57 }
58
59 ptr += (left + right) * 8;
60 }
61 }
62
crop_pack8_bf16_fp16s_neon(const Mat & src,Mat & dst,int top,int left)63 static void crop_pack8_bf16_fp16s_neon(const Mat& src, Mat& dst, int top, int left)
64 {
65 int w = dst.w;
66 int h = dst.h;
67 int right = src.w - dst.w - left;
68
69 const unsigned short* ptr = src.row<unsigned short>(top) + left * 8;
70 unsigned short* outptr = dst;
71
72 for (int y = 0; y < h; y++)
73 {
74 for (int x = 0; x < w; x++)
75 {
76 uint16x8_t _p = vld1q_u16(ptr);
77 vst1q_u16(outptr, _p);
78 ptr += 8;
79 outptr += 8;
80 }
81
82 ptr += (left + right) * 8;
83 }
84 }
85
crop_pack4_neon(const Mat & src,Mat & dst,int top,int left)86 static void crop_pack4_neon(const Mat& src, Mat& dst, int top, int left)
87 {
88 int w = dst.w;
89 int h = dst.h;
90 int right = src.w - dst.w - left;
91
92 const float* ptr = src.row(top) + left * 4;
93 float* outptr = dst;
94
95 for (int y = 0; y < h; y++)
96 {
97 for (int x = 0; x < w; x++)
98 {
99 float32x4_t _p = vld1q_f32(ptr);
100 vst1q_f32(outptr, _p);
101 ptr += 4;
102 outptr += 4;
103 }
104
105 ptr += (left + right) * 4;
106 }
107 }
108
crop_pack4_bf16_fp16s_neon(const Mat & src,Mat & dst,int top,int left)109 static void crop_pack4_bf16_fp16s_neon(const Mat& src, Mat& dst, int top, int left)
110 {
111 int w = dst.w;
112 int h = dst.h;
113 int right = src.w - dst.w - left;
114
115 const unsigned short* ptr = src.row<unsigned short>(top) + left * 4;
116 unsigned short* outptr = dst;
117
118 for (int y = 0; y < h; y++)
119 {
120 for (int x = 0; x < w; x++)
121 {
122 uint16x4_t _p = vld1_u16(ptr);
123 vst1_u16(outptr, _p);
124 ptr += 4;
125 outptr += 4;
126 }
127
128 ptr += (left + right) * 4;
129 }
130 }
131 #endif // __ARM_NEON
132
forward(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const133 int Crop_arm::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
134 {
135 int w = bottom_blob.w;
136 int h = bottom_blob.h;
137 int d = bottom_blob.d;
138 int channels = bottom_blob.c;
139 int dims = bottom_blob.dims;
140 size_t elemsize = bottom_blob.elemsize;
141 int elempack = bottom_blob.elempack;
142
143 #if __ARM_NEON
144 if (elempack == 8)
145 {
146 int _woffset, _hoffset, _doffset, _coffset;
147 int _outw, _outh, _outd, _outc;
148 resolve_crop_roi(bottom_blob.shape(), _woffset, _hoffset, _doffset, _coffset, _outw, _outh, _outd, _outc);
149
150 if (dims == 1)
151 {
152 int out_elempack = _outw % 8 == 0 ? 8 : _outw % 4 == 0 ? 4 : 1;
153 size_t out_elemsize = elemsize / elempack * out_elempack;
154
155 if (_outw / out_elempack == w && out_elempack == 8)
156 {
157 top_blob = bottom_blob;
158 return 0;
159 }
160
161 if (_woffset % 8 == 0 && out_elempack == 8)
162 {
163 top_blob.create(_outw / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
164 if (top_blob.empty())
165 return -100;
166
167 if (elemsize == 16u)
168 crop_pack8_bf16_fp16s_neon(bottom_blob, top_blob, 0, _woffset / elempack);
169 else
170 crop_pack8_neon(bottom_blob, top_blob, 0, _woffset / elempack);
171
172 return 0;
173 }
174 }
175
176 if (dims == 2)
177 {
178 int out_elempack = _outh % 8 == 0 ? 8 : _outh % 4 == 0 ? 4 : 1;
179 size_t out_elemsize = elemsize / elempack * out_elempack;
180
181 if (_outw == w && _outh / out_elempack == h && out_elempack == 8)
182 {
183 top_blob = bottom_blob;
184 return 0;
185 }
186
187 if (_hoffset % 8 == 0 && out_elempack == 8)
188 {
189 top_blob.create(_outw, _outh / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
190 if (top_blob.empty())
191 return -100;
192
193 if (elemsize == 16u)
194 crop_pack8_bf16_fp16s_neon(bottom_blob, top_blob, _hoffset / elempack, _woffset);
195 else
196 crop_pack8_neon(bottom_blob, top_blob, _hoffset / elempack, _woffset);
197
198 return 0;
199 }
200 }
201
202 if (dims == 3)
203 {
204 int out_elempack = _outc % 8 == 0 ? 8 : _outc % 4 == 0 ? 4 : 1;
205 size_t out_elemsize = elemsize / elempack * out_elempack;
206
207 if (_outw == w && _outh == h && _outc / out_elempack == channels && out_elempack == 8)
208 {
209 top_blob = bottom_blob;
210 return 0;
211 }
212
213 if (_coffset % 8 == 0 && out_elempack == 8)
214 {
215 const Mat bottom_blob_sliced = bottom_blob.channel_range(_coffset / out_elempack, _outc / out_elempack);
216
217 if (_outw == w && _outh == h)
218 {
219 top_blob = bottom_blob_sliced.clone();
220 if (top_blob.empty())
221 return -100;
222 }
223
224 top_blob.create(_outw, _outh, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
225 if (top_blob.empty())
226 return -100;
227
228 #pragma omp parallel for num_threads(opt.num_threads)
229 for (int q = 0; q < top_blob.c; q++)
230 {
231 const Mat m = bottom_blob_sliced.channel(q);
232 Mat borderm = top_blob.channel(q);
233
234 if (elemsize == 16u)
235 crop_pack8_bf16_fp16s_neon(m, borderm, _hoffset, _woffset);
236 else
237 crop_pack8_neon(m, borderm, _hoffset, _woffset);
238 }
239
240 return 0;
241 }
242 }
243
244 if (dims == 4)
245 {
246 int out_elempack = _outc % 8 == 0 ? 8 : _outc % 4 == 0 ? 4 : 1;
247 size_t out_elemsize = elemsize / elempack * out_elempack;
248
249 if (_outw == w && _outh == h && _outd == d && _outc / out_elempack == channels && out_elempack == 8)
250 {
251 top_blob = bottom_blob;
252 return 0;
253 }
254
255 if (_coffset % 8 == 0 && out_elempack == 8)
256 {
257 const Mat bottom_blob_sliced = bottom_blob.channel_range(_coffset / out_elempack, _outc / out_elempack);
258
259 if (_outw == w && _outh == h && _outd == d)
260 {
261 top_blob = bottom_blob_sliced.clone();
262 if (top_blob.empty())
263 return -100;
264 }
265
266 top_blob.create(_outw, _outh, _outd, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
267 if (top_blob.empty())
268 return -100;
269
270 #pragma omp parallel for num_threads(opt.num_threads)
271 for (int q = 0; q < top_blob.c; q++)
272 {
273 for (int z = 0; z < _outd; z++)
274 {
275 const Mat m = bottom_blob_sliced.channel(q).depth(z + _doffset);
276 Mat borderm = top_blob.channel(q).depth(z);
277
278 if (elemsize == 16u)
279 crop_pack8_bf16_fp16s_neon(m, borderm, _hoffset, _woffset);
280 else
281 crop_pack8_neon(m, borderm, _hoffset, _woffset);
282 }
283 }
284
285 return 0;
286 }
287 }
288 }
289
290 if (elempack == 4)
291 {
292 int _woffset, _hoffset, _doffset, _coffset;
293 int _outw, _outh, _outd, _outc;
294 resolve_crop_roi(bottom_blob.shape(), _woffset, _hoffset, _doffset, _coffset, _outw, _outh, _outd, _outc);
295
296 if (dims == 1)
297 {
298 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
299 int out_elempack = opt.use_fp16_arithmetic && _outw % 8 == 0 ? 8 : _outw % 4 == 0 ? 4 : 1;
300 #else
301 int out_elempack = _outw % 4 == 0 ? 4 : 1;
302 #endif
303 size_t out_elemsize = elemsize / elempack * out_elempack;
304
305 if (_outw / out_elempack == w && out_elempack == 4)
306 {
307 top_blob = bottom_blob;
308 return 0;
309 }
310
311 if (_woffset % 4 == 0 && out_elempack == 4)
312 {
313 top_blob.create(_outw / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
314 if (top_blob.empty())
315 return -100;
316
317 if (elemsize == 8u)
318 crop_pack4_bf16_fp16s_neon(bottom_blob, top_blob, 0, _woffset / elempack);
319 else
320 crop_pack4_neon(bottom_blob, top_blob, 0, _woffset / elempack);
321
322 return 0;
323 }
324 }
325
326 if (dims == 2)
327 {
328 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
329 int out_elempack = opt.use_fp16_arithmetic && _outh % 8 == 0 ? 8 : _outh % 4 == 0 ? 4 : 1;
330 #else
331 int out_elempack = _outh % 4 == 0 ? 4 : 1;
332 #endif
333 size_t out_elemsize = elemsize / elempack * out_elempack;
334
335 if (_outw == w && _outh / out_elempack == h && out_elempack == 4)
336 {
337 top_blob = bottom_blob;
338 return 0;
339 }
340
341 if (_hoffset % 4 == 0 && out_elempack == 4)
342 {
343 top_blob.create(_outw, _outh / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
344 if (top_blob.empty())
345 return -100;
346
347 if (elemsize == 8u)
348 crop_pack4_bf16_fp16s_neon(bottom_blob, top_blob, _hoffset / elempack, _woffset);
349 else
350 crop_pack4_neon(bottom_blob, top_blob, _hoffset / elempack, _woffset);
351
352 return 0;
353 }
354 }
355
356 if (dims == 3)
357 {
358 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
359 int out_elempack = opt.use_fp16_arithmetic && _outc % 8 == 0 ? 8 : _outc % 4 == 0 ? 4 : 1;
360 #else
361 int out_elempack = _outc % 4 == 0 ? 4 : 1;
362 #endif
363 size_t out_elemsize = elemsize / elempack * out_elempack;
364
365 if (_outw == w && _outh == h && _outc / out_elempack == channels && out_elempack == 4)
366 {
367 top_blob = bottom_blob;
368 return 0;
369 }
370
371 if (_coffset % 4 == 0 && out_elempack == 4)
372 {
373 const Mat bottom_blob_sliced = bottom_blob.channel_range(_coffset / out_elempack, _outc / out_elempack);
374
375 if (_outw == w && _outh == h)
376 {
377 top_blob = bottom_blob_sliced.clone();
378 if (top_blob.empty())
379 return -100;
380 }
381
382 top_blob.create(_outw, _outh, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
383 if (top_blob.empty())
384 return -100;
385
386 #pragma omp parallel for num_threads(opt.num_threads)
387 for (int q = 0; q < top_blob.c; q++)
388 {
389 const Mat m = bottom_blob_sliced.channel(q);
390 Mat borderm = top_blob.channel(q);
391
392 if (elemsize == 8u)
393 crop_pack4_bf16_fp16s_neon(m, borderm, _hoffset, _woffset);
394 else
395 crop_pack4_neon(m, borderm, _hoffset, _woffset);
396 }
397
398 return 0;
399 }
400 }
401
402 if (dims == 4)
403 {
404 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
405 int out_elempack = opt.use_fp16_arithmetic && _outc % 8 == 0 ? 8 : _outc % 4 == 0 ? 4 : 1;
406 #else
407 int out_elempack = _outc % 4 == 0 ? 4 : 1;
408 #endif
409 size_t out_elemsize = elemsize / elempack * out_elempack;
410
411 if (_outw == w && _outh == h && _outd == d && _outc / out_elempack == channels && out_elempack == 4)
412 {
413 top_blob = bottom_blob;
414 return 0;
415 }
416
417 if (_coffset % 4 == 0 && out_elempack == 4)
418 {
419 const Mat bottom_blob_sliced = bottom_blob.channel_range(_coffset / out_elempack, _outc / out_elempack);
420
421 if (_outw == w && _outh == h && _outd == d)
422 {
423 top_blob = bottom_blob_sliced.clone();
424 if (top_blob.empty())
425 return -100;
426 }
427
428 top_blob.create(_outw, _outh, _outd, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
429 if (top_blob.empty())
430 return -100;
431
432 #pragma omp parallel for num_threads(opt.num_threads)
433 for (int q = 0; q < top_blob.c; q++)
434 {
435 for (int z = 0; z < _outd; z++)
436 {
437 const Mat m = bottom_blob_sliced.channel(q).depth(z + _doffset);
438 Mat borderm = top_blob.channel(q).depth(z);
439
440 if (elemsize == 8u)
441 crop_pack4_bf16_fp16s_neon(m, borderm, _hoffset, _woffset);
442 else
443 crop_pack4_neon(m, borderm, _hoffset, _woffset);
444 }
445 }
446
447 return 0;
448 }
449 }
450 }
451 #endif // __ARM_NEON
452
453 Mat bottom_blob_unpacked = bottom_blob;
454 if (elempack != 1)
455 {
456 Option opt_pack1 = opt;
457 opt_pack1.blob_allocator = opt.workspace_allocator;
458
459 convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1);
460 }
461
462 return Crop::forward(bottom_blob_unpacked, top_blob, opt);
463 }
464
forward(const std::vector<Mat> & bottom_blobs,std::vector<Mat> & top_blobs,const Option & opt) const465 int Crop_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
466 {
467 const Mat& bottom_blob = bottom_blobs[0];
468 const Mat& reference_blob = bottom_blobs[1];
469
470 int w = bottom_blob.w;
471 int h = bottom_blob.h;
472 int d = bottom_blob.d;
473 int channels = bottom_blob.c;
474 int dims = bottom_blob.dims;
475 size_t elemsize = bottom_blob.elemsize;
476 int elempack = bottom_blob.elempack;
477
478 int ref_elempack = reference_blob.elempack;
479
480 Mat& top_blob = top_blobs[0];
481
482 #if __ARM_NEON
483 if (elempack == 8)
484 {
485 int _woffset, _hoffset, _doffset, _coffset;
486 int _outw, _outh, _outd, _outc;
487 if (woffset == -233)
488 {
489 resolve_crop_roi(bottom_blob.shape(), (const int*)reference_blob, _woffset, _hoffset, _doffset, _coffset, _outw, _outh, _outd, _outc);
490 }
491 else
492 {
493 resolve_crop_roi(bottom_blob.shape(), reference_blob.shape(), _woffset, _hoffset, _doffset, _coffset, _outw, _outh, _outd, _outc);
494 }
495
496 if (dims == 1)
497 {
498 int out_elempack = _outw % 8 == 0 ? 8 : _outw % 4 == 0 ? 4 : 1;
499 size_t out_elemsize = elemsize / elempack * out_elempack;
500
501 if (_outw / out_elempack == w && out_elempack == 8)
502 {
503 top_blob = bottom_blob;
504 return 0;
505 }
506
507 if (_woffset % 8 == 0 && out_elempack == 8)
508 {
509 top_blob.create(_outw / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
510 if (top_blob.empty())
511 return -100;
512
513 if (elemsize == 16u)
514 crop_pack8_bf16_fp16s_neon(bottom_blob, top_blob, 0, _woffset / elempack);
515 else
516 crop_pack8_neon(bottom_blob, top_blob, 0, _woffset / elempack);
517
518 return 0;
519 }
520 }
521
522 if (dims == 2)
523 {
524 int out_elempack = _outh % 8 == 0 ? 8 : _outh % 4 == 0 ? 4 : 1;
525 size_t out_elemsize = elemsize / elempack * out_elempack;
526
527 if (_outw == w && _outh / out_elempack == h && out_elempack == 8)
528 {
529 top_blob = bottom_blob;
530 return 0;
531 }
532
533 if (_hoffset % 8 == 0 && out_elempack == 8)
534 {
535 top_blob.create(_outw, _outh / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
536 if (top_blob.empty())
537 return -100;
538
539 if (elemsize == 16u)
540 crop_pack8_bf16_fp16s_neon(bottom_blob, top_blob, _hoffset / elempack, _woffset);
541 else
542 crop_pack8_neon(bottom_blob, top_blob, _hoffset / elempack, _woffset);
543
544 return 0;
545 }
546 }
547
548 if (dims == 3)
549 {
550 int out_elempack = _outc % 8 == 0 ? 8 : _outc % 4 == 0 ? 4 : 1;
551 size_t out_elemsize = elemsize / elempack * out_elempack;
552
553 if (_outw == w && _outh == h && _outc / out_elempack == channels && out_elempack == 8)
554 {
555 top_blob = bottom_blob;
556 return 0;
557 }
558
559 if (_coffset % 8 == 0 && out_elempack == 8)
560 {
561 const Mat bottom_blob_sliced = bottom_blob.channel_range(_coffset / out_elempack, _outc / out_elempack);
562
563 if (_outw == w && _outh == h)
564 {
565 top_blob = bottom_blob_sliced.clone();
566 if (top_blob.empty())
567 return -100;
568 }
569
570 top_blob.create(_outw, _outh, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
571 if (top_blob.empty())
572 return -100;
573
574 #pragma omp parallel for num_threads(opt.num_threads)
575 for (int q = 0; q < top_blob.c; q++)
576 {
577 const Mat m = bottom_blob_sliced.channel(q);
578 Mat borderm = top_blob.channel(q);
579
580 if (elemsize == 16u)
581 crop_pack8_bf16_fp16s_neon(m, borderm, _hoffset, _woffset);
582 else
583 crop_pack8_neon(m, borderm, _hoffset, _woffset);
584 }
585
586 return 0;
587 }
588 }
589
590 if (dims == 4)
591 {
592 int out_elempack = _outc % 8 == 0 ? 8 : _outc % 4 == 0 ? 4 : 1;
593 size_t out_elemsize = elemsize / elempack * out_elempack;
594
595 if (_outw == w && _outh == h && _outd == d && _outc / out_elempack == channels && out_elempack == 8)
596 {
597 top_blob = bottom_blob;
598 return 0;
599 }
600
601 if (_coffset % 8 == 0 && out_elempack == 8)
602 {
603 const Mat bottom_blob_sliced = bottom_blob.channel_range(_coffset / out_elempack, _outc / out_elempack);
604
605 if (_outw == w && _outh == h && _outd == d)
606 {
607 top_blob = bottom_blob_sliced.clone();
608 if (top_blob.empty())
609 return -100;
610 }
611
612 top_blob.create(_outw, _outh, _outd, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
613 if (top_blob.empty())
614 return -100;
615
616 #pragma omp parallel for num_threads(opt.num_threads)
617 for (int q = 0; q < top_blob.c; q++)
618 {
619 for (int z = 0; z < _outd; z++)
620 {
621 const Mat m = bottom_blob_sliced.channel(q).depth(z + _doffset);
622 Mat borderm = top_blob.channel(q).depth(z);
623
624 if (elemsize == 16u)
625 crop_pack8_bf16_fp16s_neon(m, borderm, _hoffset, _woffset);
626 else
627 crop_pack8_neon(m, borderm, _hoffset, _woffset);
628 }
629 }
630
631 return 0;
632 }
633 }
634 }
635
636 if (elempack == 4)
637 {
638 int _woffset, _hoffset, _doffset, _coffset;
639 int _outw, _outh, _outd, _outc;
640 if (woffset == -233)
641 {
642 resolve_crop_roi(bottom_blob.shape(), (const int*)reference_blob, _woffset, _hoffset, _doffset, _coffset, _outw, _outh, _outd, _outc);
643 }
644 else
645 {
646 resolve_crop_roi(bottom_blob.shape(), reference_blob.shape(), _woffset, _hoffset, _doffset, _coffset, _outw, _outh, _outd, _outc);
647 }
648
649 if (dims == 1)
650 {
651 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
652 int out_elempack = opt.use_fp16_arithmetic && _outw % 8 == 0 ? 8 : _outw % 4 == 0 ? 4 : 1;
653 #else
654 int out_elempack = _outw % 4 == 0 ? 4 : 1;
655 #endif
656 size_t out_elemsize = elemsize / elempack * out_elempack;
657
658 if (_outw / out_elempack == w && out_elempack == 4)
659 {
660 top_blob = bottom_blob;
661 return 0;
662 }
663
664 if (_woffset % 4 == 0 && out_elempack == 4)
665 {
666 top_blob.create(_outw / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
667 if (top_blob.empty())
668 return -100;
669
670 if (elemsize == 8u)
671 crop_pack4_bf16_fp16s_neon(bottom_blob, top_blob, 0, _woffset / elempack);
672 else
673 crop_pack4_neon(bottom_blob, top_blob, 0, _woffset / elempack);
674
675 return 0;
676 }
677 }
678
679 if (dims == 2)
680 {
681 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
682 int out_elempack = opt.use_fp16_arithmetic && _outh % 8 == 0 ? 8 : _outh % 4 == 0 ? 4 : 1;
683 #else
684 int out_elempack = _outh % 4 == 0 ? 4 : 1;
685 #endif
686 size_t out_elemsize = elemsize / elempack * out_elempack;
687
688 if (_outw == w && _outh / out_elempack == h && out_elempack == 4)
689 {
690 top_blob = bottom_blob;
691 return 0;
692 }
693
694 if (_hoffset % 4 == 0 && out_elempack == 4)
695 {
696 top_blob.create(_outw, _outh / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
697 if (top_blob.empty())
698 return -100;
699
700 if (elemsize == 8u)
701 crop_pack4_bf16_fp16s_neon(bottom_blob, top_blob, _hoffset / elempack, _woffset);
702 else
703 crop_pack4_neon(bottom_blob, top_blob, _hoffset / elempack, _woffset);
704
705 return 0;
706 }
707 }
708
709 if (dims == 3)
710 {
711 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
712 int out_elempack = opt.use_fp16_arithmetic && _outc % 8 == 0 ? 8 : _outc % 4 == 0 ? 4 : 1;
713 #else
714 int out_elempack = _outc % 4 == 0 ? 4 : 1;
715 #endif
716 size_t out_elemsize = elemsize / elempack * out_elempack;
717
718 if (_outw == w && _outh == h && _outc / out_elempack == channels && out_elempack == 4)
719 {
720 top_blob = bottom_blob;
721 return 0;
722 }
723
724 if (_coffset % 4 == 0 && out_elempack == 4)
725 {
726 const Mat bottom_blob_sliced = bottom_blob.channel_range(_coffset / out_elempack, _outc / out_elempack);
727
728 if (_outw == w && _outh == h)
729 {
730 top_blob = bottom_blob_sliced.clone();
731 if (top_blob.empty())
732 return -100;
733 }
734
735 top_blob.create(_outw, _outh, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
736 if (top_blob.empty())
737 return -100;
738
739 #pragma omp parallel for num_threads(opt.num_threads)
740 for (int q = 0; q < top_blob.c; q++)
741 {
742 const Mat m = bottom_blob_sliced.channel(q);
743 Mat borderm = top_blob.channel(q);
744
745 if (elemsize == 8u)
746 crop_pack4_bf16_fp16s_neon(m, borderm, _hoffset, _woffset);
747 else
748 crop_pack4_neon(m, borderm, _hoffset, _woffset);
749 }
750
751 return 0;
752 }
753 }
754
755 if (dims == 4)
756 {
757 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
758 int out_elempack = opt.use_fp16_arithmetic && _outc % 8 == 0 ? 8 : _outc % 4 == 0 ? 4 : 1;
759 #else
760 int out_elempack = _outc % 4 == 0 ? 4 : 1;
761 #endif
762 size_t out_elemsize = elemsize / elempack * out_elempack;
763
764 if (_outw == w && _outh == h && _outd == d && _outc / out_elempack == channels && out_elempack == 4)
765 {
766 top_blob = bottom_blob;
767 return 0;
768 }
769
770 if (_coffset % 4 == 0 && out_elempack == 4)
771 {
772 const Mat bottom_blob_sliced = bottom_blob.channel_range(_coffset / out_elempack, _outc / out_elempack);
773
774 if (_outw == w && _outh == h && _outd == d)
775 {
776 top_blob = bottom_blob_sliced.clone();
777 if (top_blob.empty())
778 return -100;
779 }
780
781 top_blob.create(_outw, _outh, _outd, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
782 if (top_blob.empty())
783 return -100;
784
785 #pragma omp parallel for num_threads(opt.num_threads)
786 for (int q = 0; q < top_blob.c; q++)
787 {
788 for (int z = 0; z < _outd; z++)
789 {
790 const Mat m = bottom_blob_sliced.channel(q).depth(z + _doffset);
791 Mat borderm = top_blob.channel(q).depth(z);
792
793 if (elemsize == 8u)
794 crop_pack4_bf16_fp16s_neon(m, borderm, _hoffset, _woffset);
795 else
796 crop_pack4_neon(m, borderm, _hoffset, _woffset);
797 }
798 }
799
800 return 0;
801 }
802 }
803 }
804 #endif // __ARM_NEON
805
806 Mat bottom_blob_unpacked = bottom_blob;
807 if (elempack != 1)
808 {
809 Option opt_pack1 = opt;
810 opt_pack1.blob_allocator = opt.workspace_allocator;
811
812 convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1);
813 }
814
815 Mat reference_blob_unpacked = reference_blob;
816 if (ref_elempack != 1)
817 {
818 Option opt_pack1 = opt;
819 opt_pack1.blob_allocator = opt.workspace_allocator;
820
821 convert_packing(reference_blob, reference_blob_unpacked, 1, opt_pack1);
822 }
823
824 std::vector<Mat> bottom_blobs_unpacked(2);
825 bottom_blobs_unpacked[0] = bottom_blob_unpacked;
826 bottom_blobs_unpacked[1] = reference_blob_unpacked;
827
828 return Crop::forward(bottom_blobs_unpacked, top_blobs, opt);
829 }
830
831 } // namespace ncnn
832