1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14
15 #include "crop_x86.h"
16
17 #if __SSE2__
18 #include <emmintrin.h>
19 #if __AVX__
20 #include <immintrin.h>
21 #endif
22 #endif // __SSE2__
23
24 namespace ncnn {
25
Crop_x86()26 Crop_x86::Crop_x86()
27 {
28 #if __SSE2__
29 support_packing = true;
30 #endif // __SSE2__
31 }
32
33 #if __SSE2__
34 #if __AVX__
crop_pack8_avx(const Mat & src,Mat & dst,int top,int left)35 static void crop_pack8_avx(const Mat& src, Mat& dst, int top, int left)
36 {
37 int w = dst.w;
38 int h = dst.h;
39 int right = src.w - dst.w - left;
40
41 const float* ptr = src.row(top) + left * 8;
42 float* outptr = dst;
43
44 for (int y = 0; y < h; y++)
45 {
46 for (int x = 0; x < w; x++)
47 {
48 __m256 _p = _mm256_loadu_ps(ptr);
49 _mm256_storeu_ps(outptr, _p);
50 ptr += 8;
51 outptr += 8;
52 }
53
54 ptr += (left + right) * 8;
55 }
56 }
57 #endif // __AVX__
58
crop_pack4_sse(const Mat & src,Mat & dst,int top,int left)59 static void crop_pack4_sse(const Mat& src, Mat& dst, int top, int left)
60 {
61 int w = dst.w;
62 int h = dst.h;
63 int right = src.w - dst.w - left;
64
65 const float* ptr = src.row(top) + left * 4;
66 float* outptr = dst;
67
68 for (int y = 0; y < h; y++)
69 {
70 for (int x = 0; x < w; x++)
71 {
72 __m128 _p = _mm_loadu_ps(ptr);
73 _mm_storeu_ps(outptr, _p);
74 ptr += 4;
75 outptr += 4;
76 }
77
78 ptr += (left + right) * 4;
79 }
80 }
81 #endif // __SSE2__
82
forward(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const83 int Crop_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
84 {
85 int w = bottom_blob.w;
86 int h = bottom_blob.h;
87 int channels = bottom_blob.c;
88 int dims = bottom_blob.dims;
89 size_t elemsize = bottom_blob.elemsize;
90 int elempack = bottom_blob.elempack;
91
92 #if __SSE2__
93 #if __AVX__
94 if (elempack == 8)
95 {
96 int _woffset, _hoffset, _coffset;
97 int _outw, _outh, _outc;
98 resolve_crop_roi(bottom_blob.shape(), _woffset, _hoffset, _coffset, _outw, _outh, _outc);
99
100 if (dims == 1)
101 {
102 int out_elempack = _outw % 8 == 0 ? 8 : _outw % 4 == 0 ? 4 : 1;
103 size_t out_elemsize = elemsize / elempack * out_elempack;
104
105 if (_outw / out_elempack == w)
106 {
107 top_blob = bottom_blob;
108 return 0;
109 }
110
111 top_blob.create(_outw / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
112 if (top_blob.empty())
113 return -100;
114
115 if (_woffset % 8 == 0 && out_elempack == 8)
116 {
117 crop_pack8_avx(bottom_blob, top_blob, 0, _woffset / elempack);
118
119 return 0;
120 }
121 }
122
123 if (dims == 2)
124 {
125 int out_elempack = _outh % 8 == 0 ? 8 : _outh % 4 == 0 ? 4 : 1;
126 size_t out_elemsize = elemsize / elempack * out_elempack;
127
128 if (_outw == w && _outh / out_elempack == h)
129 {
130 top_blob = bottom_blob;
131 return 0;
132 }
133
134 top_blob.create(_outw, _outh / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
135 if (top_blob.empty())
136 return -100;
137
138 if (_hoffset % 8 == 0 && out_elempack == 8)
139 {
140 crop_pack8_avx(bottom_blob, top_blob, _hoffset / elempack, _woffset);
141
142 return 0;
143 }
144 }
145
146 if (dims == 3)
147 {
148 int out_elempack = _outc % 8 == 0 ? 8 : _outc % 4 == 0 ? 4 : 1;
149 size_t out_elemsize = elemsize / elempack * out_elempack;
150
151 if (_coffset % 8 == 0 && out_elempack == 8)
152 {
153 const Mat bottom_blob_sliced = bottom_blob.channel_range(_coffset / out_elempack, _outc / out_elempack);
154
155 if (_outw == w && _outh == h)
156 {
157 top_blob = bottom_blob_sliced.clone();
158 if (top_blob.empty())
159 return -100;
160 }
161
162 if (_outw == w && _outh == h && _outc / out_elempack == channels)
163 {
164 top_blob = bottom_blob;
165 return 0;
166 }
167
168 top_blob.create(_outw, _outh, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
169 if (top_blob.empty())
170 return -100;
171
172 #pragma omp parallel for num_threads(opt.num_threads)
173 for (int q = 0; q < top_blob.c; q++)
174 {
175 const Mat m = bottom_blob_sliced.channel(q);
176 Mat borderm = top_blob.channel(q);
177 crop_pack8_avx(m, borderm, _hoffset, _woffset);
178 }
179
180 return 0;
181 }
182 }
183 }
184 #endif // __AVX__
185
186 if (elempack == 4)
187 {
188 int _woffset, _hoffset, _coffset;
189 int _outw, _outh, _outc;
190 resolve_crop_roi(bottom_blob.shape(), _woffset, _hoffset, _coffset, _outw, _outh, _outc);
191
192 if (dims == 1)
193 {
194 #if __AVX__
195 int out_elempack = _outw % 8 == 0 ? 8 : _outw % 4 == 0 ? 4 : 1;
196 #else
197 int out_elempack = _outw % 4 == 0 ? 4 : 1;
198 #endif
199 size_t out_elemsize = elemsize / elempack * out_elempack;
200
201 if (_outw / out_elempack == w)
202 {
203 top_blob = bottom_blob;
204 return 0;
205 }
206
207 top_blob.create(_outw / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
208 if (top_blob.empty())
209 return -100;
210
211 if (_woffset % 4 == 0 && out_elempack == 4)
212 {
213 crop_pack4_sse(bottom_blob, top_blob, 0, _woffset / elempack);
214
215 return 0;
216 }
217 }
218
219 if (dims == 2)
220 {
221 #if __AVX__
222 int out_elempack = _outh % 8 == 0 ? 8 : _outh % 4 == 0 ? 4 : 1;
223 #else
224 int out_elempack = _outh % 4 == 0 ? 4 : 1;
225 #endif
226 size_t out_elemsize = elemsize / elempack * out_elempack;
227
228 if (_outw == w && _outh / out_elempack == h)
229 {
230 top_blob = bottom_blob;
231 return 0;
232 }
233
234 top_blob.create(_outw, _outh / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
235 if (top_blob.empty())
236 return -100;
237
238 if (_hoffset % 4 == 0 && out_elempack == 4)
239 {
240 crop_pack4_sse(bottom_blob, top_blob, _hoffset / elempack, _woffset);
241
242 return 0;
243 }
244 }
245
246 if (dims == 3)
247 {
248 #if __AVX__
249 int out_elempack = _outc % 8 == 0 ? 8 : _outc % 4 == 0 ? 4 : 1;
250 #else
251 int out_elempack = _outc % 4 == 0 ? 4 : 1;
252 #endif
253 size_t out_elemsize = elemsize / elempack * out_elempack;
254
255 if (_coffset % 4 == 0 && out_elempack == 4)
256 {
257 const Mat bottom_blob_sliced = bottom_blob.channel_range(_coffset / out_elempack, _outc / out_elempack);
258
259 if (_outw == w && _outh == h)
260 {
261 top_blob = bottom_blob_sliced.clone();
262 if (top_blob.empty())
263 return -100;
264 }
265
266 if (_outw == w && _outh == h && _outc / out_elempack == channels)
267 {
268 top_blob = bottom_blob;
269 return 0;
270 }
271
272 top_blob.create(_outw, _outh, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
273 if (top_blob.empty())
274 return -100;
275
276 #pragma omp parallel for num_threads(opt.num_threads)
277 for (int q = 0; q < top_blob.c; q++)
278 {
279 const Mat m = bottom_blob_sliced.channel(q);
280 Mat borderm = top_blob.channel(q);
281
282 crop_pack4_sse(m, borderm, _hoffset, _woffset);
283 }
284
285 return 0;
286 }
287 }
288 }
289 #endif // __SSE2__
290
291 Mat bottom_blob_unpacked = bottom_blob;
292 if (elempack != 1)
293 {
294 Option opt_pack1 = opt;
295 opt_pack1.blob_allocator = opt.workspace_allocator;
296
297 convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1);
298 }
299
300 return Crop::forward(bottom_blob_unpacked, top_blob, opt);
301 }
302
forward(const std::vector<Mat> & bottom_blobs,std::vector<Mat> & top_blobs,const Option & opt) const303 int Crop_x86::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
304 {
305 const Mat& bottom_blob = bottom_blobs[0];
306 const Mat& reference_blob = bottom_blobs[1];
307
308 int w = bottom_blob.w;
309 int h = bottom_blob.h;
310 int channels = bottom_blob.c;
311 int dims = bottom_blob.dims;
312 size_t elemsize = bottom_blob.elemsize;
313 int elempack = bottom_blob.elempack;
314
315 int ref_elempack = reference_blob.elempack;
316
317 Mat& top_blob = top_blobs[0];
318
319 #if __SSE2__
320 #if __AVX__
321 if (elempack == 8)
322 {
323 int _woffset, _hoffset, _coffset;
324 int _outw, _outh, _outc;
325 if (woffset == -233)
326 {
327 resolve_crop_roi(bottom_blob.shape(), (const int*)reference_blob, _woffset, _hoffset, _coffset, _outw, _outh, _outc);
328 }
329 else
330 {
331 resolve_crop_roi(bottom_blob.shape(), reference_blob.shape(), _woffset, _hoffset, _coffset, _outw, _outh, _outc);
332 }
333
334 if (dims == 1)
335 {
336 int out_elempack = _outw % 8 == 0 ? 8 : _outw % 4 == 0 ? 4 : 1;
337 size_t out_elemsize = elemsize / elempack * out_elempack;
338
339 if (_outw / out_elempack == w)
340 {
341 top_blob = bottom_blob;
342 return 0;
343 }
344
345 top_blob.create(_outw / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
346 if (top_blob.empty())
347 return -100;
348
349 if (_woffset % 8 == 0 && out_elempack == 8)
350 {
351 crop_pack8_avx(bottom_blob, top_blob, 0, _woffset / elempack);
352
353 return 0;
354 }
355 }
356
357 if (dims == 2)
358 {
359 int out_elempack = _outh % 8 == 0 ? 8 : _outh % 4 == 0 ? 4 : 1;
360 size_t out_elemsize = elemsize / elempack * out_elempack;
361
362 if (_outw == w && _outh / out_elempack == h)
363 {
364 top_blob = bottom_blob;
365 return 0;
366 }
367
368 top_blob.create(_outw, _outh / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
369 if (top_blob.empty())
370 return -100;
371
372 if (_hoffset % 8 == 0 && out_elempack == 8)
373 {
374 crop_pack8_avx(bottom_blob, top_blob, _hoffset / elempack, _woffset);
375
376 return 0;
377 }
378 }
379
380 if (dims == 3)
381 {
382 int out_elempack = _outc % 8 == 0 ? 8 : _outc % 4 == 0 ? 4 : 1;
383 size_t out_elemsize = elemsize / elempack * out_elempack;
384
385 if (_coffset % 8 == 0 && out_elempack == 8)
386 {
387 const Mat bottom_blob_sliced = bottom_blob.channel_range(_coffset / out_elempack, _outc / out_elempack);
388
389 if (_outw == w && _outh == h)
390 {
391 top_blob = bottom_blob_sliced.clone();
392 if (top_blob.empty())
393 return -100;
394 }
395
396 if (_outw == w && _outh == h && _outc / out_elempack == channels)
397 {
398 top_blob = bottom_blob;
399 return 0;
400 }
401
402 top_blob.create(_outw, _outh, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
403 if (top_blob.empty())
404 return -100;
405
406 #pragma omp parallel for num_threads(opt.num_threads)
407 for (int q = 0; q < top_blob.c; q++)
408 {
409 const Mat m = bottom_blob_sliced.channel(q);
410 Mat borderm = top_blob.channel(q);
411 crop_pack8_avx(m, borderm, _hoffset, _woffset);
412 }
413
414 return 0;
415 }
416 }
417 }
418 #endif // __AVX__
419
420 if (elempack == 4)
421 {
422 int _woffset, _hoffset, _coffset;
423 int _outw, _outh, _outc;
424 if (woffset == -233)
425 {
426 resolve_crop_roi(bottom_blob.shape(), (const int*)reference_blob, _woffset, _hoffset, _coffset, _outw, _outh, _outc);
427 }
428 else
429 {
430 resolve_crop_roi(bottom_blob.shape(), reference_blob.shape(), _woffset, _hoffset, _coffset, _outw, _outh, _outc);
431 }
432
433 if (dims == 1)
434 {
435 #if __AVX__
436 int out_elempack = _outw % 8 == 0 ? 8 : _outw % 4 == 0 ? 4 : 1;
437 #else
438 int out_elempack = _outw % 4 == 0 ? 4 : 1;
439 #endif
440 size_t out_elemsize = elemsize / elempack * out_elempack;
441
442 if (_outw / out_elempack == w)
443 {
444 top_blob = bottom_blob;
445 return 0;
446 }
447
448 top_blob.create(_outw / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
449 if (top_blob.empty())
450 return -100;
451
452 if (_woffset % 4 == 0 && out_elempack == 4)
453 {
454 crop_pack4_sse(bottom_blob, top_blob, 0, _woffset / elempack);
455
456 return 0;
457 }
458 }
459
460 if (dims == 2)
461 {
462 #if __AVX__
463 int out_elempack = _outh % 8 == 0 ? 8 : _outh % 4 == 0 ? 4 : 1;
464 #else
465 int out_elempack = _outh % 4 == 0 ? 4 : 1;
466 #endif
467 size_t out_elemsize = elemsize / elempack * out_elempack;
468
469 if (_outw == w && _outh / out_elempack == h)
470 {
471 top_blob = bottom_blob;
472 return 0;
473 }
474
475 top_blob.create(_outw, _outh / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
476 if (top_blob.empty())
477 return -100;
478
479 if (_hoffset % 4 == 0 && out_elempack == 4)
480 {
481 crop_pack4_sse(bottom_blob, top_blob, _hoffset / elempack, _woffset);
482
483 return 0;
484 }
485 }
486
487 if (dims == 3)
488 {
489 #if __AVX__
490 int out_elempack = _outc % 8 == 0 ? 8 : _outc % 4 == 0 ? 4 : 1;
491 #else
492 int out_elempack = _outc % 4 == 0 ? 4 : 1;
493 #endif
494 size_t out_elemsize = elemsize / elempack * out_elempack;
495
496 if (_coffset % 4 == 0 && out_elempack == 4)
497 {
498 const Mat bottom_blob_sliced = bottom_blob.channel_range(_coffset / out_elempack, _outc / out_elempack);
499
500 if (_outw == w && _outh == h)
501 {
502 top_blob = bottom_blob_sliced.clone();
503 if (top_blob.empty())
504 return -100;
505 }
506
507 if (_outw == w && _outh == h && _outc / out_elempack == channels)
508 {
509 top_blob = bottom_blob;
510 return 0;
511 }
512
513 top_blob.create(_outw, _outh, _outc / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
514 if (top_blob.empty())
515 return -100;
516
517 #pragma omp parallel for num_threads(opt.num_threads)
518 for (int q = 0; q < top_blob.c; q++)
519 {
520 const Mat m = bottom_blob_sliced.channel(q);
521 Mat borderm = top_blob.channel(q);
522
523 crop_pack4_sse(m, borderm, _hoffset, _woffset);
524 }
525
526 return 0;
527 }
528 }
529 }
530 #endif // __SSE2__
531
532 Mat bottom_blob_unpacked = bottom_blob;
533 if (elempack != 1)
534 {
535 Option opt_pack1 = opt;
536 opt_pack1.blob_allocator = opt.workspace_allocator;
537
538 convert_packing(bottom_blob, bottom_blob_unpacked, 1, opt_pack1);
539 }
540
541 Mat reference_blob_unpacked = reference_blob;
542 if (ref_elempack != 1)
543 {
544 Option opt_pack1 = opt;
545 opt_pack1.blob_allocator = opt.workspace_allocator;
546
547 convert_packing(reference_blob, reference_blob_unpacked, 1, opt_pack1);
548 }
549
550 std::vector<Mat> bottom_blobs_unpacked(2);
551 bottom_blobs_unpacked[0] = bottom_blob_unpacked;
552 bottom_blobs_unpacked[1] = reference_blob_unpacked;
553
554 return Crop::forward(bottom_blobs_unpacked, top_blobs, opt);
555 }
556
557 } // namespace ncnn
558