1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14
15 #include "concat_arm.h"
16
17 namespace ncnn {
18
Concat_arm()19 Concat_arm::Concat_arm()
20 {
21 #if __ARM_NEON
22 support_packing = true;
23 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
24 support_fp16_storage = true;
25 #endif
26 #endif // __ARM_NEON
27
28 support_bf16_storage = true;
29 }
30
forward(const std::vector<Mat> & bottom_blobs,std::vector<Mat> & top_blobs,const Option & opt) const31 int Concat_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
32 {
33 int elembits = bottom_blobs[0].elembits();
34
35 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
36 if (opt.use_fp16_storage && elembits == 16)
37 return forward_bf16s_fp16s(bottom_blobs, top_blobs, opt);
38 #endif
39
40 if (opt.use_bf16_storage && elembits == 16)
41 return forward_bf16s_fp16s(bottom_blobs, top_blobs, opt);
42
43 int dims = bottom_blobs[0].dims;
44 int positive_axis = axis < 0 ? dims + axis : axis;
45
46 if (dims == 1) // positive_axis == 0
47 {
48 // concat vector
49 // total length
50 size_t elemsize = bottom_blobs[0].elemsize;
51 int elempack = bottom_blobs[0].elempack;
52 int top_w = 0;
53 for (size_t b = 0; b < bottom_blobs.size(); b++)
54 {
55 const Mat& bottom_blob = bottom_blobs[b];
56 top_w += bottom_blob.w * bottom_blob.elempack;
57 }
58
59 int out_elempack = opt.use_packing_layout && top_w % 4 == 0 ? 4 : 1;
60 size_t out_elemsize = elemsize / elempack * out_elempack;
61
62 Mat& top_blob = top_blobs[0];
63 top_blob.create(top_w / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
64 if (top_blob.empty())
65 return -100;
66
67 float* outptr = top_blob;
68 for (size_t b = 0; b < bottom_blobs.size(); b++)
69 {
70 const Mat& bottom_blob = bottom_blobs[b];
71
72 const float* ptr = bottom_blob;
73 memcpy(outptr, ptr, bottom_blob.w * bottom_blob.elemsize);
74
75 outptr += bottom_blob.w * bottom_blob.elempack;
76 }
77 }
78
79 if (dims == 2 && positive_axis == 0)
80 {
81 // concat image
82 int w = bottom_blobs[0].w;
83
84 // total height
85 size_t elemsize = bottom_blobs[0].elemsize;
86 int elempack = bottom_blobs[0].elempack;
87 int top_h = 0;
88 for (size_t b = 0; b < bottom_blobs.size(); b++)
89 {
90 const Mat& bottom_blob = bottom_blobs[b];
91 elemsize = std::min(elemsize, bottom_blob.elemsize);
92 elempack = std::min(elempack, bottom_blob.elempack);
93 top_h += bottom_blob.h * bottom_blob.elempack;
94 }
95
96 int out_elempack = opt.use_packing_layout && top_h % 4 == 0 ? 4 : 1;
97 size_t out_elemsize = elemsize / elempack * out_elempack;
98
99 Mat& top_blob = top_blobs[0];
100 top_blob.create(w, top_h / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
101 if (top_blob.empty())
102 return -100;
103
104 Mat top_blob_unpacked = top_blob;
105 if (elempack < out_elempack)
106 {
107 top_blob_unpacked.create(w, top_h / elempack, elemsize, elempack, opt.workspace_allocator);
108 if (top_blob_unpacked.empty())
109 return -100;
110 }
111
112 float* outptr = top_blob_unpacked;
113 for (size_t b = 0; b < bottom_blobs.size(); b++)
114 {
115 const Mat& bottom_blob = bottom_blobs[b];
116
117 if (bottom_blob.elempack == 4 && elempack == 1)
118 {
119 for (int i = 0; i < bottom_blob.h; i++)
120 {
121 const float* r0 = bottom_blob.row(i);
122
123 float* outptr0 = outptr;
124 float* outptr1 = outptr + w;
125 float* outptr2 = outptr + w * 2;
126 float* outptr3 = outptr + w * 3;
127
128 for (int j = 0; j < w; j++)
129 {
130 *outptr0++ = r0[0];
131 *outptr1++ = r0[1];
132 *outptr2++ = r0[2];
133 *outptr3++ = r0[3];
134
135 r0 += 4;
136 }
137
138 outptr += w * 4;
139 }
140 }
141 else // if (bottom_blob.elempack == 1 && elempack == 1) if (bottom_blob.elempack == 4 && elempack == 4)
142 {
143 int size = w * bottom_blob.h;
144
145 const float* ptr = bottom_blob;
146 memcpy(outptr, ptr, size * bottom_blob.elemsize);
147
148 outptr += size * bottom_blob.elempack;
149 }
150 }
151
152 // packing
153 if (elempack < out_elempack)
154 {
155 convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
156 }
157 }
158
159 if (dims == 2 && positive_axis == 1)
160 {
161 // interleave image row
162 int h = bottom_blobs[0].h;
163 size_t elemsize = bottom_blobs[0].elemsize;
164 int elempack = bottom_blobs[0].elempack;
165
166 // total width
167 int top_w = 0;
168 for (size_t b = 0; b < bottom_blobs.size(); b++)
169 {
170 const Mat& bottom_blob = bottom_blobs[b];
171 top_w += bottom_blob.w;
172 }
173
174 Mat& top_blob = top_blobs[0];
175 top_blob.create(top_w, h, elemsize, elempack, opt.blob_allocator);
176 if (top_blob.empty())
177 return -100;
178
179 #pragma omp parallel for num_threads(opt.num_threads)
180 for (int i = 0; i < h; i++)
181 {
182 float* outptr = top_blob.row(i);
183 for (size_t b = 0; b < bottom_blobs.size(); b++)
184 {
185 const Mat& bottom_blob = bottom_blobs[b];
186
187 const float* ptr = bottom_blob.row(i);
188 memcpy(outptr, ptr, bottom_blob.w * elemsize);
189
190 outptr += bottom_blob.w * elempack;
191 }
192 }
193 }
194
195 if (dims == 3 && positive_axis == 0)
196 {
197 // concat dim
198 int w = bottom_blobs[0].w;
199 int h = bottom_blobs[0].h;
200
201 // total channels
202 size_t elemsize = bottom_blobs[0].elemsize;
203 int elempack = bottom_blobs[0].elempack;
204 int top_channels = 0;
205 for (size_t b = 0; b < bottom_blobs.size(); b++)
206 {
207 const Mat& bottom_blob = bottom_blobs[b];
208 elemsize = std::min(elemsize, bottom_blob.elemsize);
209 elempack = std::min(elempack, bottom_blob.elempack);
210 top_channels += bottom_blob.c * bottom_blob.elempack;
211 }
212
213 int out_elempack = opt.use_packing_layout && top_channels % 4 == 0 ? 4 : 1;
214 size_t out_elemsize = elemsize / elempack * out_elempack;
215
216 Mat& top_blob = top_blobs[0];
217 top_blob.create(w, h, top_channels / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
218 if (top_blob.empty())
219 return -100;
220
221 Mat top_blob_unpacked = top_blob;
222 if (elempack < out_elempack)
223 {
224 top_blob_unpacked.create(w, h, top_channels / elempack, elemsize, elempack, opt.workspace_allocator);
225 if (top_blob_unpacked.empty())
226 return -100;
227 }
228
229 int p = 0;
230 for (size_t b = 0; b < bottom_blobs.size(); b++)
231 {
232 const Mat& bottom_blob = bottom_blobs[b];
233
234 if (bottom_blob.elempack == 4 && elempack == 1)
235 {
236 int size = bottom_blob.w * bottom_blob.h;
237
238 for (int q = 0; q < bottom_blob.c; q++)
239 {
240 const float* r0 = bottom_blob.channel(q);
241
242 float* outptr0 = top_blob_unpacked.channel(p);
243 float* outptr1 = top_blob_unpacked.channel(p + 1);
244 float* outptr2 = top_blob_unpacked.channel(p + 2);
245 float* outptr3 = top_blob_unpacked.channel(p + 3);
246
247 for (int i = 0; i < size; i++)
248 {
249 *outptr0++ = r0[0];
250 *outptr1++ = r0[1];
251 *outptr2++ = r0[2];
252 *outptr3++ = r0[3];
253
254 r0 += 4;
255 }
256
257 p += 4;
258 }
259 }
260 else // if (bottom_blob.elempack == 1 && elempack == 1) if (bottom_blob.elempack == 4 && elempack == 4)
261 {
262 int size = bottom_blob.total();
263
264 const float* ptr = bottom_blob;
265 float* outptr = top_blob_unpacked.channel(p);
266 memcpy(outptr, ptr, size * bottom_blob.elemsize);
267
268 p += bottom_blob.c;
269 }
270 }
271
272 // packing
273 if (elempack < out_elempack)
274 {
275 convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
276 }
277 }
278
279 if (dims == 3 && positive_axis == 1)
280 {
281 // interleave dim height
282 int w = bottom_blobs[0].w;
283 int channels = bottom_blobs[0].c;
284 size_t elemsize = bottom_blobs[0].elemsize;
285 int elempack = bottom_blobs[0].elempack;
286
287 // total height
288 int top_h = 0;
289 for (size_t b = 0; b < bottom_blobs.size(); b++)
290 {
291 const Mat& bottom_blob = bottom_blobs[b];
292 top_h += bottom_blob.h;
293 }
294
295 Mat& top_blob = top_blobs[0];
296 top_blob.create(w, top_h, channels, elemsize, elempack, opt.blob_allocator);
297 if (top_blob.empty())
298 return -100;
299
300 #pragma omp parallel for num_threads(opt.num_threads)
301 for (int q = 0; q < channels; q++)
302 {
303 float* outptr = top_blob.channel(q);
304
305 for (size_t b = 0; b < bottom_blobs.size(); b++)
306 {
307 const Mat& bottom_blob = bottom_blobs[b];
308
309 int size = bottom_blob.w * bottom_blob.h;
310
311 const float* ptr = bottom_blob.channel(q);
312 memcpy(outptr, ptr, size * elemsize);
313
314 outptr += size * elempack;
315 }
316 }
317 }
318
319 if (dims == 3 && positive_axis == 2)
320 {
321 // interleave dim width
322 int h = bottom_blobs[0].h;
323 int channels = bottom_blobs[0].c;
324 size_t elemsize = bottom_blobs[0].elemsize;
325 int elempack = bottom_blobs[0].elempack;
326
327 // total height
328 int top_w = 0;
329 for (size_t b = 0; b < bottom_blobs.size(); b++)
330 {
331 const Mat& bottom_blob = bottom_blobs[b];
332 top_w += bottom_blob.w;
333 }
334
335 Mat& top_blob = top_blobs[0];
336 top_blob.create(top_w, h, channels, elemsize, elempack, opt.blob_allocator);
337 if (top_blob.empty())
338 return -100;
339
340 #pragma omp parallel for num_threads(opt.num_threads)
341 for (int q = 0; q < channels; q++)
342 {
343 float* outptr = top_blob.channel(q);
344
345 for (int i = 0; i < h; i++)
346 {
347 for (size_t b = 0; b < bottom_blobs.size(); b++)
348 {
349 const Mat& bottom_blob = bottom_blobs[b];
350
351 const float* ptr = bottom_blob.channel(q).row(i);
352 memcpy(outptr, ptr, bottom_blob.w * elemsize);
353
354 outptr += bottom_blob.w * elempack;
355 }
356 }
357 }
358 }
359
360 return 0;
361 }
362
forward_bf16s_fp16s(const std::vector<Mat> & bottom_blobs,std::vector<Mat> & top_blobs,const Option & opt) const363 int Concat_arm::forward_bf16s_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
364 {
365 int dims = bottom_blobs[0].dims;
366 int positive_axis = axis < 0 ? dims + axis : axis;
367
368 if (dims == 1) // positive_axis == 0
369 {
370 // concat vector
371 // total length
372 size_t elemsize = bottom_blobs[0].elemsize;
373 int elempack = bottom_blobs[0].elempack;
374 int top_w = 0;
375 for (size_t b = 0; b < bottom_blobs.size(); b++)
376 {
377 const Mat& bottom_blob = bottom_blobs[b];
378 top_w += bottom_blob.w * bottom_blob.elempack;
379 }
380
381 int out_elempack = 1;
382 if (opt.use_packing_layout)
383 {
384 out_elempack = opt.use_fp16_arithmetic && top_w % 8 == 0 ? 8 : top_w % 4 == 0 ? 4 : 1;
385 }
386 size_t out_elemsize = elemsize / elempack * out_elempack;
387
388 Mat& top_blob = top_blobs[0];
389 top_blob.create(top_w / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
390 if (top_blob.empty())
391 return -100;
392
393 unsigned short* outptr = top_blob;
394 for (size_t b = 0; b < bottom_blobs.size(); b++)
395 {
396 const Mat& bottom_blob = bottom_blobs[b];
397
398 const unsigned short* ptr = bottom_blob;
399 memcpy(outptr, ptr, bottom_blob.w * bottom_blob.elemsize);
400
401 outptr += bottom_blob.w * bottom_blob.elempack;
402 }
403 }
404
405 if (dims == 2 && positive_axis == 0)
406 {
407 // concat image
408 int w = bottom_blobs[0].w;
409
410 // total height
411 size_t elemsize = bottom_blobs[0].elemsize;
412 int elempack = bottom_blobs[0].elempack;
413 int top_h = 0;
414 for (size_t b = 0; b < bottom_blobs.size(); b++)
415 {
416 const Mat& bottom_blob = bottom_blobs[b];
417 elemsize = std::min(elemsize, bottom_blob.elemsize);
418 elempack = std::min(elempack, bottom_blob.elempack);
419 top_h += bottom_blob.h * bottom_blob.elempack;
420 }
421
422 int out_elempack = 1;
423 if (opt.use_packing_layout)
424 {
425 out_elempack = opt.use_fp16_arithmetic && top_h % 8 == 0 ? 8 : top_h % 4 == 0 ? 4 : 1;
426 }
427 size_t out_elemsize = elemsize / elempack * out_elempack;
428
429 Mat& top_blob = top_blobs[0];
430 top_blob.create(w, top_h / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
431 if (top_blob.empty())
432 return -100;
433
434 Mat top_blob_unpacked = top_blob;
435 if (elempack < out_elempack)
436 {
437 top_blob_unpacked.create(w, top_h / elempack, elemsize, elempack, opt.workspace_allocator);
438 if (top_blob_unpacked.empty())
439 return -100;
440 }
441
442 unsigned short* outptr = top_blob_unpacked;
443 for (size_t b = 0; b < bottom_blobs.size(); b++)
444 {
445 const Mat& bottom_blob = bottom_blobs[b];
446
447 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
448 if (bottom_blob.elempack == 8 && elempack == 4)
449 {
450 for (int i = 0; i < bottom_blob.h; i++)
451 {
452 const unsigned short* r0 = bottom_blob.row<const unsigned short>(i);
453
454 unsigned short* outptr0 = outptr;
455 unsigned short* outptr1 = outptr + w * 4;
456
457 for (int j = 0; j < w; j++)
458 {
459 outptr0[0] = r0[0];
460 outptr0[1] = r0[1];
461 outptr0[2] = r0[2];
462 outptr0[3] = r0[3];
463 outptr1[0] = r0[4];
464 outptr1[1] = r0[5];
465 outptr1[2] = r0[6];
466 outptr1[3] = r0[7];
467
468 outptr0 += 4;
469 outptr1 += 4;
470 r0 += 8;
471 }
472
473 outptr += w * 8;
474 }
475 }
476 if (bottom_blob.elempack == 8 && elempack == 1)
477 {
478 for (int i = 0; i < bottom_blob.h; i++)
479 {
480 const unsigned short* r0 = bottom_blob.row<const unsigned short>(i);
481
482 unsigned short* outptr0 = outptr;
483 unsigned short* outptr1 = outptr + w;
484 unsigned short* outptr2 = outptr + w * 2;
485 unsigned short* outptr3 = outptr + w * 3;
486 unsigned short* outptr4 = outptr + w * 4;
487 unsigned short* outptr5 = outptr + w * 5;
488 unsigned short* outptr6 = outptr + w * 6;
489 unsigned short* outptr7 = outptr + w * 7;
490
491 for (int j = 0; j < w; j++)
492 {
493 *outptr0++ = r0[0];
494 *outptr1++ = r0[1];
495 *outptr2++ = r0[2];
496 *outptr3++ = r0[3];
497 *outptr4++ = r0[4];
498 *outptr5++ = r0[5];
499 *outptr6++ = r0[6];
500 *outptr7++ = r0[7];
501
502 r0 += 8;
503 }
504
505 outptr += w * 8;
506 }
507 }
508 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
509 if (bottom_blob.elempack == 4 && elempack == 1)
510 {
511 for (int i = 0; i < bottom_blob.h; i++)
512 {
513 const unsigned short* r0 = bottom_blob.row<const unsigned short>(i);
514
515 unsigned short* outptr0 = outptr;
516 unsigned short* outptr1 = outptr + w;
517 unsigned short* outptr2 = outptr + w * 2;
518 unsigned short* outptr3 = outptr + w * 3;
519
520 for (int j = 0; j < w; j++)
521 {
522 *outptr0++ = r0[0];
523 *outptr1++ = r0[1];
524 *outptr2++ = r0[2];
525 *outptr3++ = r0[3];
526
527 r0 += 4;
528 }
529
530 outptr += w * 4;
531 }
532 }
533 if (bottom_blob.elempack == elempack) // 1-1 4-4 8-8
534 {
535 int size = w * bottom_blob.h;
536
537 const unsigned short* ptr = bottom_blob;
538 memcpy(outptr, ptr, size * bottom_blob.elemsize);
539
540 outptr += size * bottom_blob.elempack;
541 }
542 }
543
544 // packing
545 if (elempack < out_elempack)
546 {
547 convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
548 }
549 }
550
551 if (dims == 2 && positive_axis == 1)
552 {
553 // interleave image row
554 int h = bottom_blobs[0].h;
555 size_t elemsize = bottom_blobs[0].elemsize;
556 int elempack = bottom_blobs[0].elempack;
557
558 // total width
559 int top_w = 0;
560 for (size_t b = 0; b < bottom_blobs.size(); b++)
561 {
562 const Mat& bottom_blob = bottom_blobs[b];
563 top_w += bottom_blob.w;
564 }
565
566 Mat& top_blob = top_blobs[0];
567 top_blob.create(top_w, h, elemsize, elempack, opt.blob_allocator);
568 if (top_blob.empty())
569 return -100;
570
571 #pragma omp parallel for num_threads(opt.num_threads)
572 for (int i = 0; i < h; i++)
573 {
574 unsigned short* outptr = top_blob.row<unsigned short>(i);
575 for (size_t b = 0; b < bottom_blobs.size(); b++)
576 {
577 const Mat& bottom_blob = bottom_blobs[b];
578
579 const unsigned short* ptr = bottom_blob.row<unsigned short>(i);
580 memcpy(outptr, ptr, bottom_blob.w * elemsize);
581
582 outptr += bottom_blob.w * elempack;
583 }
584 }
585 }
586
587 if (dims == 3 && positive_axis == 0)
588 {
589 // concat dim
590 int w = bottom_blobs[0].w;
591 int h = bottom_blobs[0].h;
592
593 // total channels
594 size_t elemsize = bottom_blobs[0].elemsize;
595 int elempack = bottom_blobs[0].elempack;
596 int top_channels = 0;
597 for (size_t b = 0; b < bottom_blobs.size(); b++)
598 {
599 const Mat& bottom_blob = bottom_blobs[b];
600 elemsize = std::min(elemsize, bottom_blob.elemsize);
601 elempack = std::min(elempack, bottom_blob.elempack);
602 top_channels += bottom_blob.c * bottom_blob.elempack;
603 }
604
605 int out_elempack = 1;
606 if (opt.use_packing_layout)
607 {
608 out_elempack = opt.use_fp16_arithmetic && top_channels % 8 == 0 ? 8 : top_channels % 4 == 0 ? 4 : 1;
609 }
610 size_t out_elemsize = elemsize / elempack * out_elempack;
611
612 Mat& top_blob = top_blobs[0];
613 top_blob.create(w, h, top_channels / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
614 if (top_blob.empty())
615 return -100;
616
617 Mat top_blob_unpacked = top_blob;
618 if (elempack < out_elempack)
619 {
620 top_blob_unpacked.create(w, h, top_channels / elempack, elemsize, elempack, opt.workspace_allocator);
621 if (top_blob_unpacked.empty())
622 return -100;
623 }
624
625 int p = 0;
626 for (size_t b = 0; b < bottom_blobs.size(); b++)
627 {
628 const Mat& bottom_blob = bottom_blobs[b];
629
630 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
631 if (bottom_blob.elempack == 8 && elempack == 4)
632 {
633 int size = bottom_blob.w * bottom_blob.h;
634
635 for (int q = 0; q < bottom_blob.c; q++)
636 {
637 const unsigned short* r0 = bottom_blob.channel(q);
638
639 unsigned short* outptr0 = top_blob_unpacked.channel(p);
640 unsigned short* outptr1 = top_blob_unpacked.channel(p + 1);
641
642 for (int i = 0; i < size; i++)
643 {
644 outptr0[0] = r0[0];
645 outptr0[1] = r0[1];
646 outptr0[2] = r0[2];
647 outptr0[3] = r0[3];
648 outptr1[0] = r0[4];
649 outptr1[1] = r0[5];
650 outptr1[2] = r0[6];
651 outptr1[3] = r0[7];
652
653 outptr0 += 4;
654 outptr1 += 4;
655 r0 += 8;
656 }
657
658 p += 2;
659 }
660 }
661 if (bottom_blob.elempack == 8 && elempack == 1)
662 {
663 int size = bottom_blob.w * bottom_blob.h;
664
665 for (int q = 0; q < bottom_blob.c; q++)
666 {
667 const unsigned short* r0 = bottom_blob.channel(q);
668
669 unsigned short* outptr0 = top_blob_unpacked.channel(p);
670 unsigned short* outptr1 = top_blob_unpacked.channel(p + 1);
671 unsigned short* outptr2 = top_blob_unpacked.channel(p + 2);
672 unsigned short* outptr3 = top_blob_unpacked.channel(p + 3);
673 unsigned short* outptr4 = top_blob_unpacked.channel(p + 4);
674 unsigned short* outptr5 = top_blob_unpacked.channel(p + 5);
675 unsigned short* outptr6 = top_blob_unpacked.channel(p + 6);
676 unsigned short* outptr7 = top_blob_unpacked.channel(p + 7);
677
678 for (int i = 0; i < size; i++)
679 {
680 *outptr0++ = r0[0];
681 *outptr1++ = r0[1];
682 *outptr2++ = r0[2];
683 *outptr3++ = r0[3];
684 *outptr4++ = r0[4];
685 *outptr5++ = r0[5];
686 *outptr6++ = r0[6];
687 *outptr7++ = r0[7];
688
689 r0 += 8;
690 }
691
692 p += 8;
693 }
694 }
695 #endif
696 if (bottom_blob.elempack == 4 && elempack == 1)
697 {
698 int size = bottom_blob.w * bottom_blob.h;
699
700 for (int q = 0; q < bottom_blob.c; q++)
701 {
702 const unsigned short* r0 = bottom_blob.channel(q);
703
704 unsigned short* outptr0 = top_blob_unpacked.channel(p);
705 unsigned short* outptr1 = top_blob_unpacked.channel(p + 1);
706 unsigned short* outptr2 = top_blob_unpacked.channel(p + 2);
707 unsigned short* outptr3 = top_blob_unpacked.channel(p + 3);
708
709 for (int i = 0; i < size; i++)
710 {
711 *outptr0++ = r0[0];
712 *outptr1++ = r0[1];
713 *outptr2++ = r0[2];
714 *outptr3++ = r0[3];
715
716 r0 += 4;
717 }
718
719 p += 4;
720 }
721 }
722 if (bottom_blob.elempack == elempack) // 1-1 4-4 8-8
723 {
724 int size = bottom_blob.total();
725
726 const unsigned short* ptr = bottom_blob;
727 unsigned short* outptr = top_blob_unpacked.channel(p);
728 memcpy(outptr, ptr, size * bottom_blob.elemsize);
729
730 p += bottom_blob.c;
731 }
732 }
733
734 // packing
735 if (elempack < out_elempack)
736 {
737 convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
738 }
739 }
740
741 if (dims == 3 && positive_axis == 1)
742 {
743 // interleave dim height
744 int w = bottom_blobs[0].w;
745 int channels = bottom_blobs[0].c;
746 size_t elemsize = bottom_blobs[0].elemsize;
747 int elempack = bottom_blobs[0].elempack;
748
749 // total height
750 int top_h = 0;
751 for (size_t b = 0; b < bottom_blobs.size(); b++)
752 {
753 const Mat& bottom_blob = bottom_blobs[b];
754 top_h += bottom_blob.h;
755 }
756
757 Mat& top_blob = top_blobs[0];
758 top_blob.create(w, top_h, channels, elemsize, elempack, opt.blob_allocator);
759 if (top_blob.empty())
760 return -100;
761
762 #pragma omp parallel for num_threads(opt.num_threads)
763 for (int q = 0; q < channels; q++)
764 {
765 unsigned short* outptr = top_blob.channel(q);
766
767 for (size_t b = 0; b < bottom_blobs.size(); b++)
768 {
769 const Mat& bottom_blob = bottom_blobs[b];
770
771 int size = bottom_blob.w * bottom_blob.h;
772
773 const unsigned short* ptr = bottom_blob.channel(q);
774 memcpy(outptr, ptr, size * elemsize);
775
776 outptr += size * elempack;
777 }
778 }
779 }
780
781 if (dims == 3 && positive_axis == 2)
782 {
783 // interleave dim width
784 int h = bottom_blobs[0].h;
785 int channels = bottom_blobs[0].c;
786 size_t elemsize = bottom_blobs[0].elemsize;
787 int elempack = bottom_blobs[0].elempack;
788
789 // total height
790 int top_w = 0;
791 for (size_t b = 0; b < bottom_blobs.size(); b++)
792 {
793 const Mat& bottom_blob = bottom_blobs[b];
794 top_w += bottom_blob.w;
795 }
796
797 Mat& top_blob = top_blobs[0];
798 top_blob.create(top_w, h, channels, elemsize, elempack, opt.blob_allocator);
799 if (top_blob.empty())
800 return -100;
801
802 #pragma omp parallel for num_threads(opt.num_threads)
803 for (int q = 0; q < channels; q++)
804 {
805 unsigned short* outptr = top_blob.channel(q);
806
807 for (int i = 0; i < h; i++)
808 {
809 for (size_t b = 0; b < bottom_blobs.size(); b++)
810 {
811 const Mat& bottom_blob = bottom_blobs[b];
812
813 const unsigned short* ptr = bottom_blob.channel(q).row<const unsigned short>(i);
814 memcpy(outptr, ptr, bottom_blob.w * elemsize);
815
816 outptr += bottom_blob.w * elempack;
817 }
818 }
819 }
820 }
821
822 return 0;
823 }
824
825 } // namespace ncnn
826