1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 
15 #include "concat_arm.h"
16 
17 namespace ncnn {
18 
Concat_arm()19 Concat_arm::Concat_arm()
20 {
21 #if __ARM_NEON
22     support_packing = true;
23 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
24     support_fp16_storage = true;
25 #endif
26 #endif // __ARM_NEON
27 
28     support_bf16_storage = true;
29 }
30 
forward(const std::vector<Mat> & bottom_blobs,std::vector<Mat> & top_blobs,const Option & opt) const31 int Concat_arm::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
32 {
33     int elembits = bottom_blobs[0].elembits();
34 
35 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
36     if (opt.use_fp16_storage && elembits == 16)
37         return forward_bf16s_fp16s(bottom_blobs, top_blobs, opt);
38 #endif
39 
40     if (opt.use_bf16_storage && elembits == 16)
41         return forward_bf16s_fp16s(bottom_blobs, top_blobs, opt);
42 
43     int dims = bottom_blobs[0].dims;
44     int positive_axis = axis < 0 ? dims + axis : axis;
45 
46     if (dims == 1) // positive_axis == 0
47     {
48         // concat vector
49         // total length
50         size_t elemsize = bottom_blobs[0].elemsize;
51         int elempack = bottom_blobs[0].elempack;
52         int top_w = 0;
53         for (size_t b = 0; b < bottom_blobs.size(); b++)
54         {
55             const Mat& bottom_blob = bottom_blobs[b];
56             top_w += bottom_blob.w * bottom_blob.elempack;
57         }
58 
59         int out_elempack = opt.use_packing_layout && top_w % 4 == 0 ? 4 : 1;
60         size_t out_elemsize = elemsize / elempack * out_elempack;
61 
62         Mat& top_blob = top_blobs[0];
63         top_blob.create(top_w / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
64         if (top_blob.empty())
65             return -100;
66 
67         float* outptr = top_blob;
68         for (size_t b = 0; b < bottom_blobs.size(); b++)
69         {
70             const Mat& bottom_blob = bottom_blobs[b];
71 
72             const float* ptr = bottom_blob;
73             memcpy(outptr, ptr, bottom_blob.w * bottom_blob.elemsize);
74 
75             outptr += bottom_blob.w * bottom_blob.elempack;
76         }
77     }
78 
79     if (dims == 2 && positive_axis == 0)
80     {
81         // concat image
82         int w = bottom_blobs[0].w;
83 
84         // total height
85         size_t elemsize = bottom_blobs[0].elemsize;
86         int elempack = bottom_blobs[0].elempack;
87         int top_h = 0;
88         for (size_t b = 0; b < bottom_blobs.size(); b++)
89         {
90             const Mat& bottom_blob = bottom_blobs[b];
91             elemsize = std::min(elemsize, bottom_blob.elemsize);
92             elempack = std::min(elempack, bottom_blob.elempack);
93             top_h += bottom_blob.h * bottom_blob.elempack;
94         }
95 
96         int out_elempack = opt.use_packing_layout && top_h % 4 == 0 ? 4 : 1;
97         size_t out_elemsize = elemsize / elempack * out_elempack;
98 
99         Mat& top_blob = top_blobs[0];
100         top_blob.create(w, top_h / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
101         if (top_blob.empty())
102             return -100;
103 
104         Mat top_blob_unpacked = top_blob;
105         if (elempack < out_elempack)
106         {
107             top_blob_unpacked.create(w, top_h / elempack, elemsize, elempack, opt.workspace_allocator);
108             if (top_blob_unpacked.empty())
109                 return -100;
110         }
111 
112         float* outptr = top_blob_unpacked;
113         for (size_t b = 0; b < bottom_blobs.size(); b++)
114         {
115             const Mat& bottom_blob = bottom_blobs[b];
116 
117             if (bottom_blob.elempack == 4 && elempack == 1)
118             {
119                 for (int i = 0; i < bottom_blob.h; i++)
120                 {
121                     const float* r0 = bottom_blob.row(i);
122 
123                     float* outptr0 = outptr;
124                     float* outptr1 = outptr + w;
125                     float* outptr2 = outptr + w * 2;
126                     float* outptr3 = outptr + w * 3;
127 
128                     for (int j = 0; j < w; j++)
129                     {
130                         *outptr0++ = r0[0];
131                         *outptr1++ = r0[1];
132                         *outptr2++ = r0[2];
133                         *outptr3++ = r0[3];
134 
135                         r0 += 4;
136                     }
137 
138                     outptr += w * 4;
139                 }
140             }
141             else // if (bottom_blob.elempack == 1 && elempack == 1) if (bottom_blob.elempack == 4 && elempack == 4)
142             {
143                 int size = w * bottom_blob.h;
144 
145                 const float* ptr = bottom_blob;
146                 memcpy(outptr, ptr, size * bottom_blob.elemsize);
147 
148                 outptr += size * bottom_blob.elempack;
149             }
150         }
151 
152         // packing
153         if (elempack < out_elempack)
154         {
155             convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
156         }
157     }
158 
159     if (dims == 2 && positive_axis == 1)
160     {
161         // interleave image row
162         int h = bottom_blobs[0].h;
163         size_t elemsize = bottom_blobs[0].elemsize;
164         int elempack = bottom_blobs[0].elempack;
165 
166         // total width
167         int top_w = 0;
168         for (size_t b = 0; b < bottom_blobs.size(); b++)
169         {
170             const Mat& bottom_blob = bottom_blobs[b];
171             top_w += bottom_blob.w;
172         }
173 
174         Mat& top_blob = top_blobs[0];
175         top_blob.create(top_w, h, elemsize, elempack, opt.blob_allocator);
176         if (top_blob.empty())
177             return -100;
178 
179         #pragma omp parallel for num_threads(opt.num_threads)
180         for (int i = 0; i < h; i++)
181         {
182             float* outptr = top_blob.row(i);
183             for (size_t b = 0; b < bottom_blobs.size(); b++)
184             {
185                 const Mat& bottom_blob = bottom_blobs[b];
186 
187                 const float* ptr = bottom_blob.row(i);
188                 memcpy(outptr, ptr, bottom_blob.w * elemsize);
189 
190                 outptr += bottom_blob.w * elempack;
191             }
192         }
193     }
194 
195     if (dims == 3 && positive_axis == 0)
196     {
197         // concat dim
198         int w = bottom_blobs[0].w;
199         int h = bottom_blobs[0].h;
200 
201         // total channels
202         size_t elemsize = bottom_blobs[0].elemsize;
203         int elempack = bottom_blobs[0].elempack;
204         int top_channels = 0;
205         for (size_t b = 0; b < bottom_blobs.size(); b++)
206         {
207             const Mat& bottom_blob = bottom_blobs[b];
208             elemsize = std::min(elemsize, bottom_blob.elemsize);
209             elempack = std::min(elempack, bottom_blob.elempack);
210             top_channels += bottom_blob.c * bottom_blob.elempack;
211         }
212 
213         int out_elempack = opt.use_packing_layout && top_channels % 4 == 0 ? 4 : 1;
214         size_t out_elemsize = elemsize / elempack * out_elempack;
215 
216         Mat& top_blob = top_blobs[0];
217         top_blob.create(w, h, top_channels / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
218         if (top_blob.empty())
219             return -100;
220 
221         Mat top_blob_unpacked = top_blob;
222         if (elempack < out_elempack)
223         {
224             top_blob_unpacked.create(w, h, top_channels / elempack, elemsize, elempack, opt.workspace_allocator);
225             if (top_blob_unpacked.empty())
226                 return -100;
227         }
228 
229         int p = 0;
230         for (size_t b = 0; b < bottom_blobs.size(); b++)
231         {
232             const Mat& bottom_blob = bottom_blobs[b];
233 
234             if (bottom_blob.elempack == 4 && elempack == 1)
235             {
236                 int size = bottom_blob.w * bottom_blob.h;
237 
238                 for (int q = 0; q < bottom_blob.c; q++)
239                 {
240                     const float* r0 = bottom_blob.channel(q);
241 
242                     float* outptr0 = top_blob_unpacked.channel(p);
243                     float* outptr1 = top_blob_unpacked.channel(p + 1);
244                     float* outptr2 = top_blob_unpacked.channel(p + 2);
245                     float* outptr3 = top_blob_unpacked.channel(p + 3);
246 
247                     for (int i = 0; i < size; i++)
248                     {
249                         *outptr0++ = r0[0];
250                         *outptr1++ = r0[1];
251                         *outptr2++ = r0[2];
252                         *outptr3++ = r0[3];
253 
254                         r0 += 4;
255                     }
256 
257                     p += 4;
258                 }
259             }
260             else // if (bottom_blob.elempack == 1 && elempack == 1) if (bottom_blob.elempack == 4 && elempack == 4)
261             {
262                 int size = bottom_blob.total();
263 
264                 const float* ptr = bottom_blob;
265                 float* outptr = top_blob_unpacked.channel(p);
266                 memcpy(outptr, ptr, size * bottom_blob.elemsize);
267 
268                 p += bottom_blob.c;
269             }
270         }
271 
272         // packing
273         if (elempack < out_elempack)
274         {
275             convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
276         }
277     }
278 
279     if (dims == 3 && positive_axis == 1)
280     {
281         // interleave dim height
282         int w = bottom_blobs[0].w;
283         int channels = bottom_blobs[0].c;
284         size_t elemsize = bottom_blobs[0].elemsize;
285         int elempack = bottom_blobs[0].elempack;
286 
287         // total height
288         int top_h = 0;
289         for (size_t b = 0; b < bottom_blobs.size(); b++)
290         {
291             const Mat& bottom_blob = bottom_blobs[b];
292             top_h += bottom_blob.h;
293         }
294 
295         Mat& top_blob = top_blobs[0];
296         top_blob.create(w, top_h, channels, elemsize, elempack, opt.blob_allocator);
297         if (top_blob.empty())
298             return -100;
299 
300         #pragma omp parallel for num_threads(opt.num_threads)
301         for (int q = 0; q < channels; q++)
302         {
303             float* outptr = top_blob.channel(q);
304 
305             for (size_t b = 0; b < bottom_blobs.size(); b++)
306             {
307                 const Mat& bottom_blob = bottom_blobs[b];
308 
309                 int size = bottom_blob.w * bottom_blob.h;
310 
311                 const float* ptr = bottom_blob.channel(q);
312                 memcpy(outptr, ptr, size * elemsize);
313 
314                 outptr += size * elempack;
315             }
316         }
317     }
318 
319     if (dims == 3 && positive_axis == 2)
320     {
321         // interleave dim width
322         int h = bottom_blobs[0].h;
323         int channels = bottom_blobs[0].c;
324         size_t elemsize = bottom_blobs[0].elemsize;
325         int elempack = bottom_blobs[0].elempack;
326 
327         // total height
328         int top_w = 0;
329         for (size_t b = 0; b < bottom_blobs.size(); b++)
330         {
331             const Mat& bottom_blob = bottom_blobs[b];
332             top_w += bottom_blob.w;
333         }
334 
335         Mat& top_blob = top_blobs[0];
336         top_blob.create(top_w, h, channels, elemsize, elempack, opt.blob_allocator);
337         if (top_blob.empty())
338             return -100;
339 
340         #pragma omp parallel for num_threads(opt.num_threads)
341         for (int q = 0; q < channels; q++)
342         {
343             float* outptr = top_blob.channel(q);
344 
345             for (int i = 0; i < h; i++)
346             {
347                 for (size_t b = 0; b < bottom_blobs.size(); b++)
348                 {
349                     const Mat& bottom_blob = bottom_blobs[b];
350 
351                     const float* ptr = bottom_blob.channel(q).row(i);
352                     memcpy(outptr, ptr, bottom_blob.w * elemsize);
353 
354                     outptr += bottom_blob.w * elempack;
355                 }
356             }
357         }
358     }
359 
360     return 0;
361 }
362 
forward_bf16s_fp16s(const std::vector<Mat> & bottom_blobs,std::vector<Mat> & top_blobs,const Option & opt) const363 int Concat_arm::forward_bf16s_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
364 {
365     int dims = bottom_blobs[0].dims;
366     int positive_axis = axis < 0 ? dims + axis : axis;
367 
368     if (dims == 1) // positive_axis == 0
369     {
370         // concat vector
371         // total length
372         size_t elemsize = bottom_blobs[0].elemsize;
373         int elempack = bottom_blobs[0].elempack;
374         int top_w = 0;
375         for (size_t b = 0; b < bottom_blobs.size(); b++)
376         {
377             const Mat& bottom_blob = bottom_blobs[b];
378             top_w += bottom_blob.w * bottom_blob.elempack;
379         }
380 
381         int out_elempack = 1;
382         if (opt.use_packing_layout)
383         {
384             out_elempack = opt.use_fp16_arithmetic && top_w % 8 == 0 ? 8 : top_w % 4 == 0 ? 4 : 1;
385         }
386         size_t out_elemsize = elemsize / elempack * out_elempack;
387 
388         Mat& top_blob = top_blobs[0];
389         top_blob.create(top_w / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
390         if (top_blob.empty())
391             return -100;
392 
393         unsigned short* outptr = top_blob;
394         for (size_t b = 0; b < bottom_blobs.size(); b++)
395         {
396             const Mat& bottom_blob = bottom_blobs[b];
397 
398             const unsigned short* ptr = bottom_blob;
399             memcpy(outptr, ptr, bottom_blob.w * bottom_blob.elemsize);
400 
401             outptr += bottom_blob.w * bottom_blob.elempack;
402         }
403     }
404 
405     if (dims == 2 && positive_axis == 0)
406     {
407         // concat image
408         int w = bottom_blobs[0].w;
409 
410         // total height
411         size_t elemsize = bottom_blobs[0].elemsize;
412         int elempack = bottom_blobs[0].elempack;
413         int top_h = 0;
414         for (size_t b = 0; b < bottom_blobs.size(); b++)
415         {
416             const Mat& bottom_blob = bottom_blobs[b];
417             elemsize = std::min(elemsize, bottom_blob.elemsize);
418             elempack = std::min(elempack, bottom_blob.elempack);
419             top_h += bottom_blob.h * bottom_blob.elempack;
420         }
421 
422         int out_elempack = 1;
423         if (opt.use_packing_layout)
424         {
425             out_elempack = opt.use_fp16_arithmetic && top_h % 8 == 0 ? 8 : top_h % 4 == 0 ? 4 : 1;
426         }
427         size_t out_elemsize = elemsize / elempack * out_elempack;
428 
429         Mat& top_blob = top_blobs[0];
430         top_blob.create(w, top_h / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
431         if (top_blob.empty())
432             return -100;
433 
434         Mat top_blob_unpacked = top_blob;
435         if (elempack < out_elempack)
436         {
437             top_blob_unpacked.create(w, top_h / elempack, elemsize, elempack, opt.workspace_allocator);
438             if (top_blob_unpacked.empty())
439                 return -100;
440         }
441 
442         unsigned short* outptr = top_blob_unpacked;
443         for (size_t b = 0; b < bottom_blobs.size(); b++)
444         {
445             const Mat& bottom_blob = bottom_blobs[b];
446 
447 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
448             if (bottom_blob.elempack == 8 && elempack == 4)
449             {
450                 for (int i = 0; i < bottom_blob.h; i++)
451                 {
452                     const unsigned short* r0 = bottom_blob.row<const unsigned short>(i);
453 
454                     unsigned short* outptr0 = outptr;
455                     unsigned short* outptr1 = outptr + w * 4;
456 
457                     for (int j = 0; j < w; j++)
458                     {
459                         outptr0[0] = r0[0];
460                         outptr0[1] = r0[1];
461                         outptr0[2] = r0[2];
462                         outptr0[3] = r0[3];
463                         outptr1[0] = r0[4];
464                         outptr1[1] = r0[5];
465                         outptr1[2] = r0[6];
466                         outptr1[3] = r0[7];
467 
468                         outptr0 += 4;
469                         outptr1 += 4;
470                         r0 += 8;
471                     }
472 
473                     outptr += w * 8;
474                 }
475             }
476             if (bottom_blob.elempack == 8 && elempack == 1)
477             {
478                 for (int i = 0; i < bottom_blob.h; i++)
479                 {
480                     const unsigned short* r0 = bottom_blob.row<const unsigned short>(i);
481 
482                     unsigned short* outptr0 = outptr;
483                     unsigned short* outptr1 = outptr + w;
484                     unsigned short* outptr2 = outptr + w * 2;
485                     unsigned short* outptr3 = outptr + w * 3;
486                     unsigned short* outptr4 = outptr + w * 4;
487                     unsigned short* outptr5 = outptr + w * 5;
488                     unsigned short* outptr6 = outptr + w * 6;
489                     unsigned short* outptr7 = outptr + w * 7;
490 
491                     for (int j = 0; j < w; j++)
492                     {
493                         *outptr0++ = r0[0];
494                         *outptr1++ = r0[1];
495                         *outptr2++ = r0[2];
496                         *outptr3++ = r0[3];
497                         *outptr4++ = r0[4];
498                         *outptr5++ = r0[5];
499                         *outptr6++ = r0[6];
500                         *outptr7++ = r0[7];
501 
502                         r0 += 8;
503                     }
504 
505                     outptr += w * 8;
506                 }
507             }
508 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
509             if (bottom_blob.elempack == 4 && elempack == 1)
510             {
511                 for (int i = 0; i < bottom_blob.h; i++)
512                 {
513                     const unsigned short* r0 = bottom_blob.row<const unsigned short>(i);
514 
515                     unsigned short* outptr0 = outptr;
516                     unsigned short* outptr1 = outptr + w;
517                     unsigned short* outptr2 = outptr + w * 2;
518                     unsigned short* outptr3 = outptr + w * 3;
519 
520                     for (int j = 0; j < w; j++)
521                     {
522                         *outptr0++ = r0[0];
523                         *outptr1++ = r0[1];
524                         *outptr2++ = r0[2];
525                         *outptr3++ = r0[3];
526 
527                         r0 += 4;
528                     }
529 
530                     outptr += w * 4;
531                 }
532             }
533             if (bottom_blob.elempack == elempack) // 1-1 4-4 8-8
534             {
535                 int size = w * bottom_blob.h;
536 
537                 const unsigned short* ptr = bottom_blob;
538                 memcpy(outptr, ptr, size * bottom_blob.elemsize);
539 
540                 outptr += size * bottom_blob.elempack;
541             }
542         }
543 
544         // packing
545         if (elempack < out_elempack)
546         {
547             convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
548         }
549     }
550 
551     if (dims == 2 && positive_axis == 1)
552     {
553         // interleave image row
554         int h = bottom_blobs[0].h;
555         size_t elemsize = bottom_blobs[0].elemsize;
556         int elempack = bottom_blobs[0].elempack;
557 
558         // total width
559         int top_w = 0;
560         for (size_t b = 0; b < bottom_blobs.size(); b++)
561         {
562             const Mat& bottom_blob = bottom_blobs[b];
563             top_w += bottom_blob.w;
564         }
565 
566         Mat& top_blob = top_blobs[0];
567         top_blob.create(top_w, h, elemsize, elempack, opt.blob_allocator);
568         if (top_blob.empty())
569             return -100;
570 
571         #pragma omp parallel for num_threads(opt.num_threads)
572         for (int i = 0; i < h; i++)
573         {
574             unsigned short* outptr = top_blob.row<unsigned short>(i);
575             for (size_t b = 0; b < bottom_blobs.size(); b++)
576             {
577                 const Mat& bottom_blob = bottom_blobs[b];
578 
579                 const unsigned short* ptr = bottom_blob.row<unsigned short>(i);
580                 memcpy(outptr, ptr, bottom_blob.w * elemsize);
581 
582                 outptr += bottom_blob.w * elempack;
583             }
584         }
585     }
586 
587     if (dims == 3 && positive_axis == 0)
588     {
589         // concat dim
590         int w = bottom_blobs[0].w;
591         int h = bottom_blobs[0].h;
592 
593         // total channels
594         size_t elemsize = bottom_blobs[0].elemsize;
595         int elempack = bottom_blobs[0].elempack;
596         int top_channels = 0;
597         for (size_t b = 0; b < bottom_blobs.size(); b++)
598         {
599             const Mat& bottom_blob = bottom_blobs[b];
600             elemsize = std::min(elemsize, bottom_blob.elemsize);
601             elempack = std::min(elempack, bottom_blob.elempack);
602             top_channels += bottom_blob.c * bottom_blob.elempack;
603         }
604 
605         int out_elempack = 1;
606         if (opt.use_packing_layout)
607         {
608             out_elempack = opt.use_fp16_arithmetic && top_channels % 8 == 0 ? 8 : top_channels % 4 == 0 ? 4 : 1;
609         }
610         size_t out_elemsize = elemsize / elempack * out_elempack;
611 
612         Mat& top_blob = top_blobs[0];
613         top_blob.create(w, h, top_channels / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
614         if (top_blob.empty())
615             return -100;
616 
617         Mat top_blob_unpacked = top_blob;
618         if (elempack < out_elempack)
619         {
620             top_blob_unpacked.create(w, h, top_channels / elempack, elemsize, elempack, opt.workspace_allocator);
621             if (top_blob_unpacked.empty())
622                 return -100;
623         }
624 
625         int p = 0;
626         for (size_t b = 0; b < bottom_blobs.size(); b++)
627         {
628             const Mat& bottom_blob = bottom_blobs[b];
629 
630 #if __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
631             if (bottom_blob.elempack == 8 && elempack == 4)
632             {
633                 int size = bottom_blob.w * bottom_blob.h;
634 
635                 for (int q = 0; q < bottom_blob.c; q++)
636                 {
637                     const unsigned short* r0 = bottom_blob.channel(q);
638 
639                     unsigned short* outptr0 = top_blob_unpacked.channel(p);
640                     unsigned short* outptr1 = top_blob_unpacked.channel(p + 1);
641 
642                     for (int i = 0; i < size; i++)
643                     {
644                         outptr0[0] = r0[0];
645                         outptr0[1] = r0[1];
646                         outptr0[2] = r0[2];
647                         outptr0[3] = r0[3];
648                         outptr1[0] = r0[4];
649                         outptr1[1] = r0[5];
650                         outptr1[2] = r0[6];
651                         outptr1[3] = r0[7];
652 
653                         outptr0 += 4;
654                         outptr1 += 4;
655                         r0 += 8;
656                     }
657 
658                     p += 2;
659                 }
660             }
661             if (bottom_blob.elempack == 8 && elempack == 1)
662             {
663                 int size = bottom_blob.w * bottom_blob.h;
664 
665                 for (int q = 0; q < bottom_blob.c; q++)
666                 {
667                     const unsigned short* r0 = bottom_blob.channel(q);
668 
669                     unsigned short* outptr0 = top_blob_unpacked.channel(p);
670                     unsigned short* outptr1 = top_blob_unpacked.channel(p + 1);
671                     unsigned short* outptr2 = top_blob_unpacked.channel(p + 2);
672                     unsigned short* outptr3 = top_blob_unpacked.channel(p + 3);
673                     unsigned short* outptr4 = top_blob_unpacked.channel(p + 4);
674                     unsigned short* outptr5 = top_blob_unpacked.channel(p + 5);
675                     unsigned short* outptr6 = top_blob_unpacked.channel(p + 6);
676                     unsigned short* outptr7 = top_blob_unpacked.channel(p + 7);
677 
678                     for (int i = 0; i < size; i++)
679                     {
680                         *outptr0++ = r0[0];
681                         *outptr1++ = r0[1];
682                         *outptr2++ = r0[2];
683                         *outptr3++ = r0[3];
684                         *outptr4++ = r0[4];
685                         *outptr5++ = r0[5];
686                         *outptr6++ = r0[6];
687                         *outptr7++ = r0[7];
688 
689                         r0 += 8;
690                     }
691 
692                     p += 8;
693                 }
694             }
695 #endif
696             if (bottom_blob.elempack == 4 && elempack == 1)
697             {
698                 int size = bottom_blob.w * bottom_blob.h;
699 
700                 for (int q = 0; q < bottom_blob.c; q++)
701                 {
702                     const unsigned short* r0 = bottom_blob.channel(q);
703 
704                     unsigned short* outptr0 = top_blob_unpacked.channel(p);
705                     unsigned short* outptr1 = top_blob_unpacked.channel(p + 1);
706                     unsigned short* outptr2 = top_blob_unpacked.channel(p + 2);
707                     unsigned short* outptr3 = top_blob_unpacked.channel(p + 3);
708 
709                     for (int i = 0; i < size; i++)
710                     {
711                         *outptr0++ = r0[0];
712                         *outptr1++ = r0[1];
713                         *outptr2++ = r0[2];
714                         *outptr3++ = r0[3];
715 
716                         r0 += 4;
717                     }
718 
719                     p += 4;
720                 }
721             }
722             if (bottom_blob.elempack == elempack) // 1-1 4-4 8-8
723             {
724                 int size = bottom_blob.total();
725 
726                 const unsigned short* ptr = bottom_blob;
727                 unsigned short* outptr = top_blob_unpacked.channel(p);
728                 memcpy(outptr, ptr, size * bottom_blob.elemsize);
729 
730                 p += bottom_blob.c;
731             }
732         }
733 
734         // packing
735         if (elempack < out_elempack)
736         {
737             convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
738         }
739     }
740 
741     if (dims == 3 && positive_axis == 1)
742     {
743         // interleave dim height
744         int w = bottom_blobs[0].w;
745         int channels = bottom_blobs[0].c;
746         size_t elemsize = bottom_blobs[0].elemsize;
747         int elempack = bottom_blobs[0].elempack;
748 
749         // total height
750         int top_h = 0;
751         for (size_t b = 0; b < bottom_blobs.size(); b++)
752         {
753             const Mat& bottom_blob = bottom_blobs[b];
754             top_h += bottom_blob.h;
755         }
756 
757         Mat& top_blob = top_blobs[0];
758         top_blob.create(w, top_h, channels, elemsize, elempack, opt.blob_allocator);
759         if (top_blob.empty())
760             return -100;
761 
762         #pragma omp parallel for num_threads(opt.num_threads)
763         for (int q = 0; q < channels; q++)
764         {
765             unsigned short* outptr = top_blob.channel(q);
766 
767             for (size_t b = 0; b < bottom_blobs.size(); b++)
768             {
769                 const Mat& bottom_blob = bottom_blobs[b];
770 
771                 int size = bottom_blob.w * bottom_blob.h;
772 
773                 const unsigned short* ptr = bottom_blob.channel(q);
774                 memcpy(outptr, ptr, size * elemsize);
775 
776                 outptr += size * elempack;
777             }
778         }
779     }
780 
781     if (dims == 3 && positive_axis == 2)
782     {
783         // interleave dim width
784         int h = bottom_blobs[0].h;
785         int channels = bottom_blobs[0].c;
786         size_t elemsize = bottom_blobs[0].elemsize;
787         int elempack = bottom_blobs[0].elempack;
788 
789         // total height
790         int top_w = 0;
791         for (size_t b = 0; b < bottom_blobs.size(); b++)
792         {
793             const Mat& bottom_blob = bottom_blobs[b];
794             top_w += bottom_blob.w;
795         }
796 
797         Mat& top_blob = top_blobs[0];
798         top_blob.create(top_w, h, channels, elemsize, elempack, opt.blob_allocator);
799         if (top_blob.empty())
800             return -100;
801 
802         #pragma omp parallel for num_threads(opt.num_threads)
803         for (int q = 0; q < channels; q++)
804         {
805             unsigned short* outptr = top_blob.channel(q);
806 
807             for (int i = 0; i < h; i++)
808             {
809                 for (size_t b = 0; b < bottom_blobs.size(); b++)
810                 {
811                     const Mat& bottom_blob = bottom_blobs[b];
812 
813                     const unsigned short* ptr = bottom_blob.channel(q).row<const unsigned short>(i);
814                     memcpy(outptr, ptr, bottom_blob.w * elemsize);
815 
816                     outptr += bottom_blob.w * elempack;
817                 }
818             }
819         }
820     }
821 
822     return 0;
823 }
824 
825 } // namespace ncnn
826