1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 
15 #include "concat_riscv.h"
16 
17 #if __riscv_vector
18 #ifdef RVV_SPEC_0_7
19 #include "riscv_v_071_fix.h"
20 #else
21 #include <riscv_vector.h>
22 #endif
23 #endif // __riscv_vector
24 
25 #include "riscv_usability.h"
26 
27 namespace ncnn {
28 
Concat_riscv()29 Concat_riscv::Concat_riscv()
30 {
31 #if __riscv_vector
32     support_packing = true;
33 #if __riscv_zfh
34     support_fp16_storage = true;
35 #endif
36 #endif // __riscv_vector
37 
38 #if NCNN_BF16
39     support_bf16_storage = true;
40 #endif
41 }
42 
forward(const std::vector<Mat> & bottom_blobs,std::vector<Mat> & top_blobs,const Option & opt) const43 int Concat_riscv::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
44 {
45     int elembits = bottom_blobs[0].elembits();
46 
47 #if __riscv_vector && __riscv_zfh
48     if (opt.use_fp16_storage && elembits == 16)
49         return forward_bf16s_fp16s(bottom_blobs, top_blobs, opt);
50 #endif
51 
52 #if NCNN_BF16
53     if (opt.use_bf16_storage && elembits == 16)
54         return forward_bf16s_fp16s(bottom_blobs, top_blobs, opt);
55 #endif
56 
57 #if __riscv_vector
58     const int packn = csrr_vlenb() / 4;
59 #endif
60 
61     int dims = bottom_blobs[0].dims;
62     int positive_axis = axis < 0 ? dims + axis : axis;
63 
64     if (dims == 1) // positive_axis == 0
65     {
66         // concat vector
67         // total length
68         size_t elemsize = bottom_blobs[0].elemsize;
69         int elempack = bottom_blobs[0].elempack;
70         int top_w = 0;
71         for (size_t b = 0; b < bottom_blobs.size(); b++)
72         {
73             const Mat& bottom_blob = bottom_blobs[b];
74             top_w += bottom_blob.w * bottom_blob.elempack;
75         }
76 
77         int out_elempack = 1;
78 #if __riscv_vector
79         if (opt.use_packing_layout)
80         {
81             out_elempack = top_w % packn == 0 ? packn : 1;
82         }
83 #endif
84         size_t out_elemsize = elemsize / elempack * out_elempack;
85 
86         Mat& top_blob = top_blobs[0];
87         top_blob.create(top_w / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
88         if (top_blob.empty())
89             return -100;
90 
91         float* outptr = top_blob;
92         for (size_t b = 0; b < bottom_blobs.size(); b++)
93         {
94             const Mat& bottom_blob = bottom_blobs[b];
95 
96             const float* ptr = bottom_blob;
97             memcpy(outptr, ptr, bottom_blob.w * bottom_blob.elemsize);
98 
99             outptr += bottom_blob.w * bottom_blob.elempack;
100         }
101     }
102 
103     if (dims == 2 && positive_axis == 0)
104     {
105         // concat image
106         int w = bottom_blobs[0].w;
107 
108         // total height
109         size_t elemsize = bottom_blobs[0].elemsize;
110         int elempack = bottom_blobs[0].elempack;
111         int top_h = 0;
112         for (size_t b = 0; b < bottom_blobs.size(); b++)
113         {
114             const Mat& bottom_blob = bottom_blobs[b];
115             elemsize = std::min(elemsize, bottom_blob.elemsize);
116             elempack = std::min(elempack, bottom_blob.elempack);
117             top_h += bottom_blob.h * bottom_blob.elempack;
118         }
119 
120         int out_elempack = 1;
121 #if __riscv_vector
122         if (opt.use_packing_layout)
123         {
124             out_elempack = top_h % packn == 0 ? packn : 1;
125         }
126 #endif
127         size_t out_elemsize = elemsize / elempack * out_elempack;
128 
129         Mat& top_blob = top_blobs[0];
130         top_blob.create(w, top_h / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
131         if (top_blob.empty())
132             return -100;
133 
134         Mat top_blob_unpacked = top_blob;
135         if (elempack < out_elempack)
136         {
137             top_blob_unpacked.create(w, top_h / elempack, elemsize, elempack, opt.workspace_allocator);
138             if (top_blob_unpacked.empty())
139                 return -100;
140         }
141 
142         float* outptr = top_blob_unpacked;
143         for (size_t b = 0; b < bottom_blobs.size(); b++)
144         {
145             const Mat& bottom_blob = bottom_blobs[b];
146 
147 #if __riscv_vector
148             if (bottom_blob.elempack == packn && elempack == 1)
149             {
150                 const word_type vl = vsetvl_e32m1(packn);
151 
152                 for (int i = 0; i < bottom_blob.h; i++)
153                 {
154                     const float* r0 = bottom_blob.row(i);
155 
156                     float* outptr0 = outptr;
157 
158                     for (int j = 0; j < w; j++)
159                     {
160                         vfloat32m1_t _p = vle32_v_f32m1(r0, vl);
161                         vsse32_v_f32m1(outptr0, w * sizeof(float), _p, vl);
162 
163                         r0 += packn;
164                         outptr0 += 1;
165                     }
166 
167                     outptr += w * packn;
168                 }
169             }
170             else // if (bottom_blob.elempack == 1 && elempack == 1) if (bottom_blob.elempack == packn && elempack == packn)
171 #endif           // __riscv_vector
172             {
173                 int size = w * bottom_blob.h;
174 
175                 const float* ptr = bottom_blob;
176                 memcpy(outptr, ptr, size * bottom_blob.elemsize);
177 
178                 outptr += size * bottom_blob.elempack;
179             }
180         }
181 
182         // packing
183         if (elempack < out_elempack)
184         {
185             convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
186         }
187     }
188 
189     if (dims == 2 && positive_axis == 1)
190     {
191         // interleave image row
192         int h = bottom_blobs[0].h;
193         size_t elemsize = bottom_blobs[0].elemsize;
194         int elempack = bottom_blobs[0].elempack;
195 
196         // total width
197         int top_w = 0;
198         for (size_t b = 0; b < bottom_blobs.size(); b++)
199         {
200             const Mat& bottom_blob = bottom_blobs[b];
201             top_w += bottom_blob.w;
202         }
203 
204         Mat& top_blob = top_blobs[0];
205         top_blob.create(top_w, h, elemsize, elempack, opt.blob_allocator);
206         if (top_blob.empty())
207             return -100;
208 
209         #pragma omp parallel for num_threads(opt.num_threads)
210         for (int i = 0; i < h; i++)
211         {
212             float* outptr = top_blob.row(i);
213             for (size_t b = 0; b < bottom_blobs.size(); b++)
214             {
215                 const Mat& bottom_blob = bottom_blobs[b];
216 
217                 const float* ptr = bottom_blob.row(i);
218                 memcpy(outptr, ptr, bottom_blob.w * elemsize);
219 
220                 outptr += bottom_blob.w * elempack;
221             }
222         }
223     }
224 
225     if (dims == 3 && positive_axis == 0)
226     {
227         // concat dim
228         int w = bottom_blobs[0].w;
229         int h = bottom_blobs[0].h;
230 
231         // total channels
232         size_t elemsize = bottom_blobs[0].elemsize;
233         int elempack = bottom_blobs[0].elempack;
234         int top_channels = 0;
235         for (size_t b = 0; b < bottom_blobs.size(); b++)
236         {
237             const Mat& bottom_blob = bottom_blobs[b];
238             elemsize = std::min(elemsize, bottom_blob.elemsize);
239             elempack = std::min(elempack, bottom_blob.elempack);
240             top_channels += bottom_blob.c * bottom_blob.elempack;
241         }
242 
243         int out_elempack = 1;
244 #if __riscv_vector
245         if (opt.use_packing_layout)
246         {
247             out_elempack = top_channels % packn == 0 ? packn : 1;
248         }
249 #endif
250         size_t out_elemsize = elemsize / elempack * out_elempack;
251 
252         Mat& top_blob = top_blobs[0];
253         top_blob.create(w, h, top_channels / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
254         if (top_blob.empty())
255             return -100;
256 
257         Mat top_blob_unpacked = top_blob;
258         if (elempack < out_elempack)
259         {
260             top_blob_unpacked.create(w, h, top_channels / elempack, elemsize, elempack, opt.workspace_allocator);
261             if (top_blob_unpacked.empty())
262                 return -100;
263         }
264 
265         int p = 0;
266         for (size_t b = 0; b < bottom_blobs.size(); b++)
267         {
268             const Mat& bottom_blob = bottom_blobs[b];
269 
270 #if __riscv_vector
271             if (bottom_blob.elempack == packn && elempack == 1)
272             {
273                 const word_type vl = vsetvl_e32m1(packn);
274 
275                 int size = bottom_blob.w * bottom_blob.h;
276 
277                 for (int q = 0; q < bottom_blob.c; q++)
278                 {
279                     const float* r0 = bottom_blob.channel(q);
280 
281                     float* outptr0 = top_blob_unpacked.channel(p);
282 
283                     for (int i = 0; i < size; i++)
284                     {
285                         vfloat32m1_t _p = vle32_v_f32m1(r0, vl);
286                         vsse32_v_f32m1(outptr0, top_blob_unpacked.cstep * sizeof(float), _p, vl);
287 
288                         r0 += packn;
289                         outptr0 += 1;
290                     }
291 
292                     p += packn;
293                 }
294             }
295             else // if (bottom_blob.elempack == 1 && elempack == 1) if (bottom_blob.elempack == 4 && elempack == 4)
296 #endif           // __riscv_vector
297             {
298                 int size = bottom_blob.total();
299 
300                 const float* ptr = bottom_blob;
301                 float* outptr = top_blob_unpacked.channel(p);
302                 memcpy(outptr, ptr, size * bottom_blob.elemsize);
303 
304                 p += bottom_blob.c;
305             }
306         }
307 
308         // packing
309         if (elempack < out_elempack)
310         {
311             convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
312         }
313     }
314 
315     if (dims == 3 && positive_axis == 1)
316     {
317         // interleave dim height
318         int w = bottom_blobs[0].w;
319         int channels = bottom_blobs[0].c;
320         size_t elemsize = bottom_blobs[0].elemsize;
321         int elempack = bottom_blobs[0].elempack;
322 
323         // total height
324         int top_h = 0;
325         for (size_t b = 0; b < bottom_blobs.size(); b++)
326         {
327             const Mat& bottom_blob = bottom_blobs[b];
328             top_h += bottom_blob.h;
329         }
330 
331         Mat& top_blob = top_blobs[0];
332         top_blob.create(w, top_h, channels, elemsize, elempack, opt.blob_allocator);
333         if (top_blob.empty())
334             return -100;
335 
336         #pragma omp parallel for num_threads(opt.num_threads)
337         for (int q = 0; q < channels; q++)
338         {
339             float* outptr = top_blob.channel(q);
340 
341             for (size_t b = 0; b < bottom_blobs.size(); b++)
342             {
343                 const Mat& bottom_blob = bottom_blobs[b];
344 
345                 int size = bottom_blob.w * bottom_blob.h;
346 
347                 const float* ptr = bottom_blob.channel(q);
348                 memcpy(outptr, ptr, size * elemsize);
349 
350                 outptr += size * elempack;
351             }
352         }
353     }
354 
355     if (dims == 3 && positive_axis == 2)
356     {
357         // interleave dim width
358         int h = bottom_blobs[0].h;
359         int channels = bottom_blobs[0].c;
360         size_t elemsize = bottom_blobs[0].elemsize;
361         int elempack = bottom_blobs[0].elempack;
362 
363         // total height
364         int top_w = 0;
365         for (size_t b = 0; b < bottom_blobs.size(); b++)
366         {
367             const Mat& bottom_blob = bottom_blobs[b];
368             top_w += bottom_blob.w;
369         }
370 
371         Mat& top_blob = top_blobs[0];
372         top_blob.create(top_w, h, channels, elemsize, elempack, opt.blob_allocator);
373         if (top_blob.empty())
374             return -100;
375 
376         #pragma omp parallel for num_threads(opt.num_threads)
377         for (int q = 0; q < channels; q++)
378         {
379             float* outptr = top_blob.channel(q);
380 
381             for (int i = 0; i < h; i++)
382             {
383                 for (size_t b = 0; b < bottom_blobs.size(); b++)
384                 {
385                     const Mat& bottom_blob = bottom_blobs[b];
386 
387                     const float* ptr = bottom_blob.channel(q).row(i);
388                     memcpy(outptr, ptr, bottom_blob.w * elemsize);
389 
390                     outptr += bottom_blob.w * elempack;
391                 }
392             }
393         }
394     }
395 
396     return 0;
397 }
398 
forward_bf16s_fp16s(const std::vector<Mat> & bottom_blobs,std::vector<Mat> & top_blobs,const Option & opt) const399 int Concat_riscv::forward_bf16s_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
400 {
401 #if __riscv_vector
402     const int packn = csrr_vlenb() / 2;
403 #endif
404 
405     int dims = bottom_blobs[0].dims;
406     int positive_axis = axis < 0 ? dims + axis : axis;
407 
408     if (dims == 1) // positive_axis == 0
409     {
410         // concat vector
411         // total length
412         size_t elemsize = bottom_blobs[0].elemsize;
413         int elempack = bottom_blobs[0].elempack;
414         int top_w = 0;
415         for (size_t b = 0; b < bottom_blobs.size(); b++)
416         {
417             const Mat& bottom_blob = bottom_blobs[b];
418             top_w += bottom_blob.w * bottom_blob.elempack;
419         }
420 
421         int out_elempack = 1;
422 #if __riscv_vector
423         if (opt.use_packing_layout)
424         {
425             out_elempack = top_w % packn == 0 ? packn : 1;
426         }
427 #endif
428         size_t out_elemsize = elemsize / elempack * out_elempack;
429 
430         Mat& top_blob = top_blobs[0];
431         top_blob.create(top_w / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
432         if (top_blob.empty())
433             return -100;
434 
435         unsigned short* outptr = top_blob;
436         for (size_t b = 0; b < bottom_blobs.size(); b++)
437         {
438             const Mat& bottom_blob = bottom_blobs[b];
439 
440             const unsigned short* ptr = bottom_blob;
441             memcpy(outptr, ptr, bottom_blob.w * bottom_blob.elemsize);
442 
443             outptr += bottom_blob.w * bottom_blob.elempack;
444         }
445     }
446 
447     if (dims == 2 && positive_axis == 0)
448     {
449         // concat image
450         int w = bottom_blobs[0].w;
451 
452         // total height
453         size_t elemsize = bottom_blobs[0].elemsize;
454         int elempack = bottom_blobs[0].elempack;
455         int top_h = 0;
456         for (size_t b = 0; b < bottom_blobs.size(); b++)
457         {
458             const Mat& bottom_blob = bottom_blobs[b];
459             elemsize = std::min(elemsize, bottom_blob.elemsize);
460             elempack = std::min(elempack, bottom_blob.elempack);
461             top_h += bottom_blob.h * bottom_blob.elempack;
462         }
463 
464         int out_elempack = 1;
465 #if __riscv_vector
466         if (opt.use_packing_layout)
467         {
468             out_elempack = top_h % packn == 0 ? packn : 1;
469         }
470 #endif
471         size_t out_elemsize = elemsize / elempack * out_elempack;
472 
473         Mat& top_blob = top_blobs[0];
474         top_blob.create(w, top_h / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
475         if (top_blob.empty())
476             return -100;
477 
478         Mat top_blob_unpacked = top_blob;
479         if (elempack < out_elempack)
480         {
481             top_blob_unpacked.create(w, top_h / elempack, elemsize, elempack, opt.workspace_allocator);
482             if (top_blob_unpacked.empty())
483                 return -100;
484         }
485 
486         unsigned short* outptr = top_blob_unpacked;
487         for (size_t b = 0; b < bottom_blobs.size(); b++)
488         {
489             const Mat& bottom_blob = bottom_blobs[b];
490 
491 #if __riscv_vector
492             if (bottom_blob.elempack == packn && elempack == 1)
493             {
494                 const word_type vl = vsetvl_e16m1(packn);
495 
496                 for (int i = 0; i < bottom_blob.h; i++)
497                 {
498                     const unsigned short* r0 = bottom_blob.row<const unsigned short>(i);
499 
500                     unsigned short* outptr0 = outptr;
501 
502                     for (int j = 0; j < w; j++)
503                     {
504                         vuint16m1_t _p = vle16_v_u16m1(r0, vl);
505                         vsse16_v_u16m1(outptr0, w * sizeof(unsigned short), _p, vl);
506 
507                         r0 += packn;
508                         outptr0 += 1;
509                     }
510 
511                     outptr += w * packn;
512                 }
513             }
514             else // if (bottom_blob.elempack == 1 && elempack == 1) if (bottom_blob.elempack == packn && elempack == packn)
515 #endif           // __riscv_vector
516             {
517                 int size = w * bottom_blob.h;
518 
519                 const unsigned short* ptr = bottom_blob;
520                 memcpy(outptr, ptr, size * bottom_blob.elemsize);
521 
522                 outptr += size * bottom_blob.elempack;
523             }
524         }
525 
526         // packing
527         if (elempack < out_elempack)
528         {
529             convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
530         }
531     }
532 
533     if (dims == 2 && positive_axis == 1)
534     {
535         // interleave image row
536         int h = bottom_blobs[0].h;
537         size_t elemsize = bottom_blobs[0].elemsize;
538         int elempack = bottom_blobs[0].elempack;
539 
540         // total width
541         int top_w = 0;
542         for (size_t b = 0; b < bottom_blobs.size(); b++)
543         {
544             const Mat& bottom_blob = bottom_blobs[b];
545             top_w += bottom_blob.w;
546         }
547 
548         Mat& top_blob = top_blobs[0];
549         top_blob.create(top_w, h, elemsize, elempack, opt.blob_allocator);
550         if (top_blob.empty())
551             return -100;
552 
553         #pragma omp parallel for num_threads(opt.num_threads)
554         for (int i = 0; i < h; i++)
555         {
556             unsigned short* outptr = top_blob.row<unsigned short>(i);
557             for (size_t b = 0; b < bottom_blobs.size(); b++)
558             {
559                 const Mat& bottom_blob = bottom_blobs[b];
560 
561                 const unsigned short* ptr = bottom_blob.row<unsigned short>(i);
562                 memcpy(outptr, ptr, bottom_blob.w * elemsize);
563 
564                 outptr += bottom_blob.w * elempack;
565             }
566         }
567     }
568 
569     if (dims == 3 && positive_axis == 0)
570     {
571         // concat dim
572         int w = bottom_blobs[0].w;
573         int h = bottom_blobs[0].h;
574 
575         // total channels
576         size_t elemsize = bottom_blobs[0].elemsize;
577         int elempack = bottom_blobs[0].elempack;
578         int top_channels = 0;
579         for (size_t b = 0; b < bottom_blobs.size(); b++)
580         {
581             const Mat& bottom_blob = bottom_blobs[b];
582             elemsize = std::min(elemsize, bottom_blob.elemsize);
583             elempack = std::min(elempack, bottom_blob.elempack);
584             top_channels += bottom_blob.c * bottom_blob.elempack;
585         }
586 
587         int out_elempack = 1;
588 #if __riscv_vector
589         if (opt.use_packing_layout)
590         {
591             out_elempack = top_channels % packn == 0 ? packn : 1;
592         }
593 #endif
594         size_t out_elemsize = elemsize / elempack * out_elempack;
595 
596         Mat& top_blob = top_blobs[0];
597         top_blob.create(w, h, top_channels / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
598         if (top_blob.empty())
599             return -100;
600 
601         Mat top_blob_unpacked = top_blob;
602         if (elempack < out_elempack)
603         {
604             top_blob_unpacked.create(w, h, top_channels / elempack, elemsize, elempack, opt.workspace_allocator);
605             if (top_blob_unpacked.empty())
606                 return -100;
607         }
608 
609         int p = 0;
610         for (size_t b = 0; b < bottom_blobs.size(); b++)
611         {
612             const Mat& bottom_blob = bottom_blobs[b];
613 
614 #if __riscv_vector
615             if (bottom_blob.elempack == packn && elempack == 1)
616             {
617                 const word_type vl = vsetvl_e16m1(packn);
618 
619                 int size = bottom_blob.w * bottom_blob.h;
620 
621                 for (int q = 0; q < bottom_blob.c; q++)
622                 {
623                     const unsigned short* r0 = bottom_blob.channel(q);
624 
625                     unsigned short* outptr0 = top_blob_unpacked.channel(p);
626 
627                     for (int i = 0; i < size; i++)
628                     {
629                         vuint16m1_t _p = vle16_v_u16m1(r0, vl);
630                         vsse16_v_u16m1(outptr0, top_blob_unpacked.cstep * sizeof(unsigned short), _p, vl);
631 
632                         r0 += packn;
633                         outptr0 += 1;
634                     }
635 
636                     p += packn;
637                 }
638             }
639             else // if (bottom_blob.elempack == 1 && elempack == 1) if (bottom_blob.elempack == packn && elempack == packn)
640 #endif           // __riscv_vector
641             {
642                 int size = bottom_blob.total();
643 
644                 const unsigned short* ptr = bottom_blob;
645                 unsigned short* outptr = top_blob_unpacked.channel(p);
646                 memcpy(outptr, ptr, size * bottom_blob.elemsize);
647 
648                 p += bottom_blob.c;
649             }
650         }
651 
652         // packing
653         if (elempack < out_elempack)
654         {
655             convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
656         }
657     }
658 
659     if (dims == 3 && positive_axis == 1)
660     {
661         // interleave dim height
662         int w = bottom_blobs[0].w;
663         int channels = bottom_blobs[0].c;
664         size_t elemsize = bottom_blobs[0].elemsize;
665         int elempack = bottom_blobs[0].elempack;
666 
667         // total height
668         int top_h = 0;
669         for (size_t b = 0; b < bottom_blobs.size(); b++)
670         {
671             const Mat& bottom_blob = bottom_blobs[b];
672             top_h += bottom_blob.h;
673         }
674 
675         Mat& top_blob = top_blobs[0];
676         top_blob.create(w, top_h, channels, elemsize, elempack, opt.blob_allocator);
677         if (top_blob.empty())
678             return -100;
679 
680         #pragma omp parallel for num_threads(opt.num_threads)
681         for (int q = 0; q < channels; q++)
682         {
683             unsigned short* outptr = top_blob.channel(q);
684 
685             for (size_t b = 0; b < bottom_blobs.size(); b++)
686             {
687                 const Mat& bottom_blob = bottom_blobs[b];
688 
689                 int size = bottom_blob.w * bottom_blob.h;
690 
691                 const unsigned short* ptr = bottom_blob.channel(q);
692                 memcpy(outptr, ptr, size * elemsize);
693 
694                 outptr += size * elempack;
695             }
696         }
697     }
698 
699     if (dims == 3 && positive_axis == 2)
700     {
701         // interleave dim width
702         int h = bottom_blobs[0].h;
703         int channels = bottom_blobs[0].c;
704         size_t elemsize = bottom_blobs[0].elemsize;
705         int elempack = bottom_blobs[0].elempack;
706 
707         // total height
708         int top_w = 0;
709         for (size_t b = 0; b < bottom_blobs.size(); b++)
710         {
711             const Mat& bottom_blob = bottom_blobs[b];
712             top_w += bottom_blob.w;
713         }
714 
715         Mat& top_blob = top_blobs[0];
716         top_blob.create(top_w, h, channels, elemsize, elempack, opt.blob_allocator);
717         if (top_blob.empty())
718             return -100;
719 
720         #pragma omp parallel for num_threads(opt.num_threads)
721         for (int q = 0; q < channels; q++)
722         {
723             unsigned short* outptr = top_blob.channel(q);
724 
725             for (int i = 0; i < h; i++)
726             {
727                 for (size_t b = 0; b < bottom_blobs.size(); b++)
728                 {
729                     const Mat& bottom_blob = bottom_blobs[b];
730 
731                     const unsigned short* ptr = bottom_blob.channel(q).row<const unsigned short>(i);
732                     memcpy(outptr, ptr, bottom_blob.w * elemsize);
733 
734                     outptr += bottom_blob.w * elempack;
735                 }
736             }
737         }
738     }
739 
740     return 0;
741 }
742 
743 } // namespace ncnn
744