1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14
15 #include "concat_riscv.h"
16
17 #if __riscv_vector
18 #ifdef RVV_SPEC_0_7
19 #include "riscv_v_071_fix.h"
20 #else
21 #include <riscv_vector.h>
22 #endif
23 #endif // __riscv_vector
24
25 #include "riscv_usability.h"
26
27 namespace ncnn {
28
Concat_riscv()29 Concat_riscv::Concat_riscv()
30 {
31 #if __riscv_vector
32 support_packing = true;
33 #if __riscv_zfh
34 support_fp16_storage = true;
35 #endif
36 #endif // __riscv_vector
37
38 #if NCNN_BF16
39 support_bf16_storage = true;
40 #endif
41 }
42
forward(const std::vector<Mat> & bottom_blobs,std::vector<Mat> & top_blobs,const Option & opt) const43 int Concat_riscv::forward(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
44 {
45 int elembits = bottom_blobs[0].elembits();
46
47 #if __riscv_vector && __riscv_zfh
48 if (opt.use_fp16_storage && elembits == 16)
49 return forward_bf16s_fp16s(bottom_blobs, top_blobs, opt);
50 #endif
51
52 #if NCNN_BF16
53 if (opt.use_bf16_storage && elembits == 16)
54 return forward_bf16s_fp16s(bottom_blobs, top_blobs, opt);
55 #endif
56
57 #if __riscv_vector
58 const int packn = csrr_vlenb() / 4;
59 #endif
60
61 int dims = bottom_blobs[0].dims;
62 int positive_axis = axis < 0 ? dims + axis : axis;
63
64 if (dims == 1) // positive_axis == 0
65 {
66 // concat vector
67 // total length
68 size_t elemsize = bottom_blobs[0].elemsize;
69 int elempack = bottom_blobs[0].elempack;
70 int top_w = 0;
71 for (size_t b = 0; b < bottom_blobs.size(); b++)
72 {
73 const Mat& bottom_blob = bottom_blobs[b];
74 top_w += bottom_blob.w * bottom_blob.elempack;
75 }
76
77 int out_elempack = 1;
78 #if __riscv_vector
79 if (opt.use_packing_layout)
80 {
81 out_elempack = top_w % packn == 0 ? packn : 1;
82 }
83 #endif
84 size_t out_elemsize = elemsize / elempack * out_elempack;
85
86 Mat& top_blob = top_blobs[0];
87 top_blob.create(top_w / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
88 if (top_blob.empty())
89 return -100;
90
91 float* outptr = top_blob;
92 for (size_t b = 0; b < bottom_blobs.size(); b++)
93 {
94 const Mat& bottom_blob = bottom_blobs[b];
95
96 const float* ptr = bottom_blob;
97 memcpy(outptr, ptr, bottom_blob.w * bottom_blob.elemsize);
98
99 outptr += bottom_blob.w * bottom_blob.elempack;
100 }
101 }
102
103 if (dims == 2 && positive_axis == 0)
104 {
105 // concat image
106 int w = bottom_blobs[0].w;
107
108 // total height
109 size_t elemsize = bottom_blobs[0].elemsize;
110 int elempack = bottom_blobs[0].elempack;
111 int top_h = 0;
112 for (size_t b = 0; b < bottom_blobs.size(); b++)
113 {
114 const Mat& bottom_blob = bottom_blobs[b];
115 elemsize = std::min(elemsize, bottom_blob.elemsize);
116 elempack = std::min(elempack, bottom_blob.elempack);
117 top_h += bottom_blob.h * bottom_blob.elempack;
118 }
119
120 int out_elempack = 1;
121 #if __riscv_vector
122 if (opt.use_packing_layout)
123 {
124 out_elempack = top_h % packn == 0 ? packn : 1;
125 }
126 #endif
127 size_t out_elemsize = elemsize / elempack * out_elempack;
128
129 Mat& top_blob = top_blobs[0];
130 top_blob.create(w, top_h / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
131 if (top_blob.empty())
132 return -100;
133
134 Mat top_blob_unpacked = top_blob;
135 if (elempack < out_elempack)
136 {
137 top_blob_unpacked.create(w, top_h / elempack, elemsize, elempack, opt.workspace_allocator);
138 if (top_blob_unpacked.empty())
139 return -100;
140 }
141
142 float* outptr = top_blob_unpacked;
143 for (size_t b = 0; b < bottom_blobs.size(); b++)
144 {
145 const Mat& bottom_blob = bottom_blobs[b];
146
147 #if __riscv_vector
148 if (bottom_blob.elempack == packn && elempack == 1)
149 {
150 const word_type vl = vsetvl_e32m1(packn);
151
152 for (int i = 0; i < bottom_blob.h; i++)
153 {
154 const float* r0 = bottom_blob.row(i);
155
156 float* outptr0 = outptr;
157
158 for (int j = 0; j < w; j++)
159 {
160 vfloat32m1_t _p = vle32_v_f32m1(r0, vl);
161 vsse32_v_f32m1(outptr0, w * sizeof(float), _p, vl);
162
163 r0 += packn;
164 outptr0 += 1;
165 }
166
167 outptr += w * packn;
168 }
169 }
170 else // if (bottom_blob.elempack == 1 && elempack == 1) if (bottom_blob.elempack == packn && elempack == packn)
171 #endif // __riscv_vector
172 {
173 int size = w * bottom_blob.h;
174
175 const float* ptr = bottom_blob;
176 memcpy(outptr, ptr, size * bottom_blob.elemsize);
177
178 outptr += size * bottom_blob.elempack;
179 }
180 }
181
182 // packing
183 if (elempack < out_elempack)
184 {
185 convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
186 }
187 }
188
189 if (dims == 2 && positive_axis == 1)
190 {
191 // interleave image row
192 int h = bottom_blobs[0].h;
193 size_t elemsize = bottom_blobs[0].elemsize;
194 int elempack = bottom_blobs[0].elempack;
195
196 // total width
197 int top_w = 0;
198 for (size_t b = 0; b < bottom_blobs.size(); b++)
199 {
200 const Mat& bottom_blob = bottom_blobs[b];
201 top_w += bottom_blob.w;
202 }
203
204 Mat& top_blob = top_blobs[0];
205 top_blob.create(top_w, h, elemsize, elempack, opt.blob_allocator);
206 if (top_blob.empty())
207 return -100;
208
209 #pragma omp parallel for num_threads(opt.num_threads)
210 for (int i = 0; i < h; i++)
211 {
212 float* outptr = top_blob.row(i);
213 for (size_t b = 0; b < bottom_blobs.size(); b++)
214 {
215 const Mat& bottom_blob = bottom_blobs[b];
216
217 const float* ptr = bottom_blob.row(i);
218 memcpy(outptr, ptr, bottom_blob.w * elemsize);
219
220 outptr += bottom_blob.w * elempack;
221 }
222 }
223 }
224
225 if (dims == 3 && positive_axis == 0)
226 {
227 // concat dim
228 int w = bottom_blobs[0].w;
229 int h = bottom_blobs[0].h;
230
231 // total channels
232 size_t elemsize = bottom_blobs[0].elemsize;
233 int elempack = bottom_blobs[0].elempack;
234 int top_channels = 0;
235 for (size_t b = 0; b < bottom_blobs.size(); b++)
236 {
237 const Mat& bottom_blob = bottom_blobs[b];
238 elemsize = std::min(elemsize, bottom_blob.elemsize);
239 elempack = std::min(elempack, bottom_blob.elempack);
240 top_channels += bottom_blob.c * bottom_blob.elempack;
241 }
242
243 int out_elempack = 1;
244 #if __riscv_vector
245 if (opt.use_packing_layout)
246 {
247 out_elempack = top_channels % packn == 0 ? packn : 1;
248 }
249 #endif
250 size_t out_elemsize = elemsize / elempack * out_elempack;
251
252 Mat& top_blob = top_blobs[0];
253 top_blob.create(w, h, top_channels / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
254 if (top_blob.empty())
255 return -100;
256
257 Mat top_blob_unpacked = top_blob;
258 if (elempack < out_elempack)
259 {
260 top_blob_unpacked.create(w, h, top_channels / elempack, elemsize, elempack, opt.workspace_allocator);
261 if (top_blob_unpacked.empty())
262 return -100;
263 }
264
265 int p = 0;
266 for (size_t b = 0; b < bottom_blobs.size(); b++)
267 {
268 const Mat& bottom_blob = bottom_blobs[b];
269
270 #if __riscv_vector
271 if (bottom_blob.elempack == packn && elempack == 1)
272 {
273 const word_type vl = vsetvl_e32m1(packn);
274
275 int size = bottom_blob.w * bottom_blob.h;
276
277 for (int q = 0; q < bottom_blob.c; q++)
278 {
279 const float* r0 = bottom_blob.channel(q);
280
281 float* outptr0 = top_blob_unpacked.channel(p);
282
283 for (int i = 0; i < size; i++)
284 {
285 vfloat32m1_t _p = vle32_v_f32m1(r0, vl);
286 vsse32_v_f32m1(outptr0, top_blob_unpacked.cstep * sizeof(float), _p, vl);
287
288 r0 += packn;
289 outptr0 += 1;
290 }
291
292 p += packn;
293 }
294 }
295 else // if (bottom_blob.elempack == 1 && elempack == 1) if (bottom_blob.elempack == 4 && elempack == 4)
296 #endif // __riscv_vector
297 {
298 int size = bottom_blob.total();
299
300 const float* ptr = bottom_blob;
301 float* outptr = top_blob_unpacked.channel(p);
302 memcpy(outptr, ptr, size * bottom_blob.elemsize);
303
304 p += bottom_blob.c;
305 }
306 }
307
308 // packing
309 if (elempack < out_elempack)
310 {
311 convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
312 }
313 }
314
315 if (dims == 3 && positive_axis == 1)
316 {
317 // interleave dim height
318 int w = bottom_blobs[0].w;
319 int channels = bottom_blobs[0].c;
320 size_t elemsize = bottom_blobs[0].elemsize;
321 int elempack = bottom_blobs[0].elempack;
322
323 // total height
324 int top_h = 0;
325 for (size_t b = 0; b < bottom_blobs.size(); b++)
326 {
327 const Mat& bottom_blob = bottom_blobs[b];
328 top_h += bottom_blob.h;
329 }
330
331 Mat& top_blob = top_blobs[0];
332 top_blob.create(w, top_h, channels, elemsize, elempack, opt.blob_allocator);
333 if (top_blob.empty())
334 return -100;
335
336 #pragma omp parallel for num_threads(opt.num_threads)
337 for (int q = 0; q < channels; q++)
338 {
339 float* outptr = top_blob.channel(q);
340
341 for (size_t b = 0; b < bottom_blobs.size(); b++)
342 {
343 const Mat& bottom_blob = bottom_blobs[b];
344
345 int size = bottom_blob.w * bottom_blob.h;
346
347 const float* ptr = bottom_blob.channel(q);
348 memcpy(outptr, ptr, size * elemsize);
349
350 outptr += size * elempack;
351 }
352 }
353 }
354
355 if (dims == 3 && positive_axis == 2)
356 {
357 // interleave dim width
358 int h = bottom_blobs[0].h;
359 int channels = bottom_blobs[0].c;
360 size_t elemsize = bottom_blobs[0].elemsize;
361 int elempack = bottom_blobs[0].elempack;
362
363 // total height
364 int top_w = 0;
365 for (size_t b = 0; b < bottom_blobs.size(); b++)
366 {
367 const Mat& bottom_blob = bottom_blobs[b];
368 top_w += bottom_blob.w;
369 }
370
371 Mat& top_blob = top_blobs[0];
372 top_blob.create(top_w, h, channels, elemsize, elempack, opt.blob_allocator);
373 if (top_blob.empty())
374 return -100;
375
376 #pragma omp parallel for num_threads(opt.num_threads)
377 for (int q = 0; q < channels; q++)
378 {
379 float* outptr = top_blob.channel(q);
380
381 for (int i = 0; i < h; i++)
382 {
383 for (size_t b = 0; b < bottom_blobs.size(); b++)
384 {
385 const Mat& bottom_blob = bottom_blobs[b];
386
387 const float* ptr = bottom_blob.channel(q).row(i);
388 memcpy(outptr, ptr, bottom_blob.w * elemsize);
389
390 outptr += bottom_blob.w * elempack;
391 }
392 }
393 }
394 }
395
396 return 0;
397 }
398
forward_bf16s_fp16s(const std::vector<Mat> & bottom_blobs,std::vector<Mat> & top_blobs,const Option & opt) const399 int Concat_riscv::forward_bf16s_fp16s(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& top_blobs, const Option& opt) const
400 {
401 #if __riscv_vector
402 const int packn = csrr_vlenb() / 2;
403 #endif
404
405 int dims = bottom_blobs[0].dims;
406 int positive_axis = axis < 0 ? dims + axis : axis;
407
408 if (dims == 1) // positive_axis == 0
409 {
410 // concat vector
411 // total length
412 size_t elemsize = bottom_blobs[0].elemsize;
413 int elempack = bottom_blobs[0].elempack;
414 int top_w = 0;
415 for (size_t b = 0; b < bottom_blobs.size(); b++)
416 {
417 const Mat& bottom_blob = bottom_blobs[b];
418 top_w += bottom_blob.w * bottom_blob.elempack;
419 }
420
421 int out_elempack = 1;
422 #if __riscv_vector
423 if (opt.use_packing_layout)
424 {
425 out_elempack = top_w % packn == 0 ? packn : 1;
426 }
427 #endif
428 size_t out_elemsize = elemsize / elempack * out_elempack;
429
430 Mat& top_blob = top_blobs[0];
431 top_blob.create(top_w / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
432 if (top_blob.empty())
433 return -100;
434
435 unsigned short* outptr = top_blob;
436 for (size_t b = 0; b < bottom_blobs.size(); b++)
437 {
438 const Mat& bottom_blob = bottom_blobs[b];
439
440 const unsigned short* ptr = bottom_blob;
441 memcpy(outptr, ptr, bottom_blob.w * bottom_blob.elemsize);
442
443 outptr += bottom_blob.w * bottom_blob.elempack;
444 }
445 }
446
447 if (dims == 2 && positive_axis == 0)
448 {
449 // concat image
450 int w = bottom_blobs[0].w;
451
452 // total height
453 size_t elemsize = bottom_blobs[0].elemsize;
454 int elempack = bottom_blobs[0].elempack;
455 int top_h = 0;
456 for (size_t b = 0; b < bottom_blobs.size(); b++)
457 {
458 const Mat& bottom_blob = bottom_blobs[b];
459 elemsize = std::min(elemsize, bottom_blob.elemsize);
460 elempack = std::min(elempack, bottom_blob.elempack);
461 top_h += bottom_blob.h * bottom_blob.elempack;
462 }
463
464 int out_elempack = 1;
465 #if __riscv_vector
466 if (opt.use_packing_layout)
467 {
468 out_elempack = top_h % packn == 0 ? packn : 1;
469 }
470 #endif
471 size_t out_elemsize = elemsize / elempack * out_elempack;
472
473 Mat& top_blob = top_blobs[0];
474 top_blob.create(w, top_h / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
475 if (top_blob.empty())
476 return -100;
477
478 Mat top_blob_unpacked = top_blob;
479 if (elempack < out_elempack)
480 {
481 top_blob_unpacked.create(w, top_h / elempack, elemsize, elempack, opt.workspace_allocator);
482 if (top_blob_unpacked.empty())
483 return -100;
484 }
485
486 unsigned short* outptr = top_blob_unpacked;
487 for (size_t b = 0; b < bottom_blobs.size(); b++)
488 {
489 const Mat& bottom_blob = bottom_blobs[b];
490
491 #if __riscv_vector
492 if (bottom_blob.elempack == packn && elempack == 1)
493 {
494 const word_type vl = vsetvl_e16m1(packn);
495
496 for (int i = 0; i < bottom_blob.h; i++)
497 {
498 const unsigned short* r0 = bottom_blob.row<const unsigned short>(i);
499
500 unsigned short* outptr0 = outptr;
501
502 for (int j = 0; j < w; j++)
503 {
504 vuint16m1_t _p = vle16_v_u16m1(r0, vl);
505 vsse16_v_u16m1(outptr0, w * sizeof(unsigned short), _p, vl);
506
507 r0 += packn;
508 outptr0 += 1;
509 }
510
511 outptr += w * packn;
512 }
513 }
514 else // if (bottom_blob.elempack == 1 && elempack == 1) if (bottom_blob.elempack == packn && elempack == packn)
515 #endif // __riscv_vector
516 {
517 int size = w * bottom_blob.h;
518
519 const unsigned short* ptr = bottom_blob;
520 memcpy(outptr, ptr, size * bottom_blob.elemsize);
521
522 outptr += size * bottom_blob.elempack;
523 }
524 }
525
526 // packing
527 if (elempack < out_elempack)
528 {
529 convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
530 }
531 }
532
533 if (dims == 2 && positive_axis == 1)
534 {
535 // interleave image row
536 int h = bottom_blobs[0].h;
537 size_t elemsize = bottom_blobs[0].elemsize;
538 int elempack = bottom_blobs[0].elempack;
539
540 // total width
541 int top_w = 0;
542 for (size_t b = 0; b < bottom_blobs.size(); b++)
543 {
544 const Mat& bottom_blob = bottom_blobs[b];
545 top_w += bottom_blob.w;
546 }
547
548 Mat& top_blob = top_blobs[0];
549 top_blob.create(top_w, h, elemsize, elempack, opt.blob_allocator);
550 if (top_blob.empty())
551 return -100;
552
553 #pragma omp parallel for num_threads(opt.num_threads)
554 for (int i = 0; i < h; i++)
555 {
556 unsigned short* outptr = top_blob.row<unsigned short>(i);
557 for (size_t b = 0; b < bottom_blobs.size(); b++)
558 {
559 const Mat& bottom_blob = bottom_blobs[b];
560
561 const unsigned short* ptr = bottom_blob.row<unsigned short>(i);
562 memcpy(outptr, ptr, bottom_blob.w * elemsize);
563
564 outptr += bottom_blob.w * elempack;
565 }
566 }
567 }
568
569 if (dims == 3 && positive_axis == 0)
570 {
571 // concat dim
572 int w = bottom_blobs[0].w;
573 int h = bottom_blobs[0].h;
574
575 // total channels
576 size_t elemsize = bottom_blobs[0].elemsize;
577 int elempack = bottom_blobs[0].elempack;
578 int top_channels = 0;
579 for (size_t b = 0; b < bottom_blobs.size(); b++)
580 {
581 const Mat& bottom_blob = bottom_blobs[b];
582 elemsize = std::min(elemsize, bottom_blob.elemsize);
583 elempack = std::min(elempack, bottom_blob.elempack);
584 top_channels += bottom_blob.c * bottom_blob.elempack;
585 }
586
587 int out_elempack = 1;
588 #if __riscv_vector
589 if (opt.use_packing_layout)
590 {
591 out_elempack = top_channels % packn == 0 ? packn : 1;
592 }
593 #endif
594 size_t out_elemsize = elemsize / elempack * out_elempack;
595
596 Mat& top_blob = top_blobs[0];
597 top_blob.create(w, h, top_channels / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
598 if (top_blob.empty())
599 return -100;
600
601 Mat top_blob_unpacked = top_blob;
602 if (elempack < out_elempack)
603 {
604 top_blob_unpacked.create(w, h, top_channels / elempack, elemsize, elempack, opt.workspace_allocator);
605 if (top_blob_unpacked.empty())
606 return -100;
607 }
608
609 int p = 0;
610 for (size_t b = 0; b < bottom_blobs.size(); b++)
611 {
612 const Mat& bottom_blob = bottom_blobs[b];
613
614 #if __riscv_vector
615 if (bottom_blob.elempack == packn && elempack == 1)
616 {
617 const word_type vl = vsetvl_e16m1(packn);
618
619 int size = bottom_blob.w * bottom_blob.h;
620
621 for (int q = 0; q < bottom_blob.c; q++)
622 {
623 const unsigned short* r0 = bottom_blob.channel(q);
624
625 unsigned short* outptr0 = top_blob_unpacked.channel(p);
626
627 for (int i = 0; i < size; i++)
628 {
629 vuint16m1_t _p = vle16_v_u16m1(r0, vl);
630 vsse16_v_u16m1(outptr0, top_blob_unpacked.cstep * sizeof(unsigned short), _p, vl);
631
632 r0 += packn;
633 outptr0 += 1;
634 }
635
636 p += packn;
637 }
638 }
639 else // if (bottom_blob.elempack == 1 && elempack == 1) if (bottom_blob.elempack == packn && elempack == packn)
640 #endif // __riscv_vector
641 {
642 int size = bottom_blob.total();
643
644 const unsigned short* ptr = bottom_blob;
645 unsigned short* outptr = top_blob_unpacked.channel(p);
646 memcpy(outptr, ptr, size * bottom_blob.elemsize);
647
648 p += bottom_blob.c;
649 }
650 }
651
652 // packing
653 if (elempack < out_elempack)
654 {
655 convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
656 }
657 }
658
659 if (dims == 3 && positive_axis == 1)
660 {
661 // interleave dim height
662 int w = bottom_blobs[0].w;
663 int channels = bottom_blobs[0].c;
664 size_t elemsize = bottom_blobs[0].elemsize;
665 int elempack = bottom_blobs[0].elempack;
666
667 // total height
668 int top_h = 0;
669 for (size_t b = 0; b < bottom_blobs.size(); b++)
670 {
671 const Mat& bottom_blob = bottom_blobs[b];
672 top_h += bottom_blob.h;
673 }
674
675 Mat& top_blob = top_blobs[0];
676 top_blob.create(w, top_h, channels, elemsize, elempack, opt.blob_allocator);
677 if (top_blob.empty())
678 return -100;
679
680 #pragma omp parallel for num_threads(opt.num_threads)
681 for (int q = 0; q < channels; q++)
682 {
683 unsigned short* outptr = top_blob.channel(q);
684
685 for (size_t b = 0; b < bottom_blobs.size(); b++)
686 {
687 const Mat& bottom_blob = bottom_blobs[b];
688
689 int size = bottom_blob.w * bottom_blob.h;
690
691 const unsigned short* ptr = bottom_blob.channel(q);
692 memcpy(outptr, ptr, size * elemsize);
693
694 outptr += size * elempack;
695 }
696 }
697 }
698
699 if (dims == 3 && positive_axis == 2)
700 {
701 // interleave dim width
702 int h = bottom_blobs[0].h;
703 int channels = bottom_blobs[0].c;
704 size_t elemsize = bottom_blobs[0].elemsize;
705 int elempack = bottom_blobs[0].elempack;
706
707 // total height
708 int top_w = 0;
709 for (size_t b = 0; b < bottom_blobs.size(); b++)
710 {
711 const Mat& bottom_blob = bottom_blobs[b];
712 top_w += bottom_blob.w;
713 }
714
715 Mat& top_blob = top_blobs[0];
716 top_blob.create(top_w, h, channels, elemsize, elempack, opt.blob_allocator);
717 if (top_blob.empty())
718 return -100;
719
720 #pragma omp parallel for num_threads(opt.num_threads)
721 for (int q = 0; q < channels; q++)
722 {
723 unsigned short* outptr = top_blob.channel(q);
724
725 for (int i = 0; i < h; i++)
726 {
727 for (size_t b = 0; b < bottom_blobs.size(); b++)
728 {
729 const Mat& bottom_blob = bottom_blobs[b];
730
731 const unsigned short* ptr = bottom_blob.channel(q).row<const unsigned short>(i);
732 memcpy(outptr, ptr, bottom_blob.w * elemsize);
733
734 outptr += bottom_blob.w * elempack;
735 }
736 }
737 }
738 }
739
740 return 0;
741 }
742
743 } // namespace ncnn
744