1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 
15 #include "packing_riscv.h"
16 
17 #if __riscv_vector
18 #ifdef RVV_SPEC_0_7
19 #include "riscv_v_071_fix.h"
20 #else
21 #include <riscv_vector.h>
22 #endif
23 #endif // __riscv_vector
24 
25 namespace ncnn {
26 
Packing_riscv()27 Packing_riscv::Packing_riscv()
28 {
29     support_packing = true;
30     support_fp16_storage = true;
31     support_bf16_storage = true;
32 }
33 
forward(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const34 int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
35 {
36     int elembits = bottom_blob.elembits();
37 
38     if (opt.use_fp16_storage && elembits == 16)
39         return forward_bf16s_fp16s(bottom_blob, top_blob, opt);
40 
41     if (opt.use_bf16_storage && elembits == 16)
42         return forward_bf16s_fp16s(bottom_blob, top_blob, opt);
43 
44     if (use_padding)
45     {
46         return Packing::forward(bottom_blob, top_blob, opt);
47     }
48 
49     if (elembits != 32)
50     {
51         // non-fp32 type
52         return Packing::forward(bottom_blob, top_blob, opt);
53     }
54 
55     size_t elemsize = bottom_blob.elemsize;
56     int elempack = bottom_blob.elempack;
57 
58     if (elempack == out_elempack)
59     {
60         top_blob = bottom_blob;
61         return 0;
62     }
63 
64     bool pack1to4 = elempack == 1 && out_elempack == 4;
65     bool pack4to1 = elempack == 4 && out_elempack == 1;
66 
67     if (!pack1to4 && !pack4to1)
68     {
69         return Packing::forward(bottom_blob, top_blob, opt);
70     }
71 
72     int w = bottom_blob.w;
73     int h = bottom_blob.h;
74     int channels = bottom_blob.c;
75     int dims = bottom_blob.dims;
76 
77     if (!use_padding)
78     {
79         // identity if use_padding not allowed
80         if (dims == 1 && w * elempack % out_elempack != 0)
81         {
82             top_blob = bottom_blob;
83             return 0;
84         }
85         if (dims == 2 && h * elempack % out_elempack != 0)
86         {
87             top_blob = bottom_blob;
88             return 0;
89         }
90         if (dims == 3 && channels * elempack % out_elempack != 0)
91         {
92             top_blob = bottom_blob;
93             return 0;
94         }
95     }
96 
97     if (dims == 1)
98     {
99         top_blob = bottom_blob;
100         top_blob.w = w * elempack / out_elempack;
101         top_blob.cstep = w * elempack / out_elempack;
102         top_blob.elemsize = elemsize / elempack * out_elempack;
103         top_blob.elempack = out_elempack;
104         return 0;
105     }
106 
107     if (dims == 2)
108     {
109         int outh = h * elempack / out_elempack;
110         size_t out_elemsize = elemsize / elempack * out_elempack;
111 
112         top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator);
113         if (top_blob.empty())
114             return -100;
115 
116         if (pack1to4)
117         {
118             #pragma omp parallel for num_threads(opt.num_threads)
119             for (int i = 0; i < outh; i++)
120             {
121                 const float* r0 = bottom_blob.row(i * 4);
122                 const float* r1 = bottom_blob.row(i * 4 + 1);
123                 const float* r2 = bottom_blob.row(i * 4 + 2);
124                 const float* r3 = bottom_blob.row(i * 4 + 3);
125 
126                 float* outptr = top_blob.row(i);
127 
128 #if __riscv_vector
129                 int n = w;
130                 while (n > 0)
131                 {
132                     word_type vl = vsetvl_e32m2(n);
133 
134                     vfloat32m2_t _p0 = vle32_v_f32m2(r0, vl);
135                     vfloat32m2_t _p1 = vle32_v_f32m2(r1, vl);
136                     vfloat32m2_t _p2 = vle32_v_f32m2(r2, vl);
137                     vfloat32m2_t _p3 = vle32_v_f32m2(r3, vl);
138                     vsseg4e32_v_f32m2x4(outptr, vcreate_f32m2x4(_p0, _p1, _p2, _p3), vl);
139 
140                     r0 += vl;
141                     r1 += vl;
142                     r2 += vl;
143                     r3 += vl;
144                     outptr += vl * 4;
145                     n -= vl;
146                 }
147 #else  // __riscv_vector
148                 for (int j = 0; j < w; j++)
149                 {
150                     outptr[0] = *r0++;
151                     outptr[1] = *r1++;
152                     outptr[2] = *r2++;
153                     outptr[3] = *r3++;
154 
155                     outptr += 4;
156                 }
157 #endif // __riscv_vector
158             }
159         }
160         if (pack4to1)
161         {
162             #pragma omp parallel for num_threads(opt.num_threads)
163             for (int i = 0; i < h; i++)
164             {
165                 const float* r0 = bottom_blob.row(i);
166 
167                 float* outptr0 = top_blob.row(i * 4);
168                 float* outptr1 = top_blob.row(i * 4 + 1);
169                 float* outptr2 = top_blob.row(i * 4 + 2);
170                 float* outptr3 = top_blob.row(i * 4 + 3);
171 
172 #if __riscv_vector
173                 int n = w;
174                 while (n > 0)
175                 {
176                     word_type vl = vsetvl_e32m2(n);
177 
178                     vfloat32m2x4_t _p = vlseg4e32_v_f32m2x4(r0, vl);
179                     vse32_v_f32m2(outptr0, vget_f32m2x4_f32m2(_p, 0), vl);
180                     vse32_v_f32m2(outptr1, vget_f32m2x4_f32m2(_p, 1), vl);
181                     vse32_v_f32m2(outptr2, vget_f32m2x4_f32m2(_p, 2), vl);
182                     vse32_v_f32m2(outptr3, vget_f32m2x4_f32m2(_p, 3), vl);
183 
184                     r0 += vl * 4;
185                     outptr0 += vl;
186                     outptr1 += vl;
187                     outptr2 += vl;
188                     outptr3 += vl;
189                     n -= vl;
190                 }
191 #else  // __riscv_vector
192                 for (int j = 0; j < w; j++)
193                 {
194                     *outptr0++ = r0[0];
195                     *outptr1++ = r0[1];
196                     *outptr2++ = r0[2];
197                     *outptr3++ = r0[3];
198 
199                     r0 += 4;
200                 }
201 #endif // __riscv_vector
202             }
203         }
204 
205         return 0;
206     }
207 
208     if (dims == 3)
209     {
210         int size = w * h;
211         int outc = channels * elempack / out_elempack;
212         size_t out_elemsize = elemsize / elempack * out_elempack;
213 
214         top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator);
215         if (top_blob.empty())
216             return -100;
217 
218         if (pack1to4)
219         {
220             #pragma omp parallel for num_threads(opt.num_threads)
221             for (int q = 0; q < outc; q++)
222             {
223                 const float* r0 = bottom_blob.channel(q * 4);
224                 const float* r1 = bottom_blob.channel(q * 4 + 1);
225                 const float* r2 = bottom_blob.channel(q * 4 + 2);
226                 const float* r3 = bottom_blob.channel(q * 4 + 3);
227 
228                 float* outptr = top_blob.channel(q);
229 
230 #if __riscv_vector
231                 int n = size;
232                 while (n > 0)
233                 {
234                     word_type vl = vsetvl_e32m2(n);
235 
236                     vfloat32m2_t _p0 = vle32_v_f32m2(r0, vl);
237                     vfloat32m2_t _p1 = vle32_v_f32m2(r1, vl);
238                     vfloat32m2_t _p2 = vle32_v_f32m2(r2, vl);
239                     vfloat32m2_t _p3 = vle32_v_f32m2(r3, vl);
240                     vsseg4e32_v_f32m2x4(outptr, vcreate_f32m2x4(_p0, _p1, _p2, _p3), vl);
241 
242                     r0 += vl;
243                     r1 += vl;
244                     r2 += vl;
245                     r3 += vl;
246                     outptr += vl * 4;
247                     n -= vl;
248                 }
249 #else  // __riscv_vector
250                 for (int i = 0; i < size; i++)
251                 {
252                     outptr[0] = *r0++;
253                     outptr[1] = *r1++;
254                     outptr[2] = *r2++;
255                     outptr[3] = *r3++;
256 
257                     outptr += 4;
258                 }
259 #endif // __riscv_vector
260             }
261         }
262         if (pack4to1)
263         {
264             #pragma omp parallel for num_threads(opt.num_threads)
265             for (int q = 0; q < channels; q++)
266             {
267                 const float* r0 = bottom_blob.channel(q);
268 
269                 float* outptr0 = top_blob.channel(q * 4);
270                 float* outptr1 = top_blob.channel(q * 4 + 1);
271                 float* outptr2 = top_blob.channel(q * 4 + 2);
272                 float* outptr3 = top_blob.channel(q * 4 + 3);
273 
274 #if __riscv_vector
275                 int n = size;
276                 while (n > 0)
277                 {
278                     word_type vl = vsetvl_e32m2(n);
279 
280                     vfloat32m2x4_t _p = vlseg4e32_v_f32m2x4(r0, vl);
281                     vse32_v_f32m2(outptr0, vget_f32m2x4_f32m2(_p, 0), vl);
282                     vse32_v_f32m2(outptr1, vget_f32m2x4_f32m2(_p, 1), vl);
283                     vse32_v_f32m2(outptr2, vget_f32m2x4_f32m2(_p, 2), vl);
284                     vse32_v_f32m2(outptr3, vget_f32m2x4_f32m2(_p, 3), vl);
285 
286                     r0 += vl * 4;
287                     outptr0 += vl;
288                     outptr1 += vl;
289                     outptr2 += vl;
290                     outptr3 += vl;
291                     n -= vl;
292                 }
293 #else  // __riscv_vector
294                 for (int i = 0; i < size; i++)
295                 {
296                     *outptr0++ = r0[0];
297                     *outptr1++ = r0[1];
298                     *outptr2++ = r0[2];
299                     *outptr3++ = r0[3];
300 
301                     r0 += 4;
302                 }
303 #endif // __riscv_vector
304             }
305         }
306 
307         return 0;
308     }
309 
310     return 0;
311 }
312 
forward_bf16s_fp16s(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const313 int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
314 {
315     if (use_padding)
316     {
317         return Packing::forward(bottom_blob, top_blob, opt);
318     }
319 
320     size_t elemsize = bottom_blob.elemsize;
321     int elempack = bottom_blob.elempack;
322 
323     if (elempack == out_elempack)
324     {
325         top_blob = bottom_blob;
326         return 0;
327     }
328 
329     bool pack1to4 = elempack == 1 && out_elempack == 4;
330     bool pack4to1 = elempack == 4 && out_elempack == 1;
331     bool pack1to8 = elempack == 1 && out_elempack == 8;
332     bool pack8to1 = elempack == 8 && out_elempack == 1;
333     bool pack4to8 = elempack == 4 && out_elempack == 8;
334     bool pack8to4 = elempack == 8 && out_elempack == 4;
335 
336     if (!pack1to4 && !pack4to1 && !pack1to8 && !pack8to1 && !pack4to8 && !pack8to4)
337     {
338         return Packing::forward(bottom_blob, top_blob, opt);
339     }
340 
341     int w = bottom_blob.w;
342     int h = bottom_blob.h;
343     int channels = bottom_blob.c;
344     int dims = bottom_blob.dims;
345 
346     if (!use_padding)
347     {
348         // identity if use_padding not allowed
349         if (dims == 1 && w * elempack % out_elempack != 0)
350         {
351             top_blob = bottom_blob;
352             return 0;
353         }
354         if (dims == 2 && h * elempack % out_elempack != 0)
355         {
356             top_blob = bottom_blob;
357             return 0;
358         }
359         if (dims == 3 && channels * elempack % out_elempack != 0)
360         {
361             top_blob = bottom_blob;
362             return 0;
363         }
364     }
365 
366     if (dims == 1)
367     {
368         top_blob = bottom_blob;
369         top_blob.w = w * elempack / out_elempack;
370         top_blob.cstep = w * elempack / out_elempack;
371         top_blob.elemsize = elemsize / elempack * out_elempack;
372         top_blob.elempack = out_elempack;
373         return 0;
374     }
375 
376     if (dims == 2)
377     {
378         int outh = h * elempack / out_elempack;
379         size_t out_elemsize = elemsize / elempack * out_elempack;
380 
381         top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator);
382         if (top_blob.empty())
383             return -100;
384 
385         if (pack1to4)
386         {
387             #pragma omp parallel for num_threads(opt.num_threads)
388             for (int i = 0; i < outh; i++)
389             {
390                 const unsigned short* r0 = bottom_blob.row<const unsigned short>(i * 4);
391                 const unsigned short* r1 = bottom_blob.row<const unsigned short>(i * 4 + 1);
392                 const unsigned short* r2 = bottom_blob.row<const unsigned short>(i * 4 + 2);
393                 const unsigned short* r3 = bottom_blob.row<const unsigned short>(i * 4 + 3);
394 
395                 unsigned short* outptr = top_blob.row<unsigned short>(i);
396 
397 #if __riscv_vector
398                 int n = w;
399                 while (n > 0)
400                 {
401                     word_type vl = vsetvl_e16m2(n);
402 
403                     vuint16m2_t _p0 = vle16_v_u16m2(r0, vl);
404                     vuint16m2_t _p1 = vle16_v_u16m2(r1, vl);
405                     vuint16m2_t _p2 = vle16_v_u16m2(r2, vl);
406                     vuint16m2_t _p3 = vle16_v_u16m2(r3, vl);
407                     vsseg4e16_v_u16m2x4(outptr, vcreate_u16m2x4(_p0, _p1, _p2, _p3), vl);
408 
409                     r0 += vl;
410                     r1 += vl;
411                     r2 += vl;
412                     r3 += vl;
413                     outptr += vl * 4;
414                     n -= vl;
415                 }
416 #else  // __riscv_vector
417                 for (int j = 0; j < w; j++)
418                 {
419                     outptr[0] = *r0++;
420                     outptr[1] = *r1++;
421                     outptr[2] = *r2++;
422                     outptr[3] = *r3++;
423 
424                     outptr += 4;
425                 }
426 #endif // __riscv_vector
427             }
428         }
429         if (pack4to1)
430         {
431             #pragma omp parallel for num_threads(opt.num_threads)
432             for (int i = 0; i < h; i++)
433             {
434                 const unsigned short* r0 = bottom_blob.row<const unsigned short>(i);
435 
436                 unsigned short* outptr0 = top_blob.row<unsigned short>(i * 4);
437                 unsigned short* outptr1 = top_blob.row<unsigned short>(i * 4 + 1);
438                 unsigned short* outptr2 = top_blob.row<unsigned short>(i * 4 + 2);
439                 unsigned short* outptr3 = top_blob.row<unsigned short>(i * 4 + 3);
440 
441 #if __riscv_vector
442                 int n = w;
443                 while (n > 0)
444                 {
445                     word_type vl = vsetvl_e16m2(n);
446 
447                     vuint16m2x4_t _p = vlseg4e16_v_u16m2x4(r0, vl);
448                     vse16_v_u16m2(outptr0, vget_u16m2x4_u16m2(_p, 0), vl);
449                     vse16_v_u16m2(outptr1, vget_u16m2x4_u16m2(_p, 1), vl);
450                     vse16_v_u16m2(outptr2, vget_u16m2x4_u16m2(_p, 2), vl);
451                     vse16_v_u16m2(outptr3, vget_u16m2x4_u16m2(_p, 3), vl);
452 
453                     r0 += vl * 4;
454                     outptr0 += vl;
455                     outptr1 += vl;
456                     outptr2 += vl;
457                     outptr3 += vl;
458                     n -= vl;
459                 }
460 #else  // __riscv_vector
461                 for (int j = 0; j < w; j++)
462                 {
463                     *outptr0++ = r0[0];
464                     *outptr1++ = r0[1];
465                     *outptr2++ = r0[2];
466                     *outptr3++ = r0[3];
467 
468                     r0 += 4;
469                 }
470 #endif // __riscv_vector
471             }
472         }
473         if (pack1to8)
474         {
475             #pragma omp parallel for num_threads(opt.num_threads)
476             for (int i = 0; i < outh; i++)
477             {
478                 const unsigned short* r0 = bottom_blob.row<const unsigned short>(i * 8);
479                 const unsigned short* r1 = bottom_blob.row<const unsigned short>(i * 8 + 1);
480                 const unsigned short* r2 = bottom_blob.row<const unsigned short>(i * 8 + 2);
481                 const unsigned short* r3 = bottom_blob.row<const unsigned short>(i * 8 + 3);
482                 const unsigned short* r4 = bottom_blob.row<const unsigned short>(i * 8 + 4);
483                 const unsigned short* r5 = bottom_blob.row<const unsigned short>(i * 8 + 5);
484                 const unsigned short* r6 = bottom_blob.row<const unsigned short>(i * 8 + 6);
485                 const unsigned short* r7 = bottom_blob.row<const unsigned short>(i * 8 + 7);
486 
487                 unsigned short* outptr = top_blob.row<unsigned short>(i);
488 
489 #if __riscv_vector
490                 int n = w;
491                 while (n > 0)
492                 {
493                     word_type vl = vsetvl_e16m1(n);
494 
495                     vuint16m1_t _p0 = vle16_v_u16m1(r0, vl);
496                     vuint16m1_t _p1 = vle16_v_u16m1(r1, vl);
497                     vuint16m1_t _p2 = vle16_v_u16m1(r2, vl);
498                     vuint16m1_t _p3 = vle16_v_u16m1(r3, vl);
499                     vuint16m1_t _p4 = vle16_v_u16m1(r4, vl);
500                     vuint16m1_t _p5 = vle16_v_u16m1(r5, vl);
501                     vuint16m1_t _p6 = vle16_v_u16m1(r6, vl);
502                     vuint16m1_t _p7 = vle16_v_u16m1(r7, vl);
503                     vsseg8e16_v_u16m1x8(outptr, vcreate_u16m1x8(_p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7), vl);
504 
505                     r0 += vl;
506                     r1 += vl;
507                     r2 += vl;
508                     r3 += vl;
509                     r4 += vl;
510                     r5 += vl;
511                     r6 += vl;
512                     r7 += vl;
513                     outptr += vl * 8;
514                     n -= vl;
515                 }
516 #else  // __riscv_vector
517                 for (int j = 0; j < w; j++)
518                 {
519                     outptr[0] = *r0++;
520                     outptr[1] = *r1++;
521                     outptr[2] = *r2++;
522                     outptr[3] = *r3++;
523                     outptr[4] = *r4++;
524                     outptr[5] = *r5++;
525                     outptr[6] = *r6++;
526                     outptr[7] = *r7++;
527 
528                     outptr += 8;
529                 }
530 #endif // __riscv_vector
531             }
532         }
533         if (pack8to1)
534         {
535             #pragma omp parallel for num_threads(opt.num_threads)
536             for (int i = 0; i < h; i++)
537             {
538                 const unsigned short* r0 = bottom_blob.row<const unsigned short>(i);
539 
540                 unsigned short* outptr0 = top_blob.row<unsigned short>(i * 8);
541                 unsigned short* outptr1 = top_blob.row<unsigned short>(i * 8 + 1);
542                 unsigned short* outptr2 = top_blob.row<unsigned short>(i * 8 + 2);
543                 unsigned short* outptr3 = top_blob.row<unsigned short>(i * 8 + 3);
544                 unsigned short* outptr4 = top_blob.row<unsigned short>(i * 8 + 4);
545                 unsigned short* outptr5 = top_blob.row<unsigned short>(i * 8 + 5);
546                 unsigned short* outptr6 = top_blob.row<unsigned short>(i * 8 + 6);
547                 unsigned short* outptr7 = top_blob.row<unsigned short>(i * 8 + 7);
548 
549 #if __riscv_vector
550                 int n = w;
551                 while (n > 0)
552                 {
553                     word_type vl = vsetvl_e16m1(n);
554 
555                     vuint16m1x8_t _p = vlseg8e16_v_u16m1x8(r0, vl);
556                     vse16_v_u16m1(outptr0, vget_u16m1x8_u16m1(_p, 0), vl);
557                     vse16_v_u16m1(outptr1, vget_u16m1x8_u16m1(_p, 1), vl);
558                     vse16_v_u16m1(outptr2, vget_u16m1x8_u16m1(_p, 2), vl);
559                     vse16_v_u16m1(outptr3, vget_u16m1x8_u16m1(_p, 3), vl);
560                     vse16_v_u16m1(outptr4, vget_u16m1x8_u16m1(_p, 4), vl);
561                     vse16_v_u16m1(outptr5, vget_u16m1x8_u16m1(_p, 5), vl);
562                     vse16_v_u16m1(outptr6, vget_u16m1x8_u16m1(_p, 6), vl);
563                     vse16_v_u16m1(outptr7, vget_u16m1x8_u16m1(_p, 7), vl);
564 
565                     r0 += vl * 8;
566                     outptr0 += vl;
567                     outptr1 += vl;
568                     outptr2 += vl;
569                     outptr3 += vl;
570                     outptr4 += vl;
571                     outptr5 += vl;
572                     outptr6 += vl;
573                     outptr7 += vl;
574                     n -= vl;
575                 }
576 #else  // __riscv_vector
577                 for (int j = 0; j < w; j++)
578                 {
579                     *outptr0++ = r0[0];
580                     *outptr1++ = r0[1];
581                     *outptr2++ = r0[2];
582                     *outptr3++ = r0[3];
583                     *outptr4++ = r0[4];
584                     *outptr5++ = r0[5];
585                     *outptr6++ = r0[6];
586                     *outptr7++ = r0[7];
587 
588                     r0 += 8;
589                 }
590 #endif // __riscv_vector
591             }
592         }
593         if (pack4to8)
594         {
595             #pragma omp parallel for num_threads(opt.num_threads)
596             for (int i = 0; i < outh; i++)
597             {
598                 const unsigned short* r0 = bottom_blob.row<const unsigned short>(i * 2);
599                 const unsigned short* r1 = bottom_blob.row<const unsigned short>(i * 2 + 1);
600 
601                 unsigned short* outptr = top_blob.row<unsigned short>(i);
602 
603 #if __riscv_vector
604                 int n = w;
605                 while (n > 0)
606                 {
607                     word_type vl = vsetvl_e16m1(n);
608 
609                     vuint16m1x4_t _p0 = vlseg4e16_v_u16m1x4(r0, vl);
610                     vuint16m1x4_t _p1 = vlseg4e16_v_u16m1x4(r1, vl);
611                     vuint16m1_t _p00 = vget_u16m1x4_u16m1(_p0, 0);
612                     vuint16m1_t _p01 = vget_u16m1x4_u16m1(_p0, 1);
613                     vuint16m1_t _p02 = vget_u16m1x4_u16m1(_p0, 2);
614                     vuint16m1_t _p03 = vget_u16m1x4_u16m1(_p0, 3);
615                     vuint16m1_t _p10 = vget_u16m1x4_u16m1(_p1, 0);
616                     vuint16m1_t _p11 = vget_u16m1x4_u16m1(_p1, 1);
617                     vuint16m1_t _p12 = vget_u16m1x4_u16m1(_p1, 2);
618                     vuint16m1_t _p13 = vget_u16m1x4_u16m1(_p1, 3);
619                     vsseg8e16_v_u16m1x8(outptr, vcreate_u16m1x8(_p00, _p01, _p02, _p03, _p10, _p11, _p12, _p13), vl);
620 
621                     r0 += vl * 4;
622                     r1 += vl * 4;
623                     outptr += vl * 8;
624                     n -= vl;
625                 }
626 #else  // __riscv_vector
627                 for (int j = 0; j < w; j++)
628                 {
629                     outptr[0] = r0[0];
630                     outptr[1] = r0[1];
631                     outptr[2] = r0[2];
632                     outptr[3] = r0[3];
633                     outptr[4] = r1[0];
634                     outptr[5] = r1[1];
635                     outptr[6] = r1[2];
636                     outptr[7] = r1[3];
637 
638                     r0 += 4;
639                     r1 += 4;
640                     outptr += 8;
641                 }
642 #endif // __riscv_vector
643             }
644         }
645         if (pack8to4)
646         {
647             #pragma omp parallel for num_threads(opt.num_threads)
648             for (int i = 0; i < h; i++)
649             {
650                 const unsigned short* r0 = bottom_blob.row<const unsigned short>(i);
651 
652                 unsigned short* outptr0 = top_blob.row<unsigned short>(i * 2);
653                 unsigned short* outptr1 = top_blob.row<unsigned short>(i * 2 + 1);
654 
655 #if __riscv_vector
656                 int n = w;
657                 while (n > 0)
658                 {
659                     word_type vl = vsetvl_e16m1(n);
660 
661                     vuint16m1x8_t _p = vlseg8e16_v_u16m1x8(r0, vl);
662                     vuint16m1_t _p0 = vget_u16m1x8_u16m1(_p, 0);
663                     vuint16m1_t _p1 = vget_u16m1x8_u16m1(_p, 1);
664                     vuint16m1_t _p2 = vget_u16m1x8_u16m1(_p, 2);
665                     vuint16m1_t _p3 = vget_u16m1x8_u16m1(_p, 3);
666                     vuint16m1_t _p4 = vget_u16m1x8_u16m1(_p, 4);
667                     vuint16m1_t _p5 = vget_u16m1x8_u16m1(_p, 5);
668                     vuint16m1_t _p6 = vget_u16m1x8_u16m1(_p, 6);
669                     vuint16m1_t _p7 = vget_u16m1x8_u16m1(_p, 7);
670                     vsseg4e16_v_u16m1x4(outptr0, vcreate_u16m1x4(_p0, _p1, _p2, _p3), vl);
671                     vsseg4e16_v_u16m1x4(outptr1, vcreate_u16m1x4(_p4, _p5, _p6, _p7), vl);
672 
673                     r0 += vl * 8;
674                     outptr0 += vl * 4;
675                     outptr1 += vl * 4;
676                     n -= vl;
677                 }
678 #else  // __riscv_vector
679                 for (int j = 0; j < w; j++)
680                 {
681                     outptr0[0] = r0[0];
682                     outptr0[1] = r0[1];
683                     outptr0[2] = r0[2];
684                     outptr0[3] = r0[3];
685                     outptr1[0] = r0[4];
686                     outptr1[1] = r0[5];
687                     outptr1[2] = r0[6];
688                     outptr1[3] = r0[7];
689 
690                     r0 += 8;
691                     outptr0 += 4;
692                     outptr1 += 4;
693                 }
694 #endif // __riscv_vector
695             }
696         }
697 
698         return 0;
699     }
700 
701     if (dims == 3)
702     {
703         int size = w * h;
704         int outc = channels * elempack / out_elempack;
705         size_t out_elemsize = elemsize / elempack * out_elempack;
706 
707         top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator);
708         if (top_blob.empty())
709             return -100;
710 
711         if (pack1to4)
712         {
713             #pragma omp parallel for num_threads(opt.num_threads)
714             for (int q = 0; q < outc; q++)
715             {
716                 const unsigned short* r0 = bottom_blob.channel(q * 4);
717                 const unsigned short* r1 = bottom_blob.channel(q * 4 + 1);
718                 const unsigned short* r2 = bottom_blob.channel(q * 4 + 2);
719                 const unsigned short* r3 = bottom_blob.channel(q * 4 + 3);
720 
721                 unsigned short* outptr = top_blob.channel(q);
722 
723 #if __riscv_vector
724                 int n = size;
725                 while (n > 0)
726                 {
727                     word_type vl = vsetvl_e16m2(n);
728 
729                     vuint16m2_t _p0 = vle16_v_u16m2(r0, vl);
730                     vuint16m2_t _p1 = vle16_v_u16m2(r1, vl);
731                     vuint16m2_t _p2 = vle16_v_u16m2(r2, vl);
732                     vuint16m2_t _p3 = vle16_v_u16m2(r3, vl);
733                     vsseg4e16_v_u16m2x4(outptr, vcreate_u16m2x4(_p0, _p1, _p2, _p3), vl);
734 
735                     r0 += vl;
736                     r1 += vl;
737                     r2 += vl;
738                     r3 += vl;
739                     outptr += vl * 4;
740                     n -= vl;
741                 }
742 #else  // __riscv_vector
743                 for (int i = 0; i < size; i++)
744                 {
745                     outptr[0] = *r0++;
746                     outptr[1] = *r1++;
747                     outptr[2] = *r2++;
748                     outptr[3] = *r3++;
749 
750                     outptr += 4;
751                 }
752 #endif // __riscv_vector
753             }
754         }
755         if (pack4to1)
756         {
757             #pragma omp parallel for num_threads(opt.num_threads)
758             for (int q = 0; q < channels; q++)
759             {
760                 const unsigned short* r0 = bottom_blob.channel(q);
761 
762                 unsigned short* outptr0 = top_blob.channel(q * 4);
763                 unsigned short* outptr1 = top_blob.channel(q * 4 + 1);
764                 unsigned short* outptr2 = top_blob.channel(q * 4 + 2);
765                 unsigned short* outptr3 = top_blob.channel(q * 4 + 3);
766 
767 #if __riscv_vector
768                 int n = size;
769                 while (n > 0)
770                 {
771                     word_type vl = vsetvl_e16m2(n);
772 
773                     vuint16m2x4_t _p = vlseg4e16_v_u16m2x4(r0, vl);
774                     vse16_v_u16m2(outptr0, vget_u16m2x4_u16m2(_p, 0), vl);
775                     vse16_v_u16m2(outptr1, vget_u16m2x4_u16m2(_p, 1), vl);
776                     vse16_v_u16m2(outptr2, vget_u16m2x4_u16m2(_p, 2), vl);
777                     vse16_v_u16m2(outptr3, vget_u16m2x4_u16m2(_p, 3), vl);
778 
779                     r0 += vl * 4;
780                     outptr0 += vl;
781                     outptr1 += vl;
782                     outptr2 += vl;
783                     outptr3 += vl;
784                     n -= vl;
785                 }
786 #else  // __riscv_vector
787                 for (int i = 0; i < size; i++)
788                 {
789                     *outptr0++ = r0[0];
790                     *outptr1++ = r0[1];
791                     *outptr2++ = r0[2];
792                     *outptr3++ = r0[3];
793 
794                     r0 += 4;
795                 }
796 #endif // __riscv_vector
797             }
798         }
799         if (pack1to8)
800         {
801             #pragma omp parallel for num_threads(opt.num_threads)
802             for (int q = 0; q < outc; q++)
803             {
804                 const unsigned short* r0 = bottom_blob.channel(q * 8);
805                 const unsigned short* r1 = bottom_blob.channel(q * 8 + 1);
806                 const unsigned short* r2 = bottom_blob.channel(q * 8 + 2);
807                 const unsigned short* r3 = bottom_blob.channel(q * 8 + 3);
808                 const unsigned short* r4 = bottom_blob.channel(q * 8 + 4);
809                 const unsigned short* r5 = bottom_blob.channel(q * 8 + 5);
810                 const unsigned short* r6 = bottom_blob.channel(q * 8 + 6);
811                 const unsigned short* r7 = bottom_blob.channel(q * 8 + 7);
812 
813                 unsigned short* outptr = top_blob.channel(q);
814 
815 #if __riscv_vector
816                 int n = size;
817                 while (n > 0)
818                 {
819                     word_type vl = vsetvl_e16m1(n);
820 
821                     vuint16m1_t _p0 = vle16_v_u16m1(r0, vl);
822                     vuint16m1_t _p1 = vle16_v_u16m1(r1, vl);
823                     vuint16m1_t _p2 = vle16_v_u16m1(r2, vl);
824                     vuint16m1_t _p3 = vle16_v_u16m1(r3, vl);
825                     vuint16m1_t _p4 = vle16_v_u16m1(r4, vl);
826                     vuint16m1_t _p5 = vle16_v_u16m1(r5, vl);
827                     vuint16m1_t _p6 = vle16_v_u16m1(r6, vl);
828                     vuint16m1_t _p7 = vle16_v_u16m1(r7, vl);
829                     vsseg8e16_v_u16m1x8(outptr, vcreate_u16m1x8(_p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7), vl);
830 
831                     r0 += vl;
832                     r1 += vl;
833                     r2 += vl;
834                     r3 += vl;
835                     r4 += vl;
836                     r5 += vl;
837                     r6 += vl;
838                     r7 += vl;
839                     outptr += vl * 8;
840                     n -= vl;
841                 }
842 #else  // __riscv_vector
843                 for (int i = 0; i < size; i++)
844                 {
845                     outptr[0] = *r0++;
846                     outptr[1] = *r1++;
847                     outptr[2] = *r2++;
848                     outptr[3] = *r3++;
849                     outptr[4] = *r4++;
850                     outptr[5] = *r5++;
851                     outptr[6] = *r6++;
852                     outptr[7] = *r7++;
853 
854                     outptr += 8;
855                 }
856 #endif // __riscv_vector
857             }
858         }
859         if (pack8to1)
860         {
861             #pragma omp parallel for num_threads(opt.num_threads)
862             for (int q = 0; q < channels; q++)
863             {
864                 const unsigned short* r0 = bottom_blob.channel(q);
865 
866                 unsigned short* outptr0 = top_blob.channel(q * 8);
867                 unsigned short* outptr1 = top_blob.channel(q * 8 + 1);
868                 unsigned short* outptr2 = top_blob.channel(q * 8 + 2);
869                 unsigned short* outptr3 = top_blob.channel(q * 8 + 3);
870                 unsigned short* outptr4 = top_blob.channel(q * 8 + 4);
871                 unsigned short* outptr5 = top_blob.channel(q * 8 + 5);
872                 unsigned short* outptr6 = top_blob.channel(q * 8 + 6);
873                 unsigned short* outptr7 = top_blob.channel(q * 8 + 7);
874 
875 #if __riscv_vector
876                 int n = size;
877                 while (n > 0)
878                 {
879                     word_type vl = vsetvl_e16m1(n);
880 
881                     vuint16m1x8_t _p = vlseg8e16_v_u16m1x8(r0, vl);
882                     vse16_v_u16m1(outptr0, vget_u16m1x8_u16m1(_p, 0), vl);
883                     vse16_v_u16m1(outptr1, vget_u16m1x8_u16m1(_p, 1), vl);
884                     vse16_v_u16m1(outptr2, vget_u16m1x8_u16m1(_p, 2), vl);
885                     vse16_v_u16m1(outptr3, vget_u16m1x8_u16m1(_p, 3), vl);
886                     vse16_v_u16m1(outptr4, vget_u16m1x8_u16m1(_p, 4), vl);
887                     vse16_v_u16m1(outptr5, vget_u16m1x8_u16m1(_p, 5), vl);
888                     vse16_v_u16m1(outptr6, vget_u16m1x8_u16m1(_p, 6), vl);
889                     vse16_v_u16m1(outptr7, vget_u16m1x8_u16m1(_p, 7), vl);
890 
891                     r0 += vl * 8;
892                     outptr0 += vl;
893                     outptr1 += vl;
894                     outptr2 += vl;
895                     outptr3 += vl;
896                     outptr4 += vl;
897                     outptr5 += vl;
898                     outptr6 += vl;
899                     outptr7 += vl;
900                     n -= vl;
901                 }
902 #else  // __riscv_vector
903                 for (int i = 0; i < size; i++)
904                 {
905                     *outptr0++ = r0[0];
906                     *outptr1++ = r0[1];
907                     *outptr2++ = r0[2];
908                     *outptr3++ = r0[3];
909                     *outptr4++ = r0[4];
910                     *outptr5++ = r0[5];
911                     *outptr6++ = r0[6];
912                     *outptr7++ = r0[7];
913 
914                     r0 += 8;
915                 }
916 #endif // __riscv_vector
917             }
918         }
919         if (pack4to8)
920         {
921             #pragma omp parallel for num_threads(opt.num_threads)
922             for (int q = 0; q < outc; q++)
923             {
924                 const unsigned short* r0 = bottom_blob.channel(q * 2);
925                 const unsigned short* r1 = bottom_blob.channel(q * 2 + 1);
926 
927                 unsigned short* outptr = top_blob.channel(q);
928 
929 #if __riscv_vector
930                 int n = size;
931                 while (n > 0)
932                 {
933                     word_type vl = vsetvl_e16m1(n);
934 
935                     vuint16m1x4_t _p0 = vlseg4e16_v_u16m1x4(r0, vl);
936                     vuint16m1x4_t _p1 = vlseg4e16_v_u16m1x4(r1, vl);
937 
938                     vuint16m1_t _p00 = vget_u16m1x4_u16m1(_p0, 0);
939                     vuint16m1_t _p01 = vget_u16m1x4_u16m1(_p0, 1);
940                     vuint16m1_t _p02 = vget_u16m1x4_u16m1(_p0, 2);
941                     vuint16m1_t _p03 = vget_u16m1x4_u16m1(_p0, 3);
942                     vuint16m1_t _p10 = vget_u16m1x4_u16m1(_p1, 0);
943                     vuint16m1_t _p11 = vget_u16m1x4_u16m1(_p1, 1);
944                     vuint16m1_t _p12 = vget_u16m1x4_u16m1(_p1, 2);
945                     vuint16m1_t _p13 = vget_u16m1x4_u16m1(_p1, 3);
946                     vsseg8e16_v_u16m1x8(outptr, vcreate_u16m1x8(_p00, _p01, _p02, _p03, _p10, _p11, _p12, _p13), vl);
947 
948                     r0 += vl * 4;
949                     r1 += vl * 4;
950                     outptr += vl * 8;
951                     n -= vl;
952                 }
953 #else  // __riscv_vector
954                 for (int i = 0; i < size; i++)
955                 {
956                     outptr[0] = r0[0];
957                     outptr[1] = r0[1];
958                     outptr[2] = r0[2];
959                     outptr[3] = r0[3];
960                     outptr[4] = r1[0];
961                     outptr[5] = r1[1];
962                     outptr[6] = r1[2];
963                     outptr[7] = r1[3];
964 
965                     r0 += 4;
966                     r1 += 4;
967                     outptr += 8;
968                 }
969 #endif // __riscv_vector
970             }
971         }
972         if (pack8to4)
973         {
974             #pragma omp parallel for num_threads(opt.num_threads)
975             for (int q = 0; q < channels; q++)
976             {
977                 const unsigned short* r0 = bottom_blob.channel(q);
978 
979                 unsigned short* outptr0 = top_blob.channel(q * 2);
980                 unsigned short* outptr1 = top_blob.channel(q * 2 + 1);
981 
982 #if __riscv_vector
983                 int n = size;
984                 while (n > 0)
985                 {
986                     word_type vl = vsetvl_e16m1(n);
987 
988                     vuint16m1x8_t _p = vlseg8e16_v_u16m1x8(r0, vl);
989                     vuint16m1_t _p0 = vget_u16m1x8_u16m1(_p, 0);
990                     vuint16m1_t _p1 = vget_u16m1x8_u16m1(_p, 1);
991                     vuint16m1_t _p2 = vget_u16m1x8_u16m1(_p, 2);
992                     vuint16m1_t _p3 = vget_u16m1x8_u16m1(_p, 3);
993                     vuint16m1_t _p4 = vget_u16m1x8_u16m1(_p, 4);
994                     vuint16m1_t _p5 = vget_u16m1x8_u16m1(_p, 5);
995                     vuint16m1_t _p6 = vget_u16m1x8_u16m1(_p, 6);
996                     vuint16m1_t _p7 = vget_u16m1x8_u16m1(_p, 7);
997                     vsseg4e16_v_u16m1x4(outptr0, vcreate_u16m1x4(_p0, _p1, _p2, _p3), vl);
998                     vsseg4e16_v_u16m1x4(outptr1, vcreate_u16m1x4(_p4, _p5, _p6, _p7), vl);
999 
1000                     r0 += vl * 8;
1001                     outptr0 += vl * 4;
1002                     outptr1 += vl * 4;
1003                     n -= vl;
1004                 }
1005 #else  // __riscv_vector
1006                 for (int i = 0; i < size; i++)
1007                 {
1008                     outptr0[0] = r0[0];
1009                     outptr0[1] = r0[1];
1010                     outptr0[2] = r0[2];
1011                     outptr0[3] = r0[3];
1012                     outptr1[0] = r0[4];
1013                     outptr1[1] = r0[5];
1014                     outptr1[2] = r0[6];
1015                     outptr1[3] = r0[7];
1016 
1017                     r0 += 8;
1018                     outptr0 += 4;
1019                     outptr1 += 4;
1020                 }
1021 #endif // __riscv_vector
1022             }
1023         }
1024 
1025         return 0;
1026     }
1027 
1028     return 0;
1029 }
1030 
1031 } // namespace ncnn
1032