1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2021 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14
15 #include "packing_riscv.h"
16
17 #if __riscv_vector
18 #ifdef RVV_SPEC_0_7
19 #include "riscv_v_071_fix.h"
20 #else
21 #include <riscv_vector.h>
22 #endif
23 #endif // __riscv_vector
24
25 namespace ncnn {
26
Packing_riscv()27 Packing_riscv::Packing_riscv()
28 {
29 support_packing = true;
30 support_fp16_storage = true;
31 support_bf16_storage = true;
32 }
33
forward(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const34 int Packing_riscv::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
35 {
36 int elembits = bottom_blob.elembits();
37
38 if (opt.use_fp16_storage && elembits == 16)
39 return forward_bf16s_fp16s(bottom_blob, top_blob, opt);
40
41 if (opt.use_bf16_storage && elembits == 16)
42 return forward_bf16s_fp16s(bottom_blob, top_blob, opt);
43
44 if (use_padding)
45 {
46 return Packing::forward(bottom_blob, top_blob, opt);
47 }
48
49 if (elembits != 32)
50 {
51 // non-fp32 type
52 return Packing::forward(bottom_blob, top_blob, opt);
53 }
54
55 size_t elemsize = bottom_blob.elemsize;
56 int elempack = bottom_blob.elempack;
57
58 if (elempack == out_elempack)
59 {
60 top_blob = bottom_blob;
61 return 0;
62 }
63
64 bool pack1to4 = elempack == 1 && out_elempack == 4;
65 bool pack4to1 = elempack == 4 && out_elempack == 1;
66
67 if (!pack1to4 && !pack4to1)
68 {
69 return Packing::forward(bottom_blob, top_blob, opt);
70 }
71
72 int w = bottom_blob.w;
73 int h = bottom_blob.h;
74 int channels = bottom_blob.c;
75 int dims = bottom_blob.dims;
76
77 if (!use_padding)
78 {
79 // identity if use_padding not allowed
80 if (dims == 1 && w * elempack % out_elempack != 0)
81 {
82 top_blob = bottom_blob;
83 return 0;
84 }
85 if (dims == 2 && h * elempack % out_elempack != 0)
86 {
87 top_blob = bottom_blob;
88 return 0;
89 }
90 if (dims == 3 && channels * elempack % out_elempack != 0)
91 {
92 top_blob = bottom_blob;
93 return 0;
94 }
95 }
96
97 if (dims == 1)
98 {
99 top_blob = bottom_blob;
100 top_blob.w = w * elempack / out_elempack;
101 top_blob.cstep = w * elempack / out_elempack;
102 top_blob.elemsize = elemsize / elempack * out_elempack;
103 top_blob.elempack = out_elempack;
104 return 0;
105 }
106
107 if (dims == 2)
108 {
109 int outh = h * elempack / out_elempack;
110 size_t out_elemsize = elemsize / elempack * out_elempack;
111
112 top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator);
113 if (top_blob.empty())
114 return -100;
115
116 if (pack1to4)
117 {
118 #pragma omp parallel for num_threads(opt.num_threads)
119 for (int i = 0; i < outh; i++)
120 {
121 const float* r0 = bottom_blob.row(i * 4);
122 const float* r1 = bottom_blob.row(i * 4 + 1);
123 const float* r2 = bottom_blob.row(i * 4 + 2);
124 const float* r3 = bottom_blob.row(i * 4 + 3);
125
126 float* outptr = top_blob.row(i);
127
128 #if __riscv_vector
129 int n = w;
130 while (n > 0)
131 {
132 word_type vl = vsetvl_e32m2(n);
133
134 vfloat32m2_t _p0 = vle32_v_f32m2(r0, vl);
135 vfloat32m2_t _p1 = vle32_v_f32m2(r1, vl);
136 vfloat32m2_t _p2 = vle32_v_f32m2(r2, vl);
137 vfloat32m2_t _p3 = vle32_v_f32m2(r3, vl);
138 vsseg4e32_v_f32m2x4(outptr, vcreate_f32m2x4(_p0, _p1, _p2, _p3), vl);
139
140 r0 += vl;
141 r1 += vl;
142 r2 += vl;
143 r3 += vl;
144 outptr += vl * 4;
145 n -= vl;
146 }
147 #else // __riscv_vector
148 for (int j = 0; j < w; j++)
149 {
150 outptr[0] = *r0++;
151 outptr[1] = *r1++;
152 outptr[2] = *r2++;
153 outptr[3] = *r3++;
154
155 outptr += 4;
156 }
157 #endif // __riscv_vector
158 }
159 }
160 if (pack4to1)
161 {
162 #pragma omp parallel for num_threads(opt.num_threads)
163 for (int i = 0; i < h; i++)
164 {
165 const float* r0 = bottom_blob.row(i);
166
167 float* outptr0 = top_blob.row(i * 4);
168 float* outptr1 = top_blob.row(i * 4 + 1);
169 float* outptr2 = top_blob.row(i * 4 + 2);
170 float* outptr3 = top_blob.row(i * 4 + 3);
171
172 #if __riscv_vector
173 int n = w;
174 while (n > 0)
175 {
176 word_type vl = vsetvl_e32m2(n);
177
178 vfloat32m2x4_t _p = vlseg4e32_v_f32m2x4(r0, vl);
179 vse32_v_f32m2(outptr0, vget_f32m2x4_f32m2(_p, 0), vl);
180 vse32_v_f32m2(outptr1, vget_f32m2x4_f32m2(_p, 1), vl);
181 vse32_v_f32m2(outptr2, vget_f32m2x4_f32m2(_p, 2), vl);
182 vse32_v_f32m2(outptr3, vget_f32m2x4_f32m2(_p, 3), vl);
183
184 r0 += vl * 4;
185 outptr0 += vl;
186 outptr1 += vl;
187 outptr2 += vl;
188 outptr3 += vl;
189 n -= vl;
190 }
191 #else // __riscv_vector
192 for (int j = 0; j < w; j++)
193 {
194 *outptr0++ = r0[0];
195 *outptr1++ = r0[1];
196 *outptr2++ = r0[2];
197 *outptr3++ = r0[3];
198
199 r0 += 4;
200 }
201 #endif // __riscv_vector
202 }
203 }
204
205 return 0;
206 }
207
208 if (dims == 3)
209 {
210 int size = w * h;
211 int outc = channels * elempack / out_elempack;
212 size_t out_elemsize = elemsize / elempack * out_elempack;
213
214 top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator);
215 if (top_blob.empty())
216 return -100;
217
218 if (pack1to4)
219 {
220 #pragma omp parallel for num_threads(opt.num_threads)
221 for (int q = 0; q < outc; q++)
222 {
223 const float* r0 = bottom_blob.channel(q * 4);
224 const float* r1 = bottom_blob.channel(q * 4 + 1);
225 const float* r2 = bottom_blob.channel(q * 4 + 2);
226 const float* r3 = bottom_blob.channel(q * 4 + 3);
227
228 float* outptr = top_blob.channel(q);
229
230 #if __riscv_vector
231 int n = size;
232 while (n > 0)
233 {
234 word_type vl = vsetvl_e32m2(n);
235
236 vfloat32m2_t _p0 = vle32_v_f32m2(r0, vl);
237 vfloat32m2_t _p1 = vle32_v_f32m2(r1, vl);
238 vfloat32m2_t _p2 = vle32_v_f32m2(r2, vl);
239 vfloat32m2_t _p3 = vle32_v_f32m2(r3, vl);
240 vsseg4e32_v_f32m2x4(outptr, vcreate_f32m2x4(_p0, _p1, _p2, _p3), vl);
241
242 r0 += vl;
243 r1 += vl;
244 r2 += vl;
245 r3 += vl;
246 outptr += vl * 4;
247 n -= vl;
248 }
249 #else // __riscv_vector
250 for (int i = 0; i < size; i++)
251 {
252 outptr[0] = *r0++;
253 outptr[1] = *r1++;
254 outptr[2] = *r2++;
255 outptr[3] = *r3++;
256
257 outptr += 4;
258 }
259 #endif // __riscv_vector
260 }
261 }
262 if (pack4to1)
263 {
264 #pragma omp parallel for num_threads(opt.num_threads)
265 for (int q = 0; q < channels; q++)
266 {
267 const float* r0 = bottom_blob.channel(q);
268
269 float* outptr0 = top_blob.channel(q * 4);
270 float* outptr1 = top_blob.channel(q * 4 + 1);
271 float* outptr2 = top_blob.channel(q * 4 + 2);
272 float* outptr3 = top_blob.channel(q * 4 + 3);
273
274 #if __riscv_vector
275 int n = size;
276 while (n > 0)
277 {
278 word_type vl = vsetvl_e32m2(n);
279
280 vfloat32m2x4_t _p = vlseg4e32_v_f32m2x4(r0, vl);
281 vse32_v_f32m2(outptr0, vget_f32m2x4_f32m2(_p, 0), vl);
282 vse32_v_f32m2(outptr1, vget_f32m2x4_f32m2(_p, 1), vl);
283 vse32_v_f32m2(outptr2, vget_f32m2x4_f32m2(_p, 2), vl);
284 vse32_v_f32m2(outptr3, vget_f32m2x4_f32m2(_p, 3), vl);
285
286 r0 += vl * 4;
287 outptr0 += vl;
288 outptr1 += vl;
289 outptr2 += vl;
290 outptr3 += vl;
291 n -= vl;
292 }
293 #else // __riscv_vector
294 for (int i = 0; i < size; i++)
295 {
296 *outptr0++ = r0[0];
297 *outptr1++ = r0[1];
298 *outptr2++ = r0[2];
299 *outptr3++ = r0[3];
300
301 r0 += 4;
302 }
303 #endif // __riscv_vector
304 }
305 }
306
307 return 0;
308 }
309
310 return 0;
311 }
312
forward_bf16s_fp16s(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const313 int Packing_riscv::forward_bf16s_fp16s(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
314 {
315 if (use_padding)
316 {
317 return Packing::forward(bottom_blob, top_blob, opt);
318 }
319
320 size_t elemsize = bottom_blob.elemsize;
321 int elempack = bottom_blob.elempack;
322
323 if (elempack == out_elempack)
324 {
325 top_blob = bottom_blob;
326 return 0;
327 }
328
329 bool pack1to4 = elempack == 1 && out_elempack == 4;
330 bool pack4to1 = elempack == 4 && out_elempack == 1;
331 bool pack1to8 = elempack == 1 && out_elempack == 8;
332 bool pack8to1 = elempack == 8 && out_elempack == 1;
333 bool pack4to8 = elempack == 4 && out_elempack == 8;
334 bool pack8to4 = elempack == 8 && out_elempack == 4;
335
336 if (!pack1to4 && !pack4to1 && !pack1to8 && !pack8to1 && !pack4to8 && !pack8to4)
337 {
338 return Packing::forward(bottom_blob, top_blob, opt);
339 }
340
341 int w = bottom_blob.w;
342 int h = bottom_blob.h;
343 int channels = bottom_blob.c;
344 int dims = bottom_blob.dims;
345
346 if (!use_padding)
347 {
348 // identity if use_padding not allowed
349 if (dims == 1 && w * elempack % out_elempack != 0)
350 {
351 top_blob = bottom_blob;
352 return 0;
353 }
354 if (dims == 2 && h * elempack % out_elempack != 0)
355 {
356 top_blob = bottom_blob;
357 return 0;
358 }
359 if (dims == 3 && channels * elempack % out_elempack != 0)
360 {
361 top_blob = bottom_blob;
362 return 0;
363 }
364 }
365
366 if (dims == 1)
367 {
368 top_blob = bottom_blob;
369 top_blob.w = w * elempack / out_elempack;
370 top_blob.cstep = w * elempack / out_elempack;
371 top_blob.elemsize = elemsize / elempack * out_elempack;
372 top_blob.elempack = out_elempack;
373 return 0;
374 }
375
376 if (dims == 2)
377 {
378 int outh = h * elempack / out_elempack;
379 size_t out_elemsize = elemsize / elempack * out_elempack;
380
381 top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator);
382 if (top_blob.empty())
383 return -100;
384
385 if (pack1to4)
386 {
387 #pragma omp parallel for num_threads(opt.num_threads)
388 for (int i = 0; i < outh; i++)
389 {
390 const unsigned short* r0 = bottom_blob.row<const unsigned short>(i * 4);
391 const unsigned short* r1 = bottom_blob.row<const unsigned short>(i * 4 + 1);
392 const unsigned short* r2 = bottom_blob.row<const unsigned short>(i * 4 + 2);
393 const unsigned short* r3 = bottom_blob.row<const unsigned short>(i * 4 + 3);
394
395 unsigned short* outptr = top_blob.row<unsigned short>(i);
396
397 #if __riscv_vector
398 int n = w;
399 while (n > 0)
400 {
401 word_type vl = vsetvl_e16m2(n);
402
403 vuint16m2_t _p0 = vle16_v_u16m2(r0, vl);
404 vuint16m2_t _p1 = vle16_v_u16m2(r1, vl);
405 vuint16m2_t _p2 = vle16_v_u16m2(r2, vl);
406 vuint16m2_t _p3 = vle16_v_u16m2(r3, vl);
407 vsseg4e16_v_u16m2x4(outptr, vcreate_u16m2x4(_p0, _p1, _p2, _p3), vl);
408
409 r0 += vl;
410 r1 += vl;
411 r2 += vl;
412 r3 += vl;
413 outptr += vl * 4;
414 n -= vl;
415 }
416 #else // __riscv_vector
417 for (int j = 0; j < w; j++)
418 {
419 outptr[0] = *r0++;
420 outptr[1] = *r1++;
421 outptr[2] = *r2++;
422 outptr[3] = *r3++;
423
424 outptr += 4;
425 }
426 #endif // __riscv_vector
427 }
428 }
429 if (pack4to1)
430 {
431 #pragma omp parallel for num_threads(opt.num_threads)
432 for (int i = 0; i < h; i++)
433 {
434 const unsigned short* r0 = bottom_blob.row<const unsigned short>(i);
435
436 unsigned short* outptr0 = top_blob.row<unsigned short>(i * 4);
437 unsigned short* outptr1 = top_blob.row<unsigned short>(i * 4 + 1);
438 unsigned short* outptr2 = top_blob.row<unsigned short>(i * 4 + 2);
439 unsigned short* outptr3 = top_blob.row<unsigned short>(i * 4 + 3);
440
441 #if __riscv_vector
442 int n = w;
443 while (n > 0)
444 {
445 word_type vl = vsetvl_e16m2(n);
446
447 vuint16m2x4_t _p = vlseg4e16_v_u16m2x4(r0, vl);
448 vse16_v_u16m2(outptr0, vget_u16m2x4_u16m2(_p, 0), vl);
449 vse16_v_u16m2(outptr1, vget_u16m2x4_u16m2(_p, 1), vl);
450 vse16_v_u16m2(outptr2, vget_u16m2x4_u16m2(_p, 2), vl);
451 vse16_v_u16m2(outptr3, vget_u16m2x4_u16m2(_p, 3), vl);
452
453 r0 += vl * 4;
454 outptr0 += vl;
455 outptr1 += vl;
456 outptr2 += vl;
457 outptr3 += vl;
458 n -= vl;
459 }
460 #else // __riscv_vector
461 for (int j = 0; j < w; j++)
462 {
463 *outptr0++ = r0[0];
464 *outptr1++ = r0[1];
465 *outptr2++ = r0[2];
466 *outptr3++ = r0[3];
467
468 r0 += 4;
469 }
470 #endif // __riscv_vector
471 }
472 }
473 if (pack1to8)
474 {
475 #pragma omp parallel for num_threads(opt.num_threads)
476 for (int i = 0; i < outh; i++)
477 {
478 const unsigned short* r0 = bottom_blob.row<const unsigned short>(i * 8);
479 const unsigned short* r1 = bottom_blob.row<const unsigned short>(i * 8 + 1);
480 const unsigned short* r2 = bottom_blob.row<const unsigned short>(i * 8 + 2);
481 const unsigned short* r3 = bottom_blob.row<const unsigned short>(i * 8 + 3);
482 const unsigned short* r4 = bottom_blob.row<const unsigned short>(i * 8 + 4);
483 const unsigned short* r5 = bottom_blob.row<const unsigned short>(i * 8 + 5);
484 const unsigned short* r6 = bottom_blob.row<const unsigned short>(i * 8 + 6);
485 const unsigned short* r7 = bottom_blob.row<const unsigned short>(i * 8 + 7);
486
487 unsigned short* outptr = top_blob.row<unsigned short>(i);
488
489 #if __riscv_vector
490 int n = w;
491 while (n > 0)
492 {
493 word_type vl = vsetvl_e16m1(n);
494
495 vuint16m1_t _p0 = vle16_v_u16m1(r0, vl);
496 vuint16m1_t _p1 = vle16_v_u16m1(r1, vl);
497 vuint16m1_t _p2 = vle16_v_u16m1(r2, vl);
498 vuint16m1_t _p3 = vle16_v_u16m1(r3, vl);
499 vuint16m1_t _p4 = vle16_v_u16m1(r4, vl);
500 vuint16m1_t _p5 = vle16_v_u16m1(r5, vl);
501 vuint16m1_t _p6 = vle16_v_u16m1(r6, vl);
502 vuint16m1_t _p7 = vle16_v_u16m1(r7, vl);
503 vsseg8e16_v_u16m1x8(outptr, vcreate_u16m1x8(_p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7), vl);
504
505 r0 += vl;
506 r1 += vl;
507 r2 += vl;
508 r3 += vl;
509 r4 += vl;
510 r5 += vl;
511 r6 += vl;
512 r7 += vl;
513 outptr += vl * 8;
514 n -= vl;
515 }
516 #else // __riscv_vector
517 for (int j = 0; j < w; j++)
518 {
519 outptr[0] = *r0++;
520 outptr[1] = *r1++;
521 outptr[2] = *r2++;
522 outptr[3] = *r3++;
523 outptr[4] = *r4++;
524 outptr[5] = *r5++;
525 outptr[6] = *r6++;
526 outptr[7] = *r7++;
527
528 outptr += 8;
529 }
530 #endif // __riscv_vector
531 }
532 }
533 if (pack8to1)
534 {
535 #pragma omp parallel for num_threads(opt.num_threads)
536 for (int i = 0; i < h; i++)
537 {
538 const unsigned short* r0 = bottom_blob.row<const unsigned short>(i);
539
540 unsigned short* outptr0 = top_blob.row<unsigned short>(i * 8);
541 unsigned short* outptr1 = top_blob.row<unsigned short>(i * 8 + 1);
542 unsigned short* outptr2 = top_blob.row<unsigned short>(i * 8 + 2);
543 unsigned short* outptr3 = top_blob.row<unsigned short>(i * 8 + 3);
544 unsigned short* outptr4 = top_blob.row<unsigned short>(i * 8 + 4);
545 unsigned short* outptr5 = top_blob.row<unsigned short>(i * 8 + 5);
546 unsigned short* outptr6 = top_blob.row<unsigned short>(i * 8 + 6);
547 unsigned short* outptr7 = top_blob.row<unsigned short>(i * 8 + 7);
548
549 #if __riscv_vector
550 int n = w;
551 while (n > 0)
552 {
553 word_type vl = vsetvl_e16m1(n);
554
555 vuint16m1x8_t _p = vlseg8e16_v_u16m1x8(r0, vl);
556 vse16_v_u16m1(outptr0, vget_u16m1x8_u16m1(_p, 0), vl);
557 vse16_v_u16m1(outptr1, vget_u16m1x8_u16m1(_p, 1), vl);
558 vse16_v_u16m1(outptr2, vget_u16m1x8_u16m1(_p, 2), vl);
559 vse16_v_u16m1(outptr3, vget_u16m1x8_u16m1(_p, 3), vl);
560 vse16_v_u16m1(outptr4, vget_u16m1x8_u16m1(_p, 4), vl);
561 vse16_v_u16m1(outptr5, vget_u16m1x8_u16m1(_p, 5), vl);
562 vse16_v_u16m1(outptr6, vget_u16m1x8_u16m1(_p, 6), vl);
563 vse16_v_u16m1(outptr7, vget_u16m1x8_u16m1(_p, 7), vl);
564
565 r0 += vl * 8;
566 outptr0 += vl;
567 outptr1 += vl;
568 outptr2 += vl;
569 outptr3 += vl;
570 outptr4 += vl;
571 outptr5 += vl;
572 outptr6 += vl;
573 outptr7 += vl;
574 n -= vl;
575 }
576 #else // __riscv_vector
577 for (int j = 0; j < w; j++)
578 {
579 *outptr0++ = r0[0];
580 *outptr1++ = r0[1];
581 *outptr2++ = r0[2];
582 *outptr3++ = r0[3];
583 *outptr4++ = r0[4];
584 *outptr5++ = r0[5];
585 *outptr6++ = r0[6];
586 *outptr7++ = r0[7];
587
588 r0 += 8;
589 }
590 #endif // __riscv_vector
591 }
592 }
593 if (pack4to8)
594 {
595 #pragma omp parallel for num_threads(opt.num_threads)
596 for (int i = 0; i < outh; i++)
597 {
598 const unsigned short* r0 = bottom_blob.row<const unsigned short>(i * 2);
599 const unsigned short* r1 = bottom_blob.row<const unsigned short>(i * 2 + 1);
600
601 unsigned short* outptr = top_blob.row<unsigned short>(i);
602
603 #if __riscv_vector
604 int n = w;
605 while (n > 0)
606 {
607 word_type vl = vsetvl_e16m1(n);
608
609 vuint16m1x4_t _p0 = vlseg4e16_v_u16m1x4(r0, vl);
610 vuint16m1x4_t _p1 = vlseg4e16_v_u16m1x4(r1, vl);
611 vuint16m1_t _p00 = vget_u16m1x4_u16m1(_p0, 0);
612 vuint16m1_t _p01 = vget_u16m1x4_u16m1(_p0, 1);
613 vuint16m1_t _p02 = vget_u16m1x4_u16m1(_p0, 2);
614 vuint16m1_t _p03 = vget_u16m1x4_u16m1(_p0, 3);
615 vuint16m1_t _p10 = vget_u16m1x4_u16m1(_p1, 0);
616 vuint16m1_t _p11 = vget_u16m1x4_u16m1(_p1, 1);
617 vuint16m1_t _p12 = vget_u16m1x4_u16m1(_p1, 2);
618 vuint16m1_t _p13 = vget_u16m1x4_u16m1(_p1, 3);
619 vsseg8e16_v_u16m1x8(outptr, vcreate_u16m1x8(_p00, _p01, _p02, _p03, _p10, _p11, _p12, _p13), vl);
620
621 r0 += vl * 4;
622 r1 += vl * 4;
623 outptr += vl * 8;
624 n -= vl;
625 }
626 #else // __riscv_vector
627 for (int j = 0; j < w; j++)
628 {
629 outptr[0] = r0[0];
630 outptr[1] = r0[1];
631 outptr[2] = r0[2];
632 outptr[3] = r0[3];
633 outptr[4] = r1[0];
634 outptr[5] = r1[1];
635 outptr[6] = r1[2];
636 outptr[7] = r1[3];
637
638 r0 += 4;
639 r1 += 4;
640 outptr += 8;
641 }
642 #endif // __riscv_vector
643 }
644 }
645 if (pack8to4)
646 {
647 #pragma omp parallel for num_threads(opt.num_threads)
648 for (int i = 0; i < h; i++)
649 {
650 const unsigned short* r0 = bottom_blob.row<const unsigned short>(i);
651
652 unsigned short* outptr0 = top_blob.row<unsigned short>(i * 2);
653 unsigned short* outptr1 = top_blob.row<unsigned short>(i * 2 + 1);
654
655 #if __riscv_vector
656 int n = w;
657 while (n > 0)
658 {
659 word_type vl = vsetvl_e16m1(n);
660
661 vuint16m1x8_t _p = vlseg8e16_v_u16m1x8(r0, vl);
662 vuint16m1_t _p0 = vget_u16m1x8_u16m1(_p, 0);
663 vuint16m1_t _p1 = vget_u16m1x8_u16m1(_p, 1);
664 vuint16m1_t _p2 = vget_u16m1x8_u16m1(_p, 2);
665 vuint16m1_t _p3 = vget_u16m1x8_u16m1(_p, 3);
666 vuint16m1_t _p4 = vget_u16m1x8_u16m1(_p, 4);
667 vuint16m1_t _p5 = vget_u16m1x8_u16m1(_p, 5);
668 vuint16m1_t _p6 = vget_u16m1x8_u16m1(_p, 6);
669 vuint16m1_t _p7 = vget_u16m1x8_u16m1(_p, 7);
670 vsseg4e16_v_u16m1x4(outptr0, vcreate_u16m1x4(_p0, _p1, _p2, _p3), vl);
671 vsseg4e16_v_u16m1x4(outptr1, vcreate_u16m1x4(_p4, _p5, _p6, _p7), vl);
672
673 r0 += vl * 8;
674 outptr0 += vl * 4;
675 outptr1 += vl * 4;
676 n -= vl;
677 }
678 #else // __riscv_vector
679 for (int j = 0; j < w; j++)
680 {
681 outptr0[0] = r0[0];
682 outptr0[1] = r0[1];
683 outptr0[2] = r0[2];
684 outptr0[3] = r0[3];
685 outptr1[0] = r0[4];
686 outptr1[1] = r0[5];
687 outptr1[2] = r0[6];
688 outptr1[3] = r0[7];
689
690 r0 += 8;
691 outptr0 += 4;
692 outptr1 += 4;
693 }
694 #endif // __riscv_vector
695 }
696 }
697
698 return 0;
699 }
700
701 if (dims == 3)
702 {
703 int size = w * h;
704 int outc = channels * elempack / out_elempack;
705 size_t out_elemsize = elemsize / elempack * out_elempack;
706
707 top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator);
708 if (top_blob.empty())
709 return -100;
710
711 if (pack1to4)
712 {
713 #pragma omp parallel for num_threads(opt.num_threads)
714 for (int q = 0; q < outc; q++)
715 {
716 const unsigned short* r0 = bottom_blob.channel(q * 4);
717 const unsigned short* r1 = bottom_blob.channel(q * 4 + 1);
718 const unsigned short* r2 = bottom_blob.channel(q * 4 + 2);
719 const unsigned short* r3 = bottom_blob.channel(q * 4 + 3);
720
721 unsigned short* outptr = top_blob.channel(q);
722
723 #if __riscv_vector
724 int n = size;
725 while (n > 0)
726 {
727 word_type vl = vsetvl_e16m2(n);
728
729 vuint16m2_t _p0 = vle16_v_u16m2(r0, vl);
730 vuint16m2_t _p1 = vle16_v_u16m2(r1, vl);
731 vuint16m2_t _p2 = vle16_v_u16m2(r2, vl);
732 vuint16m2_t _p3 = vle16_v_u16m2(r3, vl);
733 vsseg4e16_v_u16m2x4(outptr, vcreate_u16m2x4(_p0, _p1, _p2, _p3), vl);
734
735 r0 += vl;
736 r1 += vl;
737 r2 += vl;
738 r3 += vl;
739 outptr += vl * 4;
740 n -= vl;
741 }
742 #else // __riscv_vector
743 for (int i = 0; i < size; i++)
744 {
745 outptr[0] = *r0++;
746 outptr[1] = *r1++;
747 outptr[2] = *r2++;
748 outptr[3] = *r3++;
749
750 outptr += 4;
751 }
752 #endif // __riscv_vector
753 }
754 }
755 if (pack4to1)
756 {
757 #pragma omp parallel for num_threads(opt.num_threads)
758 for (int q = 0; q < channels; q++)
759 {
760 const unsigned short* r0 = bottom_blob.channel(q);
761
762 unsigned short* outptr0 = top_blob.channel(q * 4);
763 unsigned short* outptr1 = top_blob.channel(q * 4 + 1);
764 unsigned short* outptr2 = top_blob.channel(q * 4 + 2);
765 unsigned short* outptr3 = top_blob.channel(q * 4 + 3);
766
767 #if __riscv_vector
768 int n = size;
769 while (n > 0)
770 {
771 word_type vl = vsetvl_e16m2(n);
772
773 vuint16m2x4_t _p = vlseg4e16_v_u16m2x4(r0, vl);
774 vse16_v_u16m2(outptr0, vget_u16m2x4_u16m2(_p, 0), vl);
775 vse16_v_u16m2(outptr1, vget_u16m2x4_u16m2(_p, 1), vl);
776 vse16_v_u16m2(outptr2, vget_u16m2x4_u16m2(_p, 2), vl);
777 vse16_v_u16m2(outptr3, vget_u16m2x4_u16m2(_p, 3), vl);
778
779 r0 += vl * 4;
780 outptr0 += vl;
781 outptr1 += vl;
782 outptr2 += vl;
783 outptr3 += vl;
784 n -= vl;
785 }
786 #else // __riscv_vector
787 for (int i = 0; i < size; i++)
788 {
789 *outptr0++ = r0[0];
790 *outptr1++ = r0[1];
791 *outptr2++ = r0[2];
792 *outptr3++ = r0[3];
793
794 r0 += 4;
795 }
796 #endif // __riscv_vector
797 }
798 }
799 if (pack1to8)
800 {
801 #pragma omp parallel for num_threads(opt.num_threads)
802 for (int q = 0; q < outc; q++)
803 {
804 const unsigned short* r0 = bottom_blob.channel(q * 8);
805 const unsigned short* r1 = bottom_blob.channel(q * 8 + 1);
806 const unsigned short* r2 = bottom_blob.channel(q * 8 + 2);
807 const unsigned short* r3 = bottom_blob.channel(q * 8 + 3);
808 const unsigned short* r4 = bottom_blob.channel(q * 8 + 4);
809 const unsigned short* r5 = bottom_blob.channel(q * 8 + 5);
810 const unsigned short* r6 = bottom_blob.channel(q * 8 + 6);
811 const unsigned short* r7 = bottom_blob.channel(q * 8 + 7);
812
813 unsigned short* outptr = top_blob.channel(q);
814
815 #if __riscv_vector
816 int n = size;
817 while (n > 0)
818 {
819 word_type vl = vsetvl_e16m1(n);
820
821 vuint16m1_t _p0 = vle16_v_u16m1(r0, vl);
822 vuint16m1_t _p1 = vle16_v_u16m1(r1, vl);
823 vuint16m1_t _p2 = vle16_v_u16m1(r2, vl);
824 vuint16m1_t _p3 = vle16_v_u16m1(r3, vl);
825 vuint16m1_t _p4 = vle16_v_u16m1(r4, vl);
826 vuint16m1_t _p5 = vle16_v_u16m1(r5, vl);
827 vuint16m1_t _p6 = vle16_v_u16m1(r6, vl);
828 vuint16m1_t _p7 = vle16_v_u16m1(r7, vl);
829 vsseg8e16_v_u16m1x8(outptr, vcreate_u16m1x8(_p0, _p1, _p2, _p3, _p4, _p5, _p6, _p7), vl);
830
831 r0 += vl;
832 r1 += vl;
833 r2 += vl;
834 r3 += vl;
835 r4 += vl;
836 r5 += vl;
837 r6 += vl;
838 r7 += vl;
839 outptr += vl * 8;
840 n -= vl;
841 }
842 #else // __riscv_vector
843 for (int i = 0; i < size; i++)
844 {
845 outptr[0] = *r0++;
846 outptr[1] = *r1++;
847 outptr[2] = *r2++;
848 outptr[3] = *r3++;
849 outptr[4] = *r4++;
850 outptr[5] = *r5++;
851 outptr[6] = *r6++;
852 outptr[7] = *r7++;
853
854 outptr += 8;
855 }
856 #endif // __riscv_vector
857 }
858 }
859 if (pack8to1)
860 {
861 #pragma omp parallel for num_threads(opt.num_threads)
862 for (int q = 0; q < channels; q++)
863 {
864 const unsigned short* r0 = bottom_blob.channel(q);
865
866 unsigned short* outptr0 = top_blob.channel(q * 8);
867 unsigned short* outptr1 = top_blob.channel(q * 8 + 1);
868 unsigned short* outptr2 = top_blob.channel(q * 8 + 2);
869 unsigned short* outptr3 = top_blob.channel(q * 8 + 3);
870 unsigned short* outptr4 = top_blob.channel(q * 8 + 4);
871 unsigned short* outptr5 = top_blob.channel(q * 8 + 5);
872 unsigned short* outptr6 = top_blob.channel(q * 8 + 6);
873 unsigned short* outptr7 = top_blob.channel(q * 8 + 7);
874
875 #if __riscv_vector
876 int n = size;
877 while (n > 0)
878 {
879 word_type vl = vsetvl_e16m1(n);
880
881 vuint16m1x8_t _p = vlseg8e16_v_u16m1x8(r0, vl);
882 vse16_v_u16m1(outptr0, vget_u16m1x8_u16m1(_p, 0), vl);
883 vse16_v_u16m1(outptr1, vget_u16m1x8_u16m1(_p, 1), vl);
884 vse16_v_u16m1(outptr2, vget_u16m1x8_u16m1(_p, 2), vl);
885 vse16_v_u16m1(outptr3, vget_u16m1x8_u16m1(_p, 3), vl);
886 vse16_v_u16m1(outptr4, vget_u16m1x8_u16m1(_p, 4), vl);
887 vse16_v_u16m1(outptr5, vget_u16m1x8_u16m1(_p, 5), vl);
888 vse16_v_u16m1(outptr6, vget_u16m1x8_u16m1(_p, 6), vl);
889 vse16_v_u16m1(outptr7, vget_u16m1x8_u16m1(_p, 7), vl);
890
891 r0 += vl * 8;
892 outptr0 += vl;
893 outptr1 += vl;
894 outptr2 += vl;
895 outptr3 += vl;
896 outptr4 += vl;
897 outptr5 += vl;
898 outptr6 += vl;
899 outptr7 += vl;
900 n -= vl;
901 }
902 #else // __riscv_vector
903 for (int i = 0; i < size; i++)
904 {
905 *outptr0++ = r0[0];
906 *outptr1++ = r0[1];
907 *outptr2++ = r0[2];
908 *outptr3++ = r0[3];
909 *outptr4++ = r0[4];
910 *outptr5++ = r0[5];
911 *outptr6++ = r0[6];
912 *outptr7++ = r0[7];
913
914 r0 += 8;
915 }
916 #endif // __riscv_vector
917 }
918 }
919 if (pack4to8)
920 {
921 #pragma omp parallel for num_threads(opt.num_threads)
922 for (int q = 0; q < outc; q++)
923 {
924 const unsigned short* r0 = bottom_blob.channel(q * 2);
925 const unsigned short* r1 = bottom_blob.channel(q * 2 + 1);
926
927 unsigned short* outptr = top_blob.channel(q);
928
929 #if __riscv_vector
930 int n = size;
931 while (n > 0)
932 {
933 word_type vl = vsetvl_e16m1(n);
934
935 vuint16m1x4_t _p0 = vlseg4e16_v_u16m1x4(r0, vl);
936 vuint16m1x4_t _p1 = vlseg4e16_v_u16m1x4(r1, vl);
937
938 vuint16m1_t _p00 = vget_u16m1x4_u16m1(_p0, 0);
939 vuint16m1_t _p01 = vget_u16m1x4_u16m1(_p0, 1);
940 vuint16m1_t _p02 = vget_u16m1x4_u16m1(_p0, 2);
941 vuint16m1_t _p03 = vget_u16m1x4_u16m1(_p0, 3);
942 vuint16m1_t _p10 = vget_u16m1x4_u16m1(_p1, 0);
943 vuint16m1_t _p11 = vget_u16m1x4_u16m1(_p1, 1);
944 vuint16m1_t _p12 = vget_u16m1x4_u16m1(_p1, 2);
945 vuint16m1_t _p13 = vget_u16m1x4_u16m1(_p1, 3);
946 vsseg8e16_v_u16m1x8(outptr, vcreate_u16m1x8(_p00, _p01, _p02, _p03, _p10, _p11, _p12, _p13), vl);
947
948 r0 += vl * 4;
949 r1 += vl * 4;
950 outptr += vl * 8;
951 n -= vl;
952 }
953 #else // __riscv_vector
954 for (int i = 0; i < size; i++)
955 {
956 outptr[0] = r0[0];
957 outptr[1] = r0[1];
958 outptr[2] = r0[2];
959 outptr[3] = r0[3];
960 outptr[4] = r1[0];
961 outptr[5] = r1[1];
962 outptr[6] = r1[2];
963 outptr[7] = r1[3];
964
965 r0 += 4;
966 r1 += 4;
967 outptr += 8;
968 }
969 #endif // __riscv_vector
970 }
971 }
972 if (pack8to4)
973 {
974 #pragma omp parallel for num_threads(opt.num_threads)
975 for (int q = 0; q < channels; q++)
976 {
977 const unsigned short* r0 = bottom_blob.channel(q);
978
979 unsigned short* outptr0 = top_blob.channel(q * 2);
980 unsigned short* outptr1 = top_blob.channel(q * 2 + 1);
981
982 #if __riscv_vector
983 int n = size;
984 while (n > 0)
985 {
986 word_type vl = vsetvl_e16m1(n);
987
988 vuint16m1x8_t _p = vlseg8e16_v_u16m1x8(r0, vl);
989 vuint16m1_t _p0 = vget_u16m1x8_u16m1(_p, 0);
990 vuint16m1_t _p1 = vget_u16m1x8_u16m1(_p, 1);
991 vuint16m1_t _p2 = vget_u16m1x8_u16m1(_p, 2);
992 vuint16m1_t _p3 = vget_u16m1x8_u16m1(_p, 3);
993 vuint16m1_t _p4 = vget_u16m1x8_u16m1(_p, 4);
994 vuint16m1_t _p5 = vget_u16m1x8_u16m1(_p, 5);
995 vuint16m1_t _p6 = vget_u16m1x8_u16m1(_p, 6);
996 vuint16m1_t _p7 = vget_u16m1x8_u16m1(_p, 7);
997 vsseg4e16_v_u16m1x4(outptr0, vcreate_u16m1x4(_p0, _p1, _p2, _p3), vl);
998 vsseg4e16_v_u16m1x4(outptr1, vcreate_u16m1x4(_p4, _p5, _p6, _p7), vl);
999
1000 r0 += vl * 8;
1001 outptr0 += vl * 4;
1002 outptr1 += vl * 4;
1003 n -= vl;
1004 }
1005 #else // __riscv_vector
1006 for (int i = 0; i < size; i++)
1007 {
1008 outptr0[0] = r0[0];
1009 outptr0[1] = r0[1];
1010 outptr0[2] = r0[2];
1011 outptr0[3] = r0[3];
1012 outptr1[0] = r0[4];
1013 outptr1[1] = r0[5];
1014 outptr1[2] = r0[6];
1015 outptr1[3] = r0[7];
1016
1017 r0 += 8;
1018 outptr0 += 4;
1019 outptr1 += 4;
1020 }
1021 #endif // __riscv_vector
1022 }
1023 }
1024
1025 return 0;
1026 }
1027
1028 return 0;
1029 }
1030
1031 } // namespace ncnn
1032