1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14
15 #include "packing_x86.h"
16
17 #include "x86_usability.h"
18
19 namespace ncnn {
20
Packing_x86()21 Packing_x86::Packing_x86()
22 {
23 support_packing = true;
24 }
25
forward(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const26 int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
27 {
28 int elembits = bottom_blob.elembits();
29
30 if (elembits == 8)
31 return forward_int8(bottom_blob, top_blob, opt);
32
33 if (use_padding)
34 {
35 return Packing::forward(bottom_blob, top_blob, opt);
36 }
37
38 if (elembits != 32)
39 {
40 // non-fp32 type
41 return Packing::forward(bottom_blob, top_blob, opt);
42 }
43
44 size_t elemsize = bottom_blob.elemsize;
45 int elempack = bottom_blob.elempack;
46
47 if (elempack == out_elempack)
48 {
49 top_blob = bottom_blob;
50 return 0;
51 }
52
53 bool pack1to4 = elempack == 1 && out_elempack == 4;
54 bool pack4to1 = elempack == 4 && out_elempack == 1;
55 bool pack1to8 = elempack == 1 && out_elempack == 8;
56 bool pack8to1 = elempack == 8 && out_elempack == 1;
57 bool pack4to8 = elempack == 4 && out_elempack == 8;
58 bool pack8to4 = elempack == 8 && out_elempack == 4;
59
60 if (!pack1to4 && !pack4to1 && !pack1to8 && !pack8to1 && !pack4to8 && !pack8to4)
61 {
62 return Packing::forward(bottom_blob, top_blob, opt);
63 }
64
65 int w = bottom_blob.w;
66 int h = bottom_blob.h;
67 int channels = bottom_blob.c;
68 int dims = bottom_blob.dims;
69
70 if (!use_padding)
71 {
72 // identity if use_padding not allowed
73 if (dims == 1 && w * elempack % out_elempack != 0)
74 {
75 top_blob = bottom_blob;
76 return 0;
77 }
78 if (dims == 2 && h * elempack % out_elempack != 0)
79 {
80 top_blob = bottom_blob;
81 return 0;
82 }
83 if (dims == 3 && channels * elempack % out_elempack != 0)
84 {
85 top_blob = bottom_blob;
86 return 0;
87 }
88 }
89
90 if (dims == 1)
91 {
92 top_blob = bottom_blob;
93 top_blob.w = w * elempack / out_elempack;
94 top_blob.cstep = w * elempack / out_elempack;
95 top_blob.elemsize = elemsize / elempack * out_elempack;
96 top_blob.elempack = out_elempack;
97 return 0;
98 }
99
100 if (dims == 2)
101 {
102 int outh = h * elempack / out_elempack;
103 size_t out_elemsize = elemsize / elempack * out_elempack;
104
105 top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator);
106 if (top_blob.empty())
107 return -100;
108
109 if (pack1to4)
110 {
111 #pragma omp parallel for num_threads(opt.num_threads)
112 for (int i = 0; i < outh; i++)
113 {
114 const float* r0 = bottom_blob.row(i * 4);
115 const float* r1 = bottom_blob.row(i * 4 + 1);
116 const float* r2 = bottom_blob.row(i * 4 + 2);
117 const float* r3 = bottom_blob.row(i * 4 + 3);
118
119 float* outptr = top_blob.row(i);
120
121 for (int j = 0; j < w; j++)
122 {
123 outptr[0] = *r0++;
124 outptr[1] = *r1++;
125 outptr[2] = *r2++;
126 outptr[3] = *r3++;
127
128 outptr += 4;
129 }
130 }
131 }
132 if (pack4to1)
133 {
134 #pragma omp parallel for num_threads(opt.num_threads)
135 for (int i = 0; i < h; i++)
136 {
137 const float* r0 = bottom_blob.row(i);
138
139 float* outptr0 = top_blob.row(i * 4);
140 float* outptr1 = top_blob.row(i * 4 + 1);
141 float* outptr2 = top_blob.row(i * 4 + 2);
142 float* outptr3 = top_blob.row(i * 4 + 3);
143
144 for (int j = 0; j < w; j++)
145 {
146 *outptr0++ = r0[0];
147 *outptr1++ = r0[1];
148 *outptr2++ = r0[2];
149 *outptr3++ = r0[3];
150
151 r0 += 4;
152 }
153 }
154 }
155 if (pack1to8)
156 {
157 #pragma omp parallel for num_threads(opt.num_threads)
158 for (int i = 0; i < outh; i++)
159 {
160 const float* r0 = bottom_blob.row(i * 8);
161 const float* r1 = bottom_blob.row(i * 8 + 1);
162 const float* r2 = bottom_blob.row(i * 8 + 2);
163 const float* r3 = bottom_blob.row(i * 8 + 3);
164 const float* r4 = bottom_blob.row(i * 8 + 4);
165 const float* r5 = bottom_blob.row(i * 8 + 5);
166 const float* r6 = bottom_blob.row(i * 8 + 6);
167 const float* r7 = bottom_blob.row(i * 8 + 7);
168
169 float* outptr = top_blob.row(i);
170
171 #if __AVX__
172 int nn = w >> 3;
173 int remain = w & 7;
174 for (; nn > 0; nn--)
175 {
176 __m256 _row0 = _mm256_loadu_ps(r0);
177 __m256 _row1 = _mm256_loadu_ps(r1);
178 __m256 _row2 = _mm256_loadu_ps(r2);
179 __m256 _row3 = _mm256_loadu_ps(r3);
180 __m256 _row4 = _mm256_loadu_ps(r4);
181 __m256 _row5 = _mm256_loadu_ps(r5);
182 __m256 _row6 = _mm256_loadu_ps(r6);
183 __m256 _row7 = _mm256_loadu_ps(r7);
184 transpose8_ps(_row0, _row1, _row2, _row3, _row4, _row5, _row6, _row7);
185 _mm256_storeu_ps(outptr, _row0);
186 _mm256_storeu_ps(outptr + 8, _row1);
187 _mm256_storeu_ps(outptr + 16, _row2);
188 _mm256_storeu_ps(outptr + 24, _row3);
189 _mm256_storeu_ps(outptr + 32, _row4);
190 _mm256_storeu_ps(outptr + 40, _row5);
191 _mm256_storeu_ps(outptr + 48, _row6);
192 _mm256_storeu_ps(outptr + 56, _row7);
193 r0 += 8;
194 r1 += 8;
195 r2 += 8;
196 r3 += 8;
197 r4 += 8;
198 r5 += 8;
199 r6 += 8;
200 r7 += 8;
201 outptr += 64;
202 }
203 #else
204 int remain = w;
205 #endif
206
207 for (; remain > 0; remain--)
208 {
209 outptr[0] = *r0++;
210 outptr[1] = *r1++;
211 outptr[2] = *r2++;
212 outptr[3] = *r3++;
213 outptr[4] = *r4++;
214 outptr[5] = *r5++;
215 outptr[6] = *r6++;
216 outptr[7] = *r7++;
217
218 outptr += 8;
219 }
220 }
221 }
222 if (pack8to1)
223 {
224 #pragma omp parallel for num_threads(opt.num_threads)
225 for (int i = 0; i < h; i++)
226 {
227 const float* r0 = bottom_blob.row(i);
228
229 float* outptr0 = top_blob.row(i * 8);
230 float* outptr1 = top_blob.row(i * 8 + 1);
231 float* outptr2 = top_blob.row(i * 8 + 2);
232 float* outptr3 = top_blob.row(i * 8 + 3);
233 float* outptr4 = top_blob.row(i * 8 + 4);
234 float* outptr5 = top_blob.row(i * 8 + 5);
235 float* outptr6 = top_blob.row(i * 8 + 6);
236 float* outptr7 = top_blob.row(i * 8 + 7);
237 #if __AVX__
238 int nn = w >> 3;
239 int remain = w & 7;
240 #else
241 int remain = w;
242 #endif
243
244 #if __AVX__
245 for (; nn > 0; nn--)
246 {
247 __m256 _row0 = _mm256_loadu_ps(r0);
248 __m256 _row1 = _mm256_loadu_ps(r0 + 8);
249 __m256 _row2 = _mm256_loadu_ps(r0 + 16);
250 __m256 _row3 = _mm256_loadu_ps(r0 + 24);
251 __m256 _row4 = _mm256_loadu_ps(r0 + 32);
252 __m256 _row5 = _mm256_loadu_ps(r0 + 40);
253 __m256 _row6 = _mm256_loadu_ps(r0 + 48);
254 __m256 _row7 = _mm256_loadu_ps(r0 + 56);
255 transpose8_ps(_row0, _row1, _row2, _row3, _row4, _row5, _row6, _row7);
256 _mm256_storeu_ps(outptr0, _row0);
257 _mm256_storeu_ps(outptr1, _row1);
258 _mm256_storeu_ps(outptr2, _row2);
259 _mm256_storeu_ps(outptr3, _row3);
260 _mm256_storeu_ps(outptr4, _row4);
261 _mm256_storeu_ps(outptr5, _row5);
262 _mm256_storeu_ps(outptr6, _row6);
263 _mm256_storeu_ps(outptr7, _row7);
264
265 r0 += 64;
266 outptr0 += 8;
267 outptr1 += 8;
268 outptr2 += 8;
269 outptr3 += 8;
270 outptr4 += 8;
271 outptr5 += 8;
272 outptr6 += 8;
273 outptr7 += 8;
274 }
275 #endif
276 for (; remain > 0; remain--)
277 {
278 *outptr0++ = r0[0];
279 *outptr1++ = r0[1];
280 *outptr2++ = r0[2];
281 *outptr3++ = r0[3];
282 *outptr4++ = r0[4];
283 *outptr5++ = r0[5];
284 *outptr6++ = r0[6];
285 *outptr7++ = r0[7];
286
287 r0 += 8;
288 }
289 }
290 }
291 if (pack4to8)
292 {
293 #pragma omp parallel for num_threads(opt.num_threads)
294 for (int i = 0; i < outh; i++)
295 {
296 const float* r0 = bottom_blob.row(i * 2);
297 const float* r1 = bottom_blob.row(i * 2 + 1);
298
299 float* outptr = top_blob.row(i);
300
301 for (int j = 0; j < w; j++)
302 {
303 outptr[0] = r0[0];
304 outptr[1] = r0[1];
305 outptr[2] = r0[2];
306 outptr[3] = r0[3];
307 outptr[4] = r1[0];
308 outptr[5] = r1[1];
309 outptr[6] = r1[2];
310 outptr[7] = r1[3];
311
312 r0 += 4;
313 r1 += 4;
314 outptr += 8;
315 }
316 }
317 }
318 if (pack8to4)
319 {
320 #pragma omp parallel for num_threads(opt.num_threads)
321 for (int i = 0; i < h; i++)
322 {
323 const float* r0 = bottom_blob.row(i);
324
325 float* outptr0 = top_blob.row(i * 2);
326 float* outptr1 = top_blob.row(i * 2 + 1);
327
328 for (int j = 0; j < w; j++)
329 {
330 outptr0[0] = r0[0];
331 outptr0[1] = r0[1];
332 outptr0[2] = r0[2];
333 outptr0[3] = r0[3];
334 outptr1[0] = r0[4];
335 outptr1[1] = r0[5];
336 outptr1[2] = r0[6];
337 outptr1[3] = r0[7];
338
339 r0 += 8;
340 outptr0 += 4;
341 outptr1 += 4;
342 }
343 }
344 }
345
346 return 0;
347 }
348
349 if (dims == 3)
350 {
351 int size = w * h;
352 int outc = channels * elempack / out_elempack;
353 size_t out_elemsize = elemsize / elempack * out_elempack;
354
355 top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator);
356 if (top_blob.empty())
357 return -100;
358
359 if (pack1to4)
360 {
361 #pragma omp parallel for num_threads(opt.num_threads)
362 for (int q = 0; q < outc; q++)
363 {
364 const float* r0 = bottom_blob.channel(q * 4);
365 const float* r1 = bottom_blob.channel(q * 4 + 1);
366 const float* r2 = bottom_blob.channel(q * 4 + 2);
367 const float* r3 = bottom_blob.channel(q * 4 + 3);
368
369 float* outptr = top_blob.channel(q);
370
371 for (int i = 0; i < size; i++)
372 {
373 outptr[0] = *r0++;
374 outptr[1] = *r1++;
375 outptr[2] = *r2++;
376 outptr[3] = *r3++;
377
378 outptr += 4;
379 }
380 }
381 }
382 if (pack4to1)
383 {
384 #pragma omp parallel for num_threads(opt.num_threads)
385 for (int q = 0; q < channels; q++)
386 {
387 const float* r0 = bottom_blob.channel(q);
388
389 float* outptr0 = top_blob.channel(q * 4);
390 float* outptr1 = top_blob.channel(q * 4 + 1);
391 float* outptr2 = top_blob.channel(q * 4 + 2);
392 float* outptr3 = top_blob.channel(q * 4 + 3);
393
394 for (int i = 0; i < size; i++)
395 {
396 *outptr0++ = r0[0];
397 *outptr1++ = r0[1];
398 *outptr2++ = r0[2];
399 *outptr3++ = r0[3];
400
401 r0 += 4;
402 }
403 }
404 }
405 if (pack1to8)
406 {
407 #pragma omp parallel for num_threads(opt.num_threads)
408 for (int q = 0; q < outc; q++)
409 {
410 const float* r0 = bottom_blob.channel(q * 8);
411 const float* r1 = bottom_blob.channel(q * 8 + 1);
412 const float* r2 = bottom_blob.channel(q * 8 + 2);
413 const float* r3 = bottom_blob.channel(q * 8 + 3);
414 const float* r4 = bottom_blob.channel(q * 8 + 4);
415 const float* r5 = bottom_blob.channel(q * 8 + 5);
416 const float* r6 = bottom_blob.channel(q * 8 + 6);
417 const float* r7 = bottom_blob.channel(q * 8 + 7);
418
419 float* outptr = top_blob.channel(q);
420
421 #if __AVX__
422 int nn = size >> 3;
423 int remain = size & 7;
424 #else
425 int remain = size;
426 #endif
427
428 #if __AVX__
429 for (; nn > 0; nn--)
430 {
431 __m256 _row0 = _mm256_loadu_ps(r0);
432 __m256 _row1 = _mm256_loadu_ps(r1);
433 __m256 _row2 = _mm256_loadu_ps(r2);
434 __m256 _row3 = _mm256_loadu_ps(r3);
435 __m256 _row4 = _mm256_loadu_ps(r4);
436 __m256 _row5 = _mm256_loadu_ps(r5);
437 __m256 _row6 = _mm256_loadu_ps(r6);
438 __m256 _row7 = _mm256_loadu_ps(r7);
439 transpose8_ps(_row0, _row1, _row2, _row3, _row4, _row5, _row6, _row7);
440 _mm256_storeu_ps(outptr, _row0);
441 _mm256_storeu_ps(outptr + 8, _row1);
442 _mm256_storeu_ps(outptr + 16, _row2);
443 _mm256_storeu_ps(outptr + 24, _row3);
444 _mm256_storeu_ps(outptr + 32, _row4);
445 _mm256_storeu_ps(outptr + 40, _row5);
446 _mm256_storeu_ps(outptr + 48, _row6);
447 _mm256_storeu_ps(outptr + 56, _row7);
448 r0 += 8;
449 r1 += 8;
450 r2 += 8;
451 r3 += 8;
452 r4 += 8;
453 r5 += 8;
454 r6 += 8;
455 r7 += 8;
456 outptr += 64;
457 }
458 #endif
459 for (; remain > 0; remain--)
460 {
461 outptr[0] = *r0++;
462 outptr[1] = *r1++;
463 outptr[2] = *r2++;
464 outptr[3] = *r3++;
465 outptr[4] = *r4++;
466 outptr[5] = *r5++;
467 outptr[6] = *r6++;
468 outptr[7] = *r7++;
469
470 outptr += 8;
471 }
472 }
473 }
474 if (pack8to1)
475 {
476 #pragma omp parallel for num_threads(opt.num_threads)
477 for (int q = 0; q < channels; q++)
478 {
479 const float* r0 = bottom_blob.channel(q);
480
481 float* outptr0 = top_blob.channel(q * 8);
482 float* outptr1 = top_blob.channel(q * 8 + 1);
483 float* outptr2 = top_blob.channel(q * 8 + 2);
484 float* outptr3 = top_blob.channel(q * 8 + 3);
485 float* outptr4 = top_blob.channel(q * 8 + 4);
486 float* outptr5 = top_blob.channel(q * 8 + 5);
487 float* outptr6 = top_blob.channel(q * 8 + 6);
488 float* outptr7 = top_blob.channel(q * 8 + 7);
489 #if __AVX__
490 int nn = size >> 3;
491 int remain = size & 7;
492 #else
493 int remain = size;
494 #endif
495
496 #if __AVX__
497 for (; nn > 0; nn--)
498 {
499 __m256 _row0 = _mm256_loadu_ps(r0);
500 __m256 _row1 = _mm256_loadu_ps(r0 + 8);
501 __m256 _row2 = _mm256_loadu_ps(r0 + 16);
502 __m256 _row3 = _mm256_loadu_ps(r0 + 24);
503 __m256 _row4 = _mm256_loadu_ps(r0 + 32);
504 __m256 _row5 = _mm256_loadu_ps(r0 + 40);
505 __m256 _row6 = _mm256_loadu_ps(r0 + 48);
506 __m256 _row7 = _mm256_loadu_ps(r0 + 56);
507 transpose8_ps(_row0, _row1, _row2, _row3, _row4, _row5, _row6, _row7);
508 _mm256_storeu_ps(outptr0, _row0);
509 _mm256_storeu_ps(outptr1, _row1);
510 _mm256_storeu_ps(outptr2, _row2);
511 _mm256_storeu_ps(outptr3, _row3);
512 _mm256_storeu_ps(outptr4, _row4);
513 _mm256_storeu_ps(outptr5, _row5);
514 _mm256_storeu_ps(outptr6, _row6);
515 _mm256_storeu_ps(outptr7, _row7);
516
517 r0 += 64;
518 outptr0 += 8;
519 outptr1 += 8;
520 outptr2 += 8;
521 outptr3 += 8;
522 outptr4 += 8;
523 outptr5 += 8;
524 outptr6 += 8;
525 outptr7 += 8;
526 }
527 #endif
528
529 for (; remain > 0; remain--)
530 {
531 *outptr0++ = r0[0];
532 *outptr1++ = r0[1];
533 *outptr2++ = r0[2];
534 *outptr3++ = r0[3];
535 *outptr4++ = r0[4];
536 *outptr5++ = r0[5];
537 *outptr6++ = r0[6];
538 *outptr7++ = r0[7];
539
540 r0 += 8;
541 }
542 }
543 }
544 if (pack4to8)
545 {
546 #pragma omp parallel for num_threads(opt.num_threads)
547 for (int q = 0; q < outc; q++)
548 {
549 const float* r0 = bottom_blob.channel(q * 2);
550 const float* r1 = bottom_blob.channel(q * 2 + 1);
551
552 float* outptr = top_blob.channel(q);
553
554 for (int i = 0; i < size; i++)
555 {
556 outptr[0] = r0[0];
557 outptr[1] = r0[1];
558 outptr[2] = r0[2];
559 outptr[3] = r0[3];
560 outptr[4] = r1[0];
561 outptr[5] = r1[1];
562 outptr[6] = r1[2];
563 outptr[7] = r1[3];
564
565 r0 += 4;
566 r1 += 4;
567 outptr += 8;
568 }
569 }
570 }
571 if (pack8to4)
572 {
573 #pragma omp parallel for num_threads(opt.num_threads)
574 for (int q = 0; q < channels; q++)
575 {
576 const float* r0 = bottom_blob.channel(q);
577
578 float* outptr0 = top_blob.channel(q * 2);
579 float* outptr1 = top_blob.channel(q * 2 + 1);
580
581 for (int i = 0; i < size; i++)
582 {
583 outptr0[0] = r0[0];
584 outptr0[1] = r0[1];
585 outptr0[2] = r0[2];
586 outptr0[3] = r0[3];
587 outptr1[0] = r0[4];
588 outptr1[1] = r0[5];
589 outptr1[2] = r0[6];
590 outptr1[3] = r0[7];
591
592 r0 += 8;
593 outptr0 += 4;
594 outptr1 += 4;
595 }
596 }
597 }
598
599 return 0;
600 }
601
602 return 0;
603 }
604
forward_int8(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const605 int Packing_x86::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
606 {
607 if (use_padding)
608 {
609 return Packing::forward(bottom_blob, top_blob, opt);
610 }
611
612 size_t elemsize = bottom_blob.elemsize;
613 int elempack = bottom_blob.elempack;
614
615 if (elempack == out_elempack)
616 {
617 top_blob = bottom_blob;
618 return 0;
619 }
620
621 bool pack1to8 = elempack == 1 && out_elempack == 8;
622 bool pack8to1 = elempack == 8 && out_elempack == 1;
623
624 if (!pack1to8 && !pack8to1)
625 {
626 return Packing::forward(bottom_blob, top_blob, opt);
627 }
628
629 int w = bottom_blob.w;
630 int h = bottom_blob.h;
631 int channels = bottom_blob.c;
632 int dims = bottom_blob.dims;
633
634 if (!use_padding)
635 {
636 // identity if use_padding not allowed
637 if (dims == 1 && w * elempack % out_elempack != 0)
638 {
639 top_blob = bottom_blob;
640 return 0;
641 }
642 if (dims == 2 && h * elempack % out_elempack != 0)
643 {
644 top_blob = bottom_blob;
645 return 0;
646 }
647 if (dims == 3 && channels * elempack % out_elempack != 0)
648 {
649 top_blob = bottom_blob;
650 return 0;
651 }
652 }
653
654 if (dims == 1)
655 {
656 top_blob = bottom_blob;
657 top_blob.w = w * elempack / out_elempack;
658 top_blob.cstep = w * elempack / out_elempack;
659 top_blob.elemsize = elemsize / elempack * out_elempack;
660 top_blob.elempack = out_elempack;
661 return 0;
662 }
663
664 if (dims == 2)
665 {
666 int outh = h * elempack / out_elempack;
667 size_t out_elemsize = elemsize / elempack * out_elempack;
668
669 top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator);
670 if (top_blob.empty())
671 return -100;
672
673 if (pack1to8)
674 {
675 #pragma omp parallel for num_threads(opt.num_threads)
676 for (int i = 0; i < outh; i++)
677 {
678 const signed char* r0 = bottom_blob.row<const signed char>(i * 8);
679 const signed char* r1 = bottom_blob.row<const signed char>(i * 8 + 1);
680 const signed char* r2 = bottom_blob.row<const signed char>(i * 8 + 2);
681 const signed char* r3 = bottom_blob.row<const signed char>(i * 8 + 3);
682 const signed char* r4 = bottom_blob.row<const signed char>(i * 8 + 4);
683 const signed char* r5 = bottom_blob.row<const signed char>(i * 8 + 5);
684 const signed char* r6 = bottom_blob.row<const signed char>(i * 8 + 6);
685 const signed char* r7 = bottom_blob.row<const signed char>(i * 8 + 7);
686
687 signed char* outptr = top_blob.row<signed char>(i);
688
689 int j = 0;
690 for (; j < w; j++)
691 {
692 outptr[0] = *r0++;
693 outptr[1] = *r1++;
694 outptr[2] = *r2++;
695 outptr[3] = *r3++;
696 outptr[4] = *r4++;
697 outptr[5] = *r5++;
698 outptr[6] = *r6++;
699 outptr[7] = *r7++;
700
701 outptr += 8;
702 }
703 }
704 }
705 if (pack8to1)
706 {
707 #pragma omp parallel for num_threads(opt.num_threads)
708 for (int i = 0; i < h; i++)
709 {
710 const signed char* r0 = bottom_blob.row<const signed char>(i);
711
712 signed char* outptr0 = top_blob.row<signed char>(i * 8);
713 signed char* outptr1 = top_blob.row<signed char>(i * 8 + 1);
714 signed char* outptr2 = top_blob.row<signed char>(i * 8 + 2);
715 signed char* outptr3 = top_blob.row<signed char>(i * 8 + 3);
716 signed char* outptr4 = top_blob.row<signed char>(i * 8 + 4);
717 signed char* outptr5 = top_blob.row<signed char>(i * 8 + 5);
718 signed char* outptr6 = top_blob.row<signed char>(i * 8 + 6);
719 signed char* outptr7 = top_blob.row<signed char>(i * 8 + 7);
720
721 int j = 0;
722 for (; j < w; j++)
723 {
724 *outptr0++ = r0[0];
725 *outptr1++ = r0[1];
726 *outptr2++ = r0[2];
727 *outptr3++ = r0[3];
728 *outptr4++ = r0[4];
729 *outptr5++ = r0[5];
730 *outptr6++ = r0[6];
731 *outptr7++ = r0[7];
732
733 r0 += 8;
734 }
735 }
736 }
737
738 return 0;
739 }
740
741 if (dims == 3)
742 {
743 int size = w * h;
744 int outc = channels * elempack / out_elempack;
745 size_t out_elemsize = elemsize / elempack * out_elempack;
746
747 top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator);
748 if (top_blob.empty())
749 return -100;
750
751 if (pack1to8)
752 {
753 #pragma omp parallel for num_threads(opt.num_threads)
754 for (int q = 0; q < outc; q++)
755 {
756 const signed char* r0 = bottom_blob.channel(q * 8);
757 const signed char* r1 = bottom_blob.channel(q * 8 + 1);
758 const signed char* r2 = bottom_blob.channel(q * 8 + 2);
759 const signed char* r3 = bottom_blob.channel(q * 8 + 3);
760 const signed char* r4 = bottom_blob.channel(q * 8 + 4);
761 const signed char* r5 = bottom_blob.channel(q * 8 + 5);
762 const signed char* r6 = bottom_blob.channel(q * 8 + 6);
763 const signed char* r7 = bottom_blob.channel(q * 8 + 7);
764
765 signed char* outptr = top_blob.channel(q);
766
767 int i = 0;
768 for (; i < size; i++)
769 {
770 outptr[0] = *r0++;
771 outptr[1] = *r1++;
772 outptr[2] = *r2++;
773 outptr[3] = *r3++;
774 outptr[4] = *r4++;
775 outptr[5] = *r5++;
776 outptr[6] = *r6++;
777 outptr[7] = *r7++;
778
779 outptr += 8;
780 }
781 }
782 }
783 if (pack8to1)
784 {
785 #pragma omp parallel for num_threads(opt.num_threads)
786 for (int q = 0; q < channels; q++)
787 {
788 const signed char* r0 = bottom_blob.channel(q);
789
790 signed char* outptr0 = top_blob.channel(q * 8);
791 signed char* outptr1 = top_blob.channel(q * 8 + 1);
792 signed char* outptr2 = top_blob.channel(q * 8 + 2);
793 signed char* outptr3 = top_blob.channel(q * 8 + 3);
794 signed char* outptr4 = top_blob.channel(q * 8 + 4);
795 signed char* outptr5 = top_blob.channel(q * 8 + 5);
796 signed char* outptr6 = top_blob.channel(q * 8 + 6);
797 signed char* outptr7 = top_blob.channel(q * 8 + 7);
798
799 int i = 0;
800 for (; i < size; i++)
801 {
802 *outptr0++ = r0[0];
803 *outptr1++ = r0[1];
804 *outptr2++ = r0[2];
805 *outptr3++ = r0[3];
806 *outptr4++ = r0[4];
807 *outptr5++ = r0[5];
808 *outptr6++ = r0[6];
809 *outptr7++ = r0[7];
810
811 r0 += 8;
812 }
813 }
814 }
815
816 return 0;
817 }
818
819 return 0;
820 }
821
822 } // namespace ncnn
823