1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 
15 #include "packing_x86.h"
16 
17 #include "x86_usability.h"
18 
19 namespace ncnn {
20 
Packing_x86()21 Packing_x86::Packing_x86()
22 {
23     support_packing = true;
24 }
25 
forward(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const26 int Packing_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
27 {
28     int elembits = bottom_blob.elembits();
29 
30     if (elembits == 8)
31         return forward_int8(bottom_blob, top_blob, opt);
32 
33     if (use_padding)
34     {
35         return Packing::forward(bottom_blob, top_blob, opt);
36     }
37 
38     if (elembits != 32)
39     {
40         // non-fp32 type
41         return Packing::forward(bottom_blob, top_blob, opt);
42     }
43 
44     size_t elemsize = bottom_blob.elemsize;
45     int elempack = bottom_blob.elempack;
46 
47     if (elempack == out_elempack)
48     {
49         top_blob = bottom_blob;
50         return 0;
51     }
52 
53     bool pack1to4 = elempack == 1 && out_elempack == 4;
54     bool pack4to1 = elempack == 4 && out_elempack == 1;
55     bool pack1to8 = elempack == 1 && out_elempack == 8;
56     bool pack8to1 = elempack == 8 && out_elempack == 1;
57     bool pack4to8 = elempack == 4 && out_elempack == 8;
58     bool pack8to4 = elempack == 8 && out_elempack == 4;
59 
60     if (!pack1to4 && !pack4to1 && !pack1to8 && !pack8to1 && !pack4to8 && !pack8to4)
61     {
62         return Packing::forward(bottom_blob, top_blob, opt);
63     }
64 
65     int w = bottom_blob.w;
66     int h = bottom_blob.h;
67     int channels = bottom_blob.c;
68     int dims = bottom_blob.dims;
69 
70     if (!use_padding)
71     {
72         // identity if use_padding not allowed
73         if (dims == 1 && w * elempack % out_elempack != 0)
74         {
75             top_blob = bottom_blob;
76             return 0;
77         }
78         if (dims == 2 && h * elempack % out_elempack != 0)
79         {
80             top_blob = bottom_blob;
81             return 0;
82         }
83         if (dims == 3 && channels * elempack % out_elempack != 0)
84         {
85             top_blob = bottom_blob;
86             return 0;
87         }
88     }
89 
90     if (dims == 1)
91     {
92         top_blob = bottom_blob;
93         top_blob.w = w * elempack / out_elempack;
94         top_blob.cstep = w * elempack / out_elempack;
95         top_blob.elemsize = elemsize / elempack * out_elempack;
96         top_blob.elempack = out_elempack;
97         return 0;
98     }
99 
100     if (dims == 2)
101     {
102         int outh = h * elempack / out_elempack;
103         size_t out_elemsize = elemsize / elempack * out_elempack;
104 
105         top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator);
106         if (top_blob.empty())
107             return -100;
108 
109         if (pack1to4)
110         {
111             #pragma omp parallel for num_threads(opt.num_threads)
112             for (int i = 0; i < outh; i++)
113             {
114                 const float* r0 = bottom_blob.row(i * 4);
115                 const float* r1 = bottom_blob.row(i * 4 + 1);
116                 const float* r2 = bottom_blob.row(i * 4 + 2);
117                 const float* r3 = bottom_blob.row(i * 4 + 3);
118 
119                 float* outptr = top_blob.row(i);
120 
121                 for (int j = 0; j < w; j++)
122                 {
123                     outptr[0] = *r0++;
124                     outptr[1] = *r1++;
125                     outptr[2] = *r2++;
126                     outptr[3] = *r3++;
127 
128                     outptr += 4;
129                 }
130             }
131         }
132         if (pack4to1)
133         {
134             #pragma omp parallel for num_threads(opt.num_threads)
135             for (int i = 0; i < h; i++)
136             {
137                 const float* r0 = bottom_blob.row(i);
138 
139                 float* outptr0 = top_blob.row(i * 4);
140                 float* outptr1 = top_blob.row(i * 4 + 1);
141                 float* outptr2 = top_blob.row(i * 4 + 2);
142                 float* outptr3 = top_blob.row(i * 4 + 3);
143 
144                 for (int j = 0; j < w; j++)
145                 {
146                     *outptr0++ = r0[0];
147                     *outptr1++ = r0[1];
148                     *outptr2++ = r0[2];
149                     *outptr3++ = r0[3];
150 
151                     r0 += 4;
152                 }
153             }
154         }
155         if (pack1to8)
156         {
157             #pragma omp parallel for num_threads(opt.num_threads)
158             for (int i = 0; i < outh; i++)
159             {
160                 const float* r0 = bottom_blob.row(i * 8);
161                 const float* r1 = bottom_blob.row(i * 8 + 1);
162                 const float* r2 = bottom_blob.row(i * 8 + 2);
163                 const float* r3 = bottom_blob.row(i * 8 + 3);
164                 const float* r4 = bottom_blob.row(i * 8 + 4);
165                 const float* r5 = bottom_blob.row(i * 8 + 5);
166                 const float* r6 = bottom_blob.row(i * 8 + 6);
167                 const float* r7 = bottom_blob.row(i * 8 + 7);
168 
169                 float* outptr = top_blob.row(i);
170 
171 #if __AVX__
172                 int nn = w >> 3;
173                 int remain = w & 7;
174                 for (; nn > 0; nn--)
175                 {
176                     __m256 _row0 = _mm256_loadu_ps(r0);
177                     __m256 _row1 = _mm256_loadu_ps(r1);
178                     __m256 _row2 = _mm256_loadu_ps(r2);
179                     __m256 _row3 = _mm256_loadu_ps(r3);
180                     __m256 _row4 = _mm256_loadu_ps(r4);
181                     __m256 _row5 = _mm256_loadu_ps(r5);
182                     __m256 _row6 = _mm256_loadu_ps(r6);
183                     __m256 _row7 = _mm256_loadu_ps(r7);
184                     transpose8_ps(_row0, _row1, _row2, _row3, _row4, _row5, _row6, _row7);
185                     _mm256_storeu_ps(outptr, _row0);
186                     _mm256_storeu_ps(outptr + 8, _row1);
187                     _mm256_storeu_ps(outptr + 16, _row2);
188                     _mm256_storeu_ps(outptr + 24, _row3);
189                     _mm256_storeu_ps(outptr + 32, _row4);
190                     _mm256_storeu_ps(outptr + 40, _row5);
191                     _mm256_storeu_ps(outptr + 48, _row6);
192                     _mm256_storeu_ps(outptr + 56, _row7);
193                     r0 += 8;
194                     r1 += 8;
195                     r2 += 8;
196                     r3 += 8;
197                     r4 += 8;
198                     r5 += 8;
199                     r6 += 8;
200                     r7 += 8;
201                     outptr += 64;
202                 }
203 #else
204                 int remain = w;
205 #endif
206 
207                 for (; remain > 0; remain--)
208                 {
209                     outptr[0] = *r0++;
210                     outptr[1] = *r1++;
211                     outptr[2] = *r2++;
212                     outptr[3] = *r3++;
213                     outptr[4] = *r4++;
214                     outptr[5] = *r5++;
215                     outptr[6] = *r6++;
216                     outptr[7] = *r7++;
217 
218                     outptr += 8;
219                 }
220             }
221         }
222         if (pack8to1)
223         {
224             #pragma omp parallel for num_threads(opt.num_threads)
225             for (int i = 0; i < h; i++)
226             {
227                 const float* r0 = bottom_blob.row(i);
228 
229                 float* outptr0 = top_blob.row(i * 8);
230                 float* outptr1 = top_blob.row(i * 8 + 1);
231                 float* outptr2 = top_blob.row(i * 8 + 2);
232                 float* outptr3 = top_blob.row(i * 8 + 3);
233                 float* outptr4 = top_blob.row(i * 8 + 4);
234                 float* outptr5 = top_blob.row(i * 8 + 5);
235                 float* outptr6 = top_blob.row(i * 8 + 6);
236                 float* outptr7 = top_blob.row(i * 8 + 7);
237 #if __AVX__
238                 int nn = w >> 3;
239                 int remain = w & 7;
240 #else
241                 int remain = w;
242 #endif
243 
244 #if __AVX__
245                 for (; nn > 0; nn--)
246                 {
247                     __m256 _row0 = _mm256_loadu_ps(r0);
248                     __m256 _row1 = _mm256_loadu_ps(r0 + 8);
249                     __m256 _row2 = _mm256_loadu_ps(r0 + 16);
250                     __m256 _row3 = _mm256_loadu_ps(r0 + 24);
251                     __m256 _row4 = _mm256_loadu_ps(r0 + 32);
252                     __m256 _row5 = _mm256_loadu_ps(r0 + 40);
253                     __m256 _row6 = _mm256_loadu_ps(r0 + 48);
254                     __m256 _row7 = _mm256_loadu_ps(r0 + 56);
255                     transpose8_ps(_row0, _row1, _row2, _row3, _row4, _row5, _row6, _row7);
256                     _mm256_storeu_ps(outptr0, _row0);
257                     _mm256_storeu_ps(outptr1, _row1);
258                     _mm256_storeu_ps(outptr2, _row2);
259                     _mm256_storeu_ps(outptr3, _row3);
260                     _mm256_storeu_ps(outptr4, _row4);
261                     _mm256_storeu_ps(outptr5, _row5);
262                     _mm256_storeu_ps(outptr6, _row6);
263                     _mm256_storeu_ps(outptr7, _row7);
264 
265                     r0 += 64;
266                     outptr0 += 8;
267                     outptr1 += 8;
268                     outptr2 += 8;
269                     outptr3 += 8;
270                     outptr4 += 8;
271                     outptr5 += 8;
272                     outptr6 += 8;
273                     outptr7 += 8;
274                 }
275 #endif
276                 for (; remain > 0; remain--)
277                 {
278                     *outptr0++ = r0[0];
279                     *outptr1++ = r0[1];
280                     *outptr2++ = r0[2];
281                     *outptr3++ = r0[3];
282                     *outptr4++ = r0[4];
283                     *outptr5++ = r0[5];
284                     *outptr6++ = r0[6];
285                     *outptr7++ = r0[7];
286 
287                     r0 += 8;
288                 }
289             }
290         }
291         if (pack4to8)
292         {
293             #pragma omp parallel for num_threads(opt.num_threads)
294             for (int i = 0; i < outh; i++)
295             {
296                 const float* r0 = bottom_blob.row(i * 2);
297                 const float* r1 = bottom_blob.row(i * 2 + 1);
298 
299                 float* outptr = top_blob.row(i);
300 
301                 for (int j = 0; j < w; j++)
302                 {
303                     outptr[0] = r0[0];
304                     outptr[1] = r0[1];
305                     outptr[2] = r0[2];
306                     outptr[3] = r0[3];
307                     outptr[4] = r1[0];
308                     outptr[5] = r1[1];
309                     outptr[6] = r1[2];
310                     outptr[7] = r1[3];
311 
312                     r0 += 4;
313                     r1 += 4;
314                     outptr += 8;
315                 }
316             }
317         }
318         if (pack8to4)
319         {
320             #pragma omp parallel for num_threads(opt.num_threads)
321             for (int i = 0; i < h; i++)
322             {
323                 const float* r0 = bottom_blob.row(i);
324 
325                 float* outptr0 = top_blob.row(i * 2);
326                 float* outptr1 = top_blob.row(i * 2 + 1);
327 
328                 for (int j = 0; j < w; j++)
329                 {
330                     outptr0[0] = r0[0];
331                     outptr0[1] = r0[1];
332                     outptr0[2] = r0[2];
333                     outptr0[3] = r0[3];
334                     outptr1[0] = r0[4];
335                     outptr1[1] = r0[5];
336                     outptr1[2] = r0[6];
337                     outptr1[3] = r0[7];
338 
339                     r0 += 8;
340                     outptr0 += 4;
341                     outptr1 += 4;
342                 }
343             }
344         }
345 
346         return 0;
347     }
348 
349     if (dims == 3)
350     {
351         int size = w * h;
352         int outc = channels * elempack / out_elempack;
353         size_t out_elemsize = elemsize / elempack * out_elempack;
354 
355         top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator);
356         if (top_blob.empty())
357             return -100;
358 
359         if (pack1to4)
360         {
361             #pragma omp parallel for num_threads(opt.num_threads)
362             for (int q = 0; q < outc; q++)
363             {
364                 const float* r0 = bottom_blob.channel(q * 4);
365                 const float* r1 = bottom_blob.channel(q * 4 + 1);
366                 const float* r2 = bottom_blob.channel(q * 4 + 2);
367                 const float* r3 = bottom_blob.channel(q * 4 + 3);
368 
369                 float* outptr = top_blob.channel(q);
370 
371                 for (int i = 0; i < size; i++)
372                 {
373                     outptr[0] = *r0++;
374                     outptr[1] = *r1++;
375                     outptr[2] = *r2++;
376                     outptr[3] = *r3++;
377 
378                     outptr += 4;
379                 }
380             }
381         }
382         if (pack4to1)
383         {
384             #pragma omp parallel for num_threads(opt.num_threads)
385             for (int q = 0; q < channels; q++)
386             {
387                 const float* r0 = bottom_blob.channel(q);
388 
389                 float* outptr0 = top_blob.channel(q * 4);
390                 float* outptr1 = top_blob.channel(q * 4 + 1);
391                 float* outptr2 = top_blob.channel(q * 4 + 2);
392                 float* outptr3 = top_blob.channel(q * 4 + 3);
393 
394                 for (int i = 0; i < size; i++)
395                 {
396                     *outptr0++ = r0[0];
397                     *outptr1++ = r0[1];
398                     *outptr2++ = r0[2];
399                     *outptr3++ = r0[3];
400 
401                     r0 += 4;
402                 }
403             }
404         }
405         if (pack1to8)
406         {
407             #pragma omp parallel for num_threads(opt.num_threads)
408             for (int q = 0; q < outc; q++)
409             {
410                 const float* r0 = bottom_blob.channel(q * 8);
411                 const float* r1 = bottom_blob.channel(q * 8 + 1);
412                 const float* r2 = bottom_blob.channel(q * 8 + 2);
413                 const float* r3 = bottom_blob.channel(q * 8 + 3);
414                 const float* r4 = bottom_blob.channel(q * 8 + 4);
415                 const float* r5 = bottom_blob.channel(q * 8 + 5);
416                 const float* r6 = bottom_blob.channel(q * 8 + 6);
417                 const float* r7 = bottom_blob.channel(q * 8 + 7);
418 
419                 float* outptr = top_blob.channel(q);
420 
421 #if __AVX__
422                 int nn = size >> 3;
423                 int remain = size & 7;
424 #else
425                 int remain = size;
426 #endif
427 
428 #if __AVX__
429                 for (; nn > 0; nn--)
430                 {
431                     __m256 _row0 = _mm256_loadu_ps(r0);
432                     __m256 _row1 = _mm256_loadu_ps(r1);
433                     __m256 _row2 = _mm256_loadu_ps(r2);
434                     __m256 _row3 = _mm256_loadu_ps(r3);
435                     __m256 _row4 = _mm256_loadu_ps(r4);
436                     __m256 _row5 = _mm256_loadu_ps(r5);
437                     __m256 _row6 = _mm256_loadu_ps(r6);
438                     __m256 _row7 = _mm256_loadu_ps(r7);
439                     transpose8_ps(_row0, _row1, _row2, _row3, _row4, _row5, _row6, _row7);
440                     _mm256_storeu_ps(outptr, _row0);
441                     _mm256_storeu_ps(outptr + 8, _row1);
442                     _mm256_storeu_ps(outptr + 16, _row2);
443                     _mm256_storeu_ps(outptr + 24, _row3);
444                     _mm256_storeu_ps(outptr + 32, _row4);
445                     _mm256_storeu_ps(outptr + 40, _row5);
446                     _mm256_storeu_ps(outptr + 48, _row6);
447                     _mm256_storeu_ps(outptr + 56, _row7);
448                     r0 += 8;
449                     r1 += 8;
450                     r2 += 8;
451                     r3 += 8;
452                     r4 += 8;
453                     r5 += 8;
454                     r6 += 8;
455                     r7 += 8;
456                     outptr += 64;
457                 }
458 #endif
459                 for (; remain > 0; remain--)
460                 {
461                     outptr[0] = *r0++;
462                     outptr[1] = *r1++;
463                     outptr[2] = *r2++;
464                     outptr[3] = *r3++;
465                     outptr[4] = *r4++;
466                     outptr[5] = *r5++;
467                     outptr[6] = *r6++;
468                     outptr[7] = *r7++;
469 
470                     outptr += 8;
471                 }
472             }
473         }
474         if (pack8to1)
475         {
476             #pragma omp parallel for num_threads(opt.num_threads)
477             for (int q = 0; q < channels; q++)
478             {
479                 const float* r0 = bottom_blob.channel(q);
480 
481                 float* outptr0 = top_blob.channel(q * 8);
482                 float* outptr1 = top_blob.channel(q * 8 + 1);
483                 float* outptr2 = top_blob.channel(q * 8 + 2);
484                 float* outptr3 = top_blob.channel(q * 8 + 3);
485                 float* outptr4 = top_blob.channel(q * 8 + 4);
486                 float* outptr5 = top_blob.channel(q * 8 + 5);
487                 float* outptr6 = top_blob.channel(q * 8 + 6);
488                 float* outptr7 = top_blob.channel(q * 8 + 7);
489 #if __AVX__
490                 int nn = size >> 3;
491                 int remain = size & 7;
492 #else
493                 int remain = size;
494 #endif
495 
496 #if __AVX__
497                 for (; nn > 0; nn--)
498                 {
499                     __m256 _row0 = _mm256_loadu_ps(r0);
500                     __m256 _row1 = _mm256_loadu_ps(r0 + 8);
501                     __m256 _row2 = _mm256_loadu_ps(r0 + 16);
502                     __m256 _row3 = _mm256_loadu_ps(r0 + 24);
503                     __m256 _row4 = _mm256_loadu_ps(r0 + 32);
504                     __m256 _row5 = _mm256_loadu_ps(r0 + 40);
505                     __m256 _row6 = _mm256_loadu_ps(r0 + 48);
506                     __m256 _row7 = _mm256_loadu_ps(r0 + 56);
507                     transpose8_ps(_row0, _row1, _row2, _row3, _row4, _row5, _row6, _row7);
508                     _mm256_storeu_ps(outptr0, _row0);
509                     _mm256_storeu_ps(outptr1, _row1);
510                     _mm256_storeu_ps(outptr2, _row2);
511                     _mm256_storeu_ps(outptr3, _row3);
512                     _mm256_storeu_ps(outptr4, _row4);
513                     _mm256_storeu_ps(outptr5, _row5);
514                     _mm256_storeu_ps(outptr6, _row6);
515                     _mm256_storeu_ps(outptr7, _row7);
516 
517                     r0 += 64;
518                     outptr0 += 8;
519                     outptr1 += 8;
520                     outptr2 += 8;
521                     outptr3 += 8;
522                     outptr4 += 8;
523                     outptr5 += 8;
524                     outptr6 += 8;
525                     outptr7 += 8;
526                 }
527 #endif
528 
529                 for (; remain > 0; remain--)
530                 {
531                     *outptr0++ = r0[0];
532                     *outptr1++ = r0[1];
533                     *outptr2++ = r0[2];
534                     *outptr3++ = r0[3];
535                     *outptr4++ = r0[4];
536                     *outptr5++ = r0[5];
537                     *outptr6++ = r0[6];
538                     *outptr7++ = r0[7];
539 
540                     r0 += 8;
541                 }
542             }
543         }
544         if (pack4to8)
545         {
546             #pragma omp parallel for num_threads(opt.num_threads)
547             for (int q = 0; q < outc; q++)
548             {
549                 const float* r0 = bottom_blob.channel(q * 2);
550                 const float* r1 = bottom_blob.channel(q * 2 + 1);
551 
552                 float* outptr = top_blob.channel(q);
553 
554                 for (int i = 0; i < size; i++)
555                 {
556                     outptr[0] = r0[0];
557                     outptr[1] = r0[1];
558                     outptr[2] = r0[2];
559                     outptr[3] = r0[3];
560                     outptr[4] = r1[0];
561                     outptr[5] = r1[1];
562                     outptr[6] = r1[2];
563                     outptr[7] = r1[3];
564 
565                     r0 += 4;
566                     r1 += 4;
567                     outptr += 8;
568                 }
569             }
570         }
571         if (pack8to4)
572         {
573             #pragma omp parallel for num_threads(opt.num_threads)
574             for (int q = 0; q < channels; q++)
575             {
576                 const float* r0 = bottom_blob.channel(q);
577 
578                 float* outptr0 = top_blob.channel(q * 2);
579                 float* outptr1 = top_blob.channel(q * 2 + 1);
580 
581                 for (int i = 0; i < size; i++)
582                 {
583                     outptr0[0] = r0[0];
584                     outptr0[1] = r0[1];
585                     outptr0[2] = r0[2];
586                     outptr0[3] = r0[3];
587                     outptr1[0] = r0[4];
588                     outptr1[1] = r0[5];
589                     outptr1[2] = r0[6];
590                     outptr1[3] = r0[7];
591 
592                     r0 += 8;
593                     outptr0 += 4;
594                     outptr1 += 4;
595                 }
596             }
597         }
598 
599         return 0;
600     }
601 
602     return 0;
603 }
604 
forward_int8(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const605 int Packing_x86::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
606 {
607     if (use_padding)
608     {
609         return Packing::forward(bottom_blob, top_blob, opt);
610     }
611 
612     size_t elemsize = bottom_blob.elemsize;
613     int elempack = bottom_blob.elempack;
614 
615     if (elempack == out_elempack)
616     {
617         top_blob = bottom_blob;
618         return 0;
619     }
620 
621     bool pack1to8 = elempack == 1 && out_elempack == 8;
622     bool pack8to1 = elempack == 8 && out_elempack == 1;
623 
624     if (!pack1to8 && !pack8to1)
625     {
626         return Packing::forward(bottom_blob, top_blob, opt);
627     }
628 
629     int w = bottom_blob.w;
630     int h = bottom_blob.h;
631     int channels = bottom_blob.c;
632     int dims = bottom_blob.dims;
633 
634     if (!use_padding)
635     {
636         // identity if use_padding not allowed
637         if (dims == 1 && w * elempack % out_elempack != 0)
638         {
639             top_blob = bottom_blob;
640             return 0;
641         }
642         if (dims == 2 && h * elempack % out_elempack != 0)
643         {
644             top_blob = bottom_blob;
645             return 0;
646         }
647         if (dims == 3 && channels * elempack % out_elempack != 0)
648         {
649             top_blob = bottom_blob;
650             return 0;
651         }
652     }
653 
654     if (dims == 1)
655     {
656         top_blob = bottom_blob;
657         top_blob.w = w * elempack / out_elempack;
658         top_blob.cstep = w * elempack / out_elempack;
659         top_blob.elemsize = elemsize / elempack * out_elempack;
660         top_blob.elempack = out_elempack;
661         return 0;
662     }
663 
664     if (dims == 2)
665     {
666         int outh = h * elempack / out_elempack;
667         size_t out_elemsize = elemsize / elempack * out_elempack;
668 
669         top_blob.create(w, outh, out_elemsize, out_elempack, opt.blob_allocator);
670         if (top_blob.empty())
671             return -100;
672 
673         if (pack1to8)
674         {
675             #pragma omp parallel for num_threads(opt.num_threads)
676             for (int i = 0; i < outh; i++)
677             {
678                 const signed char* r0 = bottom_blob.row<const signed char>(i * 8);
679                 const signed char* r1 = bottom_blob.row<const signed char>(i * 8 + 1);
680                 const signed char* r2 = bottom_blob.row<const signed char>(i * 8 + 2);
681                 const signed char* r3 = bottom_blob.row<const signed char>(i * 8 + 3);
682                 const signed char* r4 = bottom_blob.row<const signed char>(i * 8 + 4);
683                 const signed char* r5 = bottom_blob.row<const signed char>(i * 8 + 5);
684                 const signed char* r6 = bottom_blob.row<const signed char>(i * 8 + 6);
685                 const signed char* r7 = bottom_blob.row<const signed char>(i * 8 + 7);
686 
687                 signed char* outptr = top_blob.row<signed char>(i);
688 
689                 int j = 0;
690                 for (; j < w; j++)
691                 {
692                     outptr[0] = *r0++;
693                     outptr[1] = *r1++;
694                     outptr[2] = *r2++;
695                     outptr[3] = *r3++;
696                     outptr[4] = *r4++;
697                     outptr[5] = *r5++;
698                     outptr[6] = *r6++;
699                     outptr[7] = *r7++;
700 
701                     outptr += 8;
702                 }
703             }
704         }
705         if (pack8to1)
706         {
707             #pragma omp parallel for num_threads(opt.num_threads)
708             for (int i = 0; i < h; i++)
709             {
710                 const signed char* r0 = bottom_blob.row<const signed char>(i);
711 
712                 signed char* outptr0 = top_blob.row<signed char>(i * 8);
713                 signed char* outptr1 = top_blob.row<signed char>(i * 8 + 1);
714                 signed char* outptr2 = top_blob.row<signed char>(i * 8 + 2);
715                 signed char* outptr3 = top_blob.row<signed char>(i * 8 + 3);
716                 signed char* outptr4 = top_blob.row<signed char>(i * 8 + 4);
717                 signed char* outptr5 = top_blob.row<signed char>(i * 8 + 5);
718                 signed char* outptr6 = top_blob.row<signed char>(i * 8 + 6);
719                 signed char* outptr7 = top_blob.row<signed char>(i * 8 + 7);
720 
721                 int j = 0;
722                 for (; j < w; j++)
723                 {
724                     *outptr0++ = r0[0];
725                     *outptr1++ = r0[1];
726                     *outptr2++ = r0[2];
727                     *outptr3++ = r0[3];
728                     *outptr4++ = r0[4];
729                     *outptr5++ = r0[5];
730                     *outptr6++ = r0[6];
731                     *outptr7++ = r0[7];
732 
733                     r0 += 8;
734                 }
735             }
736         }
737 
738         return 0;
739     }
740 
741     if (dims == 3)
742     {
743         int size = w * h;
744         int outc = channels * elempack / out_elempack;
745         size_t out_elemsize = elemsize / elempack * out_elempack;
746 
747         top_blob.create(w, h, outc, out_elemsize, out_elempack, opt.blob_allocator);
748         if (top_blob.empty())
749             return -100;
750 
751         if (pack1to8)
752         {
753             #pragma omp parallel for num_threads(opt.num_threads)
754             for (int q = 0; q < outc; q++)
755             {
756                 const signed char* r0 = bottom_blob.channel(q * 8);
757                 const signed char* r1 = bottom_blob.channel(q * 8 + 1);
758                 const signed char* r2 = bottom_blob.channel(q * 8 + 2);
759                 const signed char* r3 = bottom_blob.channel(q * 8 + 3);
760                 const signed char* r4 = bottom_blob.channel(q * 8 + 4);
761                 const signed char* r5 = bottom_blob.channel(q * 8 + 5);
762                 const signed char* r6 = bottom_blob.channel(q * 8 + 6);
763                 const signed char* r7 = bottom_blob.channel(q * 8 + 7);
764 
765                 signed char* outptr = top_blob.channel(q);
766 
767                 int i = 0;
768                 for (; i < size; i++)
769                 {
770                     outptr[0] = *r0++;
771                     outptr[1] = *r1++;
772                     outptr[2] = *r2++;
773                     outptr[3] = *r3++;
774                     outptr[4] = *r4++;
775                     outptr[5] = *r5++;
776                     outptr[6] = *r6++;
777                     outptr[7] = *r7++;
778 
779                     outptr += 8;
780                 }
781             }
782         }
783         if (pack8to1)
784         {
785             #pragma omp parallel for num_threads(opt.num_threads)
786             for (int q = 0; q < channels; q++)
787             {
788                 const signed char* r0 = bottom_blob.channel(q);
789 
790                 signed char* outptr0 = top_blob.channel(q * 8);
791                 signed char* outptr1 = top_blob.channel(q * 8 + 1);
792                 signed char* outptr2 = top_blob.channel(q * 8 + 2);
793                 signed char* outptr3 = top_blob.channel(q * 8 + 3);
794                 signed char* outptr4 = top_blob.channel(q * 8 + 4);
795                 signed char* outptr5 = top_blob.channel(q * 8 + 5);
796                 signed char* outptr6 = top_blob.channel(q * 8 + 6);
797                 signed char* outptr7 = top_blob.channel(q * 8 + 7);
798 
799                 int i = 0;
800                 for (; i < size; i++)
801                 {
802                     *outptr0++ = r0[0];
803                     *outptr1++ = r0[1];
804                     *outptr2++ = r0[2];
805                     *outptr3++ = r0[3];
806                     *outptr4++ = r0[4];
807                     *outptr5++ = r0[5];
808                     *outptr6++ = r0[6];
809                     *outptr7++ = r0[7];
810 
811                     r0 += 8;
812                 }
813             }
814         }
815 
816         return 0;
817     }
818 
819     return 0;
820 }
821 
822 } // namespace ncnn
823