1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 
15 #include "convolutiondepthwise_x86.h"
16 
17 #if __SSE2__
18 #include <emmintrin.h>
19 #if __AVX__
20 #include <immintrin.h>
21 #endif
22 #endif // __SSE2__
23 
24 #include "x86_activation.h"
25 #include "x86_usability.h"
26 
27 #include "layer_type.h"
28 
29 namespace ncnn {
30 
31 #if __SSE2__
32 #if __AVX__
33 #include "convolutiondepthwise_3x3_pack8_fp16.h"
34 #include "convolutiondepthwise_3x3_pack8.h"
35 #include "convolutiondepthwise_5x5_pack8.h"
36 #endif
37 #endif // __SSE2__
38 #include "convolutiondepthwise_3x3.h"
39 #if NCNN_INT8
40 #include "convolutiondepthwise_3x3_int8.h"
41 #endif // NCNN_INT8
42 
ConvolutionDepthWise_x86()43 ConvolutionDepthWise_x86::ConvolutionDepthWise_x86()
44 {
45 #if __SSE2__
46     support_packing = true;
47 #if __AVX__
48     support_weight_fp16_storage = true;
49 #endif
50 #endif // __SSE2__
51     activation = 0;
52 }
53 
create_pipeline(const Option & opt)54 int ConvolutionDepthWise_x86::create_pipeline(const Option& opt)
55 {
56     if (activation_type == 1)
57     {
58         activation = ncnn::create_layer(ncnn::LayerType::ReLU);
59 
60         ncnn::ParamDict pd;
61         activation->load_param(pd);
62     }
63     else if (activation_type == 2)
64     {
65         activation = ncnn::create_layer(ncnn::LayerType::ReLU);
66 
67         ncnn::ParamDict pd;
68         pd.set(0, activation_params[0]); // slope
69         activation->load_param(pd);
70     }
71     else if (activation_type == 3)
72     {
73         activation = ncnn::create_layer(ncnn::LayerType::Clip);
74 
75         ncnn::ParamDict pd;
76         pd.set(0, activation_params[0]); // min
77         pd.set(1, activation_params[1]); // max
78 
79         activation->load_param(pd);
80     }
81     else if (activation_type == 4)
82     {
83         activation = ncnn::create_layer(ncnn::LayerType::Sigmoid);
84 
85         ncnn::ParamDict pd;
86         activation->load_param(pd);
87     }
88     else if (activation_type == 5)
89     {
90         activation = ncnn::create_layer(ncnn::LayerType::Mish);
91 
92         ncnn::ParamDict pd;
93         activation->load_param(pd);
94     }
95     else if (activation_type == 5)
96     {
97         activation = ncnn::create_layer(ncnn::LayerType::Mish);
98 
99         ncnn::ParamDict pd;
100         activation->load_param(pd);
101     }
102     if (activation)
103     {
104         activation->create_pipeline(opt);
105     }
106 
107 #if NCNN_INT8
108     if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
109     {
110         return create_pipeline_int8_x86(opt);
111     }
112 #endif
113 
114     const int maxk = kernel_w * kernel_h;
115     int channels = (weight_data_size / group) / maxk / (num_output / group) * group;
116 
117     // depth-wise
118     if (channels == group && group == num_output)
119     {
120         int elempack = 1;
121 #if __SSE2__
122         if (opt.use_packing_layout)
123         {
124 #if __AVX__
125             elempack = channels % 8 == 0 ? 8 : channels % 4 == 0 ? 4 : 1;
126 #else
127             elempack = channels % 4 == 0 ? 4 : 1;
128 #endif
129         }
130 #endif // __SSE2__
131 
132 #if __SSE2__
133 #if __AVX__
134         // pack8
135         if (elempack == 8)
136         {
137             if (opt.use_weight_fp16_storage && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
138             {
139                 Mat weight_data_r2 = weight_data.reshape(maxk, group);
140                 Mat weight_data_tmp;
141                 convert_packing(weight_data_r2, weight_data_tmp, 8);
142                 ncnn::cast_float32_to_float16(weight_data_tmp, weight_data_packed, opt);
143                 return 0;
144             }
145             if (opt.use_weight_fp16_storage && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
146             {
147                 Mat weight_data_r2 = weight_data.reshape(maxk, group);
148                 Mat weight_data_tmp;
149                 convert_packing(weight_data_r2, weight_data_tmp, 8);
150                 ncnn::cast_float32_to_float16(weight_data_tmp, weight_data_packed, opt);
151                 return 0;
152             }
153 
154             Mat weight_data_r2 = weight_data.reshape(maxk, group);
155             convert_packing(weight_data_r2, weight_data_packed, 8);
156 
157             return 0;
158         }
159 #endif // __AVX__
160 
161         // pack4
162         if (elempack == 4)
163         {
164             Mat weight_data_r2 = weight_data.reshape(maxk, group);
165             convert_packing(weight_data_r2, weight_data_packed, 4);
166 
167             return 0;
168         }
169 #endif // __SSE2__
170 
171         if (elempack == 1)
172         {
173             // depth-wise specific
174             if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
175             {
176                 return 0;
177             }
178             if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
179             {
180                 return 0;
181             }
182         }
183     }
184 
185     // group convolution
186     create_group_ops(opt);
187 
188     return 0;
189 }
190 
create_group_ops(const Option & opt)191 int ConvolutionDepthWise_x86::create_group_ops(const Option& opt)
192 {
193     // create Convolution op for each group
194     const int maxk = kernel_w * kernel_h;
195     int channels = (weight_data_size / group) / maxk / (num_output / group) * group;
196 
197     for (int i = 0; i < (int)group_ops.size(); i++)
198         delete group_ops[i];
199 
200     group_ops.clear();
201 
202     const int channels_g = channels / group;
203     const int num_output_g = num_output / group;
204 
205     group_ops.resize(group);
206 
207     for (int g = 0; g < group; g++)
208     {
209         Mat weight_data_g = weight_data.range(maxk * channels_g * num_output_g * g, maxk * channels_g * num_output_g);
210         Mat bias_data_g;
211         if (bias_term)
212             bias_data_g = bias_data.range(num_output_g * g, num_output_g);
213 
214         ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution);
215 
216         // set param
217         ncnn::ParamDict pd;
218         pd.set(0, num_output_g); // num_output
219         pd.set(1, kernel_w);
220         pd.set(11, kernel_h);
221         pd.set(2, dilation_w);
222         pd.set(12, dilation_h);
223         pd.set(3, stride_w);
224         pd.set(13, stride_h);
225         pd.set(4, 0);  // pad_w
226         pd.set(14, 0); // pad_h
227         pd.set(5, bias_term);
228         pd.set(6, maxk * channels_g * num_output_g); // weight_data_size
229         pd.set(8, int8_scale_term);
230         pd.set(9, activation_type);
231         pd.set(10, activation_params);
232 
233         op->load_param(pd);
234 
235         // set weights
236         if (bias_term)
237         {
238             ncnn::Mat weights[5];
239             weights[0] = weight_data_g;
240             weights[1] = bias_data_g;
241 
242 #if NCNN_INT8
243             if (int8_scale_term)
244             {
245                 Mat weight_data_int8_scales_g(num_output_g);
246                 weight_data_int8_scales_g.fill(weight_data_int8_scales[g]);
247                 weights[2] = weight_data_int8_scales_g;
248                 weights[3] = bottom_blob_int8_scales.range(g, 1);
249             }
250             if (int8_scale_term > 100)
251             {
252                 weights[4] = top_blob_int8_scales.range(g, 1);
253             }
254 #endif
255 
256             op->load_model(ModelBinFromMatArray(weights));
257         }
258         else
259         {
260             ncnn::Mat weights[4];
261             weights[0] = weight_data_g;
262 
263 #if NCNN_INT8
264             if (int8_scale_term)
265             {
266                 Mat weight_data_int8_scales_g(num_output_g);
267                 weight_data_int8_scales_g.fill(weight_data_int8_scales[g]);
268                 weights[1] = weight_data_int8_scales_g;
269                 weights[2] = bottom_blob_int8_scales.range(g, 1);
270             }
271             if (int8_scale_term > 100)
272             {
273                 weights[3] = top_blob_int8_scales.range(g, 1);
274             }
275 #endif
276 
277             op->load_model(ModelBinFromMatArray(weights));
278         }
279 
280         op->create_pipeline(opt);
281 
282         group_ops[g] = op;
283     }
284 
285     return 0;
286 }
287 
destroy_pipeline(const Option & opt)288 int ConvolutionDepthWise_x86::destroy_pipeline(const Option& opt)
289 {
290     if (activation)
291     {
292         activation->destroy_pipeline(opt);
293         delete activation;
294         activation = 0;
295     }
296 
297     for (int i = 0; i < (int)group_ops.size(); i++)
298     {
299         group_ops[i]->destroy_pipeline(opt);
300         delete group_ops[i];
301     }
302     group_ops.clear();
303 
304     return 0;
305 }
306 
forward(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const307 int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
308 {
309 #if NCNN_INT8
310     if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
311     {
312         return forward_int8_x86(bottom_blob, top_blob, opt);
313     }
314 #endif
315 
316     int w = bottom_blob.w;
317     int h = bottom_blob.h;
318     int channels = bottom_blob.c;
319     size_t elemsize = bottom_blob.elemsize;
320     int elempack = bottom_blob.elempack;
321 
322     const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
323     const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
324 
325     Mat bottom_blob_bordered;
326     make_padding(bottom_blob, bottom_blob_bordered, opt);
327     if (bottom_blob_bordered.empty())
328         return -100;
329 
330     w = bottom_blob_bordered.w;
331     h = bottom_blob_bordered.h;
332 
333     int outw = (w - kernel_extent_w) / stride_w + 1;
334     int outh = (h - kernel_extent_h) / stride_h + 1;
335     int out_elempack = 1;
336 #if __SSE2__
337     if (opt.use_packing_layout)
338     {
339 #if __AVX__
340         out_elempack = num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
341 #else
342         out_elempack = num_output % 4 == 0 ? 4 : 1;
343 #endif
344     }
345 #endif // __SSE2__
346     size_t out_elemsize = elemsize / elempack * out_elempack;
347 
348     top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
349     if (top_blob.empty())
350         return -100;
351 
352     // fprintf(stderr, "Depthwise kernel %d x %d elempack=%d group=%d channels = %d stride = %d x %d  \n",kernel_w,kernel_h,elempack,group,channels,stride_w,stride_h );
353 
354     // depth-wise
355     if (channels * elempack == group && group == num_output)
356     {
357 #if __SSE2__
358 #if __AVX__
359         if (elempack == 8)
360         {
361             if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
362             {
363                 if (opt.use_weight_fp16_storage)
364                 {
365                     convdw3x3s1_fp16_pack8_avx(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt);
366                 }
367                 else
368                 {
369                     convdw3x3s1_pack8_avx(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt);
370                 }
371 
372                 if (activation)
373                 {
374                     activation->forward_inplace(top_blob, opt);
375                 }
376 
377                 return 0;
378             }
379             if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
380             {
381                 if (opt.use_weight_fp16_storage)
382                 {
383                     convdw3x3s2_fp16_pack8_avx(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt);
384                 }
385                 else
386                 {
387                     convdw3x3s2_pack8_avx(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt);
388                 }
389 
390                 if (activation)
391                 {
392                     activation->forward_inplace(top_blob, opt);
393                 }
394 
395                 return 0;
396             }
397             if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
398             {
399                 convdw5x5s1_pack8_avx(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt);
400 
401                 if (activation)
402                 {
403                     activation->forward_inplace(top_blob, opt);
404                 }
405 
406                 return 0;
407             }
408             if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
409             {
410                 convdw5x5s2_pack8_avx(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt);
411 
412                 if (activation)
413                 {
414                     activation->forward_inplace(top_blob, opt);
415                 }
416 
417                 return 0;
418             }
419             else
420             {
421                 const int maxk = kernel_w * kernel_h;
422 
423                 // kernel offsets
424                 std::vector<int> _space_ofs(maxk);
425                 int* space_ofs = &_space_ofs[0];
426                 {
427                     int p1 = 0;
428                     int p2 = 0;
429                     int gap = w * dilation_h - kernel_w * dilation_w;
430                     for (int i = 0; i < kernel_h; i++)
431                     {
432                         for (int j = 0; j < kernel_w; j++)
433                         {
434                             space_ofs[p1] = p2;
435                             p1++;
436                             p2 += dilation_w;
437                         }
438                         p2 += gap;
439                     }
440                 }
441 
442                 #pragma omp parallel for num_threads(opt.num_threads)
443                 for (int g = 0; g < channels; g++)
444                 {
445                     float* outptr = top_blob.channel(g);
446                     const float* kptr = (const float*)weight_data_packed + maxk * g * 8;
447                     const Mat m = bottom_blob_bordered.channel(g);
448 
449                     for (int i = 0; i < outh; i++)
450                     {
451                         for (int j = 0; j < outw; j++)
452                         {
453                             __m256 _sum = _mm256_set1_ps(0.f);
454 
455                             if (bias_term)
456                             {
457                                 _sum = _mm256_loadu_ps(((const float*)bias_data) + g * 8);
458                             }
459 
460                             const float* sptr = m.row(i * stride_h) + j * stride_w * 8;
461 
462                             for (int k = 0; k < maxk; k++)
463                             {
464                                 __m256 _val = _mm256_loadu_ps(sptr + space_ofs[k] * 8);
465                                 __m256 _w = _mm256_loadu_ps(kptr + k * 8);
466                                 _sum = _mm256_fmadd_ps(_val, _w, _sum);
467                             }
468 
469                             _sum = activation_avx(_sum, activation_type, activation_params);
470 
471                             _mm256_storeu_ps(outptr + j * 8, _sum);
472                         }
473 
474                         outptr += outw * 8;
475                     }
476                 }
477 
478                 return 0;
479             }
480         }
481 #endif // __AVX__
482 
483         if (elempack == 4)
484         {
485             {
486                 const int maxk = kernel_w * kernel_h;
487 
488                 // kernel offsets
489                 std::vector<int> _space_ofs(maxk);
490                 int* space_ofs = &_space_ofs[0];
491                 {
492                     int p1 = 0;
493                     int p2 = 0;
494                     int gap = w * dilation_h - kernel_w * dilation_w;
495                     for (int i = 0; i < kernel_h; i++)
496                     {
497                         for (int j = 0; j < kernel_w; j++)
498                         {
499                             space_ofs[p1] = p2;
500                             p1++;
501                             p2 += dilation_w;
502                         }
503                         p2 += gap;
504                     }
505                 }
506 
507                 #pragma omp parallel for num_threads(opt.num_threads)
508                 for (int g = 0; g < channels; g++)
509                 {
510                     float* outptr = top_blob.channel(g);
511                     const float* kptr = (const float*)weight_data_packed + maxk * g * 4;
512                     const Mat m = bottom_blob_bordered.channel(g);
513 
514                     for (int i = 0; i < outh; i++)
515                     {
516                         for (int j = 0; j < outw; j++)
517                         {
518                             __m128 _sum = _mm_set1_ps(0.f);
519 
520                             if (bias_term)
521                             {
522                                 _sum = _mm_loadu_ps(((const float*)bias_data) + g * 4);
523                             }
524 
525                             const float* sptr = m.row(i * stride_h) + j * stride_w * 4;
526 
527                             for (int k = 0; k < maxk; k++)
528                             {
529                                 __m128 _val = _mm_loadu_ps(sptr + space_ofs[k] * 4);
530                                 __m128 _w = _mm_loadu_ps(kptr + k * 4);
531                                 _sum = _mm_add_ps(_mm_mul_ps(_val, _w), _sum);
532                             }
533 
534                             _sum = activation_sse(_sum, activation_type, activation_params);
535 
536                             _mm_storeu_ps(outptr + j * 4, _sum);
537                         }
538 
539                         outptr += outw * 4;
540                     }
541                 }
542 
543                 return 0;
544             }
545         }
546 #endif // __SSE2__
547 
548         if (elempack == 1)
549         {
550             if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
551             {
552                 convdw3x3s1_sse(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
553 
554                 if (activation)
555                 {
556                     activation->forward_inplace(top_blob, opt);
557                 }
558 
559                 return 0;
560             }
561             if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
562             {
563                 convdw3x3s2_sse(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
564 
565                 if (activation)
566                 {
567                     activation->forward_inplace(top_blob, opt);
568                 }
569 
570                 return 0;
571             }
572         }
573     }
574 
575     // group convolution
576     const int channels_g = channels * elempack / group;
577     const int num_output_g = num_output / group;
578 
579     int g_elempack = 1;
580     int out_g_elempack = 1;
581 #if __SSE2__
582     if (opt.use_packing_layout)
583     {
584 #if __AVX__
585         g_elempack = channels_g % 8 == 0 ? 8 : channels_g % 4 == 0 ? 4 : 1;
586         out_g_elempack = num_output_g % 8 == 0 ? 8 : num_output_g % 4 == 0 ? 4 : 1;
587 #else
588         g_elempack = channels_g % 4 == 0 ? 4 : 1;
589         out_g_elempack = num_output_g % 4 == 0 ? 4 : 1;
590 #endif
591     }
592 #endif // __SSE2__
593 
594     // unpacking
595     Mat bottom_blob_bordered_unpacked = bottom_blob_bordered;
596     if (elempack > g_elempack)
597     {
598         Option opt_p = opt;
599         opt_p.blob_allocator = opt.workspace_allocator;
600         convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, g_elempack, opt_p);
601     }
602 
603     Mat top_blob_unpacked = top_blob;
604     if (out_g_elempack < out_elempack)
605     {
606         top_blob_unpacked.create(outw, outh, num_output / out_g_elempack, out_elemsize / out_elempack * out_g_elempack, out_g_elempack, opt.workspace_allocator);
607         if (top_blob_unpacked.empty())
608             return -100;
609     }
610 
611     for (int g = 0; g < group; g++)
612     {
613         const Mat bottom_blob_bordered_g = bottom_blob_bordered_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack);
614         Mat top_blob_g = top_blob_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack);
615 
616         const ncnn::Layer* op = group_ops[g];
617 
618         Option opt_g = opt;
619         opt_g.blob_allocator = top_blob_unpacked.allocator;
620 
621         // forward
622         op->forward(bottom_blob_bordered_g, top_blob_g, opt_g);
623     }
624 
625     // packing
626     if (out_g_elempack < out_elempack)
627     {
628         convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
629     }
630     else
631     {
632         top_blob = top_blob_unpacked;
633     }
634 
635     return 0;
636 }
637 
638 #if NCNN_INT8
create_pipeline_int8_x86(const Option & opt)639 int ConvolutionDepthWise_x86::create_pipeline_int8_x86(const Option& opt)
640 {
641     const int maxk = kernel_w * kernel_h;
642     int channels = (weight_data_size / group) / maxk / (num_output / group) * group;
643 
644     // depth-wise
645     if (channels == group && group == num_output)
646     {
647         int elempack = 1;
648 #if __SSE2__
649         if (opt.use_packing_layout)
650         {
651             elempack = channels % 8 == 0 ? 8 : 1;
652         }
653 #endif // __SSE2__
654 
655         if (elempack == 8)
656         {
657             Mat weight_data_r2 = weight_data.reshape(maxk, group);
658             convert_packing(weight_data_r2, weight_data_int8, 8, opt);
659         }
660 
661         return 0;
662     }
663 
664     // group convolution
665     create_group_ops(opt);
666 
667     return 0;
668 }
669 
forward_int8_x86(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const670 int ConvolutionDepthWise_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
671 {
672     int w = bottom_blob.w;
673     int h = bottom_blob.h;
674     int channels = bottom_blob.c;
675     size_t elemsize = bottom_blob.elemsize;
676     int elempack = bottom_blob.elempack;
677 
678     int elembits = bottom_blob.elembits();
679 
680     const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
681     const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
682 
683     Mat bottom_blob_int8 = bottom_blob;
684     if (elembits != 8)
685     {
686         const int channels_g = channels * elempack / group;
687 
688         Mat scales(channels * elempack);
689         {
690             float* ps = scales;
691             for (int g = 0; g < group; g++)
692             {
693                 float scale = bottom_blob_int8_scales[g];
694                 for (int q = 0; q < channels_g; q++)
695                 {
696                     *ps++ = scale;
697                 }
698             }
699         }
700 
701         Option opt_q = opt;
702         opt_q.blob_allocator = opt.workspace_allocator;
703         quantize_to_int8(bottom_blob, bottom_blob_int8, scales, opt_q);
704     }
705 
706     Mat bottom_blob_bordered;
707     make_padding(bottom_blob_int8, bottom_blob_bordered, opt);
708     if (bottom_blob_bordered.empty())
709         return -100;
710 
711     w = bottom_blob_bordered.w;
712     h = bottom_blob_bordered.h;
713     channels = bottom_blob_bordered.c;
714     elempack = bottom_blob_bordered.elempack;
715 
716     int outw = (w - kernel_extent_w) / stride_w + 1;
717     int outh = (h - kernel_extent_h) / stride_h + 1;
718 
719     // depth-wise
720     if (channels * elempack == group && group == num_output)
721     {
722         int out_elempack = 1;
723 #if __SSE2__
724         if (opt.use_packing_layout)
725         {
726             out_elempack = num_output % 8 == 0 ? 8 : 1;
727         }
728 #endif // __SSE2__
729         bool use_int8_requantize = int8_scale_term > 100;
730         size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack;
731 
732         top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
733         if (top_blob.empty())
734             return -100;
735 
736 #if __SSE2__
737         if (elempack == 8)
738         {
739             {
740                 const int maxk = kernel_w * kernel_h;
741 
742                 // kernel offsets
743                 std::vector<int> _space_ofs(maxk);
744                 int* space_ofs = &_space_ofs[0];
745                 {
746                     int p1 = 0;
747                     int p2 = 0;
748                     int gap = w * dilation_h - kernel_w * dilation_w;
749                     for (int i = 0; i < kernel_h; i++)
750                     {
751                         for (int j = 0; j < kernel_w; j++)
752                         {
753                             space_ofs[p1] = p2;
754                             p1++;
755                             p2 += dilation_w;
756                         }
757                         p2 += gap;
758                     }
759                 }
760 
761                 #pragma omp parallel for num_threads(opt.num_threads)
762                 for (int g = 0; g < channels; g++)
763                 {
764                     signed char* outptr_s8 = top_blob.channel(g);
765                     float* outptr_f32 = top_blob.channel(g);
766                     const signed char* kptr = (const signed char*)weight_data_int8 + maxk * g * 8;
767                     const Mat m = bottom_blob_bordered.channel(g);
768 
769                     for (int i = 0; i < outh; i++)
770                     {
771                         for (int j = 0; j < outw; j++)
772                         {
773                             __m128i _sum0 = _mm_setzero_si128();
774                             __m128i _sum1 = _mm_setzero_si128();
775 
776                             const signed char* sptr = m.row<const signed char>(i * stride_h) + j * stride_w * 8;
777 
778                             for (int k = 0; k < maxk; k++)
779                             {
780                                 // TODO use _mm_cvtepi8_epi16 on sse4.1
781                                 __m128i _val = _mm_loadl_epi64((const __m128i*)(sptr + space_ofs[k] * 8));
782                                 _val = _mm_unpacklo_epi8(_val, _mm_cmpgt_epi8(_mm_setzero_si128(), _val));
783 
784                                 __m128i _w = _mm_loadl_epi64((const __m128i*)(kptr + k * 8));
785                                 _w = _mm_unpacklo_epi8(_w, _mm_cmpgt_epi8(_mm_setzero_si128(), _w));
786 
787                                 __m128i _sl = _mm_mullo_epi16(_val, _w);
788                                 __m128i _sh = _mm_mulhi_epi16(_val, _w);
789                                 __m128i _s0 = _mm_unpacklo_epi16(_sl, _sh);
790                                 __m128i _s1 = _mm_unpackhi_epi16(_sl, _sh);
791 
792                                 _sum0 = _mm_add_epi32(_sum0, _s0);
793                                 _sum1 = _mm_add_epi32(_sum1, _s1);
794                             }
795 
796                             __m128 _scale_in0;
797                             __m128 _scale_in1;
798                             {
799                                 __m128 _bottom_blob_int8_scales0 = _mm_loadu_ps((const float*)bottom_blob_int8_scales + g * 8);
800                                 __m128 _bottom_blob_int8_scales1 = _mm_loadu_ps((const float*)bottom_blob_int8_scales + g * 8 + 4);
801                                 __m128 _weight_data_int8_scales0 = _mm_loadu_ps((const float*)weight_data_int8_scales + g * 8);
802                                 __m128 _weight_data_int8_scales1 = _mm_loadu_ps((const float*)weight_data_int8_scales + g * 8 + 4);
803                                 _scale_in0 = _mm_rcp_ps(_mm_mul_ps(_bottom_blob_int8_scales0, _weight_data_int8_scales0));
804                                 _scale_in1 = _mm_rcp_ps(_mm_mul_ps(_bottom_blob_int8_scales1, _weight_data_int8_scales1));
805 
806                                 __m128 _m0 = _mm_cmpneq_ps(_weight_data_int8_scales0, _mm_setzero_ps());
807                                 __m128 _m1 = _mm_cmpneq_ps(_weight_data_int8_scales1, _mm_setzero_ps());
808                                 _scale_in0 = _mm_and_ps(_scale_in0, _m0);
809                                 _scale_in1 = _mm_and_ps(_scale_in1, _m1);
810                             }
811 
812                             __m128 _sumfp32_0 = _mm_mul_ps(_mm_cvtepi32_ps(_sum0), _scale_in0);
813                             __m128 _sumfp32_1 = _mm_mul_ps(_mm_cvtepi32_ps(_sum1), _scale_in1);
814 
815                             if (bias_term)
816                             {
817                                 __m128 _bias0 = _mm_loadu_ps((const float*)bias_data + g * 8);
818                                 __m128 _bias1 = _mm_loadu_ps((const float*)bias_data + g * 8 + 4);
819                                 _sumfp32_0 = _mm_add_ps(_sumfp32_0, _bias0);
820                                 _sumfp32_1 = _mm_add_ps(_sumfp32_1, _bias1);
821                             }
822 
823                             _sumfp32_0 = activation_sse(_sumfp32_0, activation_type, activation_params);
824                             _sumfp32_1 = activation_sse(_sumfp32_1, activation_type, activation_params);
825 
826                             if (use_int8_requantize)
827                             {
828                                 // requantize and relu
829                                 __m128 _scale_out0 = _mm_loadu_ps((const float*)top_blob_int8_scales + g * 8);
830                                 __m128 _scale_out1 = _mm_loadu_ps((const float*)top_blob_int8_scales + g * 8 + 4);
831                                 _sumfp32_0 = _mm_mul_ps(_sumfp32_0, _scale_out0);
832                                 _sumfp32_1 = _mm_mul_ps(_sumfp32_1, _scale_out1);
833                                 int64_t _sum8 = float2int8_sse(_sumfp32_0, _sumfp32_1);
834 
835                                 *(int64_t*)outptr_s8 = _sum8;
836                                 outptr_s8 += 8;
837                             }
838                             else
839                             {
840                                 // dequantize and relu
841                                 _mm_storeu_ps(outptr_f32, _sumfp32_0);
842                                 _mm_storeu_ps(outptr_f32 + 4, _sumfp32_1);
843                                 outptr_f32 += 8;
844                             }
845                         }
846                     }
847                 }
848             }
849         }
850 #endif // __SSE2__
851 
852         if (elempack == 1)
853         {
854             if (kernel_w == 3 && kernel_h == 3 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1 && (activation_type == 0 || activation_type == 1))
855             {
856                 if (use_int8_requantize)
857                 {
858                     std::vector<float> requantize_scales;
859                     for (int g = 0; g < group; g++)
860                     {
861                         float scale_in;
862                         if (weight_data_int8_scales[g] == 0)
863                             scale_in = 0;
864                         else
865                             scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);
866 
867                         float scale_out = top_blob_int8_scales[g];
868 
869                         requantize_scales.push_back(scale_in);
870                         requantize_scales.push_back(scale_out);
871                     }
872 
873                     convdw3x3s1_int8_requant_sse(bottom_blob_bordered, top_blob, weight_data, bias_data, requantize_scales, opt);
874                 }
875                 else
876                 {
877                     std::vector<float> dequantize_scales;
878                     for (int g = 0; g < group; g++)
879                     {
880                         float top_rescale = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);
881 
882                         dequantize_scales.push_back(top_rescale);
883                     }
884 
885                     convdw3x3s1_int8_dequant_sse(bottom_blob_bordered, top_blob, weight_data, bias_data, dequantize_scales, opt);
886                 }
887 
888                 if (activation)
889                 {
890                     activation->forward_inplace(top_blob, opt);
891                 }
892             }
893             else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2 && (activation_type == 0 || activation_type == 1))
894             {
895                 if (use_int8_requantize)
896                 {
897                     std::vector<float> requantize_scales;
898                     for (int g = 0; g < group; g++)
899                     {
900                         float scale_in;
901                         if (weight_data_int8_scales[g] == 0)
902                             scale_in = 0;
903                         else
904                             scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);
905 
906                         float scale_out = top_blob_int8_scales[g];
907 
908                         requantize_scales.push_back(scale_in);
909                         requantize_scales.push_back(scale_out);
910                     }
911 
912                     convdw3x3s2_int8_requant_sse(bottom_blob_bordered, top_blob, weight_data, bias_data, requantize_scales, opt);
913                 }
914                 else
915                 {
916                     std::vector<float> dequantize_scales;
917                     for (int g = 0; g < group; g++)
918                     {
919                         float top_rescale = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);
920 
921                         dequantize_scales.push_back(top_rescale);
922                     }
923 
924                     convdw3x3s2_int8_dequant_sse(bottom_blob_bordered, top_blob, weight_data, bias_data, dequantize_scales, opt);
925                 }
926 
927                 if (activation)
928                 {
929                     activation->forward_inplace(top_blob, opt);
930                 }
931             }
932             else
933             {
934                 const int maxk = kernel_w * kernel_h;
935 
936                 // kernel offsets
937                 std::vector<int> _space_ofs(maxk);
938                 int* space_ofs = &_space_ofs[0];
939                 {
940                     int p1 = 0;
941                     int p2 = 0;
942                     int gap = w * dilation_h - kernel_w * dilation_w;
943                     for (int i = 0; i < kernel_h; i++)
944                     {
945                         for (int j = 0; j < kernel_w; j++)
946                         {
947                             space_ofs[p1] = p2;
948                             p1++;
949                             p2 += dilation_w;
950                         }
951                         p2 += gap;
952                     }
953                 }
954 
955                 #pragma omp parallel for num_threads(opt.num_threads)
956                 for (int g = 0; g < group; g++)
957                 {
958                     signed char* outptr_s8 = top_blob.channel(g);
959                     float* outptr_f32 = top_blob.channel(g);
960                     const signed char* kptr = (const signed char*)weight_data + maxk * g;
961                     const Mat m = bottom_blob_bordered.channel(g);
962 
963                     for (int i = 0; i < outh; i++)
964                     {
965                         for (int j = 0; j < outw; j++)
966                         {
967                             int sum = 0;
968 
969                             const signed char* sptr = m.row<const signed char>(i * stride_h) + j * stride_w;
970 
971                             for (int k = 0; k < maxk; k++)
972                             {
973                                 signed char val = sptr[space_ofs[k]];
974                                 signed char w = kptr[k];
975                                 sum += val * w;
976                             }
977 
978                             float scale_in;
979                             if (weight_data_int8_scales[g] == 0)
980                                 scale_in = 0;
981                             else
982                                 scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);
983 
984                             float sumfp32 = sum * scale_in;
985 
986                             if (bias_term)
987                                 sumfp32 += bias_data[g];
988 
989                             sumfp32 = activation_ss(sumfp32, activation_type, activation_params);
990 
991                             if (use_int8_requantize)
992                             {
993                                 // requantize
994                                 float scale_out = top_blob_int8_scales[g];
995                                 signed char sums8 = float2int8(sumfp32 * scale_out);
996                                 outptr_s8[0] = sums8;
997                                 outptr_s8 += 1;
998                             }
999                             else
1000                             {
1001                                 // dequantize
1002                                 outptr_f32[0] = sumfp32;
1003                                 outptr_f32 += 1;
1004                             }
1005                         }
1006                     }
1007                 }
1008             }
1009         }
1010 
1011         return 0;
1012     }
1013 
1014     int out_elempack = 1;
1015 #if __SSE2__
1016     if (opt.use_packing_layout)
1017     {
1018         out_elempack = num_output % 4 == 0 ? 4 : 1;
1019     }
1020 #endif // __SSE2__
1021     bool use_int8_requantize = int8_scale_term > 100;
1022     size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack;
1023 
1024     top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
1025     if (top_blob.empty())
1026         return -100;
1027 
1028     // group convolution
1029     const int channels_g = channels * elempack / group;
1030     const int num_output_g = num_output / group;
1031 
1032     int g_elempack = 1;
1033     int out_g_elempack = 1;
1034 #if __SSE2__
1035     if (opt.use_packing_layout)
1036     {
1037         g_elempack = channels_g % 8 == 0 ? 8 : 1;
1038         out_g_elempack = num_output_g % 4 == 0 ? 4 : 1;
1039     }
1040 #endif // __SSE2__
1041 
1042     // unpacking
1043     Mat bottom_blob_bordered_unpacked = bottom_blob_bordered;
1044     if (elempack > g_elempack)
1045     {
1046         Option opt_p = opt;
1047         opt_p.blob_allocator = opt.workspace_allocator;
1048         convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, g_elempack, opt_p);
1049     }
1050 
1051     Mat top_blob_unpacked = top_blob;
1052     if (out_g_elempack < out_elempack)
1053     {
1054         top_blob_unpacked.create(outw, outh, num_output / out_g_elempack, out_elemsize / out_elempack * out_g_elempack, out_g_elempack, opt.workspace_allocator);
1055         if (top_blob_unpacked.empty())
1056             return -100;
1057     }
1058 
1059     #pragma omp parallel for num_threads(opt.num_threads)
1060     for (int g = 0; g < group; g++)
1061     {
1062         const Mat bottom_blob_bordered_g = bottom_blob_bordered_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack);
1063         Mat top_blob_g = top_blob_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack);
1064 
1065         const ncnn::Layer* op = group_ops[g];
1066 
1067         Option opt_g = opt;
1068         opt_g.blob_allocator = top_blob.allocator;
1069 
1070         // forward
1071         op->forward(bottom_blob_bordered_g, top_blob_g, opt_g);
1072     }
1073 
1074     // packing
1075     if (out_g_elempack < out_elempack)
1076     {
1077         convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
1078     }
1079     else
1080     {
1081         top_blob = top_blob_unpacked;
1082     }
1083 
1084     return 0;
1085 }
1086 #endif // NCNN_INT8
1087 
1088 } // namespace ncnn
1089