1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14
15 #include "convolutiondepthwise_x86.h"
16
17 #if __SSE2__
18 #include <emmintrin.h>
19 #if __AVX__
20 #include <immintrin.h>
21 #endif
22 #endif // __SSE2__
23
24 #include "x86_activation.h"
25 #include "x86_usability.h"
26
27 #include "layer_type.h"
28
29 namespace ncnn {
30
31 #if __SSE2__
32 #if __AVX__
33 #include "convolutiondepthwise_3x3_pack8_fp16.h"
34 #include "convolutiondepthwise_3x3_pack8.h"
35 #include "convolutiondepthwise_5x5_pack8.h"
36 #endif
37 #endif // __SSE2__
38 #include "convolutiondepthwise_3x3.h"
39 #if NCNN_INT8
40 #include "convolutiondepthwise_3x3_int8.h"
41 #endif // NCNN_INT8
42
ConvolutionDepthWise_x86()43 ConvolutionDepthWise_x86::ConvolutionDepthWise_x86()
44 {
45 #if __SSE2__
46 support_packing = true;
47 #if __AVX__
48 support_weight_fp16_storage = true;
49 #endif
50 #endif // __SSE2__
51 activation = 0;
52 }
53
create_pipeline(const Option & opt)54 int ConvolutionDepthWise_x86::create_pipeline(const Option& opt)
55 {
56 if (activation_type == 1)
57 {
58 activation = ncnn::create_layer(ncnn::LayerType::ReLU);
59
60 ncnn::ParamDict pd;
61 activation->load_param(pd);
62 }
63 else if (activation_type == 2)
64 {
65 activation = ncnn::create_layer(ncnn::LayerType::ReLU);
66
67 ncnn::ParamDict pd;
68 pd.set(0, activation_params[0]); // slope
69 activation->load_param(pd);
70 }
71 else if (activation_type == 3)
72 {
73 activation = ncnn::create_layer(ncnn::LayerType::Clip);
74
75 ncnn::ParamDict pd;
76 pd.set(0, activation_params[0]); // min
77 pd.set(1, activation_params[1]); // max
78
79 activation->load_param(pd);
80 }
81 else if (activation_type == 4)
82 {
83 activation = ncnn::create_layer(ncnn::LayerType::Sigmoid);
84
85 ncnn::ParamDict pd;
86 activation->load_param(pd);
87 }
88 else if (activation_type == 5)
89 {
90 activation = ncnn::create_layer(ncnn::LayerType::Mish);
91
92 ncnn::ParamDict pd;
93 activation->load_param(pd);
94 }
95 else if (activation_type == 5)
96 {
97 activation = ncnn::create_layer(ncnn::LayerType::Mish);
98
99 ncnn::ParamDict pd;
100 activation->load_param(pd);
101 }
102 if (activation)
103 {
104 activation->create_pipeline(opt);
105 }
106
107 #if NCNN_INT8
108 if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
109 {
110 return create_pipeline_int8_x86(opt);
111 }
112 #endif
113
114 const int maxk = kernel_w * kernel_h;
115 int channels = (weight_data_size / group) / maxk / (num_output / group) * group;
116
117 // depth-wise
118 if (channels == group && group == num_output)
119 {
120 int elempack = 1;
121 #if __SSE2__
122 if (opt.use_packing_layout)
123 {
124 #if __AVX__
125 elempack = channels % 8 == 0 ? 8 : channels % 4 == 0 ? 4 : 1;
126 #else
127 elempack = channels % 4 == 0 ? 4 : 1;
128 #endif
129 }
130 #endif // __SSE2__
131
132 #if __SSE2__
133 #if __AVX__
134 // pack8
135 if (elempack == 8)
136 {
137 if (opt.use_weight_fp16_storage && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
138 {
139 Mat weight_data_r2 = weight_data.reshape(maxk, group);
140 Mat weight_data_tmp;
141 convert_packing(weight_data_r2, weight_data_tmp, 8);
142 ncnn::cast_float32_to_float16(weight_data_tmp, weight_data_packed, opt);
143 return 0;
144 }
145 if (opt.use_weight_fp16_storage && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
146 {
147 Mat weight_data_r2 = weight_data.reshape(maxk, group);
148 Mat weight_data_tmp;
149 convert_packing(weight_data_r2, weight_data_tmp, 8);
150 ncnn::cast_float32_to_float16(weight_data_tmp, weight_data_packed, opt);
151 return 0;
152 }
153
154 Mat weight_data_r2 = weight_data.reshape(maxk, group);
155 convert_packing(weight_data_r2, weight_data_packed, 8);
156
157 return 0;
158 }
159 #endif // __AVX__
160
161 // pack4
162 if (elempack == 4)
163 {
164 Mat weight_data_r2 = weight_data.reshape(maxk, group);
165 convert_packing(weight_data_r2, weight_data_packed, 4);
166
167 return 0;
168 }
169 #endif // __SSE2__
170
171 if (elempack == 1)
172 {
173 // depth-wise specific
174 if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
175 {
176 return 0;
177 }
178 if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
179 {
180 return 0;
181 }
182 }
183 }
184
185 // group convolution
186 create_group_ops(opt);
187
188 return 0;
189 }
190
create_group_ops(const Option & opt)191 int ConvolutionDepthWise_x86::create_group_ops(const Option& opt)
192 {
193 // create Convolution op for each group
194 const int maxk = kernel_w * kernel_h;
195 int channels = (weight_data_size / group) / maxk / (num_output / group) * group;
196
197 for (int i = 0; i < (int)group_ops.size(); i++)
198 delete group_ops[i];
199
200 group_ops.clear();
201
202 const int channels_g = channels / group;
203 const int num_output_g = num_output / group;
204
205 group_ops.resize(group);
206
207 for (int g = 0; g < group; g++)
208 {
209 Mat weight_data_g = weight_data.range(maxk * channels_g * num_output_g * g, maxk * channels_g * num_output_g);
210 Mat bias_data_g;
211 if (bias_term)
212 bias_data_g = bias_data.range(num_output_g * g, num_output_g);
213
214 ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::Convolution);
215
216 // set param
217 ncnn::ParamDict pd;
218 pd.set(0, num_output_g); // num_output
219 pd.set(1, kernel_w);
220 pd.set(11, kernel_h);
221 pd.set(2, dilation_w);
222 pd.set(12, dilation_h);
223 pd.set(3, stride_w);
224 pd.set(13, stride_h);
225 pd.set(4, 0); // pad_w
226 pd.set(14, 0); // pad_h
227 pd.set(5, bias_term);
228 pd.set(6, maxk * channels_g * num_output_g); // weight_data_size
229 pd.set(8, int8_scale_term);
230 pd.set(9, activation_type);
231 pd.set(10, activation_params);
232
233 op->load_param(pd);
234
235 // set weights
236 if (bias_term)
237 {
238 ncnn::Mat weights[5];
239 weights[0] = weight_data_g;
240 weights[1] = bias_data_g;
241
242 #if NCNN_INT8
243 if (int8_scale_term)
244 {
245 Mat weight_data_int8_scales_g(num_output_g);
246 weight_data_int8_scales_g.fill(weight_data_int8_scales[g]);
247 weights[2] = weight_data_int8_scales_g;
248 weights[3] = bottom_blob_int8_scales.range(g, 1);
249 }
250 if (int8_scale_term > 100)
251 {
252 weights[4] = top_blob_int8_scales.range(g, 1);
253 }
254 #endif
255
256 op->load_model(ModelBinFromMatArray(weights));
257 }
258 else
259 {
260 ncnn::Mat weights[4];
261 weights[0] = weight_data_g;
262
263 #if NCNN_INT8
264 if (int8_scale_term)
265 {
266 Mat weight_data_int8_scales_g(num_output_g);
267 weight_data_int8_scales_g.fill(weight_data_int8_scales[g]);
268 weights[1] = weight_data_int8_scales_g;
269 weights[2] = bottom_blob_int8_scales.range(g, 1);
270 }
271 if (int8_scale_term > 100)
272 {
273 weights[3] = top_blob_int8_scales.range(g, 1);
274 }
275 #endif
276
277 op->load_model(ModelBinFromMatArray(weights));
278 }
279
280 op->create_pipeline(opt);
281
282 group_ops[g] = op;
283 }
284
285 return 0;
286 }
287
destroy_pipeline(const Option & opt)288 int ConvolutionDepthWise_x86::destroy_pipeline(const Option& opt)
289 {
290 if (activation)
291 {
292 activation->destroy_pipeline(opt);
293 delete activation;
294 activation = 0;
295 }
296
297 for (int i = 0; i < (int)group_ops.size(); i++)
298 {
299 group_ops[i]->destroy_pipeline(opt);
300 delete group_ops[i];
301 }
302 group_ops.clear();
303
304 return 0;
305 }
306
forward(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const307 int ConvolutionDepthWise_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
308 {
309 #if NCNN_INT8
310 if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
311 {
312 return forward_int8_x86(bottom_blob, top_blob, opt);
313 }
314 #endif
315
316 int w = bottom_blob.w;
317 int h = bottom_blob.h;
318 int channels = bottom_blob.c;
319 size_t elemsize = bottom_blob.elemsize;
320 int elempack = bottom_blob.elempack;
321
322 const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
323 const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
324
325 Mat bottom_blob_bordered;
326 make_padding(bottom_blob, bottom_blob_bordered, opt);
327 if (bottom_blob_bordered.empty())
328 return -100;
329
330 w = bottom_blob_bordered.w;
331 h = bottom_blob_bordered.h;
332
333 int outw = (w - kernel_extent_w) / stride_w + 1;
334 int outh = (h - kernel_extent_h) / stride_h + 1;
335 int out_elempack = 1;
336 #if __SSE2__
337 if (opt.use_packing_layout)
338 {
339 #if __AVX__
340 out_elempack = num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
341 #else
342 out_elempack = num_output % 4 == 0 ? 4 : 1;
343 #endif
344 }
345 #endif // __SSE2__
346 size_t out_elemsize = elemsize / elempack * out_elempack;
347
348 top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
349 if (top_blob.empty())
350 return -100;
351
352 // fprintf(stderr, "Depthwise kernel %d x %d elempack=%d group=%d channels = %d stride = %d x %d \n",kernel_w,kernel_h,elempack,group,channels,stride_w,stride_h );
353
354 // depth-wise
355 if (channels * elempack == group && group == num_output)
356 {
357 #if __SSE2__
358 #if __AVX__
359 if (elempack == 8)
360 {
361 if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
362 {
363 if (opt.use_weight_fp16_storage)
364 {
365 convdw3x3s1_fp16_pack8_avx(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt);
366 }
367 else
368 {
369 convdw3x3s1_pack8_avx(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt);
370 }
371
372 if (activation)
373 {
374 activation->forward_inplace(top_blob, opt);
375 }
376
377 return 0;
378 }
379 if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
380 {
381 if (opt.use_weight_fp16_storage)
382 {
383 convdw3x3s2_fp16_pack8_avx(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt);
384 }
385 else
386 {
387 convdw3x3s2_pack8_avx(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt);
388 }
389
390 if (activation)
391 {
392 activation->forward_inplace(top_blob, opt);
393 }
394
395 return 0;
396 }
397 if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
398 {
399 convdw5x5s1_pack8_avx(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt);
400
401 if (activation)
402 {
403 activation->forward_inplace(top_blob, opt);
404 }
405
406 return 0;
407 }
408 if (kernel_w == 5 && kernel_h == 5 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
409 {
410 convdw5x5s2_pack8_avx(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt);
411
412 if (activation)
413 {
414 activation->forward_inplace(top_blob, opt);
415 }
416
417 return 0;
418 }
419 else
420 {
421 const int maxk = kernel_w * kernel_h;
422
423 // kernel offsets
424 std::vector<int> _space_ofs(maxk);
425 int* space_ofs = &_space_ofs[0];
426 {
427 int p1 = 0;
428 int p2 = 0;
429 int gap = w * dilation_h - kernel_w * dilation_w;
430 for (int i = 0; i < kernel_h; i++)
431 {
432 for (int j = 0; j < kernel_w; j++)
433 {
434 space_ofs[p1] = p2;
435 p1++;
436 p2 += dilation_w;
437 }
438 p2 += gap;
439 }
440 }
441
442 #pragma omp parallel for num_threads(opt.num_threads)
443 for (int g = 0; g < channels; g++)
444 {
445 float* outptr = top_blob.channel(g);
446 const float* kptr = (const float*)weight_data_packed + maxk * g * 8;
447 const Mat m = bottom_blob_bordered.channel(g);
448
449 for (int i = 0; i < outh; i++)
450 {
451 for (int j = 0; j < outw; j++)
452 {
453 __m256 _sum = _mm256_set1_ps(0.f);
454
455 if (bias_term)
456 {
457 _sum = _mm256_loadu_ps(((const float*)bias_data) + g * 8);
458 }
459
460 const float* sptr = m.row(i * stride_h) + j * stride_w * 8;
461
462 for (int k = 0; k < maxk; k++)
463 {
464 __m256 _val = _mm256_loadu_ps(sptr + space_ofs[k] * 8);
465 __m256 _w = _mm256_loadu_ps(kptr + k * 8);
466 _sum = _mm256_fmadd_ps(_val, _w, _sum);
467 }
468
469 _sum = activation_avx(_sum, activation_type, activation_params);
470
471 _mm256_storeu_ps(outptr + j * 8, _sum);
472 }
473
474 outptr += outw * 8;
475 }
476 }
477
478 return 0;
479 }
480 }
481 #endif // __AVX__
482
483 if (elempack == 4)
484 {
485 {
486 const int maxk = kernel_w * kernel_h;
487
488 // kernel offsets
489 std::vector<int> _space_ofs(maxk);
490 int* space_ofs = &_space_ofs[0];
491 {
492 int p1 = 0;
493 int p2 = 0;
494 int gap = w * dilation_h - kernel_w * dilation_w;
495 for (int i = 0; i < kernel_h; i++)
496 {
497 for (int j = 0; j < kernel_w; j++)
498 {
499 space_ofs[p1] = p2;
500 p1++;
501 p2 += dilation_w;
502 }
503 p2 += gap;
504 }
505 }
506
507 #pragma omp parallel for num_threads(opt.num_threads)
508 for (int g = 0; g < channels; g++)
509 {
510 float* outptr = top_blob.channel(g);
511 const float* kptr = (const float*)weight_data_packed + maxk * g * 4;
512 const Mat m = bottom_blob_bordered.channel(g);
513
514 for (int i = 0; i < outh; i++)
515 {
516 for (int j = 0; j < outw; j++)
517 {
518 __m128 _sum = _mm_set1_ps(0.f);
519
520 if (bias_term)
521 {
522 _sum = _mm_loadu_ps(((const float*)bias_data) + g * 4);
523 }
524
525 const float* sptr = m.row(i * stride_h) + j * stride_w * 4;
526
527 for (int k = 0; k < maxk; k++)
528 {
529 __m128 _val = _mm_loadu_ps(sptr + space_ofs[k] * 4);
530 __m128 _w = _mm_loadu_ps(kptr + k * 4);
531 _sum = _mm_add_ps(_mm_mul_ps(_val, _w), _sum);
532 }
533
534 _sum = activation_sse(_sum, activation_type, activation_params);
535
536 _mm_storeu_ps(outptr + j * 4, _sum);
537 }
538
539 outptr += outw * 4;
540 }
541 }
542
543 return 0;
544 }
545 }
546 #endif // __SSE2__
547
548 if (elempack == 1)
549 {
550 if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
551 {
552 convdw3x3s1_sse(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
553
554 if (activation)
555 {
556 activation->forward_inplace(top_blob, opt);
557 }
558
559 return 0;
560 }
561 if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
562 {
563 convdw3x3s2_sse(bottom_blob_bordered, top_blob, weight_data, bias_data, opt);
564
565 if (activation)
566 {
567 activation->forward_inplace(top_blob, opt);
568 }
569
570 return 0;
571 }
572 }
573 }
574
575 // group convolution
576 const int channels_g = channels * elempack / group;
577 const int num_output_g = num_output / group;
578
579 int g_elempack = 1;
580 int out_g_elempack = 1;
581 #if __SSE2__
582 if (opt.use_packing_layout)
583 {
584 #if __AVX__
585 g_elempack = channels_g % 8 == 0 ? 8 : channels_g % 4 == 0 ? 4 : 1;
586 out_g_elempack = num_output_g % 8 == 0 ? 8 : num_output_g % 4 == 0 ? 4 : 1;
587 #else
588 g_elempack = channels_g % 4 == 0 ? 4 : 1;
589 out_g_elempack = num_output_g % 4 == 0 ? 4 : 1;
590 #endif
591 }
592 #endif // __SSE2__
593
594 // unpacking
595 Mat bottom_blob_bordered_unpacked = bottom_blob_bordered;
596 if (elempack > g_elempack)
597 {
598 Option opt_p = opt;
599 opt_p.blob_allocator = opt.workspace_allocator;
600 convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, g_elempack, opt_p);
601 }
602
603 Mat top_blob_unpacked = top_blob;
604 if (out_g_elempack < out_elempack)
605 {
606 top_blob_unpacked.create(outw, outh, num_output / out_g_elempack, out_elemsize / out_elempack * out_g_elempack, out_g_elempack, opt.workspace_allocator);
607 if (top_blob_unpacked.empty())
608 return -100;
609 }
610
611 for (int g = 0; g < group; g++)
612 {
613 const Mat bottom_blob_bordered_g = bottom_blob_bordered_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack);
614 Mat top_blob_g = top_blob_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack);
615
616 const ncnn::Layer* op = group_ops[g];
617
618 Option opt_g = opt;
619 opt_g.blob_allocator = top_blob_unpacked.allocator;
620
621 // forward
622 op->forward(bottom_blob_bordered_g, top_blob_g, opt_g);
623 }
624
625 // packing
626 if (out_g_elempack < out_elempack)
627 {
628 convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
629 }
630 else
631 {
632 top_blob = top_blob_unpacked;
633 }
634
635 return 0;
636 }
637
638 #if NCNN_INT8
create_pipeline_int8_x86(const Option & opt)639 int ConvolutionDepthWise_x86::create_pipeline_int8_x86(const Option& opt)
640 {
641 const int maxk = kernel_w * kernel_h;
642 int channels = (weight_data_size / group) / maxk / (num_output / group) * group;
643
644 // depth-wise
645 if (channels == group && group == num_output)
646 {
647 int elempack = 1;
648 #if __SSE2__
649 if (opt.use_packing_layout)
650 {
651 elempack = channels % 8 == 0 ? 8 : 1;
652 }
653 #endif // __SSE2__
654
655 if (elempack == 8)
656 {
657 Mat weight_data_r2 = weight_data.reshape(maxk, group);
658 convert_packing(weight_data_r2, weight_data_int8, 8, opt);
659 }
660
661 return 0;
662 }
663
664 // group convolution
665 create_group_ops(opt);
666
667 return 0;
668 }
669
forward_int8_x86(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const670 int ConvolutionDepthWise_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
671 {
672 int w = bottom_blob.w;
673 int h = bottom_blob.h;
674 int channels = bottom_blob.c;
675 size_t elemsize = bottom_blob.elemsize;
676 int elempack = bottom_blob.elempack;
677
678 int elembits = bottom_blob.elembits();
679
680 const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
681 const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
682
683 Mat bottom_blob_int8 = bottom_blob;
684 if (elembits != 8)
685 {
686 const int channels_g = channels * elempack / group;
687
688 Mat scales(channels * elempack);
689 {
690 float* ps = scales;
691 for (int g = 0; g < group; g++)
692 {
693 float scale = bottom_blob_int8_scales[g];
694 for (int q = 0; q < channels_g; q++)
695 {
696 *ps++ = scale;
697 }
698 }
699 }
700
701 Option opt_q = opt;
702 opt_q.blob_allocator = opt.workspace_allocator;
703 quantize_to_int8(bottom_blob, bottom_blob_int8, scales, opt_q);
704 }
705
706 Mat bottom_blob_bordered;
707 make_padding(bottom_blob_int8, bottom_blob_bordered, opt);
708 if (bottom_blob_bordered.empty())
709 return -100;
710
711 w = bottom_blob_bordered.w;
712 h = bottom_blob_bordered.h;
713 channels = bottom_blob_bordered.c;
714 elempack = bottom_blob_bordered.elempack;
715
716 int outw = (w - kernel_extent_w) / stride_w + 1;
717 int outh = (h - kernel_extent_h) / stride_h + 1;
718
719 // depth-wise
720 if (channels * elempack == group && group == num_output)
721 {
722 int out_elempack = 1;
723 #if __SSE2__
724 if (opt.use_packing_layout)
725 {
726 out_elempack = num_output % 8 == 0 ? 8 : 1;
727 }
728 #endif // __SSE2__
729 bool use_int8_requantize = int8_scale_term > 100;
730 size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack;
731
732 top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
733 if (top_blob.empty())
734 return -100;
735
736 #if __SSE2__
737 if (elempack == 8)
738 {
739 {
740 const int maxk = kernel_w * kernel_h;
741
742 // kernel offsets
743 std::vector<int> _space_ofs(maxk);
744 int* space_ofs = &_space_ofs[0];
745 {
746 int p1 = 0;
747 int p2 = 0;
748 int gap = w * dilation_h - kernel_w * dilation_w;
749 for (int i = 0; i < kernel_h; i++)
750 {
751 for (int j = 0; j < kernel_w; j++)
752 {
753 space_ofs[p1] = p2;
754 p1++;
755 p2 += dilation_w;
756 }
757 p2 += gap;
758 }
759 }
760
761 #pragma omp parallel for num_threads(opt.num_threads)
762 for (int g = 0; g < channels; g++)
763 {
764 signed char* outptr_s8 = top_blob.channel(g);
765 float* outptr_f32 = top_blob.channel(g);
766 const signed char* kptr = (const signed char*)weight_data_int8 + maxk * g * 8;
767 const Mat m = bottom_blob_bordered.channel(g);
768
769 for (int i = 0; i < outh; i++)
770 {
771 for (int j = 0; j < outw; j++)
772 {
773 __m128i _sum0 = _mm_setzero_si128();
774 __m128i _sum1 = _mm_setzero_si128();
775
776 const signed char* sptr = m.row<const signed char>(i * stride_h) + j * stride_w * 8;
777
778 for (int k = 0; k < maxk; k++)
779 {
780 // TODO use _mm_cvtepi8_epi16 on sse4.1
781 __m128i _val = _mm_loadl_epi64((const __m128i*)(sptr + space_ofs[k] * 8));
782 _val = _mm_unpacklo_epi8(_val, _mm_cmpgt_epi8(_mm_setzero_si128(), _val));
783
784 __m128i _w = _mm_loadl_epi64((const __m128i*)(kptr + k * 8));
785 _w = _mm_unpacklo_epi8(_w, _mm_cmpgt_epi8(_mm_setzero_si128(), _w));
786
787 __m128i _sl = _mm_mullo_epi16(_val, _w);
788 __m128i _sh = _mm_mulhi_epi16(_val, _w);
789 __m128i _s0 = _mm_unpacklo_epi16(_sl, _sh);
790 __m128i _s1 = _mm_unpackhi_epi16(_sl, _sh);
791
792 _sum0 = _mm_add_epi32(_sum0, _s0);
793 _sum1 = _mm_add_epi32(_sum1, _s1);
794 }
795
796 __m128 _scale_in0;
797 __m128 _scale_in1;
798 {
799 __m128 _bottom_blob_int8_scales0 = _mm_loadu_ps((const float*)bottom_blob_int8_scales + g * 8);
800 __m128 _bottom_blob_int8_scales1 = _mm_loadu_ps((const float*)bottom_blob_int8_scales + g * 8 + 4);
801 __m128 _weight_data_int8_scales0 = _mm_loadu_ps((const float*)weight_data_int8_scales + g * 8);
802 __m128 _weight_data_int8_scales1 = _mm_loadu_ps((const float*)weight_data_int8_scales + g * 8 + 4);
803 _scale_in0 = _mm_rcp_ps(_mm_mul_ps(_bottom_blob_int8_scales0, _weight_data_int8_scales0));
804 _scale_in1 = _mm_rcp_ps(_mm_mul_ps(_bottom_blob_int8_scales1, _weight_data_int8_scales1));
805
806 __m128 _m0 = _mm_cmpneq_ps(_weight_data_int8_scales0, _mm_setzero_ps());
807 __m128 _m1 = _mm_cmpneq_ps(_weight_data_int8_scales1, _mm_setzero_ps());
808 _scale_in0 = _mm_and_ps(_scale_in0, _m0);
809 _scale_in1 = _mm_and_ps(_scale_in1, _m1);
810 }
811
812 __m128 _sumfp32_0 = _mm_mul_ps(_mm_cvtepi32_ps(_sum0), _scale_in0);
813 __m128 _sumfp32_1 = _mm_mul_ps(_mm_cvtepi32_ps(_sum1), _scale_in1);
814
815 if (bias_term)
816 {
817 __m128 _bias0 = _mm_loadu_ps((const float*)bias_data + g * 8);
818 __m128 _bias1 = _mm_loadu_ps((const float*)bias_data + g * 8 + 4);
819 _sumfp32_0 = _mm_add_ps(_sumfp32_0, _bias0);
820 _sumfp32_1 = _mm_add_ps(_sumfp32_1, _bias1);
821 }
822
823 _sumfp32_0 = activation_sse(_sumfp32_0, activation_type, activation_params);
824 _sumfp32_1 = activation_sse(_sumfp32_1, activation_type, activation_params);
825
826 if (use_int8_requantize)
827 {
828 // requantize and relu
829 __m128 _scale_out0 = _mm_loadu_ps((const float*)top_blob_int8_scales + g * 8);
830 __m128 _scale_out1 = _mm_loadu_ps((const float*)top_blob_int8_scales + g * 8 + 4);
831 _sumfp32_0 = _mm_mul_ps(_sumfp32_0, _scale_out0);
832 _sumfp32_1 = _mm_mul_ps(_sumfp32_1, _scale_out1);
833 int64_t _sum8 = float2int8_sse(_sumfp32_0, _sumfp32_1);
834
835 *(int64_t*)outptr_s8 = _sum8;
836 outptr_s8 += 8;
837 }
838 else
839 {
840 // dequantize and relu
841 _mm_storeu_ps(outptr_f32, _sumfp32_0);
842 _mm_storeu_ps(outptr_f32 + 4, _sumfp32_1);
843 outptr_f32 += 8;
844 }
845 }
846 }
847 }
848 }
849 }
850 #endif // __SSE2__
851
852 if (elempack == 1)
853 {
854 if (kernel_w == 3 && kernel_h == 3 && stride_w == 1 && stride_h == 1 && dilation_w == 1 && dilation_h == 1 && (activation_type == 0 || activation_type == 1))
855 {
856 if (use_int8_requantize)
857 {
858 std::vector<float> requantize_scales;
859 for (int g = 0; g < group; g++)
860 {
861 float scale_in;
862 if (weight_data_int8_scales[g] == 0)
863 scale_in = 0;
864 else
865 scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);
866
867 float scale_out = top_blob_int8_scales[g];
868
869 requantize_scales.push_back(scale_in);
870 requantize_scales.push_back(scale_out);
871 }
872
873 convdw3x3s1_int8_requant_sse(bottom_blob_bordered, top_blob, weight_data, bias_data, requantize_scales, opt);
874 }
875 else
876 {
877 std::vector<float> dequantize_scales;
878 for (int g = 0; g < group; g++)
879 {
880 float top_rescale = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);
881
882 dequantize_scales.push_back(top_rescale);
883 }
884
885 convdw3x3s1_int8_dequant_sse(bottom_blob_bordered, top_blob, weight_data, bias_data, dequantize_scales, opt);
886 }
887
888 if (activation)
889 {
890 activation->forward_inplace(top_blob, opt);
891 }
892 }
893 else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2 && (activation_type == 0 || activation_type == 1))
894 {
895 if (use_int8_requantize)
896 {
897 std::vector<float> requantize_scales;
898 for (int g = 0; g < group; g++)
899 {
900 float scale_in;
901 if (weight_data_int8_scales[g] == 0)
902 scale_in = 0;
903 else
904 scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);
905
906 float scale_out = top_blob_int8_scales[g];
907
908 requantize_scales.push_back(scale_in);
909 requantize_scales.push_back(scale_out);
910 }
911
912 convdw3x3s2_int8_requant_sse(bottom_blob_bordered, top_blob, weight_data, bias_data, requantize_scales, opt);
913 }
914 else
915 {
916 std::vector<float> dequantize_scales;
917 for (int g = 0; g < group; g++)
918 {
919 float top_rescale = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);
920
921 dequantize_scales.push_back(top_rescale);
922 }
923
924 convdw3x3s2_int8_dequant_sse(bottom_blob_bordered, top_blob, weight_data, bias_data, dequantize_scales, opt);
925 }
926
927 if (activation)
928 {
929 activation->forward_inplace(top_blob, opt);
930 }
931 }
932 else
933 {
934 const int maxk = kernel_w * kernel_h;
935
936 // kernel offsets
937 std::vector<int> _space_ofs(maxk);
938 int* space_ofs = &_space_ofs[0];
939 {
940 int p1 = 0;
941 int p2 = 0;
942 int gap = w * dilation_h - kernel_w * dilation_w;
943 for (int i = 0; i < kernel_h; i++)
944 {
945 for (int j = 0; j < kernel_w; j++)
946 {
947 space_ofs[p1] = p2;
948 p1++;
949 p2 += dilation_w;
950 }
951 p2 += gap;
952 }
953 }
954
955 #pragma omp parallel for num_threads(opt.num_threads)
956 for (int g = 0; g < group; g++)
957 {
958 signed char* outptr_s8 = top_blob.channel(g);
959 float* outptr_f32 = top_blob.channel(g);
960 const signed char* kptr = (const signed char*)weight_data + maxk * g;
961 const Mat m = bottom_blob_bordered.channel(g);
962
963 for (int i = 0; i < outh; i++)
964 {
965 for (int j = 0; j < outw; j++)
966 {
967 int sum = 0;
968
969 const signed char* sptr = m.row<const signed char>(i * stride_h) + j * stride_w;
970
971 for (int k = 0; k < maxk; k++)
972 {
973 signed char val = sptr[space_ofs[k]];
974 signed char w = kptr[k];
975 sum += val * w;
976 }
977
978 float scale_in;
979 if (weight_data_int8_scales[g] == 0)
980 scale_in = 0;
981 else
982 scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);
983
984 float sumfp32 = sum * scale_in;
985
986 if (bias_term)
987 sumfp32 += bias_data[g];
988
989 sumfp32 = activation_ss(sumfp32, activation_type, activation_params);
990
991 if (use_int8_requantize)
992 {
993 // requantize
994 float scale_out = top_blob_int8_scales[g];
995 signed char sums8 = float2int8(sumfp32 * scale_out);
996 outptr_s8[0] = sums8;
997 outptr_s8 += 1;
998 }
999 else
1000 {
1001 // dequantize
1002 outptr_f32[0] = sumfp32;
1003 outptr_f32 += 1;
1004 }
1005 }
1006 }
1007 }
1008 }
1009 }
1010
1011 return 0;
1012 }
1013
1014 int out_elempack = 1;
1015 #if __SSE2__
1016 if (opt.use_packing_layout)
1017 {
1018 out_elempack = num_output % 4 == 0 ? 4 : 1;
1019 }
1020 #endif // __SSE2__
1021 bool use_int8_requantize = int8_scale_term > 100;
1022 size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack;
1023
1024 top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
1025 if (top_blob.empty())
1026 return -100;
1027
1028 // group convolution
1029 const int channels_g = channels * elempack / group;
1030 const int num_output_g = num_output / group;
1031
1032 int g_elempack = 1;
1033 int out_g_elempack = 1;
1034 #if __SSE2__
1035 if (opt.use_packing_layout)
1036 {
1037 g_elempack = channels_g % 8 == 0 ? 8 : 1;
1038 out_g_elempack = num_output_g % 4 == 0 ? 4 : 1;
1039 }
1040 #endif // __SSE2__
1041
1042 // unpacking
1043 Mat bottom_blob_bordered_unpacked = bottom_blob_bordered;
1044 if (elempack > g_elempack)
1045 {
1046 Option opt_p = opt;
1047 opt_p.blob_allocator = opt.workspace_allocator;
1048 convert_packing(bottom_blob_bordered, bottom_blob_bordered_unpacked, g_elempack, opt_p);
1049 }
1050
1051 Mat top_blob_unpacked = top_blob;
1052 if (out_g_elempack < out_elempack)
1053 {
1054 top_blob_unpacked.create(outw, outh, num_output / out_g_elempack, out_elemsize / out_elempack * out_g_elempack, out_g_elempack, opt.workspace_allocator);
1055 if (top_blob_unpacked.empty())
1056 return -100;
1057 }
1058
1059 #pragma omp parallel for num_threads(opt.num_threads)
1060 for (int g = 0; g < group; g++)
1061 {
1062 const Mat bottom_blob_bordered_g = bottom_blob_bordered_unpacked.channel_range(channels_g * g / g_elempack, channels_g / g_elempack);
1063 Mat top_blob_g = top_blob_unpacked.channel_range(num_output_g * g / out_g_elempack, num_output_g / out_g_elempack);
1064
1065 const ncnn::Layer* op = group_ops[g];
1066
1067 Option opt_g = opt;
1068 opt_g.blob_allocator = top_blob.allocator;
1069
1070 // forward
1071 op->forward(bottom_blob_bordered_g, top_blob_g, opt_g);
1072 }
1073
1074 // packing
1075 if (out_g_elempack < out_elempack)
1076 {
1077 convert_packing(top_blob_unpacked, top_blob, out_elempack, opt);
1078 }
1079 else
1080 {
1081 top_blob = top_blob_unpacked;
1082 }
1083
1084 return 0;
1085 }
1086 #endif // NCNN_INT8
1087
1088 } // namespace ncnn
1089