1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 
15 #include "convolution_x86.h"
16 
17 #if __SSE2__
18 #include <emmintrin.h>
19 #if __AVX__
20 #include <immintrin.h>
21 #endif
22 #endif // __SSE2__
23 
24 #include "x86_activation.h"
25 #include "x86_usability.h"
26 
27 #include "benchmark.h"
28 #include "layer_type.h"
29 
30 namespace ncnn {
31 
32 #include "convolution_sgemm.h"
33 #include "convolution_1x1.h"
34 #include "convolution_3x3.h"
35 #include "convolution_5x5.h"
36 #include "convolution_7x7.h"
37 
38 #if NCNN_INT8
39 #include "convolution_sgemm_int8.h"
40 #include "convolution_1x1_int8.h"
41 #include "convolution_3x3_int8.h"
42 #include "convolution_int8.h"
43 #endif // NCNN_INT8
44 
45 #if __SSE2__
46 #include "convolution_1x1_pack4.h"
47 
48 #if NCNN_INT8
49 #include "convolution_pack8to4_int8.h"
50 #include "convolution_pack1to4_int8.h"
51 #include "convolution_pack8to1_int8.h"
52 #include "convolution_sgemm_pack8to4_int8.h"
53 #include "convolution_1x1_pack8to4_int8.h"
54 #endif // NCNN_INT8
55 
56 #if __AVX__
57 #include "convolution_3x3_pack1to8.h"
58 #include "convolution_3x3_pack8to1.h"
59 #include "convolution_3x3_pack8.h"
60 #include "convolution_2x2_pack8.h"
61 #include "convolution_2x2_pack8_fp16.h"
62 #include "convolution_1x1_pack8.h"
63 #include "convolution_1x1_pack8_fp16.h"
64 #endif
65 #endif // __SSE2__
66 
Convolution_x86()67 Convolution_x86::Convolution_x86()
68 {
69 #if __SSE2__
70     support_packing = true;
71 #if __AVX__
72     support_weight_fp16_storage = true;
73 #endif
74 #endif // __SSE2__
75 
76     activation = 0;
77     convolution_dilation1 = 0;
78 }
79 
create_pipeline(const Option & opt)80 int Convolution_x86::create_pipeline(const Option& opt)
81 {
82     if (activation_type == 1)
83     {
84         activation = ncnn::create_layer(ncnn::LayerType::ReLU);
85 
86         ncnn::ParamDict pd;
87         activation->load_param(pd);
88     }
89     else if (activation_type == 2)
90     {
91         activation = ncnn::create_layer(ncnn::LayerType::ReLU);
92 
93         ncnn::ParamDict pd;
94         pd.set(0, activation_params[0]); // slope
95         activation->load_param(pd);
96     }
97     else if (activation_type == 3)
98     {
99         activation = ncnn::create_layer(ncnn::LayerType::Clip);
100 
101         ncnn::ParamDict pd;
102         pd.set(0, activation_params[0]); // min
103         pd.set(1, activation_params[1]); // max
104 
105         activation->load_param(pd);
106     }
107     else if (activation_type == 4)
108     {
109         activation = ncnn::create_layer(ncnn::LayerType::Sigmoid);
110 
111         ncnn::ParamDict pd;
112         activation->load_param(pd);
113     }
114     else if (activation_type == 5)
115     {
116         activation = ncnn::create_layer(ncnn::LayerType::Mish);
117 
118         ncnn::ParamDict pd;
119         activation->load_param(pd);
120     }
121 
122     if (activation)
123     {
124         activation->create_pipeline(opt);
125     }
126 
127 #if NCNN_INT8
128     if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
129     {
130         return create_pipeline_int8_x86(opt);
131     }
132 #endif
133 
134     int kernel_size = kernel_w * kernel_h;
135     int num_input = weight_data_size / kernel_size / num_output;
136 
137     if (!opt.use_packing_layout && kernel_w == kernel_h && dilation_w != 1 && dilation_h == dilation_w && stride_w == 1 && stride_h == 1)
138     {
139         convolution_dilation1 = ncnn::create_layer(ncnn::LayerType::Convolution);
140 
141         // set param
142         ncnn::ParamDict pd;
143         pd.set(0, num_output); // num_output
144         pd.set(1, kernel_w);
145         pd.set(11, kernel_h);
146         pd.set(2, 1);
147         pd.set(12, 1);
148         pd.set(3, 1);  // stride_w
149         pd.set(13, 1); // stride_h
150         pd.set(4, 0);  // pad_w
151         pd.set(14, 0); // pad_h
152         pd.set(5, bias_term);
153         pd.set(6, weight_data_size);
154 
155         convolution_dilation1->load_param(pd);
156 
157         // set weights
158         if (bias_term)
159         {
160             ncnn::Mat weights[2];
161             weights[0] = weight_data;
162             weights[1] = bias_data;
163 
164             convolution_dilation1->load_model(ModelBinFromMatArray(weights));
165         }
166         else
167         {
168             ncnn::Mat weights[1];
169             weights[0] = weight_data;
170 
171             convolution_dilation1->load_model(ModelBinFromMatArray(weights));
172         }
173 
174         convolution_dilation1->create_pipeline(opt);
175 
176         return 0;
177     }
178 
179     int elempack = 1;
180     int out_elempack = 1;
181 
182 #if __SSE2__
183     if (opt.use_packing_layout)
184     {
185 #if __AVX__
186         elempack = num_input % 8 == 0 ? 8 : num_input % 4 == 0 ? 4 : 1;
187         out_elempack = num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
188 #else
189         elempack = num_input % 4 == 0 ? 4 : 1;
190         out_elempack = num_output % 4 == 0 ? 4 : 1;
191 #endif
192     }
193 #endif // __SSE2__
194 
195     // pack1
196     if (elempack == 1 && out_elempack == 1)
197     {
198         if (opt.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1 && num_input >= 16 && num_output >= 16)
199         {
200             conv3x3s1_winograd23_transform_kernel_sse(weight_data, weight_3x3_winograd23_data, num_input, num_output);
201             // conv3x3s1_winograd43_transform_kernel_sse(weight_data, weight_3x3_winograd43_data, num_input, num_output);
202 
203             // for small size
204             conv_im2col_sgemm_transform_kernel_sse(weight_data, weight_sgemm_data, num_input, num_output, kernel_size);
205         }
206         else
207         {
208             conv_im2col_sgemm_transform_kernel_sse(weight_data, weight_sgemm_data, num_input, num_output, kernel_size);
209         }
210 
211         return 0;
212     }
213 
214     const int maxk = kernel_w * kernel_h;
215 
216     // src = kw-kh-inch-outch
217     // dst = pb-pa-kw-kh-inch/pa-outch/pb
218     {
219         Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);
220 
221         weight_data_packed.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)4u * elempack * out_elempack, elempack * out_elempack);
222 
223         for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
224         {
225             Mat g0 = weight_data_packed.channel(q / out_elempack);
226 
227             for (int p = 0; p + (elempack - 1) < num_input; p += elempack)
228             {
229                 float* g00 = g0.row(p / elempack);
230 
231                 for (int k = 0; k < maxk; k++)
232                 {
233                     for (int i = 0; i < elempack; i++)
234                     {
235                         for (int j = 0; j < out_elempack; j++)
236                         {
237                             const float* k00 = weight_data_r2.channel(q + j).row(p + i);
238 
239                             g00[0] = k00[k];
240 
241                             g00++;
242                         }
243                     }
244                 }
245             }
246         }
247     }
248 
249 #if __SSE2__
250     // pack4
251     if (elempack == 4 && out_elempack == 4)
252     {
253         if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
254         {
255             conv1x1s1_sgemm_transform_kernel_pack4_sse(weight_data, weight_data_packed, num_input, num_output);
256         }
257         else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
258         {
259             conv1x1s1_sgemm_transform_kernel_pack4_sse(weight_data, weight_data_packed, num_input, num_output);
260         }
261     }
262 
263 #if __AVX__
264     // pack8
265     if (elempack == 8 && out_elempack == 8)
266     {
267         if (opt.use_weight_fp16_storage && kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
268         {
269             conv1x1s1_sgemm_transform_kernel_fp16_pack8_avx(weight_data, weight_data_packed, num_input, num_output);
270         }
271         else if (opt.use_weight_fp16_storage && kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
272         {
273             conv1x1s1_sgemm_transform_kernel_fp16_pack8_avx(weight_data, weight_data_packed, num_input, num_output);
274         }
275         else if (opt.use_weight_fp16_storage && kernel_w == 2 && kernel_h == 2 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
276         {
277             conv2x2s1_weight_fp16_pack8_avx(weight_data, weight_data_packed, num_input, num_output);
278         }
279         else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1 && num_input >= 16 && num_output >= 16)
280         {
281             conv3x3s1_winograd64_transform_kernel_pack8_avx(weight_data, weight_data_packed, num_input, num_output);
282         }
283         else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
284         {
285             conv1x1s1_sgemm_transform_kernel_pack8_avx(weight_data, weight_data_packed, num_input, num_output);
286         }
287         else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
288         {
289             conv1x1s1_sgemm_transform_kernel_pack8_avx(weight_data, weight_data_packed, num_input, num_output);
290         }
291     }
292 #endif
293 #endif
294 
295     return 0;
296 }
297 
destroy_pipeline(const Option & opt)298 int Convolution_x86::destroy_pipeline(const Option& opt)
299 {
300     if (activation)
301     {
302         activation->destroy_pipeline(opt);
303         delete activation;
304         activation = 0;
305     }
306 
307     if (convolution_dilation1)
308     {
309         convolution_dilation1->destroy_pipeline(opt);
310         delete convolution_dilation1;
311         convolution_dilation1 = 0;
312     }
313 
314     return 0;
315 }
316 
forward(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const317 int Convolution_x86::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
318 {
319     // convolv with NxN kernel
320     // value = value + bias
321 
322 #if NCNN_INT8
323     if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
324     {
325         return forward_int8_x86(bottom_blob, top_blob, opt);
326     }
327 #endif
328 
329     if (bottom_blob.dims != 3)
330     {
331         return Convolution::forward(bottom_blob, top_blob, opt);
332     }
333 
334     if (!opt.use_packing_layout && (dilation_w > 1 || dilation_h > 1) && (stride_w > 1 || stride_h > 1))
335     {
336         return Convolution::forward(bottom_blob, top_blob, opt);
337     }
338 
339     if (!opt.use_packing_layout && (dilation_w > 1 || dilation_h > 1) && dilation_w != dilation_h)
340     {
341         return Convolution::forward(bottom_blob, top_blob, opt);
342     }
343 
344     int w = bottom_blob.w;
345     int h = bottom_blob.h;
346     int channels = bottom_blob.c;
347     size_t elemsize = bottom_blob.elemsize;
348     int elempack = bottom_blob.elempack;
349 
350     const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
351     const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
352 
353     Mat bottom_blob_bordered;
354     make_padding(bottom_blob, bottom_blob_bordered, opt);
355     if (bottom_blob_bordered.empty())
356         return -100;
357 
358     w = bottom_blob_bordered.w;
359     h = bottom_blob_bordered.h;
360 
361     int outw = (w - kernel_extent_w) / stride_w + 1;
362     int outh = (h - kernel_extent_h) / stride_h + 1;
363     int out_elempack = 1;
364 #if __SSE2__
365     if (opt.use_packing_layout)
366     {
367 #if __AVX__
368         out_elempack = num_output % 8 == 0 ? 8 : num_output % 4 == 0 ? 4 : 1;
369 #else
370         out_elempack = num_output % 4 == 0 ? 4 : 1;
371 #endif
372     }
373 #endif // __SSE2__
374     size_t out_elemsize = elemsize / elempack * out_elempack;
375 
376     top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
377     if (top_blob.empty())
378         return -100;
379 
380     if (!opt.use_packing_layout && kernel_w == kernel_h && dilation_w != 1 && dilation_h == dilation_w && stride_w == 1 && stride_h == 1)
381     {
382         if (outw >= dilation_w && outh >= dilation_h)
383         {
384             return forwardDilation_x86(bottom_blob_bordered, top_blob, opt);
385         }
386     }
387 
388     const int num_input = channels * elempack;
389 
390     const int maxk = kernel_w * kernel_h;
391 
392     // kernel offsets
393     std::vector<int> _space_ofs(maxk);
394     int* space_ofs = &_space_ofs[0];
395     {
396         int p1 = 0;
397         int p2 = 0;
398         int gap = w * dilation_h - kernel_w * dilation_w;
399         for (int i = 0; i < kernel_h; i++)
400         {
401             for (int j = 0; j < kernel_w; j++)
402             {
403                 space_ofs[p1] = p2;
404                 p1++;
405                 p2 += dilation_w;
406             }
407             p2 += gap;
408         }
409     }
410 
411 #if __SSE2__
412 #if __AVX__
413     if (elempack == 8 && out_elempack == 8)
414     {
415         if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
416         {
417             if (opt.use_weight_fp16_storage)
418             {
419                 conv1x1s1_sgemm_fp16_pack8_avx(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt);
420             }
421             else
422             {
423                 conv1x1s1_sgemm_pack8_avx(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt);
424             }
425 
426             if (activation)
427             {
428                 activation->forward_inplace(top_blob, opt);
429             }
430         }
431         else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
432         {
433             if (opt.use_weight_fp16_storage)
434             {
435                 conv1x1s2_fp16_pack8_avx(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt);
436             }
437             else
438             {
439                 conv1x1s2_pack8_avx(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt);
440             }
441             if (activation)
442             {
443                 activation->forward_inplace(top_blob, opt);
444             }
445         }
446         else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
447         {
448             if (num_input >= 16 && num_output >= 16)
449             {
450                 conv3x3s1_winograd64_pack8_avx(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt);
451             }
452             else
453             {
454                 conv3x3s1_pack8_avx(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt);
455             }
456 
457             if (activation)
458             {
459                 activation->forward_inplace(top_blob, opt);
460             }
461         }
462         else if (kernel_w == 2 && kernel_h == 2 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
463         {
464             if (opt.use_weight_fp16_storage)
465             {
466                 conv2x2s1_fp16_pack8_avx(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt);
467             }
468             else
469             {
470                 conv2x2s1_pack8_avx(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt);
471             }
472 
473             if (activation)
474             {
475                 activation->forward_inplace(top_blob, opt);
476             }
477         }
478         else
479         {
480             // num_output
481             #pragma omp parallel for num_threads(opt.num_threads)
482             for (int p = 0; p < num_output / out_elempack; p++)
483             {
484                 float* outptr = top_blob.channel(p);
485 
486                 for (int i = 0; i < outh; i++)
487                 {
488                     for (int j = 0; j < outw; j++)
489                     {
490                         __m256 _sum = _mm256_set1_ps(0.f);
491 
492                         if (bias_term)
493                         {
494                             _sum = _mm256_loadu_ps(((const float*)bias_data) + p * 8);
495                         }
496 
497                         const float* kptr = (const float*)weight_data_packed + maxk * channels * p * 64;
498 
499                         // channels
500                         for (int q = 0; q < channels; q++)
501                         {
502                             const Mat m = bottom_blob_bordered.channel(q);
503                             const float* sptr = m.row(i * stride_h) + j * stride_w * 8;
504 
505                             for (int k = 0; k < maxk; k++)
506                             {
507                                 __m256 _val0 = _mm256_broadcast_ss((sptr + space_ofs[k] * 8));
508                                 __m256 _val1 = _mm256_broadcast_ss((sptr + space_ofs[k] * 8) + 1);
509                                 __m256 _val2 = _mm256_broadcast_ss((sptr + space_ofs[k] * 8) + 2);
510                                 __m256 _val3 = _mm256_broadcast_ss((sptr + space_ofs[k] * 8) + 3);
511                                 __m256 _val4 = _mm256_broadcast_ss((sptr + space_ofs[k] * 8) + 4);
512                                 __m256 _val5 = _mm256_broadcast_ss((sptr + space_ofs[k] * 8) + 5);
513                                 __m256 _val6 = _mm256_broadcast_ss((sptr + space_ofs[k] * 8) + 6);
514                                 __m256 _val7 = _mm256_broadcast_ss((sptr + space_ofs[k] * 8) + 7);
515 
516                                 __m256 _w0 = _mm256_loadu_ps(kptr);
517                                 __m256 _mul0 = _mm256_mul_ps(_val0, _w0);
518                                 __m256 _w1 = _mm256_loadu_ps(kptr + 8);
519                                 __m256 _mul1 = _mm256_mul_ps(_val1, _w1);
520                                 __m256 _w2 = _mm256_loadu_ps(kptr + 16);
521                                 __m256 _mul2 = _mm256_mul_ps(_val2, _w2);
522                                 __m256 _w3 = _mm256_loadu_ps(kptr + 24);
523                                 __m256 _mul3 = _mm256_mul_ps(_val3, _w3);
524                                 __m256 _w4 = _mm256_loadu_ps(kptr + 32);
525                                 __m256 _mul4 = _mm256_mul_ps(_val4, _w4);
526                                 __m256 _w5 = _mm256_loadu_ps(kptr + 40);
527                                 __m256 _mul5 = _mm256_mul_ps(_val5, _w5);
528                                 __m256 _w6 = _mm256_loadu_ps(kptr + 48);
529                                 __m256 _mul6 = _mm256_mul_ps(_val6, _w6);
530                                 __m256 _w7 = _mm256_loadu_ps(kptr + 56);
531                                 __m256 _mul7 = _mm256_mul_ps(_val7, _w7);
532                                 __m256 _sum01 = _mm256_add_ps(_mul0, _mul1);
533                                 __m256 _sum23 = _mm256_add_ps(_mul2, _mul3);
534                                 __m256 _sum45 = _mm256_add_ps(_mul4, _mul5);
535                                 __m256 _sum67 = _mm256_add_ps(_mul6, _mul7);
536                                 __m256 _sum_lo = _mm256_add_ps(_sum01, _sum23);
537                                 __m256 _sum_hi = _mm256_add_ps(_sum45, _sum67);
538                                 __m256 _sum_all = _mm256_add_ps(_sum_lo, _sum_hi);
539                                 _sum = _mm256_add_ps(_sum_all, _sum);
540 
541                                 kptr += 64;
542                             }
543                         }
544 
545                         _sum = activation_avx(_sum, activation_type, activation_params);
546 
547                         _mm256_storeu_ps(outptr + j * 8, _sum);
548                     }
549 
550                     outptr += outw * 8;
551                 }
552             }
553         }
554     }
555 
556     if (elempack == 1 && out_elempack == 8)
557     {
558         if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
559         {
560             conv3x3s1_pack1to8_avx(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt);
561 
562             if (activation)
563             {
564                 activation->forward_inplace(top_blob, opt);
565             }
566         }
567         else if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
568         {
569             conv3x3s2_pack1to8_avx(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt);
570 
571             if (activation)
572             {
573                 activation->forward_inplace(top_blob, opt);
574             }
575         }
576         else
577         {
578             // num_output
579             #pragma omp parallel for num_threads(opt.num_threads)
580             for (int p = 0; p < num_output / out_elempack; p++)
581             {
582                 float* outptr = top_blob.channel(p);
583 
584                 for (int i = 0; i < outh; i++)
585                 {
586                     for (int j = 0; j < outw; j++)
587                     {
588                         __m256 _sum = _mm256_set1_ps(0.f);
589 
590                         if (bias_term)
591                         {
592                             _sum = _mm256_loadu_ps(((const float*)bias_data) + p * 8);
593                         }
594 
595                         const float* kptr = (const float*)weight_data_packed + maxk * channels * p * 8;
596 
597                         // channels
598                         for (int q = 0; q < channels; q++)
599                         {
600                             const Mat m = bottom_blob_bordered.channel(q);
601                             const float* sptr = m.row(i * stride_h) + j * stride_w;
602 
603                             for (int k = 0; k < maxk; k++) // 29.23
604                             {
605                                 __m256 _val = _mm256_set1_ps(sptr[space_ofs[k]]);
606                                 __m256 _w = _mm256_loadu_ps(kptr);
607                                 _sum = _mm256_fmadd_ps(_val, _w, _sum);
608 
609                                 kptr += 8;
610                             }
611                         }
612 
613                         _sum = activation_avx(_sum, activation_type, activation_params);
614 
615                         _mm256_storeu_ps(outptr + j * 8, _sum);
616                     }
617 
618                     outptr += outw * 8;
619                 }
620             }
621         }
622     }
623 
624     if (elempack == 4 && out_elempack == 8)
625     {
626         {
627             // num_output
628             #pragma omp parallel for num_threads(opt.num_threads)
629             for (int p = 0; p < num_output / out_elempack; p++)
630             {
631                 float* outptr = top_blob.channel(p);
632 
633                 for (int i = 0; i < outh; i++)
634                 {
635                     for (int j = 0; j < outw; j++)
636                     {
637                         __m256 _sum = _mm256_set1_ps(0.f);
638 
639                         if (bias_term)
640                         {
641                             _sum = _mm256_loadu_ps((const float*)bias_data + p * 8);
642                         }
643 
644                         const float* kptr = weight_data_packed.channel(p);
645 
646                         // channels
647                         for (int q = 0; q < channels; q++)
648                         {
649                             const Mat m = bottom_blob_bordered.channel(q);
650                             const float* sptr = m.row(i * stride_h) + j * stride_w * 4;
651 
652                             for (int k = 0; k < maxk; k++)
653                             {
654                                 __m256 _val0 = _mm256_broadcast_ss((sptr + space_ofs[k] * 4));
655                                 __m256 _val1 = _mm256_broadcast_ss((sptr + space_ofs[k] * 4) + 1);
656                                 __m256 _val2 = _mm256_broadcast_ss((sptr + space_ofs[k] * 4) + 2);
657                                 __m256 _val3 = _mm256_broadcast_ss((sptr + space_ofs[k] * 4) + 3);
658 
659                                 __m256 _w0 = _mm256_loadu_ps(kptr);
660                                 _sum = _mm256_fmadd_ps(_val0, _w0, _sum);
661                                 __m256 _w1 = _mm256_loadu_ps(kptr + 8);
662                                 _sum = _mm256_fmadd_ps(_val1, _w1, _sum);
663                                 __m256 _w2 = _mm256_loadu_ps(kptr + 16);
664                                 _sum = _mm256_fmadd_ps(_val2, _w2, _sum);
665                                 __m256 _w3 = _mm256_loadu_ps(kptr + 24);
666                                 _sum = _mm256_fmadd_ps(_val3, _w3, _sum);
667 
668                                 kptr += 32;
669                             }
670                         }
671 
672                         _sum = activation_avx(_sum, activation_type, activation_params);
673 
674                         _mm256_storeu_ps(outptr + j * 8, _sum);
675                     }
676 
677                     outptr += outw * 8;
678                 }
679             }
680         }
681     }
682 
683     if (elempack == 8 && out_elempack == 1)
684     {
685         if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
686         {
687             conv3x3s1_pack8to1_avx(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt);
688 
689             if (activation)
690             {
691                 activation->forward_inplace(top_blob, opt);
692             }
693         }
694         else
695         {
696             // num_output
697             #pragma omp parallel for num_threads(opt.num_threads)
698             for (int p = 0; p < num_output; p++)
699             {
700                 float* outptr = top_blob.channel(p);
701 
702                 for (int i = 0; i < outh; i++)
703                 {
704                     for (int j = 0; j < outw; j++)
705                     {
706                         float sum = 0.f;
707 
708                         if (bias_term)
709                         {
710                             sum = bias_data[p];
711                         }
712 
713                         const float* kptr = (const float*)weight_data_packed + maxk * channels * p * 8;
714                         __m256 _sum8 = _mm256_set1_ps(0);
715 
716                         // channels
717                         for (int q = 0; q < channels; q++)
718                         {
719                             const Mat m = bottom_blob_bordered.channel(q);
720                             const float* sptr = m.row(i * stride_h) + j * stride_w * 8;
721 
722                             for (int k = 0; k < maxk; k++) // 29.23
723                             {
724                                 __m256 _val = _mm256_loadu_ps(sptr + (space_ofs[k] * 8));
725                                 __m256 _w = _mm256_loadu_ps(kptr);
726                                 __m256 _s8 = _mm256_mul_ps(_val, _w);
727                                 _sum8 = _mm256_add_ps(_sum8, _s8);
728                                 kptr += 8;
729                             }
730                         }
731                         sum += _mm256_reduce_add_ps(_sum8); // dot
732                         sum = activation_ss(sum, activation_type, activation_params);
733 
734                         outptr[j] = sum;
735                     }
736 
737                     outptr += outw;
738                 }
739             }
740         }
741     }
742 
743     if (elempack == 8 && out_elempack == 4)
744     {
745         {
746             // num_output
747             #pragma omp parallel for num_threads(opt.num_threads)
748             for (int p = 0; p < num_output / out_elempack; p++)
749             {
750                 float* outptr = top_blob.channel(p);
751 
752                 for (int i = 0; i < outh; i++)
753                 {
754                     for (int j = 0; j < outw; j++)
755                     {
756                         __m128 _sum = _mm_set1_ps(0.f);
757 
758                         if (bias_term)
759                         {
760                             _sum = _mm_loadu_ps((const float*)bias_data + p * 4);
761                         }
762 
763                         const float* kptr = weight_data_packed.channel(p);
764 
765                         // channels
766                         for (int q = 0; q < channels; q++)
767                         {
768                             const Mat m = bottom_blob_bordered.channel(q);
769                             const float* sptr = m.row(i * stride_h) + j * stride_w * 8;
770 
771                             for (int k = 0; k < maxk; k++)
772                             {
773                                 __m128 _val0 = _mm_broadcast_ss((sptr + space_ofs[k] * 8));
774                                 __m128 _val1 = _mm_broadcast_ss((sptr + space_ofs[k] * 8) + 1);
775                                 __m128 _val2 = _mm_broadcast_ss((sptr + space_ofs[k] * 8) + 2);
776                                 __m128 _val3 = _mm_broadcast_ss((sptr + space_ofs[k] * 8) + 3);
777                                 __m128 _val4 = _mm_broadcast_ss((sptr + space_ofs[k] * 8) + 4);
778                                 __m128 _val5 = _mm_broadcast_ss((sptr + space_ofs[k] * 8) + 5);
779                                 __m128 _val6 = _mm_broadcast_ss((sptr + space_ofs[k] * 8) + 6);
780                                 __m128 _val7 = _mm_broadcast_ss((sptr + space_ofs[k] * 8) + 7);
781 
782                                 __m128 _w0 = _mm_loadu_ps(kptr);
783                                 _sum = _mm_fmadd_ps(_val0, _w0, _sum);
784                                 __m128 _w1 = _mm_loadu_ps(kptr + 4);
785                                 _sum = _mm_fmadd_ps(_val1, _w1, _sum);
786                                 __m128 _w2 = _mm_loadu_ps(kptr + 8);
787                                 _sum = _mm_fmadd_ps(_val2, _w2, _sum);
788                                 __m128 _w3 = _mm_loadu_ps(kptr + 12);
789                                 _sum = _mm_fmadd_ps(_val3, _w3, _sum);
790                                 __m128 _w4 = _mm_loadu_ps(kptr + 16);
791                                 _sum = _mm_fmadd_ps(_val4, _w4, _sum);
792                                 __m128 _w5 = _mm_loadu_ps(kptr + 20);
793                                 _sum = _mm_fmadd_ps(_val5, _w5, _sum);
794                                 __m128 _w6 = _mm_loadu_ps(kptr + 24);
795                                 _sum = _mm_fmadd_ps(_val6, _w6, _sum);
796                                 __m128 _w7 = _mm_loadu_ps(kptr + 28);
797                                 _sum = _mm_fmadd_ps(_val7, _w7, _sum);
798 
799                                 kptr += 32;
800                             }
801                         }
802 
803                         _sum = activation_sse(_sum, activation_type, activation_params);
804 
805                         _mm_storeu_ps(outptr + j * 4, _sum);
806                     }
807 
808                     outptr += outw * 4;
809                 }
810             }
811         }
812     }
813 #endif
814 
815     if (elempack == 4 && out_elempack == 4)
816     {
817         if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
818         {
819             conv1x1s1_sgemm_pack4_sse(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt);
820 
821             if (activation)
822             {
823                 activation->forward_inplace(top_blob, opt);
824             }
825         }
826         else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
827         {
828             conv1x1s2_pack4_sse(bottom_blob_bordered, top_blob, weight_data_packed, bias_data, opt);
829 
830             if (activation)
831             {
832                 activation->forward_inplace(top_blob, opt);
833             }
834         }
835         else
836         {
837             // num_output
838             #pragma omp parallel for num_threads(opt.num_threads)
839             for (int p = 0; p < num_output / out_elempack; p++)
840             {
841                 float* outptr = top_blob.channel(p);
842 
843                 for (int i = 0; i < outh; i++)
844                 {
845                     for (int j = 0; j < outw; j++)
846                     {
847                         __m128 _sum = _mm_set1_ps(0.f);
848 
849                         if (bias_term)
850                         {
851                             _sum = _mm_loadu_ps((const float*)bias_data + p * 4);
852                         }
853 
854                         const float* kptr = weight_data_packed.channel(p);
855 
856                         // channels
857                         for (int q = 0; q < channels; q++)
858                         {
859                             const Mat m = bottom_blob_bordered.channel(q);
860                             const float* sptr = m.row(i * stride_h) + j * stride_w * 4;
861 
862                             for (int k = 0; k < maxk; k++)
863                             {
864                                 __m128 _val0 = _mm_set1_ps(sptr[space_ofs[k] * 4]);
865                                 __m128 _val1 = _mm_set1_ps(sptr[space_ofs[k] * 4 + 1]);
866                                 __m128 _val2 = _mm_set1_ps(sptr[space_ofs[k] * 4 + 2]);
867                                 __m128 _val3 = _mm_set1_ps(sptr[space_ofs[k] * 4 + 3]);
868 
869                                 __m128 _w0 = _mm_loadu_ps(kptr);
870                                 _sum = _mm_add_ps(_mm_mul_ps(_val0, _w0), _sum);
871                                 __m128 _w1 = _mm_loadu_ps(kptr + 4);
872                                 _sum = _mm_add_ps(_mm_mul_ps(_val1, _w1), _sum);
873                                 __m128 _w2 = _mm_loadu_ps(kptr + 8);
874                                 _sum = _mm_add_ps(_mm_mul_ps(_val2, _w2), _sum);
875                                 __m128 _w3 = _mm_loadu_ps(kptr + 12);
876                                 _sum = _mm_add_ps(_mm_mul_ps(_val3, _w3), _sum);
877 
878                                 kptr += 16;
879                             }
880                         }
881 
882                         _sum = activation_sse(_sum, activation_type, activation_params);
883 
884                         _mm_storeu_ps(outptr + j * 4, _sum);
885                     }
886 
887                     outptr += outw * 4;
888                 }
889             }
890         }
891     }
892 
893     if (elempack == 1 && out_elempack == 4)
894     {
895         {
896             // num_output
897             #pragma omp parallel for num_threads(opt.num_threads)
898             for (int p = 0; p < num_output / out_elempack; p++)
899             {
900                 float* outptr = top_blob.channel(p);
901 
902                 for (int i = 0; i < outh; i++)
903                 {
904                     for (int j = 0; j < outw; j++)
905                     {
906                         __m128 _sum = _mm_set1_ps(0.f);
907 
908                         if (bias_term)
909                         {
910                             _sum = _mm_loadu_ps((const float*)bias_data + p * 4);
911                         }
912 
913                         const float* kptr = weight_data_packed.channel(p);
914 
915                         // channels
916                         for (int q = 0; q < channels; q++)
917                         {
918                             const Mat m = bottom_blob_bordered.channel(q);
919                             const float* sptr = m.row(i * stride_h) + j * stride_w;
920 
921                             for (int k = 0; k < maxk; k++)
922                             {
923                                 __m128 _val = _mm_set1_ps(sptr[space_ofs[k]]);
924                                 __m128 _w = _mm_loadu_ps(kptr);
925                                 _sum = _mm_add_ps(_mm_mul_ps(_val, _w), _sum);
926 
927                                 kptr += 4;
928                             }
929                         }
930 
931                         _sum = activation_sse(_sum, activation_type, activation_params);
932 
933                         _mm_storeu_ps(outptr + j * 4, _sum);
934                     }
935 
936                     outptr += outw * 4;
937                 }
938             }
939         }
940     }
941 
942     if (elempack == 4 && out_elempack == 1)
943     {
944         {
945             // num_output
946             #pragma omp parallel for num_threads(opt.num_threads)
947             for (int p = 0; p < num_output; p++)
948             {
949                 float* outptr = top_blob.channel(p);
950 
951                 for (int i = 0; i < outh; i++)
952                 {
953                     for (int j = 0; j < outw; j++)
954                     {
955                         float sum = 0.f;
956 
957                         if (bias_term)
958                         {
959                             sum = bias_data[p];
960                         }
961 
962                         const float* kptr = weight_data_packed.channel(p);
963 
964                         // channels
965                         for (int q = 0; q < channels; q++)
966                         {
967                             const Mat m = bottom_blob_bordered.channel(q);
968                             const float* sptr = m.row(i * stride_h) + j * stride_w * 4;
969 
970                             for (int k = 0; k < maxk; k++)
971                             {
972                                 __m128 _val = _mm_loadu_ps(sptr + space_ofs[k] * 4);
973                                 __m128 _w = _mm_loadu_ps(kptr);
974                                 __m128 _s4 = _mm_mul_ps(_val, _w);
975                                 sum += _mm_reduce_add_ps(_s4); // dot
976 
977                                 kptr += 4;
978                             }
979                         }
980 
981                         sum = activation_ss(sum, activation_type, activation_params);
982 
983                         outptr[j] = sum;
984                     }
985 
986                     outptr += outw;
987                 }
988             }
989         }
990     }
991 #endif // __SSE2__
992 
993     if (elempack == 1 && out_elempack == 1)
994     {
995         if (kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
996         {
997             if (opt.use_winograd_convolution && num_input >= 16 && num_output >= 16 && outw >= 8 && outh >= 8)
998             {
999                 conv3x3s1_winograd23_sse(bottom_blob_bordered, top_blob, weight_3x3_winograd23_data, bias_data, opt);
1000                 //             conv3x3s1_winograd43_sse(bottom_blob_bordered, top_blob, weight_3x3_winograd43_data, bias_data, opt);
1001             }
1002             else
1003             {
1004                 conv_im2col_sgemm_sse(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, stride_w, stride_h, opt);
1005             }
1006 
1007             if (activation)
1008             {
1009                 activation->forward_inplace(top_blob, opt);
1010             }
1011         }
1012         else if (dilation_w == 1 && dilation_h == 1)
1013         {
1014             conv_im2col_sgemm_sse(bottom_blob_bordered, top_blob, weight_sgemm_data, bias_data, kernel_w, kernel_h, stride_w, stride_h, opt);
1015             if (activation)
1016             {
1017                 activation->forward_inplace(top_blob, opt);
1018             }
1019         }
1020         else
1021         {
1022             // num_output
1023             #pragma omp parallel for num_threads(opt.num_threads)
1024             for (int p = 0; p < num_output; p++)
1025             {
1026                 float* outptr = top_blob.channel(p);
1027 
1028                 for (int i = 0; i < outh; i++)
1029                 {
1030                     for (int j = 0; j < outw; j++)
1031                     {
1032                         float sum = 0.f;
1033 
1034                         if (bias_term)
1035                         {
1036                             sum = bias_data[p];
1037                         }
1038 
1039                         const float* kptr = (const float*)weight_data + maxk * channels * p;
1040 
1041                         // channels
1042                         for (int q = 0; q < channels; q++)
1043                         {
1044                             const Mat m = bottom_blob_bordered.channel(q);
1045                             const float* sptr = m.row(i * stride_h) + j * stride_w;
1046 
1047                             for (int k = 0; k < maxk; k++)
1048                             {
1049                                 float val = sptr[space_ofs[k]];
1050                                 float wt = kptr[k];
1051                                 sum += val * wt;
1052                             }
1053 
1054                             kptr += maxk;
1055                         }
1056 
1057                         sum = activation_ss(sum, activation_type, activation_params);
1058 
1059                         outptr[j] = sum;
1060                     }
1061 
1062                     outptr += outw;
1063                 }
1064             }
1065         }
1066     }
1067 
1068     return 0;
1069 }
1070 
1071 #if NCNN_INT8
create_pipeline_int8_x86(const Option & opt)1072 int Convolution_x86::create_pipeline_int8_x86(const Option& opt)
1073 {
1074     const int maxk = kernel_w * kernel_h;
1075     const int num_input = weight_data_size / maxk / num_output;
1076 
1077     int elempack = 1;
1078     int out_elempack = 1;
1079 
1080 #if __SSE2__
1081     if (opt.use_packing_layout)
1082     {
1083         elempack = num_input % 8 == 0 ? 8 : 1;
1084         out_elempack = num_output % 4 == 0 ? 4 : 1;
1085     }
1086 #endif // __SSE2__
1087 
1088     if (elempack == 1 && out_elempack == 1)
1089     {
1090         if (opt.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1 && num_input >= 16 && num_output >= 16)
1091         {
1092             conv3x3s1_winograd23_transform_kernel_int8_sse(weight_data, weight_3x3_winograd23_data_int8, num_input, num_output);
1093             //         conv3x3s1_winograd43_transform_kernel_int8_sse(weight_data, weight_3x3_winograd23_data_int8, num_input, num_output);
1094         }
1095         else
1096         {
1097             // TODO offline transform weight
1098         }
1099 
1100         return 0;
1101     }
1102 
1103     // src = kw-kh-inch-outch
1104     // dst = pa-pb-kw-kh-inch/pa-outch/pb
1105     {
1106         Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);
1107 
1108         weight_data_int8.create(maxk, num_input / elempack, num_output / out_elempack, (size_t)elempack * out_elempack, elempack * out_elempack);
1109 
1110         for (int q = 0; q + (out_elempack - 1) < num_output; q += out_elempack)
1111         {
1112             Mat g0 = weight_data_int8.channel(q / out_elempack);
1113 
1114             for (int p = 0; p + (elempack - 1) < num_input; p += elempack)
1115             {
1116                 signed char* g00 = g0.row<signed char>(p / elempack);
1117 
1118                 for (int k = 0; k < maxk; k++)
1119                 {
1120                     for (int j = 0; j < out_elempack; j++)
1121                     {
1122                         for (int i = 0; i < elempack; i++)
1123                         {
1124                             const signed char* k00 = weight_data_r2.channel(q + j).row<const signed char>(p + i);
1125 
1126                             g00[0] = k00[k];
1127 
1128                             g00++;
1129                         }
1130                     }
1131                 }
1132             }
1133         }
1134     }
1135 
1136 #if __SSE2__
1137     if (elempack == 8 && out_elempack == 4)
1138     {
1139         if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
1140         {
1141             convolution_im2col_sgemm_transform_kernel_pack8to4_int8_sse(weight_data, weight_data_int8, num_input, num_output, kernel_w, kernel_h);
1142         }
1143         else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
1144         {
1145             convolution_im2col_sgemm_transform_kernel_pack8to4_int8_sse(weight_data, weight_data_int8, num_input, num_output, kernel_w, kernel_h);
1146         }
1147         else if (opt.use_sgemm_convolution)
1148         {
1149             convolution_im2col_sgemm_transform_kernel_pack8to4_int8_sse(weight_data, weight_data_int8, num_input, num_output, kernel_w, kernel_h);
1150         }
1151     }
1152 #endif // __SSE2__
1153 
1154     return 0;
1155 }
1156 
forward_int8_x86(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const1157 int Convolution_x86::forward_int8_x86(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
1158 {
1159     int elembits = bottom_blob.elembits();
1160 
1161     Mat bottom_blob_int8 = bottom_blob;
1162     if (elembits != 8)
1163     {
1164         Option opt_q = opt;
1165         opt_q.blob_allocator = opt.workspace_allocator;
1166         quantize_to_int8(bottom_blob, bottom_blob_int8, bottom_blob_int8_scales, opt_q);
1167     }
1168 
1169     //     NCNN_LOGE("Convolution_arm input %d x %d  ksize=%d %d  stride=%d %d", w, h, kernel_w, kernel_h, stride_w, stride_h);
1170 
1171     Mat bottom_blob_bordered;
1172     make_padding(bottom_blob_int8, bottom_blob_bordered, opt);
1173     if (bottom_blob_bordered.empty())
1174         return -100;
1175 
1176     int w = bottom_blob_bordered.w;
1177     int h = bottom_blob_bordered.h;
1178     int channels = bottom_blob_bordered.c;
1179     int elempack = bottom_blob_bordered.elempack;
1180     size_t elemsize = bottom_blob_bordered.elemsize;
1181 
1182     const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
1183     const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
1184 
1185     int outw = (w - kernel_extent_w) / stride_w + 1;
1186     int outh = (h - kernel_extent_h) / stride_h + 1;
1187 
1188     int out_elempack = 1;
1189 #if __SSE2__
1190     if (opt.use_packing_layout)
1191     {
1192         out_elempack = num_output % 4 == 0 ? 4 : 1;
1193     }
1194 #endif // __SSE2__
1195     bool use_int8_requantize = int8_scale_term > 100;
1196     size_t out_elemsize = use_int8_requantize ? 1u * out_elempack : 4u * out_elempack;
1197 
1198     //     NCNN_LOGE("forward_int8_arm %d %d %d    %d %d", w, h, bottom_blob_bordered.c, elempack, out_elempack);
1199 
1200     top_blob.create(outw, outh, num_output / out_elempack, out_elemsize, out_elempack, opt.blob_allocator);
1201     if (top_blob.empty())
1202         return -100;
1203 
1204     const int num_input = channels * elempack;
1205 
1206     Mat top_blob_int32;
1207     top_blob_int32.create(outw, outh, num_output / out_elempack, (size_t)(4u * out_elempack), out_elempack, opt.workspace_allocator);
1208     if (top_blob_int32.empty())
1209         return -100;
1210 
1211 #if __SSE2__
1212     if (elempack == 8 && out_elempack == 4)
1213     {
1214         if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1)
1215         {
1216             conv1x1s1_sgemm_pack8to4_int8_sse(bottom_blob_bordered, top_blob_int32, weight_data_int8, opt);
1217         }
1218         else if (kernel_w == 1 && kernel_h == 1 && dilation_w == 1 && dilation_h == 1 && stride_w == 2 && stride_h == 2)
1219         {
1220             conv1x1s2_pack8to4_int8_sse(bottom_blob_bordered, top_blob_int32, weight_data_int8, opt);
1221         }
1222         else if (opt.use_sgemm_convolution)
1223         {
1224             convolution_im2col_sgemm_pack8to4_int8_sse(bottom_blob_bordered, top_blob_int32, weight_data_int8, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
1225         }
1226         else
1227         {
1228             convolution_pack8to4_int8_sse(bottom_blob_bordered, top_blob_int32, weight_data_int8, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
1229         }
1230 
1231         Mat scale_in_data(num_output);
1232         for (int p = 0; p < num_output; p++)
1233         {
1234             // requantize and relu
1235             float scale_in;
1236             if (weight_data_int8_scales[p] == 0)
1237                 scale_in = 0;
1238             else
1239                 scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]);
1240 
1241             scale_in_data[p] = scale_in;
1242         }
1243 
1244         if (use_int8_requantize)
1245         {
1246             requantize_from_int32_to_int8(top_blob_int32, top_blob, scale_in_data, top_blob_int8_scales, bias_data, activation_type, activation_params, opt);
1247         }
1248         else
1249         {
1250             dequantize_from_int32(top_blob_int32, top_blob, scale_in_data, bias_data, opt);
1251 
1252             if (activation)
1253             {
1254                 activation->forward_inplace(top_blob, opt);
1255             }
1256         }
1257     }
1258 
1259     if (elempack == 1 && out_elempack == 4)
1260     {
1261         convolution_pack1to4_int8_sse(bottom_blob_bordered, top_blob_int32, weight_data_int8, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
1262 
1263         Mat scale_in_data(num_output);
1264         for (int p = 0; p < num_output; p++)
1265         {
1266             // requantize and relu
1267             float scale_in;
1268             if (weight_data_int8_scales[p] == 0)
1269                 scale_in = 0;
1270             else
1271                 scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]);
1272 
1273             scale_in_data[p] = scale_in;
1274         }
1275 
1276         if (use_int8_requantize)
1277         {
1278             requantize_from_int32_to_int8(top_blob_int32, top_blob, scale_in_data, top_blob_int8_scales, bias_data, activation_type, activation_params, opt);
1279         }
1280         else
1281         {
1282             dequantize_from_int32(top_blob_int32, top_blob, scale_in_data, bias_data, opt);
1283 
1284             if (activation)
1285             {
1286                 activation->forward_inplace(top_blob, opt);
1287             }
1288         }
1289     }
1290 
1291     if (elempack == 8 && out_elempack == 1)
1292     {
1293         convolution_pack8to1_int8_sse(bottom_blob_bordered, top_blob_int32, weight_data_int8, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
1294 
1295         Mat scale_in_data(num_output);
1296         for (int p = 0; p < num_output; p++)
1297         {
1298             // requantize and relu
1299             float scale_in;
1300             if (weight_data_int8_scales[p] == 0)
1301                 scale_in = 0;
1302             else
1303                 scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]);
1304 
1305             scale_in_data[p] = scale_in;
1306         }
1307 
1308         if (use_int8_requantize)
1309         {
1310             requantize_from_int32_to_int8(top_blob_int32, top_blob, scale_in_data, top_blob_int8_scales, bias_data, activation_type, activation_params, opt);
1311         }
1312         else
1313         {
1314             dequantize_from_int32(top_blob_int32, top_blob, scale_in_data, bias_data, opt);
1315 
1316             if (activation)
1317             {
1318                 activation->forward_inplace(top_blob, opt);
1319             }
1320         }
1321     }
1322 #endif // __SSE2__
1323 
1324     if (elempack == 1 && out_elempack == 1)
1325     {
1326         if (opt.use_winograd_convolution && kernel_w == 3 && kernel_h == 3 && dilation_w == 1 && dilation_h == 1 && stride_w == 1 && stride_h == 1 && num_input >= 16 && num_output >= 16)
1327         {
1328             conv3x3s1_winograd23_int8_sse(bottom_blob_bordered, top_blob_int32, weight_3x3_winograd23_data_int8, opt);
1329             //             conv3x3s1_winograd43_int8_sse(bottom_blob_bordered, top_blob_int32, weight_3x3_winograd23_data_int8, opt);
1330 
1331             Mat scale_in_data(num_output);
1332             for (int p = 0; p < num_output; p++)
1333             {
1334                 // requantize and relu
1335                 float scale_in;
1336                 if (weight_data_int8_scales[p] == 0)
1337                     scale_in = 0;
1338                 else
1339                     scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]);
1340 
1341                 scale_in_data[p] = scale_in;
1342             }
1343 
1344             if (use_int8_requantize)
1345             {
1346                 requantize_from_int32_to_int8(top_blob_int32, top_blob, scale_in_data, top_blob_int8_scales, bias_data, activation_type, activation_params, opt);
1347             }
1348             else
1349             {
1350                 dequantize_from_int32(top_blob_int32, top_blob, scale_in_data, bias_data, opt);
1351 
1352                 if (activation)
1353                 {
1354                     activation->forward_inplace(top_blob, opt);
1355                 }
1356             }
1357         }
1358         else if (opt.use_sgemm_convolution && dilation_w == 1 && dilation_h == 1 && (activation_type == 0 || activation_type == 1))
1359         {
1360             if (use_int8_requantize)
1361             {
1362                 std::vector<float> requantize_scales;
1363                 for (int p = 0; p < num_output; p++)
1364                 {
1365                     float scale_in;
1366                     if (weight_data_int8_scales[p] == 0)
1367                         scale_in = 0;
1368                     else
1369                         scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]);
1370 
1371                     float scale_out = top_blob_int8_scales[0];
1372 
1373                     requantize_scales.push_back(scale_in);
1374                     requantize_scales.push_back(scale_out);
1375                 }
1376 
1377                 conv_im2col_sgemm_int8_requant_sse(bottom_blob_bordered, top_blob, weight_data, kernel_w, kernel_h, stride_w, stride_h, bias_data, requantize_scales, opt);
1378             }
1379             else
1380             {
1381                 std::vector<float> dequantize_scales;
1382                 for (int p = 0; p < num_output; p++)
1383                 {
1384                     float scale_in;
1385                     if (weight_data_int8_scales[p] == 0)
1386                         scale_in = 0;
1387                     else
1388                         scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]);
1389 
1390                     dequantize_scales.push_back(scale_in);
1391                 }
1392 
1393                 conv_im2col_sgemm_int8_dequant_sse(bottom_blob_bordered, top_blob, weight_data, kernel_w, kernel_h, stride_w, stride_h, bias_data, dequantize_scales, opt);
1394             }
1395 
1396             if (activation)
1397             {
1398                 activation->forward_inplace(top_blob, opt);
1399             }
1400         }
1401         else
1402         {
1403             //         convolution_int8(bottom_blob_bordered, top_blob_int32, weight_data_int8, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
1404             convolution_int8(bottom_blob_bordered, top_blob_int32, weight_data, kernel_w, kernel_h, dilation_w, dilation_h, stride_w, stride_h, opt);
1405 
1406             Mat scale_in_data(num_output);
1407             for (int p = 0; p < num_output; p++)
1408             {
1409                 // requantize and relu
1410                 float scale_in;
1411                 if (weight_data_int8_scales[p] == 0)
1412                     scale_in = 0;
1413                 else
1414                     scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]);
1415 
1416                 scale_in_data[p] = scale_in;
1417             }
1418 
1419             if (use_int8_requantize)
1420             {
1421                 requantize_from_int32_to_int8(top_blob_int32, top_blob, scale_in_data, top_blob_int8_scales, bias_data, activation_type, activation_params, opt);
1422             }
1423             else
1424             {
1425                 dequantize_from_int32(top_blob_int32, top_blob, scale_in_data, bias_data, opt);
1426 
1427                 if (activation)
1428                 {
1429                     activation->forward_inplace(top_blob, opt);
1430                 }
1431             }
1432         }
1433     }
1434 
1435     return 0;
1436 }
1437 #endif // NCNN_INT8
1438 
forwardDilation_x86(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const1439 int Convolution_x86::forwardDilation_x86(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
1440 {
1441     int w = bottom_blob.w;
1442     int h = bottom_blob.h;
1443     size_t elemsize = bottom_blob.elemsize;
1444 
1445     const int kernel_size = kernel_w;
1446     const int stride = stride_w;
1447     const int dilation = dilation_w;
1448     const int kernel_extent = dilation * (kernel_size - 1) + 1;
1449 
1450     int outw = (w - kernel_extent) / stride + 1;
1451     int outh = (h - kernel_extent) / stride + 1;
1452 
1453     top_blob.create(outw, outh, num_output, elemsize, opt.blob_allocator);
1454     if (top_blob.empty())
1455         return -100;
1456 
1457     // Make (dilation * dilation) batches
1458     Mat inner_bottom_blob;
1459     Mat inner_top_blob;
1460     for (int x = 0; x < dilation; x++)
1461     {
1462         for (int y = 0; y < dilation; y++)
1463         {
1464             int inner_w = (w - y + dilation - 1) / dilation;
1465             int inner_h = (h - x + dilation - 1) / dilation;
1466 
1467             int inner_outw = (inner_w - kernel_size) / stride + 1;
1468             int inner_outh = (inner_h - kernel_size) / stride + 1;
1469 
1470             inner_bottom_blob.create(inner_w, inner_h, bottom_blob.c, elemsize, opt.workspace_allocator);
1471             if (inner_bottom_blob.empty())
1472                 return -100;
1473 
1474             inner_top_blob.create(inner_outw, inner_outh, num_output, elemsize, opt.workspace_allocator);
1475             if (inner_top_blob.empty())
1476                 return -100;
1477 
1478             #pragma omp parallel for num_threads(opt.num_threads)
1479             for (int c = 0; c < bottom_blob.c; c++)
1480             {
1481                 float* outptr = inner_bottom_blob.channel(c);
1482 
1483                 for (int i = 0; i < inner_h; i++)
1484                 {
1485                     const float* ptr = (const float*)bottom_blob.channel(c) + dilation * i * w + x * w + y;
1486                     for (int j = 0; j < inner_w; j++)
1487                     {
1488                         outptr[j] = ptr[j * dilation];
1489                     }
1490                     outptr += inner_w;
1491                 }
1492             }
1493 
1494             Option opt_g = opt;
1495             opt_g.blob_allocator = inner_top_blob.allocator;
1496             convolution_dilation1->forward(inner_bottom_blob, inner_top_blob, opt_g);
1497 
1498             #pragma omp parallel for num_threads(opt.num_threads)
1499             for (int c = 0; c < num_output; c++)
1500             {
1501                 float* outptr = (float*)top_blob.channel(c) + x * outw + y;
1502                 for (int i = 0; i < inner_outh; i++)
1503                 {
1504                     const float* ptr = (const float*)inner_top_blob.channel(c) + i * inner_outw;
1505                     for (int j = 0; j < inner_outw; j++)
1506                     {
1507                         outptr[j * dilation] = ptr[j];
1508                     }
1509                     outptr += dilation * outw;
1510                 }
1511             }
1512         }
1513     }
1514 
1515     if (activation)
1516     {
1517         activation->forward_inplace(top_blob, opt);
1518     }
1519 
1520     return 0;
1521 }
1522 
1523 } // namespace ncnn
1524