1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 
15 #include "convolutiondepthwise.h"
16 
17 #include "layer_type.h"
18 
19 #include "fused_activation.h"
20 
21 namespace ncnn {
22 
ConvolutionDepthWise()23 ConvolutionDepthWise::ConvolutionDepthWise()
24 {
25     one_blob_only = true;
26     support_inplace = false;
27 }
28 
load_param(const ParamDict & pd)29 int ConvolutionDepthWise::load_param(const ParamDict& pd)
30 {
31     num_output = pd.get(0, 0);
32     kernel_w = pd.get(1, 0);
33     kernel_h = pd.get(11, kernel_w);
34     dilation_w = pd.get(2, 1);
35     dilation_h = pd.get(12, dilation_w);
36     stride_w = pd.get(3, 1);
37     stride_h = pd.get(13, stride_w);
38     pad_left = pd.get(4, 0);
39     pad_right = pd.get(15, pad_left);
40     pad_top = pd.get(14, pad_left);
41     pad_bottom = pd.get(16, pad_top);
42     pad_value = pd.get(18, 0.f);
43     bias_term = pd.get(5, 0);
44     weight_data_size = pd.get(6, 0);
45     group = pd.get(7, 1);
46     int8_scale_term = pd.get(8, 0);
47     activation_type = pd.get(9, 0);
48     activation_params = pd.get(10, Mat());
49 
50     if (num_output % group != 0)
51     {
52         // reject invalid group
53         return -100;
54     }
55 
56     if (int8_scale_term)
57     {
58 #if NCNN_INT8
59         support_int8_storage = true;
60 #else
61         NCNN_LOGE("please build ncnn with NCNN_INT8 enabled for int8 inference");
62         return -1;
63 #endif
64     }
65 
66     return 0;
67 }
68 
load_model(const ModelBin & mb)69 int ConvolutionDepthWise::load_model(const ModelBin& mb)
70 {
71     weight_data = mb.load(weight_data_size, 0);
72     if (weight_data.empty())
73         return -100;
74 
75     if (bias_term)
76     {
77         bias_data = mb.load(num_output, 1);
78         if (bias_data.empty())
79             return -100;
80     }
81 
82 #if NCNN_INT8
83     if (int8_scale_term == 1 || int8_scale_term == 101)
84     {
85         weight_data_int8_scales = mb.load(group, 1);
86         bottom_blob_int8_scales = mb.load(1, 1);
87 
88         float bottom_blob_int8_scale = bottom_blob_int8_scales[0];
89         bottom_blob_int8_scales = Mat(group);
90         bottom_blob_int8_scales.fill(bottom_blob_int8_scale);
91     }
92     else if (int8_scale_term == 2 || int8_scale_term == 102)
93     {
94         weight_data_int8_scales = mb.load(1, 1);
95         bottom_blob_int8_scales = mb.load(1, 1);
96 
97         // extend group if only one provided
98         float weight_data_int8_scale = weight_data_int8_scales[0];
99         weight_data_int8_scales = Mat(group);
100         weight_data_int8_scales.fill(weight_data_int8_scale);
101 
102         float bottom_blob_int8_scale = bottom_blob_int8_scales[0];
103         bottom_blob_int8_scales = Mat(group);
104         bottom_blob_int8_scales.fill(bottom_blob_int8_scale);
105     }
106 
107     if (int8_scale_term > 100)
108     {
109         top_blob_int8_scales = mb.load(1, 1);
110 
111         float top_blob_int8_scale = top_blob_int8_scales[0];
112         top_blob_int8_scales = Mat(group);
113         top_blob_int8_scales.fill(top_blob_int8_scale);
114     }
115 #endif // NCNN_INT8
116 
117     return 0;
118 }
119 
create_pipeline(const Option & opt)120 int ConvolutionDepthWise::create_pipeline(const Option& opt)
121 {
122 #if NCNN_INT8
123     // runtime quantize the weight data
124     if (opt.use_int8_inference && weight_data.elemsize == (size_t)4u && int8_scale_term)
125     {
126         Mat int8_weight_data(weight_data_size, (size_t)1u);
127         if (int8_weight_data.empty())
128             return -100;
129 
130         const int weight_data_size_g = weight_data_size / group;
131 
132         for (int g = 0; g < group; g++)
133         {
134             Option opt_q = opt;
135             opt_q.blob_allocator = int8_weight_data.allocator;
136             opt_q.use_packing_layout = false;
137 
138             const Mat weight_data_g = weight_data.range(weight_data_size_g * g, weight_data_size_g);
139             Mat int8_weight_data_g = int8_weight_data.range(weight_data_size_g * g, weight_data_size_g);
140             const Mat weight_data_int8_scales_g = weight_data_int8_scales.range(g, 1);
141             quantize_to_int8(weight_data_g, int8_weight_data_g, weight_data_int8_scales_g, opt_q);
142         }
143 
144         weight_data = int8_weight_data;
145     }
146 #endif // NCNN_INT8
147 
148     return 0;
149 }
150 
forward(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const151 int ConvolutionDepthWise::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
152 {
153     // convolv with NxN kernel
154     // value = value + bias
155 
156 #if NCNN_INT8
157     if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
158     {
159         return forward_int8(bottom_blob, top_blob, opt);
160     }
161 #endif
162 
163     int w = bottom_blob.w;
164     int h = bottom_blob.h;
165     int channels = bottom_blob.c;
166     size_t elemsize = bottom_blob.elemsize;
167 
168     if (channels % group != 0 || num_output % group != 0)
169     {
170         // reject invalid group
171         return -100;
172     }
173 
174     //     NCNN_LOGE("ConvolutionDepthWise input %d x %d  pad = %d %d  ksize=%d %d  stride=%d %d", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h);
175 
176     const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
177     const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
178 
179     Mat bottom_blob_bordered;
180     make_padding(bottom_blob, bottom_blob_bordered, opt);
181     if (bottom_blob_bordered.empty())
182         return -100;
183 
184     w = bottom_blob_bordered.w;
185     h = bottom_blob_bordered.h;
186 
187     int outw = (w - kernel_extent_w) / stride_w + 1;
188     int outh = (h - kernel_extent_h) / stride_h + 1;
189 
190     const int maxk = kernel_w * kernel_h;
191 
192     // kernel offsets
193     std::vector<int> _space_ofs(maxk);
194     int* space_ofs = &_space_ofs[0];
195     {
196         int p1 = 0;
197         int p2 = 0;
198         int gap = w * dilation_h - kernel_w * dilation_w;
199         for (int i = 0; i < kernel_h; i++)
200         {
201             for (int j = 0; j < kernel_w; j++)
202             {
203                 space_ofs[p1] = p2;
204                 p1++;
205                 p2 += dilation_w;
206             }
207             p2 += gap;
208         }
209     }
210 
211     // float32
212     top_blob.create(outw, outh, num_output, elemsize, opt.blob_allocator);
213     if (top_blob.empty())
214         return -100;
215 
216     // depth-wise
217     if (channels == group && group == num_output)
218     {
219         #pragma omp parallel for num_threads(opt.num_threads)
220         for (int g = 0; g < group; g++)
221         {
222             float* outptr = top_blob.channel(g);
223             const float* kptr = (const float*)weight_data + maxk * g;
224             const Mat m = bottom_blob_bordered.channel(g);
225 
226             for (int i = 0; i < outh; i++)
227             {
228                 for (int j = 0; j < outw; j++)
229                 {
230                     float sum = 0.f;
231 
232                     if (bias_term)
233                         sum = bias_data[g];
234 
235                     const float* sptr = m.row(i * stride_h) + j * stride_w;
236 
237                     for (int k = 0; k < maxk; k++)
238                     {
239                         float val = sptr[space_ofs[k]];
240                         float w = kptr[k];
241                         sum += val * w;
242                     }
243 
244                     outptr[j] = activation_ss(sum, activation_type, activation_params);
245                 }
246 
247                 outptr += outw;
248             }
249         }
250     }
251     else
252     {
253         // group convolution
254         const int channels_g = channels / group;
255         const int num_output_g = num_output / group;
256 
257 #ifdef _WIN32
258         #pragma omp parallel for num_threads(opt.num_threads)
259 #else // _WIN32
260         #pragma omp parallel for collapse(2) num_threads(opt.num_threads)
261 #endif // _WIN32
262         for (int g = 0; g < group; g++)
263         {
264             for (int p = 0; p < num_output_g; p++)
265             {
266                 float* outptr = top_blob.channel(g * num_output_g + p);
267                 const float* weight_data_ptr = (const float*)weight_data + maxk * channels_g * num_output_g * g;
268 
269                 for (int i = 0; i < outh; i++)
270                 {
271                     for (int j = 0; j < outw; j++)
272                     {
273                         float sum = 0.f;
274 
275                         if (bias_term)
276                             sum = bias_data[num_output_g * g + p];
277 
278                         const float* kptr = weight_data_ptr + maxk * channels_g * p;
279 
280                         // channels_g
281                         for (int q = 0; q < channels_g; q++)
282                         {
283                             const Mat m = bottom_blob_bordered.channel(channels_g * g + q);
284                             const float* sptr = m.row(i * stride_h) + j * stride_w;
285 
286                             for (int k = 0; k < maxk; k++)
287                             {
288                                 float val = sptr[space_ofs[k]];
289                                 float w = kptr[k];
290                                 sum += val * w;
291                             }
292 
293                             kptr += maxk;
294                         }
295 
296                         outptr[j] = activation_ss(sum, activation_type, activation_params);
297                     }
298 
299                     outptr += outw;
300                 }
301             }
302         }
303     }
304 
305     return 0;
306 }
307 
make_padding(const Mat & bottom_blob,Mat & bottom_blob_bordered,const Option & opt) const308 void ConvolutionDepthWise::make_padding(const Mat& bottom_blob, Mat& bottom_blob_bordered, const Option& opt) const
309 {
310     int w = bottom_blob.w;
311     int h = bottom_blob.h;
312 
313     const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
314     const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
315 
316     bottom_blob_bordered = bottom_blob;
317     if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0)
318     {
319         Option opt_b = opt;
320         opt_b.blob_allocator = opt.workspace_allocator;
321         copy_make_border(bottom_blob, bottom_blob_bordered, pad_top, pad_bottom, pad_left, pad_right, BORDER_CONSTANT, pad_value, opt_b);
322     }
323     else if (pad_left == -233 && pad_right == -233 && pad_top == -233 && pad_bottom == -233)
324     {
325         // tensorflow padding=SAME or onnx padding=SAME_UPPER
326         int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
327         int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h;
328         if (wpad > 0 || hpad > 0)
329         {
330             Option opt_b = opt;
331             opt_b.blob_allocator = opt.workspace_allocator;
332             copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, pad_value, opt_b);
333         }
334     }
335     else if (pad_left == -234 && pad_right == -234 && pad_top == -234 && pad_bottom == -234)
336     {
337         // onnx padding=SAME_LOWER
338         int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
339         int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h;
340         if (wpad > 0 || hpad > 0)
341         {
342             Option opt_b = opt;
343             opt_b.blob_allocator = opt.workspace_allocator;
344             copy_make_border(bottom_blob, bottom_blob_bordered, hpad - hpad / 2, hpad / 2, wpad - wpad / 2, wpad / 2, BORDER_CONSTANT, pad_value, opt_b);
345         }
346     }
347 }
348 
349 #if NCNN_INT8
float2int8(float v)350 static inline signed char float2int8(float v)
351 {
352     int int32 = static_cast<int>(round(v));
353     if (int32 > 127) return 127;
354     if (int32 < -127) return -127;
355     return (signed char)int32;
356 }
357 
forward_int8(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const358 int ConvolutionDepthWise::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
359 {
360     // convolv with NxN kernel
361     // value = value + bias
362 
363     int w = bottom_blob.w;
364     int h = bottom_blob.h;
365     int channels = bottom_blob.c;
366     size_t elemsize = bottom_blob.elemsize;
367 
368     if (channels % group != 0 || num_output % group != 0)
369     {
370         // reject invalid group
371         return -100;
372     }
373 
374     //     NCNN_LOGE("ConvolutionDepthWise input %d x %d  pad = %d %d  ksize=%d %d  stride=%d %d", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h);
375 
376     const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
377     const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
378 
379     Mat bottom_blob_int8 = bottom_blob;
380     if (elemsize != 1)
381     {
382         const int channels_g = channels / group;
383 
384         Mat scales(channels);
385         {
386             float* ps = scales;
387             for (int g = 0; g < group; g++)
388             {
389                 float scale = bottom_blob_int8_scales[g];
390                 for (int q = 0; q < channels_g; q++)
391                 {
392                     *ps++ = scale;
393                 }
394             }
395         }
396 
397         Option opt_q = opt;
398         opt_q.blob_allocator = opt.workspace_allocator;
399         quantize_to_int8(bottom_blob, bottom_blob_int8, scales, opt_q);
400     }
401 
402     Mat bottom_blob_bordered;
403     make_padding(bottom_blob_int8, bottom_blob_bordered, opt);
404     if (bottom_blob_bordered.empty())
405         return -100;
406 
407     w = bottom_blob_bordered.w;
408     h = bottom_blob_bordered.h;
409 
410     int outw = (w - kernel_extent_w) / stride_w + 1;
411     int outh = (h - kernel_extent_h) / stride_h + 1;
412 
413     const int maxk = kernel_w * kernel_h;
414 
415     // kernel offsets
416     std::vector<int> _space_ofs(maxk);
417     int* space_ofs = &_space_ofs[0];
418     {
419         int p1 = 0;
420         int p2 = 0;
421         int gap = w * dilation_h - kernel_w * dilation_w;
422         for (int i = 0; i < kernel_h; i++)
423         {
424             for (int j = 0; j < kernel_w; j++)
425             {
426                 space_ofs[p1] = p2;
427                 p1++;
428                 p2 += dilation_w;
429             }
430             p2 += gap;
431         }
432     }
433 
434     // int8
435     bool use_int8_requantize = int8_scale_term > 100;
436     size_t out_elemsize = use_int8_requantize ? 1u : 4u;
437 
438     top_blob.create(outw, outh, num_output, out_elemsize, opt.blob_allocator);
439     if (top_blob.empty())
440         return -100;
441 
442     // depth-wise
443     if (channels == group && group == num_output)
444     {
445         #pragma omp parallel for num_threads(opt.num_threads)
446         for (int g = 0; g < group; g++)
447         {
448             signed char* outptr = top_blob.channel(g);
449             const signed char* kptr = (const signed char*)weight_data + maxk * g;
450             const Mat m = bottom_blob_bordered.channel(g);
451 
452             for (int i = 0; i < outh; i++)
453             {
454                 for (int j = 0; j < outw; j++)
455                 {
456                     int sum = 0;
457 
458                     const signed char* sptr = m.row<signed char>(i * stride_h) + j * stride_w;
459 
460                     for (int k = 0; k < maxk; k++)
461                     {
462                         signed char val = sptr[space_ofs[k]];
463                         signed char w = kptr[k];
464                         sum += val * w;
465                     }
466 
467                     float scale_in;
468                     if (weight_data_int8_scales[g] == 0)
469                         scale_in = 0;
470                     else
471                         scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);
472 
473                     float sumfp32 = sum * scale_in;
474 
475                     if (bias_term)
476                         sumfp32 += bias_data[g];
477 
478                     sumfp32 = activation_ss(sumfp32, activation_type, activation_params);
479 
480                     if (use_int8_requantize)
481                     {
482                         // requantize
483                         float scale_out = top_blob_int8_scales[g];
484                         signed char sums8 = float2int8(sumfp32 * scale_out);
485                         outptr[0] = sums8;
486                         outptr += 1;
487                     }
488                     else
489                     {
490                         // dequantize
491                         ((float*)outptr)[0] = sumfp32;
492                         outptr += 4;
493                     }
494                 }
495             }
496         }
497     }
498     else
499     {
500         // group convolution
501         const int channels_g = channels / group;
502         const int num_output_g = num_output / group;
503 
504 #ifdef _WIN32
505         #pragma omp parallel for num_threads(opt.num_threads)
506 #else // _WIN32
507         #pragma omp parallel for collapse(2) num_threads(opt.num_threads)
508 #endif // _WIN32
509         for (int g = 0; g < group; g++)
510         {
511             for (int p = 0; p < num_output_g; p++)
512             {
513                 signed char* outptr = top_blob.channel(g * num_output_g + p);
514                 const signed char* weight_data_ptr = (const signed char*)weight_data + maxk * channels_g * num_output_g * g;
515 
516                 for (int i = 0; i < outh; i++)
517                 {
518                     for (int j = 0; j < outw; j++)
519                     {
520                         int sum = 0;
521 
522                         const signed char* kptr = weight_data_ptr + maxk * channels_g * p;
523 
524                         // channels_g
525                         for (int q = 0; q < channels_g; q++)
526                         {
527                             const Mat m = bottom_blob_bordered.channel(channels_g * g + q);
528                             const signed char* sptr = m.row<signed char>(i * stride_h) + j * stride_w;
529 
530                             for (int k = 0; k < maxk; k++)
531                             {
532                                 signed char val = sptr[space_ofs[k]];
533                                 signed char w = kptr[k];
534                                 sum += val * w;
535                             }
536 
537                             kptr += maxk;
538                         }
539 
540                         float scale_in;
541                         if (weight_data_int8_scales[g] == 0)
542                             scale_in = 0;
543                         else
544                             scale_in = 1.f / (bottom_blob_int8_scales[g] * weight_data_int8_scales[g]);
545 
546                         float sumfp32 = sum * scale_in;
547 
548                         if (bias_term)
549                             sumfp32 += bias_data[g * num_output_g + p];
550 
551                         sumfp32 = activation_ss(sumfp32, activation_type, activation_params);
552 
553                         if (use_int8_requantize)
554                         {
555                             // requantize
556                             float scale_out = top_blob_int8_scales[g];
557                             signed char sums8 = float2int8(sumfp32 * scale_out);
558                             outptr[0] = sums8;
559                             outptr += 1;
560                         }
561                         else
562                         {
563                             // dequantize
564                             ((float*)outptr)[0] = sumfp32;
565                             outptr += 4;
566                         }
567                     }
568                 }
569             }
570         }
571     }
572 
573     return 0;
574 }
575 #endif // NCNN_INT8
576 
577 } // namespace ncnn
578