1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 
15 #include "convolution.h"
16 
17 #include "layer_type.h"
18 
19 namespace ncnn {
20 
Convolution()21 Convolution::Convolution()
22 {
23     one_blob_only = true;
24     support_inplace = false;
25 }
26 
load_param(const ParamDict & pd)27 int Convolution::load_param(const ParamDict& pd)
28 {
29     num_output = pd.get(0, 0);
30     kernel_w = pd.get(1, 0);
31     kernel_h = pd.get(11, kernel_w);
32     dilation_w = pd.get(2, 1);
33     dilation_h = pd.get(12, dilation_w);
34     stride_w = pd.get(3, 1);
35     stride_h = pd.get(13, stride_w);
36     pad_left = pd.get(4, 0);
37     pad_right = pd.get(15, pad_left);
38     pad_top = pd.get(14, pad_left);
39     pad_bottom = pd.get(16, pad_top);
40     pad_value = pd.get(18, 0.f);
41     bias_term = pd.get(5, 0);
42     weight_data_size = pd.get(6, 0);
43     int8_scale_term = pd.get(8, 0);
44     activation_type = pd.get(9, 0);
45     activation_params = pd.get(10, Mat());
46     impl_type = pd.get(17, 0);
47 
48     if (int8_scale_term)
49     {
50 #if NCNN_INT8
51         support_int8_storage = true;
52 #else
53         NCNN_LOGE("please build ncnn with NCNN_INT8 enabled for int8 inference");
54         return -1;
55 #endif
56     }
57 
58     return 0;
59 }
60 
load_model(const ModelBin & mb)61 int Convolution::load_model(const ModelBin& mb)
62 {
63     weight_data = mb.load(weight_data_size, 0);
64     if (weight_data.empty())
65         return -100;
66 
67     if (bias_term)
68     {
69         bias_data = mb.load(num_output, 1);
70         if (bias_data.empty())
71             return -100;
72     }
73 
74 #if NCNN_INT8
75     if (int8_scale_term)
76     {
77         weight_data_int8_scales = mb.load(num_output, 1);
78         bottom_blob_int8_scales = mb.load(1, 1);
79     }
80 
81     if (int8_scale_term > 100)
82     {
83         top_blob_int8_scales = mb.load(1, 1);
84     }
85 #endif // NCNN_INT8
86 
87     return 0;
88 }
89 
create_pipeline(const Option & opt)90 int Convolution::create_pipeline(const Option& opt)
91 {
92 #if NCNN_INT8
93     // runtime quantize the weight data
94     if (opt.use_int8_inference && weight_data.elemsize == (size_t)4u && int8_scale_term)
95     {
96         const int maxk = kernel_w * kernel_h;
97         const int num_input = weight_data_size / num_output / maxk;
98 
99         Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);
100 
101         Mat weight_data_int8;
102 
103         Option opt_q = opt;
104         opt_q.blob_allocator = weight_data.allocator;
105         opt_q.use_packing_layout = false;
106         quantize_to_int8(weight_data_r2, weight_data_int8, weight_data_int8_scales, opt_q);
107         if (weight_data_int8.empty())
108             return -100;
109 
110         weight_data = weight_data_int8.reshape(weight_data_size);
111     }
112 #endif // NCNN_INT8
113 
114     return 0;
115 }
116 
forward(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const117 int Convolution::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
118 {
119     // convolv with NxN kernel
120     // value = value + bias
121 
122 #if NCNN_INT8
123     if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
124     {
125         return forward_int8(bottom_blob, top_blob, opt);
126     }
127 #endif
128 
129     // flattened blob, implement as InnerProduct
130     if (bottom_blob.dims == 1 && kernel_w == 1 && kernel_h == 1)
131     {
132         int num_input = weight_data_size / num_output;
133         if (bottom_blob.w * bottom_blob.elempack == num_input)
134         {
135             // call InnerProduct
136             ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::InnerProduct);
137 
138             // set param
139             ncnn::ParamDict pd;
140             pd.set(0, num_output);
141             pd.set(1, bias_term);
142             pd.set(2, weight_data_size);
143             pd.set(8, int8_scale_term);
144             pd.set(9, activation_type);
145             pd.set(10, activation_params);
146 
147             op->load_param(pd);
148 
149             // set weights
150             ncnn::Mat weights[4];
151             weights[0] = weight_data;
152             weights[1] = bias_data;
153 
154 #if NCNN_INT8
155             if (int8_scale_term)
156             {
157                 weights[2] = weight_data_int8_scales;
158                 weights[3] = bottom_blob_int8_scales;
159             }
160 #endif
161 
162             op->load_model(ModelBinFromMatArray(weights));
163 
164             op->create_pipeline(opt);
165 
166             // forward
167             op->forward(bottom_blob, top_blob, opt);
168 
169             op->destroy_pipeline(opt);
170 
171             delete op;
172 
173             return 0;
174         }
175     }
176 
177     int w = bottom_blob.w;
178     int h = bottom_blob.h;
179     int channels = bottom_blob.c;
180     size_t elemsize = bottom_blob.elemsize;
181 
182     //     NCNN_LOGE("Convolution input %d x %d  pad = %d %d  ksize=%d %d  stride=%d %d", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h);
183 
184     const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
185     const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
186 
187     Mat bottom_blob_bordered;
188     make_padding(bottom_blob, bottom_blob_bordered, opt);
189     if (bottom_blob_bordered.empty())
190         return -100;
191 
192     w = bottom_blob_bordered.w;
193     h = bottom_blob_bordered.h;
194 
195     int outw = (w - kernel_extent_w) / stride_w + 1;
196     int outh = (h - kernel_extent_h) / stride_h + 1;
197 
198     const int maxk = kernel_w * kernel_h;
199 
200     // kernel offsets
201     std::vector<int> _space_ofs(maxk);
202     int* space_ofs = &_space_ofs[0];
203     {
204         int p1 = 0;
205         int p2 = 0;
206         int gap = w * dilation_h - kernel_w * dilation_w;
207         for (int i = 0; i < kernel_h; i++)
208         {
209             for (int j = 0; j < kernel_w; j++)
210             {
211                 space_ofs[p1] = p2;
212                 p1++;
213                 p2 += dilation_w;
214             }
215             p2 += gap;
216         }
217     }
218 
219     // float32
220     top_blob.create(outw, outh, num_output, elemsize, opt.blob_allocator);
221     if (top_blob.empty())
222         return -100;
223 
224     // num_output
225     #pragma omp parallel for num_threads(opt.num_threads)
226     for (int p = 0; p < num_output; p++)
227     {
228         float* outptr = top_blob.channel(p);
229 
230         for (int i = 0; i < outh; i++)
231         {
232             for (int j = 0; j < outw; j++)
233             {
234                 float sum = 0.f;
235 
236                 if (bias_term)
237                     sum = bias_data[p];
238 
239                 const float* kptr = (const float*)weight_data + maxk * channels * p;
240 
241                 // channels
242                 for (int q = 0; q < channels; q++)
243                 {
244                     const Mat m = bottom_blob_bordered.channel(q);
245                     const float* sptr = m.row(i * stride_h) + j * stride_w;
246 
247                     for (int k = 0; k < maxk; k++) // 29.23
248                     {
249                         float val = sptr[space_ofs[k]]; // 20.72
250                         float wt = kptr[k];
251                         sum += val * wt; // 41.45
252                     }
253 
254                     kptr += maxk;
255                 }
256 
257                 if (activation_type == 1)
258                 {
259                     sum = std::max(sum, 0.f);
260                 }
261                 else if (activation_type == 2)
262                 {
263                     float slope = activation_params[0];
264                     sum = sum > 0.f ? sum : sum * slope;
265                 }
266                 else if (activation_type == 3)
267                 {
268                     float min = activation_params[0];
269                     float max = activation_params[1];
270                     if (sum < min)
271                         sum = min;
272                     if (sum > max)
273                         sum = max;
274                 }
275                 else if (activation_type == 4)
276                 {
277                     sum = static_cast<float>(1.f / (1.f + exp(-sum)));
278                 }
279                 else if (activation_type == 5)
280                 {
281                     const float MISH_THRESHOLD = 20;
282                     float x = sum, y;
283                     if (x > MISH_THRESHOLD)
284                         y = x;
285                     else if (x < -MISH_THRESHOLD)
286                         y = expf(x);
287                     else
288                         y = logf(expf(x) + 1);
289                     sum = static_cast<float>(x * tanh(y));
290                 }
291 
292                 outptr[j] = sum;
293             }
294 
295             outptr += outw;
296         }
297     }
298 
299     return 0;
300 }
301 
make_padding(const Mat & bottom_blob,Mat & bottom_blob_bordered,const Option & opt) const302 void Convolution::make_padding(const Mat& bottom_blob, Mat& bottom_blob_bordered, const Option& opt) const
303 {
304     int w = bottom_blob.w;
305     int h = bottom_blob.h;
306 
307     const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
308     const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
309 
310     bottom_blob_bordered = bottom_blob;
311     if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0)
312     {
313         Option opt_b = opt;
314         opt_b.blob_allocator = opt.workspace_allocator;
315         copy_make_border(bottom_blob, bottom_blob_bordered, pad_top, pad_bottom, pad_left, pad_right, BORDER_CONSTANT, pad_value, opt_b);
316     }
317     else if (pad_left == -233 && pad_right == -233 && pad_top == -233 && pad_bottom == -233)
318     {
319         // tensorflow padding=SAME or onnx padding=SAME_UPPER
320         int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
321         int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h;
322         if (wpad > 0 || hpad > 0)
323         {
324             Option opt_b = opt;
325             opt_b.blob_allocator = opt.workspace_allocator;
326             copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, pad_value, opt_b);
327         }
328     }
329     else if (pad_left == -234 && pad_right == -234 && pad_top == -234 && pad_bottom == -234)
330     {
331         // onnx padding=SAME_LOWER
332         int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
333         int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h;
334         if (wpad > 0 || hpad > 0)
335         {
336             Option opt_b = opt;
337             opt_b.blob_allocator = opt.workspace_allocator;
338             copy_make_border(bottom_blob, bottom_blob_bordered, hpad - hpad / 2, hpad / 2, wpad - wpad / 2, wpad / 2, BORDER_CONSTANT, pad_value, opt_b);
339         }
340     }
341 }
342 
343 #if NCNN_INT8
float2int8(float v)344 static inline signed char float2int8(float v)
345 {
346     int int32 = static_cast<int>(round(v));
347     if (int32 > 127) return 127;
348     if (int32 < -127) return -127;
349     return (signed char)int32;
350 }
351 
forward_int8(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const352 int Convolution::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
353 {
354     int w = bottom_blob.w;
355     int h = bottom_blob.h;
356     int channels = bottom_blob.c;
357     size_t elemsize = bottom_blob.elemsize;
358 
359     //     NCNN_LOGE("Convolution input %d x %d  ksize=%d %d  stride=%d %d", w, h, kernel_w, kernel_h, stride_w, stride_h);
360 
361     const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
362     const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
363 
364     Mat bottom_blob_unbordered = bottom_blob;
365     if (elemsize != 1)
366     {
367         Option opt_g = opt;
368         opt_g.blob_allocator = opt.workspace_allocator;
369 
370         quantize_to_int8(bottom_blob, bottom_blob_unbordered, bottom_blob_int8_scales, opt_g);
371     }
372 
373     Mat bottom_blob_bordered;
374     make_padding(bottom_blob_unbordered, bottom_blob_bordered, opt);
375     if (bottom_blob_bordered.empty())
376         return -100;
377 
378     w = bottom_blob_bordered.w;
379     h = bottom_blob_bordered.h;
380 
381     int outw = (w - kernel_extent_w) / stride_w + 1;
382     int outh = (h - kernel_extent_h) / stride_h + 1;
383 
384     const int maxk = kernel_w * kernel_h;
385 
386     // kernel offsets
387     std::vector<int> _space_ofs(maxk);
388     int* space_ofs = &_space_ofs[0];
389     {
390         int p1 = 0;
391         int p2 = 0;
392         int gap = w * dilation_h - kernel_w * dilation_w;
393         for (int i = 0; i < kernel_h; i++)
394         {
395             for (int j = 0; j < kernel_w; j++)
396             {
397                 space_ofs[p1] = p2;
398                 p1++;
399                 p2 += dilation_w;
400             }
401             p2 += gap;
402         }
403     }
404 
405     // int8
406     bool use_int8_requantize = int8_scale_term > 100;
407     size_t out_elemsize = use_int8_requantize ? 1u : 4u;
408 
409     top_blob.create(outw, outh, num_output, out_elemsize, opt.blob_allocator);
410     if (top_blob.empty())
411         return -100;
412 
413     // num_output
414     #pragma omp parallel for num_threads(opt.num_threads)
415     for (int p = 0; p < num_output; p++)
416     {
417         signed char* outptr = top_blob.channel(p);
418 
419         for (int i = 0; i < outh; i++)
420         {
421             for (int j = 0; j < outw; j++)
422             {
423                 int sum = 0;
424 
425                 const signed char* kptr = (const signed char*)weight_data + maxk * channels * p;
426 
427                 // channels
428                 for (int q = 0; q < channels; q++)
429                 {
430                     const Mat m = bottom_blob_bordered.channel(q);
431                     const signed char* sptr = m.row<signed char>(i * stride_h) + j * stride_w;
432 
433                     for (int k = 0; k < maxk; k++)
434                     {
435                         int val = sptr[space_ofs[k]];
436                         int wt = kptr[k];
437                         sum += val * wt;
438                     }
439 
440                     kptr += maxk;
441                 }
442 
443                 float scale_in;
444                 if (weight_data_int8_scales[p] == 0)
445                     scale_in = 0;
446                 else
447                     scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]);
448 
449                 float sumfp32 = sum * scale_in;
450 
451                 if (bias_term)
452                     sumfp32 += bias_data[p];
453 
454                 if (activation_type == 1)
455                 {
456                     sumfp32 = std::max(sumfp32, 0.f);
457                 }
458                 else if (activation_type == 2)
459                 {
460                     float slope = activation_params[0];
461                     sumfp32 = sumfp32 > 0.f ? sumfp32 : sumfp32 * slope;
462                 }
463                 else if (activation_type == 3)
464                 {
465                     float min = activation_params[0];
466                     float max = activation_params[1];
467                     if (sumfp32 < min)
468                         sumfp32 = min;
469                     if (sumfp32 > max)
470                         sumfp32 = max;
471                 }
472                 else if (activation_type == 4)
473                 {
474                     sumfp32 = static_cast<float>(1.f / (1.f + exp(-sumfp32)));
475                 }
476                 else if (activation_type == 5)
477                 {
478                     const float MISH_THRESHOLD = 20;
479                     float x = sumfp32, y;
480                     if (x > MISH_THRESHOLD)
481                         y = x;
482                     else if (x < -MISH_THRESHOLD)
483                         y = expf(x);
484                     else
485                         y = logf(expf(x) + 1);
486                     sumfp32 = static_cast<float>(x * tanh(y));
487                 }
488 
489                 if (use_int8_requantize)
490                 {
491                     // requantize
492                     float scale_out = top_blob_int8_scales[0];
493                     signed char sums8 = float2int8(sumfp32 * scale_out);
494                     outptr[0] = sums8;
495                     outptr += 1;
496                 }
497                 else
498                 {
499                     // dequantize
500                     ((float*)outptr)[0] = sumfp32;
501                     outptr += 4;
502                 }
503             }
504         }
505     }
506 
507     return 0;
508 }
509 #endif // NCNN_INT8
510 
511 } // namespace ncnn
512