1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2017 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 
15 #include "convolution.h"
16 
17 #include "layer_type.h"
18 
19 #include "fused_activation.h"
20 
21 namespace ncnn {
22 
Convolution()23 Convolution::Convolution()
24 {
25     one_blob_only = true;
26     support_inplace = false;
27 }
28 
load_param(const ParamDict & pd)29 int Convolution::load_param(const ParamDict& pd)
30 {
31     num_output = pd.get(0, 0);
32     kernel_w = pd.get(1, 0);
33     kernel_h = pd.get(11, kernel_w);
34     dilation_w = pd.get(2, 1);
35     dilation_h = pd.get(12, dilation_w);
36     stride_w = pd.get(3, 1);
37     stride_h = pd.get(13, stride_w);
38     pad_left = pd.get(4, 0);
39     pad_right = pd.get(15, pad_left);
40     pad_top = pd.get(14, pad_left);
41     pad_bottom = pd.get(16, pad_top);
42     pad_value = pd.get(18, 0.f);
43     bias_term = pd.get(5, 0);
44     weight_data_size = pd.get(6, 0);
45     int8_scale_term = pd.get(8, 0);
46     activation_type = pd.get(9, 0);
47     activation_params = pd.get(10, Mat());
48     impl_type = pd.get(17, 0);
49 
50     if (int8_scale_term)
51     {
52 #if NCNN_INT8
53         support_int8_storage = true;
54 #else
55         NCNN_LOGE("please build ncnn with NCNN_INT8 enabled for int8 inference");
56         return -1;
57 #endif
58     }
59 
60     return 0;
61 }
62 
load_model(const ModelBin & mb)63 int Convolution::load_model(const ModelBin& mb)
64 {
65     weight_data = mb.load(weight_data_size, 0);
66     if (weight_data.empty())
67         return -100;
68 
69     if (bias_term)
70     {
71         bias_data = mb.load(num_output, 1);
72         if (bias_data.empty())
73             return -100;
74     }
75 
76 #if NCNN_INT8
77     if (int8_scale_term)
78     {
79         weight_data_int8_scales = mb.load(num_output, 1);
80         bottom_blob_int8_scales = mb.load(1, 1);
81     }
82 
83     if (int8_scale_term > 100)
84     {
85         top_blob_int8_scales = mb.load(1, 1);
86     }
87 #endif // NCNN_INT8
88 
89     return 0;
90 }
91 
create_pipeline(const Option & opt)92 int Convolution::create_pipeline(const Option& opt)
93 {
94 #if NCNN_INT8
95     // runtime quantize the weight data
96     if (opt.use_int8_inference && weight_data.elemsize == (size_t)4u && int8_scale_term)
97     {
98         const int maxk = kernel_w * kernel_h;
99         const int num_input = weight_data_size / num_output / maxk;
100 
101         Mat weight_data_r2 = weight_data.reshape(maxk, num_input, num_output);
102 
103         Mat weight_data_int8;
104 
105         Option opt_q = opt;
106         opt_q.blob_allocator = weight_data.allocator;
107         opt_q.use_packing_layout = false;
108         quantize_to_int8(weight_data_r2, weight_data_int8, weight_data_int8_scales, opt_q);
109         if (weight_data_int8.empty())
110             return -100;
111 
112         weight_data = weight_data_int8.reshape(weight_data_size);
113     }
114 #endif // NCNN_INT8
115 
116     return 0;
117 }
118 
forward(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const119 int Convolution::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
120 {
121     // convolv with NxN kernel
122     // value = value + bias
123 
124 #if NCNN_INT8
125     if (opt.use_int8_inference && weight_data.elemsize == (size_t)1u)
126     {
127         return forward_int8(bottom_blob, top_blob, opt);
128     }
129 #endif
130 
131     // flattened blob, implement as InnerProduct
132     if (bottom_blob.dims == 1 && kernel_w == 1 && kernel_h == 1)
133     {
134         int num_input = weight_data_size / num_output;
135         if (bottom_blob.w * bottom_blob.elempack == num_input)
136         {
137             // call InnerProduct
138             ncnn::Layer* op = ncnn::create_layer(ncnn::LayerType::InnerProduct);
139 
140             // set param
141             ncnn::ParamDict pd;
142             pd.set(0, num_output);
143             pd.set(1, bias_term);
144             pd.set(2, weight_data_size);
145             pd.set(8, int8_scale_term);
146             pd.set(9, activation_type);
147             pd.set(10, activation_params);
148 
149             op->load_param(pd);
150 
151             // set weights
152             ncnn::Mat weights[4];
153             weights[0] = weight_data;
154             weights[1] = bias_data;
155 
156 #if NCNN_INT8
157             if (int8_scale_term)
158             {
159                 weights[2] = weight_data_int8_scales;
160                 weights[3] = bottom_blob_int8_scales;
161             }
162 #endif
163 
164             op->load_model(ModelBinFromMatArray(weights));
165 
166             op->create_pipeline(opt);
167 
168             // forward
169             op->forward(bottom_blob, top_blob, opt);
170 
171             op->destroy_pipeline(opt);
172 
173             delete op;
174 
175             return 0;
176         }
177     }
178 
179     int w = bottom_blob.w;
180     int h = bottom_blob.h;
181     int channels = bottom_blob.c;
182     size_t elemsize = bottom_blob.elemsize;
183 
184     //     NCNN_LOGE("Convolution input %d x %d  pad = %d %d  ksize=%d %d  stride=%d %d", w, h, pad_w, pad_h, kernel_w, kernel_h, stride_w, stride_h);
185 
186     const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
187     const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
188 
189     Mat bottom_blob_bordered;
190     make_padding(bottom_blob, bottom_blob_bordered, opt);
191     if (bottom_blob_bordered.empty())
192         return -100;
193 
194     w = bottom_blob_bordered.w;
195     h = bottom_blob_bordered.h;
196 
197     int outw = (w - kernel_extent_w) / stride_w + 1;
198     int outh = (h - kernel_extent_h) / stride_h + 1;
199 
200     const int maxk = kernel_w * kernel_h;
201 
202     // kernel offsets
203     std::vector<int> _space_ofs(maxk);
204     int* space_ofs = &_space_ofs[0];
205     {
206         int p1 = 0;
207         int p2 = 0;
208         int gap = w * dilation_h - kernel_w * dilation_w;
209         for (int i = 0; i < kernel_h; i++)
210         {
211             for (int j = 0; j < kernel_w; j++)
212             {
213                 space_ofs[p1] = p2;
214                 p1++;
215                 p2 += dilation_w;
216             }
217             p2 += gap;
218         }
219     }
220 
221     // float32
222     top_blob.create(outw, outh, num_output, elemsize, opt.blob_allocator);
223     if (top_blob.empty())
224         return -100;
225 
226     // num_output
227     #pragma omp parallel for num_threads(opt.num_threads)
228     for (int p = 0; p < num_output; p++)
229     {
230         float* outptr = top_blob.channel(p);
231 
232         for (int i = 0; i < outh; i++)
233         {
234             for (int j = 0; j < outw; j++)
235             {
236                 float sum = 0.f;
237 
238                 if (bias_term)
239                     sum = bias_data[p];
240 
241                 const float* kptr = (const float*)weight_data + maxk * channels * p;
242 
243                 // channels
244                 for (int q = 0; q < channels; q++)
245                 {
246                     const Mat m = bottom_blob_bordered.channel(q);
247                     const float* sptr = m.row(i * stride_h) + j * stride_w;
248 
249                     for (int k = 0; k < maxk; k++) // 29.23
250                     {
251                         float val = sptr[space_ofs[k]]; // 20.72
252                         float wt = kptr[k];
253                         sum += val * wt; // 41.45
254                     }
255 
256                     kptr += maxk;
257                 }
258 
259                 outptr[j] = activation_ss(sum, activation_type, activation_params);
260             }
261 
262             outptr += outw;
263         }
264     }
265 
266     return 0;
267 }
268 
make_padding(const Mat & bottom_blob,Mat & bottom_blob_bordered,const Option & opt) const269 void Convolution::make_padding(const Mat& bottom_blob, Mat& bottom_blob_bordered, const Option& opt) const
270 {
271     int w = bottom_blob.w;
272     int h = bottom_blob.h;
273 
274     const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
275     const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
276 
277     bottom_blob_bordered = bottom_blob;
278     if (pad_left > 0 || pad_right > 0 || pad_top > 0 || pad_bottom > 0)
279     {
280         Option opt_b = opt;
281         opt_b.blob_allocator = opt.workspace_allocator;
282         copy_make_border(bottom_blob, bottom_blob_bordered, pad_top, pad_bottom, pad_left, pad_right, BORDER_CONSTANT, pad_value, opt_b);
283     }
284     else if (pad_left == -233 && pad_right == -233 && pad_top == -233 && pad_bottom == -233)
285     {
286         // tensorflow padding=SAME or onnx padding=SAME_UPPER
287         int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
288         int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h;
289         if (wpad > 0 || hpad > 0)
290         {
291             Option opt_b = opt;
292             opt_b.blob_allocator = opt.workspace_allocator;
293             copy_make_border(bottom_blob, bottom_blob_bordered, hpad / 2, hpad - hpad / 2, wpad / 2, wpad - wpad / 2, BORDER_CONSTANT, pad_value, opt_b);
294         }
295     }
296     else if (pad_left == -234 && pad_right == -234 && pad_top == -234 && pad_bottom == -234)
297     {
298         // onnx padding=SAME_LOWER
299         int wpad = kernel_extent_w + (w - 1) / stride_w * stride_w - w;
300         int hpad = kernel_extent_h + (h - 1) / stride_h * stride_h - h;
301         if (wpad > 0 || hpad > 0)
302         {
303             Option opt_b = opt;
304             opt_b.blob_allocator = opt.workspace_allocator;
305             copy_make_border(bottom_blob, bottom_blob_bordered, hpad - hpad / 2, hpad / 2, wpad - wpad / 2, wpad / 2, BORDER_CONSTANT, pad_value, opt_b);
306         }
307     }
308 }
309 
310 #if NCNN_INT8
float2int8(float v)311 static inline signed char float2int8(float v)
312 {
313     int int32 = static_cast<int>(round(v));
314     if (int32 > 127) return 127;
315     if (int32 < -127) return -127;
316     return (signed char)int32;
317 }
318 
forward_int8(const Mat & bottom_blob,Mat & top_blob,const Option & opt) const319 int Convolution::forward_int8(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
320 {
321     int w = bottom_blob.w;
322     int h = bottom_blob.h;
323     int channels = bottom_blob.c;
324     size_t elemsize = bottom_blob.elemsize;
325 
326     //     NCNN_LOGE("Convolution input %d x %d  ksize=%d %d  stride=%d %d", w, h, kernel_w, kernel_h, stride_w, stride_h);
327 
328     const int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
329     const int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
330 
331     Mat bottom_blob_unbordered = bottom_blob;
332     if (elemsize != 1)
333     {
334         Option opt_g = opt;
335         opt_g.blob_allocator = opt.workspace_allocator;
336 
337         quantize_to_int8(bottom_blob, bottom_blob_unbordered, bottom_blob_int8_scales, opt_g);
338     }
339 
340     Mat bottom_blob_bordered;
341     make_padding(bottom_blob_unbordered, bottom_blob_bordered, opt);
342     if (bottom_blob_bordered.empty())
343         return -100;
344 
345     w = bottom_blob_bordered.w;
346     h = bottom_blob_bordered.h;
347 
348     int outw = (w - kernel_extent_w) / stride_w + 1;
349     int outh = (h - kernel_extent_h) / stride_h + 1;
350 
351     const int maxk = kernel_w * kernel_h;
352 
353     // kernel offsets
354     std::vector<int> _space_ofs(maxk);
355     int* space_ofs = &_space_ofs[0];
356     {
357         int p1 = 0;
358         int p2 = 0;
359         int gap = w * dilation_h - kernel_w * dilation_w;
360         for (int i = 0; i < kernel_h; i++)
361         {
362             for (int j = 0; j < kernel_w; j++)
363             {
364                 space_ofs[p1] = p2;
365                 p1++;
366                 p2 += dilation_w;
367             }
368             p2 += gap;
369         }
370     }
371 
372     // int8
373     bool use_int8_requantize = int8_scale_term > 100;
374     size_t out_elemsize = use_int8_requantize ? 1u : 4u;
375 
376     top_blob.create(outw, outh, num_output, out_elemsize, opt.blob_allocator);
377     if (top_blob.empty())
378         return -100;
379 
380     // num_output
381     #pragma omp parallel for num_threads(opt.num_threads)
382     for (int p = 0; p < num_output; p++)
383     {
384         signed char* outptr = top_blob.channel(p);
385 
386         for (int i = 0; i < outh; i++)
387         {
388             for (int j = 0; j < outw; j++)
389             {
390                 int sum = 0;
391 
392                 const signed char* kptr = (const signed char*)weight_data + maxk * channels * p;
393 
394                 // channels
395                 for (int q = 0; q < channels; q++)
396                 {
397                     const Mat m = bottom_blob_bordered.channel(q);
398                     const signed char* sptr = m.row<signed char>(i * stride_h) + j * stride_w;
399 
400                     for (int k = 0; k < maxk; k++)
401                     {
402                         int val = sptr[space_ofs[k]];
403                         int wt = kptr[k];
404                         sum += val * wt;
405                     }
406 
407                     kptr += maxk;
408                 }
409 
410                 float scale_in;
411                 if (weight_data_int8_scales[p] == 0)
412                     scale_in = 0;
413                 else
414                     scale_in = 1.f / (bottom_blob_int8_scales[0] * weight_data_int8_scales[p]);
415 
416                 float sumfp32 = sum * scale_in;
417 
418                 if (bias_term)
419                     sumfp32 += bias_data[p];
420 
421                 sumfp32 = activation_ss(sumfp32, activation_type, activation_params);
422 
423                 if (use_int8_requantize)
424                 {
425                     // requantize
426                     float scale_out = top_blob_int8_scales[0];
427                     signed char sums8 = float2int8(sumfp32 * scale_out);
428                     outptr[0] = sums8;
429                     outptr += 1;
430                 }
431                 else
432                 {
433                     // dequantize
434                     ((float*)outptr)[0] = sumfp32;
435                     outptr += 4;
436                 }
437             }
438         }
439     }
440 
441     return 0;
442 }
443 #endif // NCNN_INT8
444 
445 } // namespace ncnn
446