1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 
15 #ifdef _MSC_VER
16 #define _CRT_SECURE_NO_DEPRECATE
17 #endif
18 
19 #include <algorithm>
20 #include <map>
21 #include <set>
22 #include <vector>
23 
24 // ncnn public header
25 #include "datareader.h"
26 #include "layer.h"
27 #include "layer_type.h"
28 #include "net.h"
29 
30 // ncnn private header
31 #include "modelwriter.h"
32 
33 class DataReaderFromEmpty : public ncnn::DataReader
34 {
35 public:
scan(const char * format,void * p) const36     virtual int scan(const char* format, void* p) const
37     {
38         return 0;
39     }
read(void * buf,size_t size) const40     virtual size_t read(void* buf, size_t size) const
41     {
42         memset(buf, 0, size);
43         return size;
44     }
45 };
46 
47 class NetOptimize : public ModelWriter
48 {
49 public:
50     NetOptimize();
51 
52 public:
53     int fuse_batchnorm_scale();
54     int fuse_convolution_batchnorm();
55     int fuse_convolution_mul();
56     int fuse_convolution_add();
57     int fuse_convolutiondepthwise_batchnorm();
58     int fuse_convolutiondepthwise_mul();
59     int fuse_convolutiondepthwise_add();
60     int fuse_deconvolution_batchnorm();
61     int fuse_deconvolution_mul();
62     int fuse_deconvolution_add();
63     int fuse_deconvolutiondepthwise_batchnorm();
64     int fuse_innerproduct_batchnorm();
65     int fuse_innerproduct_add();
66     int fuse_innerproduct_dropout();
67     int fuse_convolution_activation();
68     int fuse_convolutiondepthwise_activation();
69     int fuse_deconvolution_activation();
70     int fuse_deconvolutiondepthwise_activation();
71     int fuse_innerproduct_activation();
72     int fuse_memorydata_binaryop();
73     int fuse_binaryop_eltwise();
74 
75     int eliminate_dropout();
76     int eliminate_pooling1x1();
77     int eliminate_noop();
78     int eliminate_split();
79     int eliminate_orphaned_memorydata();
80     int eliminate_flatten_after_global_pooling();
81     int eliminate_reshape_after_global_pooling();
82     int eliminate_flatten_after_innerproduct();
83     int eliminate_reshape_before_binaryop();
84 
85     int replace_reduction_with_global_pooling();
86     int replace_prelu_with_leaky_relu();
87     int replace_convolution_with_innerproduct_after_global_pooling();
88     int replace_convolution_with_innerproduct_after_innerproduct();
89 };
90 
NetOptimize()91 NetOptimize::NetOptimize()
92     : ModelWriter()
93 {
94 }
95 
fuse_batchnorm_scale()96 int NetOptimize::fuse_batchnorm_scale()
97 {
98     const size_t layer_count = layers.size();
99     for (size_t i = 0; i < layer_count; i++)
100     {
101         if (layers[i]->type != "BatchNorm")
102             continue;
103 
104         // BatchNorm - Scale
105         int top_blob_index = layers[i]->tops[0];
106 
107         size_t j = i + 1;
108         for (; j < layer_count; j++)
109         {
110             if (layers[j]->type != "Scale")
111                 continue;
112 
113             if (layers[j]->bottoms.size() != 1)
114                 continue;
115 
116             if (layers[j]->bottoms[0] == top_blob_index)
117                 break;
118         }
119 
120         if (j == layer_count)
121             continue;
122 
123         // fuse BatchNorm - Scale to BatchNorm
124         ncnn::BatchNorm* batchnorm = (ncnn::BatchNorm*)layers[i];
125         ncnn::Scale* scale = (ncnn::Scale*)layers[j];
126 
127         fprintf(stderr, "fuse_batchnorm_scale %s %s\n", batchnorm->name.c_str(), scale->name.c_str());
128 
129         {
130             //             v = ((v - mean) / sqrt(var + eps) * slope + bias) * s + b
131             //               =  (v - mean) / sqrt(var + eps) * (slope * s) + (bias * s + b)
132 
133             int channels = batchnorm->channels;
134 
135             float* slope = batchnorm->slope_data;
136             float* bias = batchnorm->bias_data;
137 
138             for (int q = 0; q < channels; q++)
139             {
140                 slope[q] = slope[q] * scale->scale_data[q];
141                 if (scale->bias_term)
142                     bias[q] = bias[q] * scale->scale_data[q] + scale->bias_data[q];
143                 else
144                     bias[q] = bias[q] * scale->scale_data[q];
145             }
146         }
147 
148         int top_blob_index_final = scale->tops[0];
149         batchnorm->tops[0] = top_blob_index_final;
150         blobs[top_blob_index_final].producer = i;
151         scale->type = "ncnnfused";
152     }
153 
154     return 0;
155 }
156 
fuse_convolution_batchnorm()157 int NetOptimize::fuse_convolution_batchnorm()
158 {
159     const size_t layer_count = layers.size();
160     for (size_t i = 0; i < layer_count; i++)
161     {
162         if (layers[i]->type != "Convolution")
163             continue;
164 
165         // Convolution - BatchNorm
166         int top_blob_index = layers[i]->tops[0];
167 
168         size_t j = i + 1;
169         for (; j < layer_count; j++)
170         {
171             if (layers[j]->type != "BatchNorm")
172                 continue;
173 
174             if (layers[j]->bottoms.size() != 1)
175                 continue;
176 
177             if (layers[j]->bottoms[0] == top_blob_index)
178                 break;
179         }
180 
181         if (j == layer_count)
182             continue;
183 
184         // fuse Convolution - BatchNorm to Convolution
185         ncnn::Convolution* convolution = (ncnn::Convolution*)layers[i];
186         ncnn::BatchNorm* batchnorm = (ncnn::BatchNorm*)layers[j];
187 
188         fprintf(stderr, "fuse_convolution_batchnorm %s %s\n", convolution->name.c_str(), batchnorm->name.c_str());
189 
190         {
191             int channels = batchnorm->channels;
192             float eps = batchnorm->eps;
193 
194             // a = bias - slope * mean / sqrt(var + eps)
195             // b = slope / sqrt(var + eps)
196             // value = value * b + a
197 
198             std::vector<float> a(channels);
199             std::vector<float> b(channels);
200             for (int i = 0; i < channels; i++)
201             {
202                 float sqrt_var = static_cast<float>(sqrt(batchnorm->var_data[i] + eps));
203                 a[i] = batchnorm->bias_data[i] - batchnorm->slope_data[i] * batchnorm->mean_data[i] / sqrt_var;
204                 b[i] = batchnorm->slope_data[i] / sqrt_var;
205             }
206 
207             if (convolution->bias_term == 0)
208             {
209                 // init bias as zero
210                 convolution->bias_term = 1;
211                 convolution->bias_data = ncnn::Mat(channels);
212                 convolution->bias_data.fill(0.f);
213             }
214 
215             const int weight_per_outch = convolution->weight_data_size / channels;
216 
217             float* weight = convolution->weight_data;
218             float* bias = convolution->bias_data;
219             for (int i = 0; i < channels; i++)
220             {
221                 float* conv_weight_outch = weight + weight_per_outch * i;
222                 for (int j = 0; j < weight_per_outch; j++)
223                 {
224                     conv_weight_outch[j] *= b[i];
225                 }
226 
227                 bias[i] = bias[i] * b[i] + a[i];
228             }
229         }
230 
231         int top_blob_index_final = batchnorm->tops[0];
232         convolution->tops[0] = top_blob_index_final;
233         blobs[top_blob_index_final].producer = i;
234         batchnorm->type = "ncnnfused";
235     }
236 
237     return 0;
238 }
239 
fuse_convolution_mul()240 int NetOptimize::fuse_convolution_mul()
241 {
242     const size_t layer_count = layers.size();
243     for (size_t i = 0; i < layer_count; i++)
244     {
245         if (layers[i]->type != "Convolution")
246             continue;
247 
248         // Convolution - BinaryOp
249         int top_blob_index = layers[i]->tops[0];
250 
251         size_t j = i + 1;
252         for (; j < layer_count; j++)
253         {
254             if (layers[j]->type != "BinaryOp")
255                 continue;
256 
257             if (layers[j]->bottoms.size() != 2)
258                 continue;
259 
260             if (layers[j]->bottoms[0] == top_blob_index)
261                 break;
262         }
263 
264         if (j == layer_count)
265             continue;
266 
267         // fuse Convolution - BinaryOp to Convolution
268         ncnn::Convolution* convolution = (ncnn::Convolution*)layers[i];
269         ncnn::BinaryOp* binaryop = (ncnn::BinaryOp*)layers[j];
270 
271         if (binaryop->op_type != 2 || binaryop->with_scalar)
272             continue;
273 
274         // MemoryData - ..... - BinaryOp
275         size_t k = 0;
276         for (; k < j; k++)
277         {
278             if (layers[k]->type != "MemoryData")
279                 continue;
280 
281             if (layers[k]->tops[0] == binaryop->bottoms[1])
282                 break;
283         }
284 
285         if (k == j)
286             continue;
287 
288         ncnn::MemoryData* memorydata = (ncnn::MemoryData*)layers[k];
289 
290         int channels = convolution->num_output;
291 
292         if (memorydata->w != channels || memorydata->h != 0 || memorydata->c != 0)
293         {
294             // not bias-like broadcasting type
295             continue;
296         }
297 
298         fprintf(stderr, "fuse_convolution_mul %s %s\n", convolution->name.c_str(), binaryop->name.c_str());
299 
300         {
301             const int weight_per_outch = convolution->weight_data_size / channels;
302 
303             float* weight = convolution->weight_data;
304             float* bias = convolution->bias_data;
305             for (int i = 0; i < channels; i++)
306             {
307                 float* conv_weight_outch = weight + weight_per_outch * i;
308                 for (int j = 0; j < weight_per_outch; j++)
309                 {
310                     conv_weight_outch[j] *= memorydata->data[i];
311                 }
312 
313                 if (bias)
314                 {
315                     bias[i] = bias[i] * memorydata->data[i];
316                 }
317             }
318         }
319 
320         int top_blob_index_final = binaryop->tops[0];
321         convolution->tops[0] = top_blob_index_final;
322         blobs[top_blob_index_final].producer = i;
323         binaryop->type = "ncnnfused";
324     }
325 
326     return 0;
327 }
328 
fuse_convolution_add()329 int NetOptimize::fuse_convolution_add()
330 {
331     const size_t layer_count = layers.size();
332     for (size_t i = 0; i < layer_count; i++)
333     {
334         if (layers[i]->type != "Convolution")
335             continue;
336 
337         // Convolution - BinaryOp
338         int top_blob_index = layers[i]->tops[0];
339 
340         size_t j = i + 1;
341         for (; j < layer_count; j++)
342         {
343             if (layers[j]->type != "BinaryOp")
344                 continue;
345 
346             if (layers[j]->bottoms.size() != 2)
347                 continue;
348 
349             if (layers[j]->bottoms[0] == top_blob_index)
350                 break;
351         }
352 
353         if (j == layer_count)
354             continue;
355 
356         // fuse Convolution - BinaryOp to Convolution
357         ncnn::Convolution* convolution = (ncnn::Convolution*)layers[i];
358         ncnn::BinaryOp* binaryop = (ncnn::BinaryOp*)layers[j];
359 
360         if (binaryop->op_type != 0 || binaryop->with_scalar)
361             continue;
362 
363         // MemoryData - ..... - BinaryOp
364         size_t k = 0;
365         for (; k < j; k++)
366         {
367             if (layers[k]->type != "MemoryData")
368                 continue;
369 
370             if (layers[k]->tops[0] == binaryop->bottoms[1])
371                 break;
372         }
373 
374         if (k == j)
375             continue;
376 
377         ncnn::MemoryData* memorydata = (ncnn::MemoryData*)layers[k];
378 
379         int channels = convolution->num_output;
380 
381         bool broadcasting_type_ok = false;
382         if (memorydata->w == channels && memorydata->h == 0 && memorydata->c == 0)
383             broadcasting_type_ok = true;
384         if (memorydata->w == 1 && memorydata->h == 1 && memorydata->c == channels)
385             broadcasting_type_ok = true;
386 
387         if (!broadcasting_type_ok)
388         {
389             // not bias-like broadcasting type
390             continue;
391         }
392 
393         fprintf(stderr, "fuse_convolution_add %s %s\n", convolution->name.c_str(), binaryop->name.c_str());
394 
395         ncnn::Mat bias_data = memorydata->data.reshape(channels);
396         {
397             if (convolution->bias_term == 0)
398             {
399                 // init bias
400                 convolution->bias_term = 1;
401                 convolution->bias_data = bias_data;
402             }
403             else
404             {
405                 float* bias = convolution->bias_data;
406                 for (int i = 0; i < channels; i++)
407                 {
408                     bias[i] = bias[i] + bias_data[i];
409                 }
410             }
411         }
412 
413         int top_blob_index_final = binaryop->tops[0];
414         convolution->tops[0] = top_blob_index_final;
415         blobs[top_blob_index_final].producer = i;
416         binaryop->type = "ncnnfused";
417     }
418 
419     return 0;
420 }
421 
fuse_convolutiondepthwise_batchnorm()422 int NetOptimize::fuse_convolutiondepthwise_batchnorm()
423 {
424     const size_t layer_count = layers.size();
425     for (size_t i = 0; i < layer_count; i++)
426     {
427         if (layers[i]->type != "ConvolutionDepthWise")
428             continue;
429 
430         // ConvolutionDepthWise - BatchNorm
431         int top_blob_index = layers[i]->tops[0];
432 
433         size_t j = i + 1;
434         for (; j < layer_count; j++)
435         {
436             if (layers[j]->type != "BatchNorm")
437                 continue;
438 
439             if (layers[j]->bottoms.size() != 1)
440                 continue;
441 
442             if (layers[j]->bottoms[0] == top_blob_index)
443                 break;
444         }
445 
446         if (j == layer_count)
447             continue;
448 
449         // fuse ConvolutionDepthWise - BatchNorm to ConvolutionDepthWise
450         ncnn::ConvolutionDepthWise* convolutiondepthwise = (ncnn::ConvolutionDepthWise*)layers[i];
451         ncnn::BatchNorm* batchnorm = (ncnn::BatchNorm*)layers[j];
452 
453         fprintf(stderr, "fuse_convolutiondepthwise_batchnorm %s %s\n", convolutiondepthwise->name.c_str(), batchnorm->name.c_str());
454 
455         {
456             int channels = batchnorm->channels;
457             float eps = batchnorm->eps;
458 
459             // a = bias - slope * mean / sqrt(var + eps)
460             // b = slope / sqrt(var + eps)
461             // value = value * b + a
462 
463             std::vector<float> a(channels);
464             std::vector<float> b(channels);
465             for (int i = 0; i < channels; i++)
466             {
467                 float sqrt_var = static_cast<float>(sqrt(batchnorm->var_data[i] + eps));
468                 a[i] = batchnorm->bias_data[i] - batchnorm->slope_data[i] * batchnorm->mean_data[i] / sqrt_var;
469                 b[i] = batchnorm->slope_data[i] / sqrt_var;
470             }
471 
472             if (convolutiondepthwise->bias_term == 0)
473             {
474                 // init bias as zero
475                 convolutiondepthwise->bias_term = 1;
476                 convolutiondepthwise->bias_data = ncnn::Mat(channels);
477                 convolutiondepthwise->bias_data.fill(0.f);
478             }
479 
480             const int weight_per_outch = convolutiondepthwise->weight_data_size / channels;
481 
482             float* weight = convolutiondepthwise->weight_data;
483             float* bias = convolutiondepthwise->bias_data;
484             for (int i = 0; i < channels; i++)
485             {
486                 float* conv_weight_outch = weight + weight_per_outch * i;
487                 for (int j = 0; j < weight_per_outch; j++)
488                 {
489                     conv_weight_outch[j] *= b[i];
490                 }
491 
492                 bias[i] = bias[i] * b[i] + a[i];
493             }
494         }
495 
496         int top_blob_index_final = batchnorm->tops[0];
497         convolutiondepthwise->tops[0] = top_blob_index_final;
498         blobs[top_blob_index_final].producer = i;
499         batchnorm->type = "ncnnfused";
500     }
501 
502     return 0;
503 }
504 
fuse_convolutiondepthwise_mul()505 int NetOptimize::fuse_convolutiondepthwise_mul()
506 {
507     const size_t layer_count = layers.size();
508     for (size_t i = 0; i < layer_count; i++)
509     {
510         if (layers[i]->type != "ConvolutionDepthWise")
511             continue;
512 
513         // ConvolutionDepthWise - BinaryOp
514         int top_blob_index = layers[i]->tops[0];
515 
516         size_t j = i + 1;
517         for (; j < layer_count; j++)
518         {
519             if (layers[j]->type != "BinaryOp")
520                 continue;
521 
522             if (layers[j]->bottoms.size() != 2)
523                 continue;
524 
525             if (layers[j]->bottoms[0] == top_blob_index)
526                 break;
527         }
528 
529         if (j == layer_count)
530             continue;
531 
532         // fuse ConvolutionDepthWise - BinaryOp to ConvolutionDepthWise
533         ncnn::ConvolutionDepthWise* convolutiondepthwise = (ncnn::ConvolutionDepthWise*)layers[i];
534         ncnn::BinaryOp* binaryop = (ncnn::BinaryOp*)layers[j];
535 
536         if (binaryop->op_type != 2 || binaryop->with_scalar)
537             continue;
538 
539         // MemoryData - ..... - BinaryOp
540         size_t k = 0;
541         for (; k < j; k++)
542         {
543             if (layers[k]->type != "MemoryData")
544                 continue;
545 
546             if (layers[k]->tops[0] == binaryop->bottoms[1])
547                 break;
548         }
549 
550         if (k == j)
551             continue;
552 
553         ncnn::MemoryData* memorydata = (ncnn::MemoryData*)layers[k];
554 
555         int channels = convolutiondepthwise->num_output;
556 
557         if (memorydata->w != channels || memorydata->h != 0 || memorydata->c != 0)
558         {
559             // not bias-like broadcasting type
560             continue;
561         }
562 
563         fprintf(stderr, "fuse_convolutiondepthwise_mul %s %s\n", convolutiondepthwise->name.c_str(), binaryop->name.c_str());
564 
565         {
566             const int weight_per_outch = convolutiondepthwise->weight_data_size / channels;
567 
568             float* weight = convolutiondepthwise->weight_data;
569             float* bias = convolutiondepthwise->bias_data;
570             for (int i = 0; i < channels; i++)
571             {
572                 float* conv_weight_outch = weight + weight_per_outch * i;
573                 for (int j = 0; j < weight_per_outch; j++)
574                 {
575                     conv_weight_outch[j] *= memorydata->data[i];
576                 }
577 
578                 if (bias)
579                 {
580                     bias[i] = bias[i] * memorydata->data[i];
581                 }
582             }
583         }
584 
585         int top_blob_index_final = binaryop->tops[0];
586         convolutiondepthwise->tops[0] = top_blob_index_final;
587         blobs[top_blob_index_final].producer = i;
588         binaryop->type = "ncnnfused";
589     }
590 
591     return 0;
592 }
593 
fuse_convolutiondepthwise_add()594 int NetOptimize::fuse_convolutiondepthwise_add()
595 {
596     const size_t layer_count = layers.size();
597     for (size_t i = 0; i < layer_count; i++)
598     {
599         if (layers[i]->type != "ConvolutionDepthWise")
600             continue;
601 
602         // ConvolutionDepthWise - BinaryOp
603         int top_blob_index = layers[i]->tops[0];
604 
605         size_t j = i + 1;
606         for (; j < layer_count; j++)
607         {
608             if (layers[j]->type != "BinaryOp")
609                 continue;
610 
611             if (layers[j]->bottoms.size() != 2)
612                 continue;
613 
614             if (layers[j]->bottoms[0] == top_blob_index)
615                 break;
616         }
617 
618         if (j == layer_count)
619             continue;
620 
621         // fuse ConvolutionDepthWise - BinaryOp to ConvolutionDepthWise
622         ncnn::ConvolutionDepthWise* convolutiondepthwise = (ncnn::ConvolutionDepthWise*)layers[i];
623         ncnn::BinaryOp* binaryop = (ncnn::BinaryOp*)layers[j];
624 
625         if (binaryop->op_type != 0 || binaryop->with_scalar)
626             continue;
627 
628         // MemoryData - ..... - BinaryOp
629         size_t k = 0;
630         for (; k < j; k++)
631         {
632             if (layers[k]->type != "MemoryData")
633                 continue;
634 
635             if (layers[k]->tops[0] == binaryop->bottoms[1])
636                 break;
637         }
638 
639         if (k == j)
640             continue;
641 
642         ncnn::MemoryData* memorydata = (ncnn::MemoryData*)layers[k];
643 
644         int channels = convolutiondepthwise->num_output;
645 
646         bool broadcasting_type_ok = false;
647         if (memorydata->w == channels && memorydata->h == 0 && memorydata->c == 0)
648             broadcasting_type_ok = true;
649         if (memorydata->w == 1 && memorydata->h == 1 && memorydata->c == channels)
650             broadcasting_type_ok = true;
651 
652         if (!broadcasting_type_ok)
653         {
654             // not bias-like broadcasting type
655             continue;
656         }
657 
658         fprintf(stderr, "fuse_convolutiondepthwise_add %s %s\n", convolutiondepthwise->name.c_str(), binaryop->name.c_str());
659 
660         ncnn::Mat bias_data = memorydata->data.reshape(channels);
661         {
662             if (convolutiondepthwise->bias_term == 0)
663             {
664                 // init bias
665                 convolutiondepthwise->bias_term = 1;
666                 convolutiondepthwise->bias_data = bias_data;
667             }
668             else
669             {
670                 float* bias = convolutiondepthwise->bias_data;
671                 for (int i = 0; i < channels; i++)
672                 {
673                     bias[i] = bias[i] + bias_data[i];
674                 }
675             }
676         }
677 
678         int top_blob_index_final = binaryop->tops[0];
679         convolutiondepthwise->tops[0] = top_blob_index_final;
680         blobs[top_blob_index_final].producer = i;
681         binaryop->type = "ncnnfused";
682     }
683 
684     return 0;
685 }
686 
fuse_deconvolution_batchnorm()687 int NetOptimize::fuse_deconvolution_batchnorm()
688 {
689     const size_t layer_count = layers.size();
690     for (size_t i = 0; i < layer_count; i++)
691     {
692         if (layers[i]->type != "Deconvolution")
693             continue;
694 
695         // Deconvolution - BatchNorm
696         int top_blob_index = layers[i]->tops[0];
697 
698         size_t j = i + 1;
699         for (; j < layer_count; j++)
700         {
701             if (layers[j]->type != "BatchNorm")
702                 continue;
703 
704             if (layers[j]->bottoms.size() != 1)
705                 continue;
706 
707             if (layers[j]->bottoms[0] == top_blob_index)
708                 break;
709         }
710 
711         if (j == layer_count)
712             continue;
713 
714         // fuse Deconvolution - BatchNorm to Deconvolution
715         ncnn::Deconvolution* deconvolution = (ncnn::Deconvolution*)layers[i];
716         ncnn::BatchNorm* batchnorm = (ncnn::BatchNorm*)layers[j];
717 
718         fprintf(stderr, "fuse_deconvolution_batchnorm %s %s\n", deconvolution->name.c_str(), batchnorm->name.c_str());
719 
720         {
721             int channels = batchnorm->channels;
722             float eps = batchnorm->eps;
723 
724             // a = bias - slope * mean / sqrt(var + eps)
725             // b = slope / sqrt(var + eps)
726             // value = value * b + a
727 
728             std::vector<float> a(channels);
729             std::vector<float> b(channels);
730             for (int i = 0; i < channels; i++)
731             {
732                 float sqrt_var = static_cast<float>(sqrt(batchnorm->var_data[i] + eps));
733                 a[i] = batchnorm->bias_data[i] - batchnorm->slope_data[i] * batchnorm->mean_data[i] / sqrt_var;
734                 b[i] = batchnorm->slope_data[i] / sqrt_var;
735             }
736 
737             if (deconvolution->bias_term == 0)
738             {
739                 // init bias as zero
740                 deconvolution->bias_term = 1;
741                 deconvolution->bias_data = ncnn::Mat(channels);
742                 deconvolution->bias_data.fill(0.f);
743             }
744 
745             const int weight_per_outch = deconvolution->weight_data_size / channels;
746 
747             float* weight = deconvolution->weight_data;
748             float* bias = deconvolution->bias_data;
749             for (int i = 0; i < channels; i++)
750             {
751                 float* conv_weight_outch = weight + weight_per_outch * i;
752                 for (int j = 0; j < weight_per_outch; j++)
753                 {
754                     conv_weight_outch[j] *= b[i];
755                 }
756 
757                 bias[i] = bias[i] * b[i] + a[i];
758             }
759         }
760 
761         int top_blob_index_final = batchnorm->tops[0];
762         deconvolution->tops[0] = top_blob_index_final;
763         blobs[top_blob_index_final].producer = i;
764         batchnorm->type = "ncnnfused";
765     }
766 
767     return 0;
768 }
769 
fuse_deconvolution_mul()770 int NetOptimize::fuse_deconvolution_mul()
771 {
772     const size_t layer_count = layers.size();
773     for (size_t i = 0; i < layer_count; i++)
774     {
775         if (layers[i]->type != "Deconvolution")
776             continue;
777 
778         // Deconvolution - BinaryOp
779         int top_blob_index = layers[i]->tops[0];
780 
781         size_t j = i + 1;
782         for (; j < layer_count; j++)
783         {
784             if (layers[j]->type != "BinaryOp")
785                 continue;
786 
787             if (layers[j]->bottoms.size() != 2)
788                 continue;
789 
790             if (layers[j]->bottoms[0] == top_blob_index)
791                 break;
792         }
793 
794         if (j == layer_count)
795             continue;
796 
797         // fuse Deconvolution - BinaryOp to Deconvolution
798         ncnn::Deconvolution* deconvolution = (ncnn::Deconvolution*)layers[i];
799         ncnn::BinaryOp* binaryop = (ncnn::BinaryOp*)layers[j];
800 
801         if (binaryop->op_type != 2 || binaryop->with_scalar)
802             continue;
803 
804         // MemoryData - ..... - BinaryOp
805         size_t k = 0;
806         for (; k < j; k++)
807         {
808             if (layers[k]->type != "MemoryData")
809                 continue;
810 
811             if (layers[k]->tops[0] == binaryop->bottoms[1])
812                 break;
813         }
814 
815         if (k == j)
816             continue;
817 
818         ncnn::MemoryData* memorydata = (ncnn::MemoryData*)layers[k];
819 
820         int channels = deconvolution->num_output;
821 
822         if (memorydata->w != channels || memorydata->h != 0 || memorydata->c != 0)
823         {
824             // not bias-like broadcasting type
825             continue;
826         }
827 
828         fprintf(stderr, "fuse_deconvolution_mul %s %s\n", deconvolution->name.c_str(), binaryop->name.c_str());
829 
830         {
831             const int weight_per_outch = deconvolution->weight_data_size / channels;
832 
833             float* weight = deconvolution->weight_data;
834             float* bias = deconvolution->bias_data;
835             for (int i = 0; i < channels; i++)
836             {
837                 float* conv_weight_outch = weight + weight_per_outch * i;
838                 for (int j = 0; j < weight_per_outch; j++)
839                 {
840                     conv_weight_outch[j] *= memorydata->data[i];
841                 }
842 
843                 if (bias)
844                 {
845                     bias[i] = bias[i] * memorydata->data[i];
846                 }
847             }
848         }
849 
850         int top_blob_index_final = binaryop->tops[0];
851         deconvolution->tops[0] = top_blob_index_final;
852         blobs[top_blob_index_final].producer = i;
853         binaryop->type = "ncnnfused";
854     }
855 
856     return 0;
857 }
858 
fuse_deconvolution_add()859 int NetOptimize::fuse_deconvolution_add()
860 {
861     const size_t layer_count = layers.size();
862     for (size_t i = 0; i < layer_count; i++)
863     {
864         if (layers[i]->type != "Deconvolution")
865             continue;
866 
867         // Deconvolution - BinaryOp
868         int top_blob_index = layers[i]->tops[0];
869 
870         size_t j = i + 1;
871         for (; j < layer_count; j++)
872         {
873             if (layers[j]->type != "BinaryOp")
874                 continue;
875 
876             if (layers[j]->bottoms.size() != 2)
877                 continue;
878 
879             if (layers[j]->bottoms[0] == top_blob_index)
880                 break;
881         }
882 
883         if (j == layer_count)
884             continue;
885 
886         // fuse Deconvolution - BinaryOp to Deconvolution
887         ncnn::Deconvolution* deconvolution = (ncnn::Deconvolution*)layers[i];
888         ncnn::BinaryOp* binaryop = (ncnn::BinaryOp*)layers[j];
889 
890         if (binaryop->op_type != 0 || binaryop->with_scalar)
891             continue;
892 
893         // MemoryData - ..... - BinaryOp
894         size_t k = 0;
895         for (; k < j; k++)
896         {
897             if (layers[k]->type != "MemoryData")
898                 continue;
899 
900             if (layers[k]->tops[0] == binaryop->bottoms[1])
901                 break;
902         }
903 
904         if (k == j)
905             continue;
906 
907         ncnn::MemoryData* memorydata = (ncnn::MemoryData*)layers[k];
908 
909         int channels = deconvolution->num_output;
910 
911         bool broadcasting_type_ok = false;
912         if (memorydata->w == channels && memorydata->h == 0 && memorydata->c == 0)
913             broadcasting_type_ok = true;
914         if (memorydata->w == 1 && memorydata->h == 1 && memorydata->c == channels)
915             broadcasting_type_ok = true;
916 
917         if (!broadcasting_type_ok)
918         {
919             // not bias-like broadcasting type
920             continue;
921         }
922 
923         fprintf(stderr, "fuse_deconvolution_add %s %s\n", deconvolution->name.c_str(), binaryop->name.c_str());
924 
925         ncnn::Mat bias_data = memorydata->data.reshape(channels);
926         {
927             if (deconvolution->bias_term == 0)
928             {
929                 // init bias
930                 deconvolution->bias_term = 1;
931                 deconvolution->bias_data = bias_data;
932             }
933             else
934             {
935                 float* bias = deconvolution->bias_data;
936                 for (int i = 0; i < channels; i++)
937                 {
938                     bias[i] = bias[i] + bias_data[i];
939                 }
940             }
941         }
942 
943         int top_blob_index_final = binaryop->tops[0];
944         deconvolution->tops[0] = top_blob_index_final;
945         blobs[top_blob_index_final].producer = i;
946         binaryop->type = "ncnnfused";
947     }
948 
949     return 0;
950 }
951 
fuse_deconvolutiondepthwise_batchnorm()952 int NetOptimize::fuse_deconvolutiondepthwise_batchnorm()
953 {
954     const size_t layer_count = layers.size();
955     for (size_t i = 0; i < layer_count; i++)
956     {
957         if (layers[i]->type != "DeconvolutionDepthWise")
958             continue;
959 
960         // DeconvolutionDepthWise - BatchNorm
961         int top_blob_index = layers[i]->tops[0];
962 
963         size_t j = i + 1;
964         for (; j < layer_count; j++)
965         {
966             if (layers[j]->type != "BatchNorm")
967                 continue;
968 
969             if (layers[j]->bottoms.size() != 1)
970                 continue;
971 
972             if (layers[j]->bottoms[0] == top_blob_index)
973                 break;
974         }
975 
976         if (j == layer_count)
977             continue;
978 
979         // fuse DeconvolutionDepthWise - BatchNorm to DeconvolutionDepthWise
980         ncnn::DeconvolutionDepthWise* deconvolutiondepthwise = (ncnn::DeconvolutionDepthWise*)layers[i];
981         ncnn::BatchNorm* batchnorm = (ncnn::BatchNorm*)layers[j];
982 
983         fprintf(stderr, "fuse_deconvolutiondepthwise_batchnorm %s %s\n", deconvolutiondepthwise->name.c_str(), batchnorm->name.c_str());
984 
985         {
986             int channels = batchnorm->channels;
987             float eps = batchnorm->eps;
988 
989             // a = bias - slope * mean / sqrt(var + eps)
990             // b = slope / sqrt(var + eps)
991             // value = value * b + a
992 
993             std::vector<float> a(channels);
994             std::vector<float> b(channels);
995             for (int i = 0; i < channels; i++)
996             {
997                 float sqrt_var = static_cast<float>(sqrt(batchnorm->var_data[i] + eps));
998                 a[i] = batchnorm->bias_data[i] - batchnorm->slope_data[i] * batchnorm->mean_data[i] / sqrt_var;
999                 b[i] = batchnorm->slope_data[i] / sqrt_var;
1000             }
1001 
1002             if (deconvolutiondepthwise->bias_term == 0)
1003             {
1004                 // init bias as zero
1005                 deconvolutiondepthwise->bias_term = 1;
1006                 deconvolutiondepthwise->bias_data = ncnn::Mat(channels);
1007                 deconvolutiondepthwise->bias_data.fill(0.f);
1008             }
1009 
1010             const int weight_per_outch = deconvolutiondepthwise->weight_data_size / channels;
1011 
1012             float* weight = deconvolutiondepthwise->weight_data;
1013             float* bias = deconvolutiondepthwise->bias_data;
1014             for (int i = 0; i < channels; i++)
1015             {
1016                 float* conv_weight_outch = weight + weight_per_outch * i;
1017                 for (int j = 0; j < weight_per_outch; j++)
1018                 {
1019                     conv_weight_outch[j] *= b[i];
1020                 }
1021 
1022                 bias[i] = bias[i] * b[i] + a[i];
1023             }
1024         }
1025 
1026         int top_blob_index_final = batchnorm->tops[0];
1027         deconvolutiondepthwise->tops[0] = top_blob_index_final;
1028         blobs[top_blob_index_final].producer = i;
1029         batchnorm->type = "ncnnfused";
1030     }
1031 
1032     return 0;
1033 }
1034 
fuse_innerproduct_batchnorm()1035 int NetOptimize::fuse_innerproduct_batchnorm()
1036 {
1037     const size_t layer_count = layers.size();
1038     for (size_t i = 0; i < layer_count; i++)
1039     {
1040         if (layers[i]->type != "InnerProduct")
1041             continue;
1042 
1043         // InnerProduct - BatchNorm
1044         int top_blob_index = layers[i]->tops[0];
1045 
1046         size_t j = i + 1;
1047         for (; j < layer_count; j++)
1048         {
1049             if (layers[j]->type != "BatchNorm")
1050                 continue;
1051 
1052             if (layers[j]->bottoms.size() != 1)
1053                 continue;
1054 
1055             if (layers[j]->bottoms[0] == top_blob_index)
1056                 break;
1057         }
1058 
1059         if (j == layer_count)
1060             continue;
1061 
1062         // fuse InnerProduct - BatchNorm to InnerProduct
1063         ncnn::InnerProduct* innerproduct = (ncnn::InnerProduct*)layers[i];
1064         ncnn::BatchNorm* batchnorm = (ncnn::BatchNorm*)layers[j];
1065 
1066         fprintf(stderr, "fuse_innerproduct_batchnorm %s %s\n", innerproduct->name.c_str(), batchnorm->name.c_str());
1067 
1068         {
1069             int channels = batchnorm->channels;
1070             float eps = batchnorm->eps;
1071 
1072             // a = bias - slope * mean / sqrt(var + eps)
1073             // b = slope / sqrt(var + eps)
1074             // value = value * b + a
1075 
1076             std::vector<float> a(channels);
1077             std::vector<float> b(channels);
1078             for (int i = 0; i < channels; i++)
1079             {
1080                 float sqrt_var = static_cast<float>(sqrt(batchnorm->var_data[i] + eps));
1081                 a[i] = batchnorm->bias_data[i] - batchnorm->slope_data[i] * batchnorm->mean_data[i] / sqrt_var;
1082                 b[i] = batchnorm->slope_data[i] / sqrt_var;
1083             }
1084 
1085             if (innerproduct->bias_term == 0)
1086             {
1087                 // init bias as zero
1088                 innerproduct->bias_term = 1;
1089                 innerproduct->bias_data = ncnn::Mat(channels);
1090                 innerproduct->bias_data.fill(0.f);
1091             }
1092 
1093             const int weight_per_outch = innerproduct->weight_data_size / channels;
1094 
1095             float* weight = innerproduct->weight_data;
1096             float* bias = innerproduct->bias_data;
1097             for (int i = 0; i < channels; i++)
1098             {
1099                 float* conv_weight_outch = weight + weight_per_outch * i;
1100                 for (int j = 0; j < weight_per_outch; j++)
1101                 {
1102                     conv_weight_outch[j] *= b[i];
1103                 }
1104 
1105                 bias[i] = bias[i] * b[i] + a[i];
1106             }
1107         }
1108 
1109         int top_blob_index_final = batchnorm->tops[0];
1110         innerproduct->tops[0] = top_blob_index_final;
1111         blobs[top_blob_index_final].producer = i;
1112         batchnorm->type = "ncnnfused";
1113     }
1114 
1115     return 0;
1116 }
1117 
fuse_innerproduct_add()1118 int NetOptimize::fuse_innerproduct_add()
1119 {
1120     const size_t layer_count = layers.size();
1121     for (size_t i = 0; i < layer_count; i++)
1122     {
1123         if (layers[i]->type != "InnerProduct")
1124             continue;
1125 
1126         // InnerProduct - BinaryOp
1127         int top_blob_index = layers[i]->tops[0];
1128 
1129         size_t j = i + 1;
1130         for (; j < layer_count; j++)
1131         {
1132             if (layers[j]->type != "BinaryOp")
1133                 continue;
1134 
1135             if (layers[j]->bottoms.size() != 2)
1136                 continue;
1137 
1138             if (layers[j]->bottoms[0] == top_blob_index)
1139                 break;
1140         }
1141 
1142         if (j == layer_count)
1143             continue;
1144 
1145         // fuse InnerProduct - BinaryOp to InnerProduct
1146         ncnn::InnerProduct* innerproduct = (ncnn::InnerProduct*)layers[i];
1147         ncnn::BinaryOp* binaryop = (ncnn::BinaryOp*)layers[j];
1148 
1149         if (binaryop->op_type != 0 || binaryop->with_scalar)
1150             continue;
1151 
1152         // MemoryData - ..... - BinaryOp
1153         size_t k = 0;
1154         for (; k < j; k++)
1155         {
1156             if (layers[k]->type != "MemoryData")
1157                 continue;
1158 
1159             if (layers[k]->tops[0] == binaryop->bottoms[1])
1160                 break;
1161         }
1162 
1163         if (k == j)
1164             continue;
1165 
1166         ncnn::MemoryData* memorydata = (ncnn::MemoryData*)layers[k];
1167 
1168         int channels = innerproduct->num_output;
1169 
1170         bool broadcasting_type_ok = false;
1171         if (memorydata->w == channels && memorydata->h == 0 && memorydata->c == 0)
1172             broadcasting_type_ok = true;
1173         if (memorydata->w == 1 && memorydata->h == 1 && memorydata->c == channels)
1174             broadcasting_type_ok = true;
1175 
1176         if (!broadcasting_type_ok)
1177         {
1178             // not bias-like broadcasting type
1179             continue;
1180         }
1181 
1182         fprintf(stderr, "fuse_innerproduct_add %s %s\n", innerproduct->name.c_str(), binaryop->name.c_str());
1183 
1184         ncnn::Mat bias_data = memorydata->data.reshape(channels);
1185         {
1186             if (innerproduct->bias_term == 0)
1187             {
1188                 // init bias
1189                 innerproduct->bias_term = 1;
1190                 innerproduct->bias_data = bias_data;
1191             }
1192             else
1193             {
1194                 float* bias = innerproduct->bias_data;
1195                 for (int i = 0; i < channels; i++)
1196                 {
1197                     bias[i] = bias[i] + bias_data[i];
1198                 }
1199             }
1200         }
1201 
1202         int top_blob_index_final = binaryop->tops[0];
1203         innerproduct->tops[0] = top_blob_index_final;
1204         blobs[top_blob_index_final].producer = i;
1205         binaryop->type = "ncnnfused";
1206     }
1207 
1208     return 0;
1209 }
1210 
fuse_innerproduct_dropout()1211 int NetOptimize::fuse_innerproduct_dropout()
1212 {
1213     const size_t layer_count = layers.size();
1214     for (size_t i = 0; i < layer_count; i++)
1215     {
1216         if (layers[i]->type != "InnerProduct")
1217             continue;
1218 
1219         // InnerProduct - Dropout
1220         int top_blob_index = layers[i]->tops[0];
1221 
1222         size_t j = i + 1;
1223         for (; j < layer_count; j++)
1224         {
1225             if (layers[j]->type != "Dropout")
1226                 continue;
1227 
1228             if (layers[j]->bottoms.size() != 1)
1229                 continue;
1230 
1231             if (layers[j]->bottoms[0] == top_blob_index)
1232                 break;
1233         }
1234 
1235         if (j == layer_count)
1236             continue;
1237 
1238         // fuse InnerProduct - Dropout to InnerProduct
1239         ncnn::InnerProduct* innerproduct = (ncnn::InnerProduct*)layers[i];
1240         ncnn::Dropout* dropout = (ncnn::Dropout*)layers[j];
1241 
1242         fprintf(stderr, "fuse_innerproduct_dropout %s %s\n", innerproduct->name.c_str(), dropout->name.c_str());
1243 
1244         float scale = dropout->scale;
1245         if (scale != 1.f)
1246         {
1247             const int num_output = innerproduct->num_output;
1248             const int weight_per_outch = innerproduct->weight_data_size / num_output;
1249 
1250             float* weight = innerproduct->weight_data;
1251             for (int i = 0; i < num_output; i++)
1252             {
1253                 float* conv_weight_outch = weight + weight_per_outch * i;
1254                 for (int j = 0; j < weight_per_outch; j++)
1255                 {
1256                     conv_weight_outch[j] *= scale;
1257                 }
1258             }
1259 
1260             if (innerproduct->bias_term)
1261             {
1262                 float* bias = innerproduct->bias_data;
1263                 for (int i = 0; i < num_output; i++)
1264                 {
1265                     bias[i] *= scale;
1266                 }
1267             }
1268         }
1269 
1270         int top_blob_index_final = dropout->tops[0];
1271         innerproduct->tops[0] = top_blob_index_final;
1272         blobs[top_blob_index_final].producer = i;
1273         dropout->type = "ncnnfused";
1274     }
1275 
1276     return 0;
1277 }
1278 
fuse_convolution_activation()1279 int NetOptimize::fuse_convolution_activation()
1280 {
1281     const size_t layer_count = layers.size();
1282     for (size_t i = 0; i < layer_count; i++)
1283     {
1284         if (layers[i]->type != "Convolution")
1285             continue;
1286 
1287         // Convolution - Activation
1288         int top_blob_index = layers[i]->tops[0];
1289 
1290         size_t j = i + 1;
1291         for (; j < layer_count; j++)
1292         {
1293             if (layers[j]->type != "ReLU" && layers[j]->type != "Clip" && layers[j]->type != "Sigmoid" && layers[j]->type != "Mish" && layers[j]->type != "HardSwish")
1294                 continue;
1295 
1296             if (layers[j]->bottoms.size() != 1)
1297                 continue;
1298 
1299             if (layers[j]->bottoms[0] == top_blob_index)
1300                 break;
1301         }
1302 
1303         if (j == layer_count)
1304             continue;
1305 
1306         // fuse Convolution - Activation to Convolution
1307         ncnn::Convolution* convolution = (ncnn::Convolution*)layers[i];
1308         ncnn::Layer* activation = layers[j];
1309 
1310         fprintf(stderr, "fuse_convolution_activation %s %s\n", convolution->name.c_str(), activation->name.c_str());
1311 
1312         if (activation->type == "ReLU")
1313         {
1314             ncnn::ReLU* relu = (ncnn::ReLU*)activation;
1315 
1316             if (relu->slope == 0.f)
1317             {
1318                 convolution->activation_type = 1;
1319             }
1320             else
1321             {
1322                 convolution->activation_type = 2;
1323                 convolution->activation_params = ncnn::Mat(1);
1324                 convolution->activation_params[0] = relu->slope;
1325             }
1326         }
1327         else if (activation->type == "Clip")
1328         {
1329             ncnn::Clip* clip = (ncnn::Clip*)activation;
1330 
1331             convolution->activation_type = 3;
1332             convolution->activation_params = ncnn::Mat(2);
1333             convolution->activation_params[0] = clip->min;
1334             convolution->activation_params[1] = clip->max;
1335         }
1336         else if (activation->type == "Sigmoid")
1337         {
1338             convolution->activation_type = 4;
1339         }
1340         else if (activation->type == "Mish")
1341         {
1342             convolution->activation_type = 5;
1343         }
1344         else if (activation->type == "HardSwish")
1345         {
1346             ncnn::HardSwish* hardswish = (ncnn::HardSwish*)activation;
1347 
1348             convolution->activation_type = 6;
1349             convolution->activation_params = ncnn::Mat(2);
1350             convolution->activation_params[0] = hardswish->alpha;
1351             convolution->activation_params[1] = hardswish->beta;
1352         }
1353 
1354         int top_blob_index_final = activation->tops[0];
1355         convolution->tops[0] = top_blob_index_final;
1356         blobs[top_blob_index_final].producer = i;
1357         activation->type = "ncnnfused";
1358     }
1359 
1360     for (size_t i = 0; i < layer_count; i++)
1361     {
1362         if (layers[i]->type != "Convolution1D")
1363             continue;
1364 
1365         // Convolution1D - Activation
1366         int top_blob_index = layers[i]->tops[0];
1367 
1368         size_t j = i + 1;
1369         for (; j < layer_count; j++)
1370         {
1371             if (layers[j]->type != "ReLU" && layers[j]->type != "Clip" && layers[j]->type != "Sigmoid" && layers[j]->type != "Mish")
1372                 continue;
1373 
1374             if (layers[j]->bottoms.size() != 1)
1375                 continue;
1376 
1377             if (layers[j]->bottoms[0] == top_blob_index)
1378                 break;
1379         }
1380 
1381         if (j == layer_count)
1382             continue;
1383 
1384         // fuse Convolution1D - Activation to Convolution1D
1385         ncnn::Convolution1D* convolution = (ncnn::Convolution1D*)layers[i];
1386         ncnn::Layer* activation = layers[j];
1387 
1388         fprintf(stderr, "fuse_convolution1d_activation %s %s\n", convolution->name.c_str(), activation->name.c_str());
1389 
1390         if (activation->type == "ReLU")
1391         {
1392             ncnn::ReLU* relu = (ncnn::ReLU*)activation;
1393 
1394             if (relu->slope == 0.f)
1395             {
1396                 convolution->activation_type = 1;
1397             }
1398             else
1399             {
1400                 convolution->activation_type = 2;
1401                 convolution->activation_params = ncnn::Mat(1);
1402                 convolution->activation_params[0] = relu->slope;
1403             }
1404         }
1405         else if (activation->type == "Clip")
1406         {
1407             ncnn::Clip* clip = (ncnn::Clip*)activation;
1408 
1409             convolution->activation_type = 3;
1410             convolution->activation_params = ncnn::Mat(2);
1411             convolution->activation_params[0] = clip->min;
1412             convolution->activation_params[1] = clip->max;
1413         }
1414         else if (activation->type == "Sigmoid")
1415         {
1416             convolution->activation_type = 4;
1417         }
1418         else if (activation->type == "Mish")
1419         {
1420             convolution->activation_type = 5;
1421         }
1422 
1423         int top_blob_index_final = activation->tops[0];
1424         convolution->tops[0] = top_blob_index_final;
1425         blobs[top_blob_index_final].producer = i;
1426         activation->type = "ncnnfused";
1427     }
1428 
1429     return 0;
1430 }
1431 
fuse_convolutiondepthwise_activation()1432 int NetOptimize::fuse_convolutiondepthwise_activation()
1433 {
1434     const size_t layer_count = layers.size();
1435     for (size_t i = 0; i < layer_count; i++)
1436     {
1437         if (layers[i]->type != "ConvolutionDepthWise")
1438             continue;
1439 
1440         // ConvolutionDepthWise - Activation
1441         int top_blob_index = layers[i]->tops[0];
1442 
1443         size_t j = i + 1;
1444         for (; j < layer_count; j++)
1445         {
1446             if (layers[j]->type != "ReLU" && layers[j]->type != "Clip" && layers[j]->type != "Sigmoid" && layers[j]->type != "Mish" && layers[j]->type != "HardSwish")
1447                 continue;
1448 
1449             if (layers[j]->bottoms.size() != 1)
1450                 continue;
1451 
1452             if (layers[j]->bottoms[0] == top_blob_index)
1453                 break;
1454         }
1455 
1456         if (j == layer_count)
1457             continue;
1458 
1459         // fuse ConvolutionDepthWise - Activation to ConvolutionDepthWise
1460         ncnn::ConvolutionDepthWise* convolutiondepthwise = (ncnn::ConvolutionDepthWise*)layers[i];
1461         ncnn::Layer* activation = layers[j];
1462 
1463         fprintf(stderr, "fuse_convolutiondepthwise_activation %s %s\n", convolutiondepthwise->name.c_str(), activation->name.c_str());
1464 
1465         if (activation->type == "ReLU")
1466         {
1467             ncnn::ReLU* relu = (ncnn::ReLU*)activation;
1468 
1469             if (relu->slope == 0.f)
1470             {
1471                 convolutiondepthwise->activation_type = 1;
1472             }
1473             else
1474             {
1475                 convolutiondepthwise->activation_type = 2;
1476                 convolutiondepthwise->activation_params = ncnn::Mat(1);
1477                 convolutiondepthwise->activation_params[0] = relu->slope;
1478             }
1479         }
1480         else if (activation->type == "Clip")
1481         {
1482             ncnn::Clip* clip = (ncnn::Clip*)activation;
1483 
1484             convolutiondepthwise->activation_type = 3;
1485             convolutiondepthwise->activation_params = ncnn::Mat(2);
1486             convolutiondepthwise->activation_params[0] = clip->min;
1487             convolutiondepthwise->activation_params[1] = clip->max;
1488         }
1489         else if (activation->type == "Sigmoid")
1490         {
1491             convolutiondepthwise->activation_type = 4;
1492         }
1493         else if (activation->type == "Mish")
1494         {
1495             convolutiondepthwise->activation_type = 5;
1496         }
1497         else if (activation->type == "HardSwish")
1498         {
1499             ncnn::HardSwish* hardswish = (ncnn::HardSwish*)activation;
1500 
1501             convolutiondepthwise->activation_type = 6;
1502             convolutiondepthwise->activation_params = ncnn::Mat(2);
1503             convolutiondepthwise->activation_params[0] = hardswish->alpha;
1504             convolutiondepthwise->activation_params[1] = hardswish->beta;
1505         }
1506 
1507         int top_blob_index_final = activation->tops[0];
1508         convolutiondepthwise->tops[0] = top_blob_index_final;
1509         blobs[top_blob_index_final].producer = i;
1510         activation->type = "ncnnfused";
1511     }
1512 
1513     return 0;
1514 }
1515 
fuse_deconvolution_activation()1516 int NetOptimize::fuse_deconvolution_activation()
1517 {
1518     const size_t layer_count = layers.size();
1519     for (size_t i = 0; i < layer_count; i++)
1520     {
1521         if (layers[i]->type != "Deconvolution")
1522             continue;
1523 
1524         // Deconvolution - Activation
1525         int top_blob_index = layers[i]->tops[0];
1526 
1527         size_t j = i + 1;
1528         for (; j < layer_count; j++)
1529         {
1530             if (layers[j]->type != "ReLU" && layers[j]->type != "Clip" && layers[j]->type != "Sigmoid")
1531                 continue;
1532 
1533             if (layers[j]->bottoms.size() != 1)
1534                 continue;
1535 
1536             if (layers[j]->bottoms[0] == top_blob_index)
1537                 break;
1538         }
1539 
1540         if (j == layer_count)
1541             continue;
1542 
1543         // fuse Deconvolution - Activation to Deconvolution
1544         ncnn::Deconvolution* deconvolution = (ncnn::Deconvolution*)layers[i];
1545         ncnn::Layer* activation = layers[j];
1546 
1547         fprintf(stderr, "fuse_deconvolution_activation %s %s\n", deconvolution->name.c_str(), activation->name.c_str());
1548 
1549         if (activation->type == "ReLU")
1550         {
1551             ncnn::ReLU* relu = (ncnn::ReLU*)activation;
1552 
1553             if (relu->slope == 0.f)
1554             {
1555                 deconvolution->activation_type = 1;
1556             }
1557             else
1558             {
1559                 deconvolution->activation_type = 2;
1560                 deconvolution->activation_params = ncnn::Mat(1);
1561                 deconvolution->activation_params[0] = relu->slope;
1562             }
1563         }
1564         else if (activation->type == "Clip")
1565         {
1566             ncnn::Clip* clip = (ncnn::Clip*)activation;
1567 
1568             deconvolution->activation_type = 3;
1569             deconvolution->activation_params = ncnn::Mat(2);
1570             deconvolution->activation_params[0] = clip->min;
1571             deconvolution->activation_params[1] = clip->max;
1572         }
1573         else if (activation->type == "Sigmoid")
1574         {
1575             deconvolution->activation_type = 4;
1576         }
1577 
1578         int top_blob_index_final = activation->tops[0];
1579         deconvolution->tops[0] = top_blob_index_final;
1580         blobs[top_blob_index_final].producer = i;
1581         activation->type = "ncnnfused";
1582     }
1583 
1584     return 0;
1585 }
1586 
fuse_deconvolutiondepthwise_activation()1587 int NetOptimize::fuse_deconvolutiondepthwise_activation()
1588 {
1589     const size_t layer_count = layers.size();
1590     for (size_t i = 0; i < layer_count; i++)
1591     {
1592         if (layers[i]->type != "DeconvolutionDepthWise")
1593             continue;
1594 
1595         // DeconvolutionDepthWise - Activation
1596         int top_blob_index = layers[i]->tops[0];
1597 
1598         size_t j = i + 1;
1599         for (; j < layer_count; j++)
1600         {
1601             if (layers[j]->type != "ReLU" && layers[j]->type != "Clip" && layers[j]->type != "Sigmoid")
1602                 continue;
1603 
1604             if (layers[j]->bottoms.size() != 1)
1605                 continue;
1606 
1607             if (layers[j]->bottoms[0] == top_blob_index)
1608                 break;
1609         }
1610 
1611         if (j == layer_count)
1612             continue;
1613 
1614         // fuse DeconvolutionDepthWise - Activation to DeconvolutionDepthWise
1615         ncnn::DeconvolutionDepthWise* deconvolutiondepthwise = (ncnn::DeconvolutionDepthWise*)layers[i];
1616         ncnn::Layer* activation = layers[j];
1617 
1618         fprintf(stderr, "fuse_deconvolutiondepthwise_activation %s %s\n", deconvolutiondepthwise->name.c_str(), activation->name.c_str());
1619 
1620         if (activation->type == "ReLU")
1621         {
1622             ncnn::ReLU* relu = (ncnn::ReLU*)activation;
1623 
1624             if (relu->slope == 0.f)
1625             {
1626                 deconvolutiondepthwise->activation_type = 1;
1627             }
1628             else
1629             {
1630                 deconvolutiondepthwise->activation_type = 2;
1631                 deconvolutiondepthwise->activation_params = ncnn::Mat(1);
1632                 deconvolutiondepthwise->activation_params[0] = relu->slope;
1633             }
1634         }
1635         else if (activation->type == "Clip")
1636         {
1637             ncnn::Clip* clip = (ncnn::Clip*)activation;
1638 
1639             deconvolutiondepthwise->activation_type = 3;
1640             deconvolutiondepthwise->activation_params = ncnn::Mat(2);
1641             deconvolutiondepthwise->activation_params[0] = clip->min;
1642             deconvolutiondepthwise->activation_params[1] = clip->max;
1643         }
1644         else if (activation->type == "Sigmoid")
1645         {
1646             deconvolutiondepthwise->activation_type = 4;
1647         }
1648 
1649         int top_blob_index_final = activation->tops[0];
1650         deconvolutiondepthwise->tops[0] = top_blob_index_final;
1651         blobs[top_blob_index_final].producer = i;
1652         activation->type = "ncnnfused";
1653     }
1654 
1655     return 0;
1656 }
1657 
fuse_innerproduct_activation()1658 int NetOptimize::fuse_innerproduct_activation()
1659 {
1660     const size_t layer_count = layers.size();
1661     for (size_t i = 0; i < layer_count; i++)
1662     {
1663         if (layers[i]->type != "InnerProduct")
1664             continue;
1665 
1666         // InnerProduct - Activation
1667         int top_blob_index = layers[i]->tops[0];
1668 
1669         size_t j = i + 1;
1670         for (; j < layer_count; j++)
1671         {
1672             if (layers[j]->type != "ReLU" && layers[j]->type != "Clip" && layers[j]->type != "Sigmoid" && layers[j]->type != "Mish" && layers[j]->type != "HardSwish")
1673                 continue;
1674 
1675             if (layers[j]->bottoms.size() != 1)
1676                 continue;
1677 
1678             if (layers[j]->bottoms[0] == top_blob_index)
1679                 break;
1680         }
1681 
1682         if (j == layer_count)
1683             continue;
1684 
1685         // fuse InnerProduct - Activation to InnerProduct
1686         ncnn::InnerProduct* innerproduct = (ncnn::InnerProduct*)layers[i];
1687         ncnn::Layer* activation = layers[j];
1688 
1689         fprintf(stderr, "fuse_innerproduct_activation %s %s\n", innerproduct->name.c_str(), activation->name.c_str());
1690 
1691         if (activation->type == "ReLU")
1692         {
1693             ncnn::ReLU* relu = (ncnn::ReLU*)activation;
1694 
1695             if (relu->slope == 0.f)
1696             {
1697                 innerproduct->activation_type = 1;
1698             }
1699             else
1700             {
1701                 innerproduct->activation_type = 2;
1702                 innerproduct->activation_params = ncnn::Mat(1);
1703                 innerproduct->activation_params[0] = relu->slope;
1704             }
1705         }
1706         else if (activation->type == "Clip")
1707         {
1708             ncnn::Clip* clip = (ncnn::Clip*)activation;
1709 
1710             innerproduct->activation_type = 3;
1711             innerproduct->activation_params = ncnn::Mat(2);
1712             innerproduct->activation_params[0] = clip->min;
1713             innerproduct->activation_params[1] = clip->max;
1714         }
1715         else if (activation->type == "Sigmoid")
1716         {
1717             innerproduct->activation_type = 4;
1718         }
1719         else if (activation->type == "Mish")
1720         {
1721             innerproduct->activation_type = 5;
1722         }
1723         else if (activation->type == "HardSwish")
1724         {
1725             ncnn::HardSwish* hardswish = (ncnn::HardSwish*)activation;
1726 
1727             innerproduct->activation_type = 6;
1728             innerproduct->activation_params = ncnn::Mat(2);
1729             innerproduct->activation_params[0] = hardswish->alpha;
1730             innerproduct->activation_params[1] = hardswish->beta;
1731         }
1732 
1733         int top_blob_index_final = activation->tops[0];
1734         innerproduct->tops[0] = top_blob_index_final;
1735         blobs[top_blob_index_final].producer = i;
1736         activation->type = "ncnnfused";
1737     }
1738 
1739     return 0;
1740 }
1741 
fuse_memorydata_binaryop()1742 int NetOptimize::fuse_memorydata_binaryop()
1743 {
1744     const size_t layer_count = layers.size();
1745     for (size_t i = 0; i < layer_count; i++)
1746     {
1747         if (layers[i]->type != "MemoryData")
1748             continue;
1749 
1750         // MemoryData - BinaryOp
1751         int top_blob_index = layers[i]->tops[0];
1752 
1753         size_t j = i + 1;
1754         for (; j < layer_count; j++)
1755         {
1756             if (layers[j]->type != "BinaryOp")
1757                 continue;
1758 
1759             if (layers[j]->bottoms.size() != 2)
1760                 continue;
1761 
1762             if (layers[j]->bottoms[0] == top_blob_index || layers[j]->bottoms[1] == top_blob_index)
1763                 break;
1764         }
1765 
1766         if (j == layer_count)
1767             continue;
1768 
1769         // fuse MemoryData - BinaryOp to BinaryOp
1770         ncnn::MemoryData* memorydata = (ncnn::MemoryData*)layers[i];
1771         ncnn::BinaryOp* binaryop = (ncnn::BinaryOp*)layers[j];
1772 
1773         if (memorydata->w != 1 || memorydata->h != 0 || memorydata->c != 0)
1774         {
1775             // not a scalar
1776             continue;
1777         }
1778 
1779         int memorydata_index = 1;
1780 
1781         if (binaryop->bottoms[0] == top_blob_index)
1782         {
1783             int op_type = binaryop->op_type;
1784 
1785             if (op_type == ncnn::BinaryOp::Operation_ADD
1786                     || op_type == ncnn::BinaryOp::Operation_MUL
1787                     || op_type == ncnn::BinaryOp::Operation_MAX
1788                     || op_type == ncnn::BinaryOp::Operation_MIN)
1789             {
1790                 memorydata_index = 0;
1791             }
1792             else if (op_type == ncnn::BinaryOp::Operation_SUB)
1793             {
1794                 binaryop->op_type = ncnn::BinaryOp::Operation_RSUB;
1795                 memorydata_index = 0;
1796             }
1797             else if (op_type == ncnn::BinaryOp::Operation_DIV)
1798             {
1799                 binaryop->op_type = ncnn::BinaryOp::Operation_RDIV;
1800                 memorydata_index = 0;
1801             }
1802             else
1803             {
1804                 // non interchangeable binaryop
1805                 continue;
1806             }
1807         }
1808 
1809         float scalar = memorydata->data[0];
1810 
1811         binaryop->with_scalar = 1;
1812         binaryop->b = scalar;
1813 
1814         fprintf(stderr, "fuse_memorydata_binaryop %s %s\n", memorydata->name.c_str(), binaryop->name.c_str());
1815 
1816         binaryop->bottoms.erase(binaryop->bottoms.begin() + memorydata_index);
1817         memorydata->type = "ncnnfused";
1818     }
1819 
1820     for (size_t i = 0; i < layer_count; i++)
1821     {
1822         if (layers[i]->type != "MemoryData")
1823             continue;
1824 
1825         // MemoryData - Split - BinaryOp
1826         int top_blob_index = layers[i]->tops[0];
1827 
1828         size_t j0 = i + 1;
1829         for (; j0 < layer_count; j0++)
1830         {
1831             if (layers[j0]->type != "Split")
1832                 continue;
1833 
1834             if (layers[j0]->bottoms.size() != 1)
1835                 continue;
1836 
1837             if (layers[j0]->bottoms[0] == top_blob_index)
1838                 break;
1839         }
1840 
1841         if (j0 == layer_count)
1842             continue;
1843 
1844         int split_top_blob_index = -1;
1845 
1846         size_t j1 = j0 + 1;
1847         for (; j1 < layer_count; j1++)
1848         {
1849             if (layers[j1]->type != "BinaryOp")
1850                 continue;
1851 
1852             if (layers[j1]->bottoms.size() != 2)
1853                 continue;
1854 
1855             for (int k = 0; k < (int)layers[j0]->tops.size(); k++)
1856             {
1857                 if (layers[j1]->bottoms[0] == layers[j0]->tops[k] || layers[j1]->bottoms[1] == layers[j0]->tops[k])
1858                 {
1859                     split_top_blob_index = k;
1860                     break;
1861                 }
1862             }
1863 
1864             if (split_top_blob_index != -1)
1865                 break;
1866         }
1867 
1868         if (j1 == layer_count)
1869             continue;
1870 
1871         // fuse MemoryData - Split - BinaryOp to BinaryOp
1872         ncnn::MemoryData* memorydata = (ncnn::MemoryData*)layers[i];
1873         ncnn::Split* split = (ncnn::Split*)layers[j0];
1874         ncnn::BinaryOp* binaryop = (ncnn::BinaryOp*)layers[j1];
1875 
1876         if (memorydata->w != 1 || memorydata->h != 0 || memorydata->c != 0)
1877         {
1878             // not a scalar
1879             continue;
1880         }
1881 
1882         int memorydata_index = 1;
1883 
1884         if (binaryop->bottoms[0] == split->tops[split_top_blob_index])
1885         {
1886             int op_type = binaryop->op_type;
1887 
1888             if (op_type == ncnn::BinaryOp::Operation_ADD
1889                     || op_type == ncnn::BinaryOp::Operation_MUL
1890                     || op_type == ncnn::BinaryOp::Operation_MAX
1891                     || op_type == ncnn::BinaryOp::Operation_MIN)
1892             {
1893                 memorydata_index = 0;
1894             }
1895             else if (op_type == ncnn::BinaryOp::Operation_SUB)
1896             {
1897                 binaryop->op_type = ncnn::BinaryOp::Operation_RSUB;
1898                 memorydata_index = 0;
1899             }
1900             else if (op_type == ncnn::BinaryOp::Operation_DIV)
1901             {
1902                 binaryop->op_type = ncnn::BinaryOp::Operation_RDIV;
1903                 memorydata_index = 0;
1904             }
1905             else
1906             {
1907                 // non interchangeable binaryop
1908                 continue;
1909             }
1910         }
1911 
1912         float scalar = memorydata->data[0];
1913 
1914         binaryop->with_scalar = 1;
1915         binaryop->b = scalar;
1916 
1917         fprintf(stderr, "fuse_memorydata_binaryop %s %s\n", memorydata->name.c_str(), binaryop->name.c_str());
1918 
1919         binaryop->bottoms.erase(binaryop->bottoms.begin() + memorydata_index);
1920         split->tops.erase(split->tops.begin() + split_top_blob_index);
1921         if (split->tops.empty())
1922         {
1923             split->type = "ncnnfused";
1924             memorydata->type = "ncnnfused";
1925         }
1926 
1927         i--;
1928     }
1929 
1930     return 0;
1931 }
1932 
fuse_binaryop_eltwise()1933 int NetOptimize::fuse_binaryop_eltwise()
1934 {
1935     const size_t layer_count = layers.size();
1936     for (size_t i = 0; i < layer_count; i++)
1937     {
1938         if (layers[i]->type != "BinaryOp")
1939             continue;
1940 
1941         if (layers[i]->bottoms.size() != 2)
1942             continue;
1943 
1944         ncnn::BinaryOp* binaryop = (ncnn::BinaryOp*)layers[i];
1945 
1946         if (binaryop->op_type != ncnn::BinaryOp::Operation_ADD)
1947             continue;
1948 
1949         if (binaryop->with_scalar)
1950             continue;
1951 
1952         // BinaryOp - BinaryOp - BinaryOp
1953         int bottom_blob_index_0 = binaryop->bottoms[0];
1954         int bottom_blob_index_1 = binaryop->bottoms[1];
1955 
1956         size_t j0 = 0;
1957         for (; j0 < i; j0++)
1958         {
1959             if (layers[j0]->type != "BinaryOp")
1960                 continue;
1961 
1962             if (layers[j0]->bottoms.size() != 1)
1963                 continue;
1964 
1965             if (((ncnn::BinaryOp*)layers[j0])->op_type != ncnn::BinaryOp::Operation_MUL)
1966                 continue;
1967 
1968             if (layers[j0]->tops[0] == bottom_blob_index_0)
1969                 break;
1970         }
1971 
1972         size_t j1 = 0;
1973         for (; j1 < i; j1++)
1974         {
1975             if (layers[j1]->type != "BinaryOp")
1976                 continue;
1977 
1978             if (layers[j1]->bottoms.size() != 1)
1979                 continue;
1980 
1981             if (((ncnn::BinaryOp*)layers[j1])->op_type != ncnn::BinaryOp::Operation_MUL)
1982                 continue;
1983 
1984             if (layers[j1]->tops[0] == bottom_blob_index_1)
1985                 break;
1986         }
1987 
1988         if (j0 == i && j1 == i)
1989             continue;
1990 
1991         ncnn::BinaryOp* binaryop0 = (ncnn::BinaryOp*)layers[j0];
1992         ncnn::BinaryOp* binaryop1 = (ncnn::BinaryOp*)layers[j1];
1993 
1994         fprintf(stderr, "fuse_binaryop_eltwise %s %s %s\n", binaryop0->name.c_str(), binaryop1->name.c_str(), binaryop->name.c_str());
1995 
1996         ncnn::Eltwise* eltwise = (ncnn::Eltwise*)ncnn::create_layer("Eltwise");
1997 
1998         eltwise->type = "Eltwise";
1999         eltwise->name = binaryop->name;
2000         eltwise->bottoms = binaryop->bottoms;
2001         eltwise->tops = binaryop->tops;
2002 
2003         ncnn::ParamDict pd;
2004         eltwise->load_param(pd);
2005 
2006         eltwise->op_type = ncnn::Eltwise::Operation_SUM;
2007 
2008         eltwise->coeffs = ncnn::Mat(2);
2009 
2010         if (j0 != i && j1 != i)
2011         {
2012             // fuse BinaryOp - BinaryOp - BinaryOp to Eltwise
2013             eltwise->coeffs[0] = binaryop0->b;
2014             eltwise->coeffs[1] = binaryop1->b;
2015 
2016             eltwise->bottoms[0] = binaryop0->bottoms[0];
2017             eltwise->bottoms[1] = binaryop1->bottoms[0];
2018 
2019             binaryop0->type = "ncnnfused";
2020             binaryop1->type = "ncnnfused";
2021         }
2022         if (j0 != i && j1 == i)
2023         {
2024             // fuse BinaryOp - X - BinaryOp to Eltwise
2025             eltwise->coeffs[0] = binaryop0->b;
2026             eltwise->coeffs[1] = 1.f;
2027 
2028             eltwise->bottoms[0] = binaryop0->bottoms[0];
2029 
2030             binaryop0->type = "ncnnfused";
2031         }
2032         if (j0 == i && j1 != i)
2033         {
2034             // fuse X - BinaryOp - BinaryOp to Eltwise
2035             eltwise->coeffs[0] = 1.f;
2036             eltwise->coeffs[1] = binaryop1->b;
2037 
2038             eltwise->bottoms[1] = binaryop1->bottoms[0];
2039 
2040             binaryop1->type = "ncnnfused";
2041         }
2042 
2043         layers[i] = eltwise;
2044         delete binaryop;
2045     }
2046 
2047     return 0;
2048 }
2049 
eliminate_dropout()2050 int NetOptimize::eliminate_dropout()
2051 {
2052     const size_t layer_count = layers.size();
2053     for (size_t i = 0; i < layer_count; i++)
2054     {
2055         if (layers[i]->type != "Dropout")
2056             continue;
2057 
2058         ncnn::Dropout* dropout = (ncnn::Dropout*)layers[i];
2059         if (dropout->scale != 1.f)
2060             continue;
2061 
2062         // Any - Dropout
2063         int bottom_blob_index = layers[i]->bottoms[0];
2064 
2065         int j = i - 1;
2066         for (; j >= 0; j--)
2067         {
2068             if (layers[j]->type == "ncnnfused")
2069                 continue;
2070 
2071             if (layers[j]->tops.size() != 1)
2072                 continue;
2073 
2074             if (layers[j]->tops[0] == bottom_blob_index)
2075                 break;
2076         }
2077 
2078         if (j == -1)
2079             continue;
2080 
2081         ncnn::Layer* any = layers[j];
2082 
2083         fprintf(stderr, "eliminate_dropout %s %s\n", any->name.c_str(), dropout->name.c_str());
2084 
2085         int top_blob_index_final = dropout->tops[0];
2086         any->tops[0] = top_blob_index_final;
2087         blobs[top_blob_index_final].producer = j;
2088         dropout->type = "ncnnfused";
2089     }
2090 
2091     return 0;
2092 }
2093 
eliminate_pooling1x1()2094 int NetOptimize::eliminate_pooling1x1()
2095 {
2096     const size_t layer_count = layers.size();
2097     for (size_t i = 0; i < layer_count; i++)
2098     {
2099         if (layers[i]->type != "Pooling")
2100             continue;
2101 
2102         ncnn::Pooling* pooling = (ncnn::Pooling*)layers[i];
2103         if (pooling->pad_left != 0 || pooling->pad_right != 0 || pooling->pad_top != 0 || pooling->pad_bottom != 0)
2104             continue;
2105 
2106         if (pooling->kernel_w != 1 || pooling->kernel_h != 1 || pooling->stride_w != 1 || pooling->stride_h != 1)
2107             continue;
2108 
2109         if (pooling->global_pooling != 0)
2110             continue;
2111 
2112         // Any - Pooling
2113         int bottom_blob_index = layers[i]->bottoms[0];
2114 
2115         int top_i = -1;
2116         int j = i - 1;
2117         for (; j >= 0; j--)
2118         {
2119             if (layers[j]->type == "ncnnfused")
2120                 continue;
2121 
2122             for (size_t k = 0; k < layers[j]->tops.size(); k++)
2123             {
2124                 if (layers[j]->tops[k] == bottom_blob_index)
2125                 {
2126                     top_i = k;
2127                     break;
2128                 }
2129             }
2130 
2131             if (top_i != -1)
2132                 break;
2133         }
2134 
2135         if (j == -1)
2136             continue;
2137 
2138         ncnn::Layer* any = layers[j];
2139 
2140         fprintf(stderr, "eliminate_pooling1x1 %s %s\n", any->name.c_str(), pooling->name.c_str());
2141 
2142         int top_blob_index_final = pooling->tops[0];
2143         any->tops[top_i] = top_blob_index_final;
2144         blobs[top_blob_index_final].producer = j;
2145         pooling->type = "ncnnfused";
2146     }
2147 
2148     return 0;
2149 }
2150 
eliminate_noop()2151 int NetOptimize::eliminate_noop()
2152 {
2153     const size_t layer_count = layers.size();
2154     for (size_t i = 0; i < layer_count; i++)
2155     {
2156         if (layers[i]->type != "Noop")
2157             continue;
2158 
2159         ncnn::Layer* noop = layers[i];
2160 
2161         if (noop->bottoms.empty())
2162         {
2163             // Noop
2164             fprintf(stderr, "eliminate_noop %s\n", noop->name.c_str());
2165 
2166             size_t top_blob_count = noop->tops.size();
2167             for (size_t j = 0; j < top_blob_count; j++)
2168             {
2169                 int top_blob_index_final = noop->tops[j];
2170                 blobs[top_blob_index_final].producer = -1;
2171             }
2172             noop->type = "ncnnfused";
2173 
2174             continue;
2175         }
2176 
2177         // Any - Noop
2178         int bottom_blob_index = noop->bottoms[0];
2179 
2180         int j = i - 1;
2181         int any_k = -1;
2182         for (; j >= 0; j--)
2183         {
2184             if (layers[j]->type == "ncnnfused")
2185                 continue;
2186 
2187             bool link_noop = false;
2188             size_t top_blob_count = layers[j]->tops.size();
2189             for (size_t k = 0; k < top_blob_count; k++)
2190             {
2191                 if (layers[j]->tops[k] == bottom_blob_index)
2192                 {
2193                     link_noop = true;
2194                     any_k = k;
2195                     break;
2196                 }
2197             }
2198 
2199             if (link_noop)
2200                 break;
2201         }
2202 
2203         if (j == -1 || any_k == -1)
2204             continue;
2205 
2206         ncnn::Layer* any = layers[j];
2207 
2208         fprintf(stderr, "eliminate_noop %s %s\n", any->name.c_str(), noop->name.c_str());
2209 
2210         int top_blob_index_final = noop->tops[0];
2211         any->tops[any_k] = top_blob_index_final;
2212         blobs[top_blob_index_final].producer = j;
2213 
2214         noop->type = "ncnnfused";
2215     }
2216 
2217     return 0;
2218 }
2219 
eliminate_split()2220 int NetOptimize::eliminate_split()
2221 {
2222     const size_t layer_count = layers.size();
2223     for (size_t i = 0; i < layer_count; i++)
2224     {
2225         if (layers[i]->type != "Split")
2226             continue;
2227 
2228         ncnn::Layer* split = layers[i];
2229 
2230         int real_split_output_count = 0;
2231         int real_split_top_blob_index = -1;
2232         size_t top_blob_count = split->tops.size();
2233         for (size_t j = 0; j < top_blob_count; j++)
2234         {
2235             int top_blob_index_final = split->tops[j];
2236             if (blobs[top_blob_index_final].consumer != -1)
2237             {
2238                 real_split_output_count += 1;
2239                 real_split_top_blob_index = j;
2240             }
2241         }
2242 
2243         if (real_split_output_count > 1)
2244             continue;
2245 
2246         // Any - Pooling
2247         int bottom_blob_index = split->bottoms[0];
2248 
2249         int top_i = -1;
2250         int j = i - 1;
2251         for (; j >= 0; j--)
2252         {
2253             if (layers[j]->type == "ncnnfused")
2254                 continue;
2255 
2256             for (size_t k = 0; k < layers[j]->tops.size(); k++)
2257             {
2258                 if (layers[j]->tops[k] == bottom_blob_index)
2259                 {
2260                     top_i = k;
2261                     break;
2262                 }
2263             }
2264 
2265             if (top_i != -1)
2266                 break;
2267         }
2268 
2269         if (j == -1)
2270             continue;
2271 
2272         ncnn::Layer* any = layers[j];
2273 
2274         fprintf(stderr, "eliminate_split %s %s\n", any->name.c_str(), split->name.c_str());
2275 
2276         int top_blob_index_final = split->tops[real_split_top_blob_index];
2277         any->tops[top_i] = top_blob_index_final;
2278         blobs[top_blob_index_final].producer = j;
2279         split->type = "ncnnfused";
2280     }
2281 
2282     return 0;
2283 }
2284 
eliminate_orphaned_memorydata()2285 int NetOptimize::eliminate_orphaned_memorydata()
2286 {
2287     const size_t layer_count = layers.size();
2288     for (size_t i = 0; i < layer_count; i++)
2289     {
2290         if (layers[i]->type != "MemoryData")
2291             continue;
2292 
2293         // MemoryData - X
2294         int top_blob_index = layers[i]->tops[0];
2295 
2296         size_t j = i + 1;
2297         for (; j < layer_count; j++)
2298         {
2299             if (layers[j]->type == "ncnnfused")
2300                 continue;
2301 
2302             bool orphaned = true;
2303             for (size_t k = 0; k < layers[j]->bottoms.size(); k++)
2304             {
2305                 if (layers[j]->bottoms[k] == top_blob_index)
2306                 {
2307                     orphaned = false;
2308                     break;
2309                 }
2310             }
2311 
2312             if (!orphaned)
2313                 break;
2314         }
2315 
2316         if (j < layer_count)
2317             continue;
2318 
2319         // assert orphaned == true
2320         fprintf(stderr, "eliminate_orphaned_memorydata %s\n", layers[i]->name.c_str());
2321 
2322         layers[i]->type = "ncnnfused";
2323     }
2324 
2325     return 0;
2326 }
2327 
eliminate_reshape_after_global_pooling()2328 int NetOptimize::eliminate_reshape_after_global_pooling()
2329 {
2330     const size_t layer_count = layers.size();
2331     for (size_t i = 0; i < layer_count; i++)
2332     {
2333         if (layers[i]->type != "Pooling")
2334             continue;
2335 
2336         ncnn::Pooling* pooling = (ncnn::Pooling*)layers[i];
2337         if (pooling->global_pooling == 0)
2338             continue;
2339 
2340         // Pooling - Reshape
2341         int top_blob_index = layers[i]->tops[0];
2342 
2343         size_t j = i + 1;
2344         for (; j < layer_count; j++)
2345         {
2346             if (layers[j]->type != "Reshape")
2347                 continue;
2348 
2349             if (layers[j]->bottoms.size() != 1)
2350                 continue;
2351 
2352             if (layers[j]->bottoms[0] == top_blob_index)
2353                 break;
2354         }
2355 
2356         if (j == layer_count)
2357             continue;
2358 
2359         ncnn::Reshape* reshape = (ncnn::Reshape*)layers[j];
2360         if (reshape->h != -233 || reshape->c != -233 || reshape->permute != 0)
2361             continue;
2362 
2363         fprintf(stderr, "eliminate_reshape_after_global_pooling %s %s\n", pooling->name.c_str(), reshape->name.c_str());
2364 
2365         int top_blob_index_final = reshape->tops[0];
2366         pooling->tops[0] = top_blob_index_final;
2367         blobs[top_blob_index_final].producer = i;
2368         reshape->type = "ncnnfused";
2369     }
2370 
2371     return 0;
2372 }
2373 
eliminate_flatten_after_global_pooling()2374 int NetOptimize::eliminate_flatten_after_global_pooling()
2375 {
2376     const size_t layer_count = layers.size();
2377     for (size_t i = 0; i < layer_count; i++)
2378     {
2379         if (layers[i]->type != "Pooling")
2380             continue;
2381 
2382         ncnn::Pooling* pooling = (ncnn::Pooling*)layers[i];
2383         if (pooling->global_pooling == 0)
2384             continue;
2385 
2386         // Pooling - Flatten
2387         int top_blob_index = layers[i]->tops[0];
2388 
2389         size_t j = i + 1;
2390         for (; j < layer_count; j++)
2391         {
2392             if (layers[j]->type != "Flatten")
2393                 continue;
2394 
2395             if (layers[j]->bottoms.size() != 1)
2396                 continue;
2397 
2398             if (layers[j]->bottoms[0] == top_blob_index)
2399                 break;
2400         }
2401 
2402         if (j == layer_count)
2403             continue;
2404 
2405         ncnn::Flatten* flatten = (ncnn::Flatten*)layers[j];
2406 
2407         fprintf(stderr, "eliminate_flatten_after_global_pooling %s %s\n", pooling->name.c_str(), flatten->name.c_str());
2408 
2409         int top_blob_index_final = flatten->tops[0];
2410         pooling->tops[0] = top_blob_index_final;
2411         blobs[top_blob_index_final].producer = i;
2412         flatten->type = "ncnnfused";
2413     }
2414 
2415     return 0;
2416 }
2417 
eliminate_flatten_after_innerproduct()2418 int NetOptimize::eliminate_flatten_after_innerproduct()
2419 {
2420     const size_t layer_count = layers.size();
2421     for (size_t i = 0; i < layer_count; i++)
2422     {
2423         if (layers[i]->type != "InnerProduct")
2424             continue;
2425 
2426         // InnerProduct - Flatten
2427         int top_blob_index = layers[i]->tops[0];
2428 
2429         size_t j = i + 1;
2430         for (; j < layer_count; j++)
2431         {
2432             if (layers[j]->type != "Flatten")
2433                 continue;
2434 
2435             if (layers[j]->bottoms.size() != 1)
2436                 continue;
2437 
2438             if (layers[j]->bottoms[0] == top_blob_index)
2439                 break;
2440         }
2441 
2442         if (j == layer_count)
2443             continue;
2444 
2445         ncnn::InnerProduct* innerproduct = (ncnn::InnerProduct*)layers[i];
2446         ncnn::Flatten* flatten = (ncnn::Flatten*)layers[j];
2447 
2448         fprintf(stderr, "eliminate_flatten_after_innerproduct %s %s\n", innerproduct->name.c_str(), flatten->name.c_str());
2449 
2450         int top_blob_index_final = flatten->tops[0];
2451         innerproduct->tops[0] = top_blob_index_final;
2452         blobs[top_blob_index_final].producer = i;
2453         flatten->type = "ncnnfused";
2454     }
2455 
2456     return 0;
2457 }
2458 
eliminate_reshape_before_binaryop()2459 int NetOptimize::eliminate_reshape_before_binaryop()
2460 {
2461     const size_t layer_count = layers.size();
2462     for (size_t i = 0; i < layer_count; i++)
2463     {
2464         if (layers[i]->type != "Reshape")
2465             continue;
2466 
2467         ncnn::Reshape* reshape = (ncnn::Reshape*)layers[i];
2468         if (reshape->w != 1 || reshape->h != 1 || reshape->permute != 0)
2469             continue;
2470 
2471         // Reshape - BinaryOp
2472         int top_blob_index = layers[i]->tops[0];
2473 
2474         size_t j = i + 1;
2475         for (; j < layer_count; j++)
2476         {
2477             if (layers[j]->type != "BinaryOp")
2478                 continue;
2479 
2480             if (layers[j]->bottoms.size() != 2)
2481                 continue;
2482 
2483             if (layers[j]->bottoms[0] == top_blob_index || layers[j]->bottoms[1] == top_blob_index)
2484                 break;
2485         }
2486 
2487         if (j == layer_count)
2488             continue;
2489 
2490         ncnn::BinaryOp* binaryop = (ncnn::BinaryOp*)layers[j];
2491 
2492         fprintf(stderr, "eliminate_reshape_before_binaryop %s %s\n", reshape->name.c_str(), binaryop->name.c_str());
2493 
2494         int bottom_blob_index_final = reshape->bottoms[0];
2495         if (layers[j]->bottoms[0] == top_blob_index)
2496             binaryop->bottoms[0] = bottom_blob_index_final;
2497         if (layers[j]->bottoms[1] == top_blob_index)
2498             binaryop->bottoms[1] = bottom_blob_index_final;
2499         blobs[bottom_blob_index_final].consumer = j;
2500         reshape->type = "ncnnfused";
2501     }
2502 
2503     return 0;
2504 }
2505 
replace_reduction_with_global_pooling()2506 int NetOptimize::replace_reduction_with_global_pooling()
2507 {
2508     const size_t layer_count = layers.size();
2509     for (size_t i = 0; i < layer_count; i++)
2510     {
2511         if (layers[i]->type != "Reduction")
2512             continue;
2513 
2514         ncnn::Reduction* reduction1 = (ncnn::Reduction*)layers[i];
2515         if (reduction1->operation != 3 || reduction1->reduce_all != 0 || reduction1->coeff != 1.f)
2516             continue;
2517 
2518         if (reduction1->axes.w != 1)
2519             continue;
2520 
2521         const int* axes_ptr = reduction1->axes;
2522         if (axes_ptr[0] != 2 && axes_ptr[0] != 3)
2523             continue;
2524 
2525         // Reduction(2/3) - Reduction(2)
2526         int top_blob_index = layers[i]->tops[0];
2527 
2528         size_t j = i + 1;
2529         for (; j < layer_count; j++)
2530         {
2531             if (layers[j]->type != "Reduction")
2532                 continue;
2533 
2534             if (layers[j]->bottoms.size() != 1)
2535                 continue;
2536 
2537             if (layers[j]->bottoms[0] == top_blob_index)
2538                 break;
2539         }
2540 
2541         if (j == layer_count)
2542             continue;
2543 
2544         ncnn::Reduction* reduction2 = (ncnn::Reduction*)layers[j];
2545         if (reduction2->operation != 3 || reduction2->reduce_all != 0 || reduction2->coeff != 1.f)
2546             continue;
2547 
2548         if (reduction2->axes.w != 1)
2549             continue;
2550 
2551         const int* axes2_ptr = reduction2->axes;
2552         if (axes2_ptr[0] != 2)
2553             continue;
2554 
2555         fprintf(stderr, "replace_reduction_with_global_pooling %s %s\n", reduction1->name.c_str(), reduction2->name.c_str());
2556 
2557         ncnn::Pooling* pooling = (ncnn::Pooling*)ncnn::create_layer("Pooling");
2558 
2559         pooling->type = "Pooling";
2560         pooling->name = reduction2->name;
2561         pooling->bottoms = reduction2->bottoms;
2562         pooling->tops = reduction2->tops;
2563 
2564         ncnn::ParamDict pd;
2565         pooling->load_param(pd);
2566 
2567         pooling->pooling_type = 1;
2568         pooling->global_pooling = 1;
2569 
2570         layers[j] = pooling;
2571         delete reduction2;
2572 
2573         int bottom_blob_index_final = reduction1->bottoms[0];
2574         pooling->bottoms[0] = bottom_blob_index_final;
2575         blobs[bottom_blob_index_final].consumer = j;
2576         reduction1->type = "ncnnfused";
2577     }
2578 
2579     return 0;
2580 }
2581 
replace_prelu_with_leaky_relu()2582 int NetOptimize::replace_prelu_with_leaky_relu()
2583 {
2584     const size_t layer_count = layers.size();
2585     for (size_t i = 0; i < layer_count; i++)
2586     {
2587         if (layers[i]->type != "PReLU")
2588             continue;
2589 
2590         ncnn::PReLU* prelu = (ncnn::PReLU*)layers[i];
2591         if (prelu->num_slope != 1)
2592             continue;
2593 
2594         fprintf(stderr, "replace_prelu_with_leaky_relu %s\n", prelu->name.c_str());
2595 
2596         ncnn::ReLU* relu = (ncnn::ReLU*)ncnn::create_layer("ReLU");
2597 
2598         relu->type = "ReLU";
2599         relu->name = prelu->name;
2600         relu->bottoms = prelu->bottoms;
2601         relu->tops = prelu->tops;
2602 
2603         ncnn::ParamDict pd;
2604         relu->load_param(pd);
2605 
2606         relu->slope = prelu->slope_data[0];
2607 
2608         layers[i] = relu;
2609         delete prelu;
2610     }
2611 
2612     return 0;
2613 }
2614 
replace_convolution_with_innerproduct_after_global_pooling()2615 int NetOptimize::replace_convolution_with_innerproduct_after_global_pooling()
2616 {
2617     const size_t layer_count = layers.size();
2618     for (size_t i = 0; i < layer_count; i++)
2619     {
2620         if (layers[i]->type != "Pooling")
2621             continue;
2622 
2623         ncnn::Pooling* pooling = (ncnn::Pooling*)layers[i];
2624         if (pooling->global_pooling == 0)
2625             continue;
2626 
2627         // Pooling - Convolution
2628         int top_blob_index = layers[i]->tops[0];
2629 
2630         size_t j = i + 1;
2631         for (; j < layer_count; j++)
2632         {
2633             if (layers[j]->type != "Convolution")
2634                 continue;
2635 
2636             if (layers[j]->bottoms.size() != 1)
2637                 continue;
2638 
2639             if (layers[j]->bottoms[0] == top_blob_index)
2640                 break;
2641         }
2642 
2643         if (j == layer_count)
2644             continue;
2645 
2646         ncnn::Convolution* convolution = (ncnn::Convolution*)layers[j];
2647 
2648         fprintf(stderr, "replace_convolution_with_innerproduct_after_global_pooling %s %s\n", pooling->name.c_str(), convolution->name.c_str());
2649 
2650         ncnn::InnerProduct* innerproduct = (ncnn::InnerProduct*)ncnn::create_layer("InnerProduct");
2651 
2652         innerproduct->type = "InnerProduct";
2653         innerproduct->name = convolution->name;
2654         innerproduct->bottoms = convolution->bottoms;
2655         innerproduct->tops = convolution->tops;
2656 
2657         ncnn::ParamDict pd;
2658         innerproduct->load_param(pd);
2659 
2660         innerproduct->num_output = convolution->num_output;
2661         innerproduct->bias_term = convolution->bias_term;
2662         innerproduct->weight_data_size = convolution->weight_data_size;
2663         innerproduct->int8_scale_term = convolution->int8_scale_term;
2664 
2665         innerproduct->weight_data = convolution->weight_data;
2666         innerproduct->bias_data = convolution->bias_data;
2667 #if NCNN_INT8
2668         innerproduct->weight_data_int8_scales = convolution->weight_data_int8_scales;
2669         innerproduct->bottom_blob_int8_scales = convolution->bottom_blob_int8_scales;
2670 #endif
2671 
2672         innerproduct->activation_type = convolution->activation_type;
2673         innerproduct->activation_params = convolution->activation_params;
2674 
2675         layers[j] = innerproduct;
2676         delete convolution;
2677     }
2678 
2679     return 0;
2680 }
2681 
replace_convolution_with_innerproduct_after_innerproduct()2682 int NetOptimize::replace_convolution_with_innerproduct_after_innerproduct()
2683 {
2684     const size_t layer_count = layers.size();
2685     for (;;)
2686     {
2687         bool replaced = false;
2688 
2689         for (size_t i = 0; i < layer_count; i++)
2690         {
2691             if (layers[i]->type != "InnerProduct")
2692                 continue;
2693 
2694             // InnerProduct - Convolution
2695             int top_blob_index = layers[i]->tops[0];
2696 
2697             size_t j = i + 1;
2698             for (; j < layer_count; j++)
2699             {
2700                 if (layers[j]->type != "Convolution")
2701                     continue;
2702 
2703                 if (layers[j]->bottoms.size() != 1)
2704                     continue;
2705 
2706                 if (layers[j]->bottoms[0] == top_blob_index)
2707                     break;
2708             }
2709 
2710             if (j == layer_count)
2711                 continue;
2712 
2713             ncnn::InnerProduct* innerproduct = (ncnn::InnerProduct*)layers[i];
2714             ncnn::Convolution* convolution = (ncnn::Convolution*)layers[j];
2715 
2716             fprintf(stderr, "replace_convolution_with_innerproduct_after_innerproduct %s %s\n", innerproduct->name.c_str(), convolution->name.c_str());
2717 
2718             ncnn::InnerProduct* innerproduct2 = (ncnn::InnerProduct*)ncnn::create_layer("InnerProduct");
2719 
2720             innerproduct2->type = "InnerProduct";
2721             innerproduct2->name = convolution->name;
2722             innerproduct2->bottoms = convolution->bottoms;
2723             innerproduct2->tops = convolution->tops;
2724 
2725             ncnn::ParamDict pd;
2726             innerproduct2->load_param(pd);
2727 
2728             innerproduct2->num_output = convolution->num_output;
2729             innerproduct2->bias_term = convolution->bias_term;
2730             innerproduct2->weight_data_size = convolution->weight_data_size;
2731             innerproduct->int8_scale_term = convolution->int8_scale_term;
2732 
2733             innerproduct2->weight_data = convolution->weight_data;
2734             innerproduct2->bias_data = convolution->bias_data;
2735 #if NCNN_INT8
2736             innerproduct->weight_data_int8_scales = convolution->weight_data_int8_scales;
2737             innerproduct->bottom_blob_int8_scales = convolution->bottom_blob_int8_scales;
2738 #endif
2739 
2740             innerproduct2->activation_type = convolution->activation_type;
2741             innerproduct2->activation_params = convolution->activation_params;
2742 
2743             layers[j] = innerproduct2;
2744             delete convolution;
2745 
2746             replaced = true;
2747         }
2748 
2749         if (!replaced)
2750             break;
2751     }
2752 
2753     return 0;
2754 }
2755 
main(int argc,char ** argv)2756 int main(int argc, char** argv)
2757 {
2758     if (argc < 6)
2759     {
2760         fprintf(stderr, "usage: %s [inparam] [inbin] [outparam] [outbin] [flag] [cutstart] [cutend]\n", argv[0]);
2761         return -1;
2762     }
2763 
2764     const char* inparam = argv[1];
2765     const char* inbin = argv[2];
2766     const char* outparam = argv[3];
2767     const char* outbin = argv[4];
2768     int flag = atoi(argv[5]);
2769     const char* cutstartname = nullptr;
2770     const char* cutendname = nullptr;
2771 
2772     if (argc > 6)
2773     {
2774         cutstartname = argv[6];
2775     }
2776 
2777     if (argc > 7)
2778     {
2779         cutendname = argv[7];
2780     }
2781 
2782     NetOptimize optimizer;
2783 
2784     if (flag == 65536 || flag == 1)
2785     {
2786         optimizer.storage_type = 1;
2787     }
2788     else
2789     {
2790         optimizer.storage_type = 0;
2791     }
2792 
2793     optimizer.load_param(inparam);
2794 
2795     if (strcmp(inbin, "null") == 0)
2796     {
2797         DataReaderFromEmpty dr;
2798         optimizer.load_model(dr);
2799         optimizer.gen_random_weight = true;
2800     }
2801     else
2802         optimizer.load_model(inbin);
2803 
2804     if (optimizer.set_cutparam(cutstartname, cutendname) < 0)
2805     {
2806         return -1;
2807     }
2808 
2809     optimizer.fuse_batchnorm_scale();
2810     optimizer.fuse_convolution_batchnorm();
2811     optimizer.fuse_convolution_mul();
2812     optimizer.fuse_convolution_add();
2813     optimizer.fuse_convolutiondepthwise_batchnorm();
2814     optimizer.fuse_convolutiondepthwise_mul();
2815     optimizer.fuse_convolutiondepthwise_add();
2816     optimizer.fuse_deconvolution_batchnorm();
2817     optimizer.fuse_deconvolution_mul();
2818     optimizer.fuse_deconvolution_add();
2819     optimizer.fuse_deconvolutiondepthwise_batchnorm();
2820     optimizer.fuse_innerproduct_batchnorm();
2821     optimizer.fuse_innerproduct_add();
2822     optimizer.fuse_innerproduct_dropout();
2823 
2824     optimizer.replace_reduction_with_global_pooling();
2825     optimizer.replace_prelu_with_leaky_relu();
2826 
2827     optimizer.fuse_convolution_activation();
2828     optimizer.fuse_convolutiondepthwise_activation();
2829     optimizer.fuse_deconvolution_activation();
2830     optimizer.fuse_deconvolutiondepthwise_activation();
2831     optimizer.fuse_innerproduct_activation();
2832     optimizer.fuse_memorydata_binaryop();
2833     optimizer.fuse_binaryop_eltwise();
2834 
2835     optimizer.eliminate_dropout();
2836     optimizer.eliminate_pooling1x1();
2837     optimizer.eliminate_noop();
2838     optimizer.eliminate_split();
2839     optimizer.eliminate_flatten_after_global_pooling();
2840     optimizer.eliminate_reshape_after_global_pooling();
2841     optimizer.eliminate_reshape_before_binaryop();
2842 
2843     optimizer.replace_convolution_with_innerproduct_after_global_pooling();
2844     optimizer.replace_convolution_with_innerproduct_after_innerproduct();
2845 
2846     optimizer.eliminate_flatten_after_innerproduct();
2847     optimizer.eliminate_orphaned_memorydata();
2848 
2849     optimizer.shape_inference();
2850 
2851     optimizer.estimate_memory_footprint();
2852 
2853     optimizer.save(outparam, outbin);
2854 
2855     return 0;
2856 }
2857