1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 
15 #ifdef _MSC_VER
16 #define _CRT_SECURE_NO_DEPRECATE
17 #endif
18 
19 #include <algorithm>
20 #include <map>
21 #include <set>
22 #include <vector>
23 
24 // ncnn public header
25 #include "datareader.h"
26 #include "layer.h"
27 #include "layer_type.h"
28 #include "net.h"
29 
30 // ncnn private header
31 #include "modelwriter.h"
32 
33 class DataReaderFromEmpty : public ncnn::DataReader
34 {
35 public:
scan(const char * format,void * p) const36     virtual int scan(const char* format, void* p) const
37     {
38         return 0;
39     }
read(void * buf,size_t size) const40     virtual size_t read(void* buf, size_t size) const
41     {
42         memset(buf, 0, size);
43         return size;
44     }
45 };
46 
47 class NetOptimize : public ModelWriter
48 {
49 public:
50     NetOptimize();
51 
52 public:
53     int fuse_batchnorm_scale();
54     int fuse_convolution_batchnorm();
55     int fuse_convolution_mul();
56     int fuse_convolution_add();
57     int fuse_convolutiondepthwise_batchnorm();
58     int fuse_convolutiondepthwise_mul();
59     int fuse_convolutiondepthwise_add();
60     int fuse_deconvolution_batchnorm();
61     int fuse_deconvolution_mul();
62     int fuse_deconvolution_add();
63     int fuse_deconvolutiondepthwise_batchnorm();
64     int fuse_innerproduct_batchnorm();
65     int fuse_innerproduct_add();
66     int fuse_innerproduct_dropout();
67     int fuse_convolution_activation();
68     int fuse_convolutiondepthwise_activation();
69     int fuse_deconvolution_activation();
70     int fuse_deconvolutiondepthwise_activation();
71     int fuse_innerproduct_activation();
72     int fuse_memorydata_binaryop();
73     int fuse_binaryop_eltwise();
74 
75     int eliminate_dropout();
76     int eliminate_pooling1x1();
77     int eliminate_noop();
78     int eliminate_split();
79     int eliminate_orphaned_memorydata();
80     int eliminate_flatten_after_global_pooling();
81     int eliminate_reshape_after_global_pooling();
82     int eliminate_flatten_after_innerproduct();
83     int eliminate_reshape_before_binaryop();
84 
85     int replace_reduction_with_global_pooling();
86     int replace_prelu_with_leaky_relu();
87     int replace_convolution_with_innerproduct_after_global_pooling();
88     int replace_convolution_with_innerproduct_after_innerproduct();
89 };
90 
NetOptimize()91 NetOptimize::NetOptimize()
92     : ModelWriter()
93 {
94 }
95 
fuse_batchnorm_scale()96 int NetOptimize::fuse_batchnorm_scale()
97 {
98     const size_t layer_count = layers.size();
99     for (size_t i = 0; i < layer_count; i++)
100     {
101         if (layers[i]->type != "BatchNorm")
102             continue;
103 
104         // BatchNorm - Scale
105         int top_blob_index = layers[i]->tops[0];
106 
107         size_t j = i + 1;
108         for (; j < layer_count; j++)
109         {
110             if (layers[j]->type != "Scale")
111                 continue;
112 
113             if (layers[j]->bottoms.size() != 1)
114                 continue;
115 
116             if (layers[j]->bottoms[0] == top_blob_index)
117                 break;
118         }
119 
120         if (j == layer_count)
121             continue;
122 
123         // fuse BatchNorm - Scale to BatchNorm
124         ncnn::BatchNorm* batchnorm = (ncnn::BatchNorm*)layers[i];
125         ncnn::Scale* scale = (ncnn::Scale*)layers[j];
126 
127         fprintf(stderr, "fuse_batchnorm_scale %s %s\n", batchnorm->name.c_str(), scale->name.c_str());
128 
129         {
130             //             v = ((v - mean) / sqrt(var + eps) * slope + bias) * s + b
131             //               =  (v - mean) / sqrt(var + eps) * (slope * s) + (bias * s + b)
132 
133             int channels = batchnorm->channels;
134 
135             float* slope = batchnorm->slope_data;
136             float* bias = batchnorm->bias_data;
137 
138             for (int q = 0; q < channels; q++)
139             {
140                 slope[q] = slope[q] * scale->scale_data[q];
141                 if (scale->bias_term)
142                     bias[q] = bias[q] * scale->scale_data[q] + scale->bias_data[q];
143                 else
144                     bias[q] = bias[q] * scale->scale_data[q];
145             }
146         }
147 
148         int top_blob_index_final = scale->tops[0];
149         batchnorm->tops[0] = top_blob_index_final;
150         blobs[top_blob_index_final].producer = i;
151         scale->type = "ncnnfused";
152     }
153 
154     return 0;
155 }
156 
fuse_convolution_batchnorm()157 int NetOptimize::fuse_convolution_batchnorm()
158 {
159     const size_t layer_count = layers.size();
160     for (size_t i = 0; i < layer_count; i++)
161     {
162         if (layers[i]->type != "Convolution")
163             continue;
164 
165         // Convolution - BatchNorm
166         int top_blob_index = layers[i]->tops[0];
167 
168         size_t j = i + 1;
169         for (; j < layer_count; j++)
170         {
171             if (layers[j]->type != "BatchNorm")
172                 continue;
173 
174             if (layers[j]->bottoms.size() != 1)
175                 continue;
176 
177             if (layers[j]->bottoms[0] == top_blob_index)
178                 break;
179         }
180 
181         if (j == layer_count)
182             continue;
183 
184         // fuse Convolution - BatchNorm to Convolution
185         ncnn::Convolution* convolution = (ncnn::Convolution*)layers[i];
186         ncnn::BatchNorm* batchnorm = (ncnn::BatchNorm*)layers[j];
187 
188         fprintf(stderr, "fuse_convolution_batchnorm %s %s\n", convolution->name.c_str(), batchnorm->name.c_str());
189 
190         {
191             int channels = batchnorm->channels;
192             float eps = batchnorm->eps;
193 
194             // a = bias - slope * mean / sqrt(var + eps)
195             // b = slope / sqrt(var + eps)
196             // value = value * b + a
197 
198             std::vector<float> a(channels);
199             std::vector<float> b(channels);
200             for (int i = 0; i < channels; i++)
201             {
202                 float sqrt_var = static_cast<float>(sqrt(batchnorm->var_data[i] + eps));
203                 a[i] = batchnorm->bias_data[i] - batchnorm->slope_data[i] * batchnorm->mean_data[i] / sqrt_var;
204                 b[i] = batchnorm->slope_data[i] / sqrt_var;
205             }
206 
207             if (convolution->bias_term == 0)
208             {
209                 // init bias as zero
210                 convolution->bias_term = 1;
211                 convolution->bias_data = ncnn::Mat(channels);
212                 convolution->bias_data.fill(0.f);
213             }
214 
215             const int weight_per_outch = convolution->weight_data_size / channels;
216 
217             float* weight = convolution->weight_data;
218             float* bias = convolution->bias_data;
219             for (int i = 0; i < channels; i++)
220             {
221                 float* conv_weight_outch = weight + weight_per_outch * i;
222                 for (int j = 0; j < weight_per_outch; j++)
223                 {
224                     conv_weight_outch[j] *= b[i];
225                 }
226 
227                 bias[i] = bias[i] * b[i] + a[i];
228             }
229         }
230 
231         int top_blob_index_final = batchnorm->tops[0];
232         convolution->tops[0] = top_blob_index_final;
233         blobs[top_blob_index_final].producer = i;
234         batchnorm->type = "ncnnfused";
235     }
236 
237     return 0;
238 }
239 
fuse_convolution_mul()240 int NetOptimize::fuse_convolution_mul()
241 {
242     const size_t layer_count = layers.size();
243     for (size_t i = 0; i < layer_count; i++)
244     {
245         if (layers[i]->type != "Convolution")
246             continue;
247 
248         // Convolution - BinaryOp
249         int top_blob_index = layers[i]->tops[0];
250 
251         size_t j = i + 1;
252         for (; j < layer_count; j++)
253         {
254             if (layers[j]->type != "BinaryOp")
255                 continue;
256 
257             if (layers[j]->bottoms.size() != 2)
258                 continue;
259 
260             if (layers[j]->bottoms[0] == top_blob_index)
261                 break;
262         }
263 
264         if (j == layer_count)
265             continue;
266 
267         // fuse Convolution - BinaryOp to Convolution
268         ncnn::Convolution* convolution = (ncnn::Convolution*)layers[i];
269         ncnn::BinaryOp* binaryop = (ncnn::BinaryOp*)layers[j];
270 
271         if (binaryop->op_type != 2 || binaryop->with_scalar)
272             continue;
273 
274         // MemoryData - ..... - BinaryOp
275         size_t k = 0;
276         for (; k < j; k++)
277         {
278             if (layers[k]->type != "MemoryData")
279                 continue;
280 
281             if (layers[k]->tops[0] == binaryop->bottoms[1])
282                 break;
283         }
284 
285         if (k == j)
286             continue;
287 
288         ncnn::MemoryData* memorydata = (ncnn::MemoryData*)layers[k];
289 
290         int channels = convolution->num_output;
291 
292         if (memorydata->w != channels || memorydata->h != 0 || memorydata->c != 0)
293         {
294             // not bias-like broadcasting type
295             continue;
296         }
297 
298         fprintf(stderr, "fuse_convolution_mul %s %s\n", convolution->name.c_str(), binaryop->name.c_str());
299 
300         {
301             const int weight_per_outch = convolution->weight_data_size / channels;
302 
303             float* weight = convolution->weight_data;
304             float* bias = convolution->bias_data;
305             for (int i = 0; i < channels; i++)
306             {
307                 float* conv_weight_outch = weight + weight_per_outch * i;
308                 for (int j = 0; j < weight_per_outch; j++)
309                 {
310                     conv_weight_outch[j] *= memorydata->data[i];
311                 }
312 
313                 if (bias)
314                 {
315                     bias[i] = bias[i] * memorydata->data[i];
316                 }
317             }
318         }
319 
320         int top_blob_index_final = binaryop->tops[0];
321         convolution->tops[0] = top_blob_index_final;
322         blobs[top_blob_index_final].producer = i;
323         binaryop->type = "ncnnfused";
324     }
325 
326     return 0;
327 }
328 
fuse_convolution_add()329 int NetOptimize::fuse_convolution_add()
330 {
331     const size_t layer_count = layers.size();
332     for (size_t i = 0; i < layer_count; i++)
333     {
334         if (layers[i]->type != "Convolution")
335             continue;
336 
337         // Convolution - BinaryOp
338         int top_blob_index = layers[i]->tops[0];
339 
340         size_t j = i + 1;
341         for (; j < layer_count; j++)
342         {
343             if (layers[j]->type != "BinaryOp")
344                 continue;
345 
346             if (layers[j]->bottoms.size() != 2)
347                 continue;
348 
349             if (layers[j]->bottoms[0] == top_blob_index)
350                 break;
351         }
352 
353         if (j == layer_count)
354             continue;
355 
356         // fuse Convolution - BinaryOp to Convolution
357         ncnn::Convolution* convolution = (ncnn::Convolution*)layers[i];
358         ncnn::BinaryOp* binaryop = (ncnn::BinaryOp*)layers[j];
359 
360         if (binaryop->op_type != 0 || binaryop->with_scalar)
361             continue;
362 
363         // MemoryData - ..... - BinaryOp
364         size_t k = 0;
365         for (; k < j; k++)
366         {
367             if (layers[k]->type != "MemoryData")
368                 continue;
369 
370             if (layers[k]->tops[0] == binaryop->bottoms[1])
371                 break;
372         }
373 
374         if (k == j)
375             continue;
376 
377         ncnn::MemoryData* memorydata = (ncnn::MemoryData*)layers[k];
378 
379         int channels = convolution->num_output;
380 
381         if (memorydata->w != channels || memorydata->h != 0 || memorydata->c != 0)
382         {
383             // not bias-like broadcasting type
384             continue;
385         }
386 
387         fprintf(stderr, "fuse_convolution_add %s %s\n", convolution->name.c_str(), binaryop->name.c_str());
388 
389         {
390             if (convolution->bias_term == 0)
391             {
392                 // init bias
393                 convolution->bias_term = 1;
394                 convolution->bias_data = memorydata->data;
395             }
396             else
397             {
398                 float* bias = convolution->bias_data;
399                 for (int i = 0; i < channels; i++)
400                 {
401                     bias[i] = bias[i] + memorydata->data[i];
402                 }
403             }
404         }
405 
406         int top_blob_index_final = binaryop->tops[0];
407         convolution->tops[0] = top_blob_index_final;
408         blobs[top_blob_index_final].producer = i;
409         binaryop->type = "ncnnfused";
410     }
411 
412     return 0;
413 }
414 
fuse_convolutiondepthwise_batchnorm()415 int NetOptimize::fuse_convolutiondepthwise_batchnorm()
416 {
417     const size_t layer_count = layers.size();
418     for (size_t i = 0; i < layer_count; i++)
419     {
420         if (layers[i]->type != "ConvolutionDepthWise")
421             continue;
422 
423         // ConvolutionDepthWise - BatchNorm
424         int top_blob_index = layers[i]->tops[0];
425 
426         size_t j = i + 1;
427         for (; j < layer_count; j++)
428         {
429             if (layers[j]->type != "BatchNorm")
430                 continue;
431 
432             if (layers[j]->bottoms.size() != 1)
433                 continue;
434 
435             if (layers[j]->bottoms[0] == top_blob_index)
436                 break;
437         }
438 
439         if (j == layer_count)
440             continue;
441 
442         // fuse ConvolutionDepthWise - BatchNorm to ConvolutionDepthWise
443         ncnn::ConvolutionDepthWise* convolutiondepthwise = (ncnn::ConvolutionDepthWise*)layers[i];
444         ncnn::BatchNorm* batchnorm = (ncnn::BatchNorm*)layers[j];
445 
446         fprintf(stderr, "fuse_convolutiondepthwise_batchnorm %s %s\n", convolutiondepthwise->name.c_str(), batchnorm->name.c_str());
447 
448         {
449             int channels = batchnorm->channels;
450             float eps = batchnorm->eps;
451 
452             // a = bias - slope * mean / sqrt(var + eps)
453             // b = slope / sqrt(var + eps)
454             // value = value * b + a
455 
456             std::vector<float> a(channels);
457             std::vector<float> b(channels);
458             for (int i = 0; i < channels; i++)
459             {
460                 float sqrt_var = static_cast<float>(sqrt(batchnorm->var_data[i] + eps));
461                 a[i] = batchnorm->bias_data[i] - batchnorm->slope_data[i] * batchnorm->mean_data[i] / sqrt_var;
462                 b[i] = batchnorm->slope_data[i] / sqrt_var;
463             }
464 
465             if (convolutiondepthwise->bias_term == 0)
466             {
467                 // init bias as zero
468                 convolutiondepthwise->bias_term = 1;
469                 convolutiondepthwise->bias_data = ncnn::Mat(channels);
470                 convolutiondepthwise->bias_data.fill(0.f);
471             }
472 
473             const int weight_per_outch = convolutiondepthwise->weight_data_size / channels;
474 
475             float* weight = convolutiondepthwise->weight_data;
476             float* bias = convolutiondepthwise->bias_data;
477             for (int i = 0; i < channels; i++)
478             {
479                 float* conv_weight_outch = weight + weight_per_outch * i;
480                 for (int j = 0; j < weight_per_outch; j++)
481                 {
482                     conv_weight_outch[j] *= b[i];
483                 }
484 
485                 bias[i] = bias[i] * b[i] + a[i];
486             }
487         }
488 
489         int top_blob_index_final = batchnorm->tops[0];
490         convolutiondepthwise->tops[0] = top_blob_index_final;
491         blobs[top_blob_index_final].producer = i;
492         batchnorm->type = "ncnnfused";
493     }
494 
495     return 0;
496 }
497 
fuse_convolutiondepthwise_mul()498 int NetOptimize::fuse_convolutiondepthwise_mul()
499 {
500     const size_t layer_count = layers.size();
501     for (size_t i = 0; i < layer_count; i++)
502     {
503         if (layers[i]->type != "ConvolutionDepthWise")
504             continue;
505 
506         // ConvolutionDepthWise - BinaryOp
507         int top_blob_index = layers[i]->tops[0];
508 
509         size_t j = i + 1;
510         for (; j < layer_count; j++)
511         {
512             if (layers[j]->type != "BinaryOp")
513                 continue;
514 
515             if (layers[j]->bottoms.size() != 2)
516                 continue;
517 
518             if (layers[j]->bottoms[0] == top_blob_index)
519                 break;
520         }
521 
522         if (j == layer_count)
523             continue;
524 
525         // fuse ConvolutionDepthWise - BinaryOp to ConvolutionDepthWise
526         ncnn::ConvolutionDepthWise* convolutiondepthwise = (ncnn::ConvolutionDepthWise*)layers[i];
527         ncnn::BinaryOp* binaryop = (ncnn::BinaryOp*)layers[j];
528 
529         if (binaryop->op_type != 2 || binaryop->with_scalar)
530             continue;
531 
532         // MemoryData - ..... - BinaryOp
533         size_t k = 0;
534         for (; k < j; k++)
535         {
536             if (layers[k]->type != "MemoryData")
537                 continue;
538 
539             if (layers[k]->tops[0] == binaryop->bottoms[1])
540                 break;
541         }
542 
543         if (k == j)
544             continue;
545 
546         ncnn::MemoryData* memorydata = (ncnn::MemoryData*)layers[k];
547 
548         int channels = convolutiondepthwise->num_output;
549 
550         if (memorydata->w != channels || memorydata->h != 0 || memorydata->c != 0)
551         {
552             // not bias-like broadcasting type
553             continue;
554         }
555 
556         fprintf(stderr, "fuse_convolutiondepthwise_mul %s %s\n", convolutiondepthwise->name.c_str(), binaryop->name.c_str());
557 
558         {
559             const int weight_per_outch = convolutiondepthwise->weight_data_size / channels;
560 
561             float* weight = convolutiondepthwise->weight_data;
562             float* bias = convolutiondepthwise->bias_data;
563             for (int i = 0; i < channels; i++)
564             {
565                 float* conv_weight_outch = weight + weight_per_outch * i;
566                 for (int j = 0; j < weight_per_outch; j++)
567                 {
568                     conv_weight_outch[j] *= memorydata->data[i];
569                 }
570 
571                 if (bias)
572                 {
573                     bias[i] = bias[i] * memorydata->data[i];
574                 }
575             }
576         }
577 
578         int top_blob_index_final = binaryop->tops[0];
579         convolutiondepthwise->tops[0] = top_blob_index_final;
580         blobs[top_blob_index_final].producer = i;
581         binaryop->type = "ncnnfused";
582     }
583 
584     return 0;
585 }
586 
fuse_convolutiondepthwise_add()587 int NetOptimize::fuse_convolutiondepthwise_add()
588 {
589     const size_t layer_count = layers.size();
590     for (size_t i = 0; i < layer_count; i++)
591     {
592         if (layers[i]->type != "ConvolutionDepthWise")
593             continue;
594 
595         // ConvolutionDepthWise - BinaryOp
596         int top_blob_index = layers[i]->tops[0];
597 
598         size_t j = i + 1;
599         for (; j < layer_count; j++)
600         {
601             if (layers[j]->type != "BinaryOp")
602                 continue;
603 
604             if (layers[j]->bottoms.size() != 2)
605                 continue;
606 
607             if (layers[j]->bottoms[0] == top_blob_index)
608                 break;
609         }
610 
611         if (j == layer_count)
612             continue;
613 
614         // fuse ConvolutionDepthWise - BinaryOp to ConvolutionDepthWise
615         ncnn::ConvolutionDepthWise* convolutiondepthwise = (ncnn::ConvolutionDepthWise*)layers[i];
616         ncnn::BinaryOp* binaryop = (ncnn::BinaryOp*)layers[j];
617 
618         if (binaryop->op_type != 0 || binaryop->with_scalar)
619             continue;
620 
621         // MemoryData - ..... - BinaryOp
622         size_t k = 0;
623         for (; k < j; k++)
624         {
625             if (layers[k]->type != "MemoryData")
626                 continue;
627 
628             if (layers[k]->tops[0] == binaryop->bottoms[1])
629                 break;
630         }
631 
632         if (k == j)
633             continue;
634 
635         ncnn::MemoryData* memorydata = (ncnn::MemoryData*)layers[k];
636 
637         int channels = convolutiondepthwise->num_output;
638 
639         if (memorydata->w != channels || memorydata->h != 0 || memorydata->c != 0)
640         {
641             // not bias-like broadcasting type
642             continue;
643         }
644 
645         fprintf(stderr, "fuse_convolutiondepthwise_add %s %s\n", convolutiondepthwise->name.c_str(), binaryop->name.c_str());
646 
647         {
648             if (convolutiondepthwise->bias_term == 0)
649             {
650                 // init bias
651                 convolutiondepthwise->bias_term = 1;
652                 convolutiondepthwise->bias_data = memorydata->data;
653             }
654             else
655             {
656                 float* bias = convolutiondepthwise->bias_data;
657                 for (int i = 0; i < channels; i++)
658                 {
659                     bias[i] = bias[i] + memorydata->data[i];
660                 }
661             }
662         }
663 
664         int top_blob_index_final = binaryop->tops[0];
665         convolutiondepthwise->tops[0] = top_blob_index_final;
666         blobs[top_blob_index_final].producer = i;
667         binaryop->type = "ncnnfused";
668     }
669 
670     return 0;
671 }
672 
fuse_deconvolution_batchnorm()673 int NetOptimize::fuse_deconvolution_batchnorm()
674 {
675     const size_t layer_count = layers.size();
676     for (size_t i = 0; i < layer_count; i++)
677     {
678         if (layers[i]->type != "Deconvolution")
679             continue;
680 
681         // Deconvolution - BatchNorm
682         int top_blob_index = layers[i]->tops[0];
683 
684         size_t j = i + 1;
685         for (; j < layer_count; j++)
686         {
687             if (layers[j]->type != "BatchNorm")
688                 continue;
689 
690             if (layers[j]->bottoms.size() != 1)
691                 continue;
692 
693             if (layers[j]->bottoms[0] == top_blob_index)
694                 break;
695         }
696 
697         if (j == layer_count)
698             continue;
699 
700         // fuse Deconvolution - BatchNorm to Deconvolution
701         ncnn::Deconvolution* deconvolution = (ncnn::Deconvolution*)layers[i];
702         ncnn::BatchNorm* batchnorm = (ncnn::BatchNorm*)layers[j];
703 
704         fprintf(stderr, "fuse_deconvolution_batchnorm %s %s\n", deconvolution->name.c_str(), batchnorm->name.c_str());
705 
706         {
707             int channels = batchnorm->channels;
708             float eps = batchnorm->eps;
709 
710             // a = bias - slope * mean / sqrt(var + eps)
711             // b = slope / sqrt(var + eps)
712             // value = value * b + a
713 
714             std::vector<float> a(channels);
715             std::vector<float> b(channels);
716             for (int i = 0; i < channels; i++)
717             {
718                 float sqrt_var = static_cast<float>(sqrt(batchnorm->var_data[i] + eps));
719                 a[i] = batchnorm->bias_data[i] - batchnorm->slope_data[i] * batchnorm->mean_data[i] / sqrt_var;
720                 b[i] = batchnorm->slope_data[i] / sqrt_var;
721             }
722 
723             if (deconvolution->bias_term == 0)
724             {
725                 // init bias as zero
726                 deconvolution->bias_term = 1;
727                 deconvolution->bias_data = ncnn::Mat(channels);
728                 deconvolution->bias_data.fill(0.f);
729             }
730 
731             const int weight_per_outch = deconvolution->weight_data_size / channels;
732 
733             float* weight = deconvolution->weight_data;
734             float* bias = deconvolution->bias_data;
735             for (int i = 0; i < channels; i++)
736             {
737                 float* conv_weight_outch = weight + weight_per_outch * i;
738                 for (int j = 0; j < weight_per_outch; j++)
739                 {
740                     conv_weight_outch[j] *= b[i];
741                 }
742 
743                 bias[i] = bias[i] * b[i] + a[i];
744             }
745         }
746 
747         int top_blob_index_final = batchnorm->tops[0];
748         deconvolution->tops[0] = top_blob_index_final;
749         blobs[top_blob_index_final].producer = i;
750         batchnorm->type = "ncnnfused";
751     }
752 
753     return 0;
754 }
755 
fuse_deconvolution_mul()756 int NetOptimize::fuse_deconvolution_mul()
757 {
758     const size_t layer_count = layers.size();
759     for (size_t i = 0; i < layer_count; i++)
760     {
761         if (layers[i]->type != "Deconvolution")
762             continue;
763 
764         // Deconvolution - BinaryOp
765         int top_blob_index = layers[i]->tops[0];
766 
767         size_t j = i + 1;
768         for (; j < layer_count; j++)
769         {
770             if (layers[j]->type != "BinaryOp")
771                 continue;
772 
773             if (layers[j]->bottoms.size() != 2)
774                 continue;
775 
776             if (layers[j]->bottoms[0] == top_blob_index)
777                 break;
778         }
779 
780         if (j == layer_count)
781             continue;
782 
783         // fuse Deconvolution - BinaryOp to Deconvolution
784         ncnn::Deconvolution* deconvolution = (ncnn::Deconvolution*)layers[i];
785         ncnn::BinaryOp* binaryop = (ncnn::BinaryOp*)layers[j];
786 
787         if (binaryop->op_type != 2 || binaryop->with_scalar)
788             continue;
789 
790         // MemoryData - ..... - BinaryOp
791         size_t k = 0;
792         for (; k < j; k++)
793         {
794             if (layers[k]->type != "MemoryData")
795                 continue;
796 
797             if (layers[k]->tops[0] == binaryop->bottoms[1])
798                 break;
799         }
800 
801         if (k == j)
802             continue;
803 
804         ncnn::MemoryData* memorydata = (ncnn::MemoryData*)layers[k];
805 
806         int channels = deconvolution->num_output;
807 
808         if (memorydata->w != channels || memorydata->h != 0 || memorydata->c != 0)
809         {
810             // not bias-like broadcasting type
811             continue;
812         }
813 
814         fprintf(stderr, "fuse_deconvolution_mul %s %s\n", deconvolution->name.c_str(), binaryop->name.c_str());
815 
816         {
817             const int weight_per_outch = deconvolution->weight_data_size / channels;
818 
819             float* weight = deconvolution->weight_data;
820             float* bias = deconvolution->bias_data;
821             for (int i = 0; i < channels; i++)
822             {
823                 float* conv_weight_outch = weight + weight_per_outch * i;
824                 for (int j = 0; j < weight_per_outch; j++)
825                 {
826                     conv_weight_outch[j] *= memorydata->data[i];
827                 }
828 
829                 if (bias)
830                 {
831                     bias[i] = bias[i] * memorydata->data[i];
832                 }
833             }
834         }
835 
836         int top_blob_index_final = binaryop->tops[0];
837         deconvolution->tops[0] = top_blob_index_final;
838         blobs[top_blob_index_final].producer = i;
839         binaryop->type = "ncnnfused";
840     }
841 
842     return 0;
843 }
844 
fuse_deconvolution_add()845 int NetOptimize::fuse_deconvolution_add()
846 {
847     const size_t layer_count = layers.size();
848     for (size_t i = 0; i < layer_count; i++)
849     {
850         if (layers[i]->type != "Deconvolution")
851             continue;
852 
853         // Deconvolution - BinaryOp
854         int top_blob_index = layers[i]->tops[0];
855 
856         size_t j = i + 1;
857         for (; j < layer_count; j++)
858         {
859             if (layers[j]->type != "BinaryOp")
860                 continue;
861 
862             if (layers[j]->bottoms.size() != 2)
863                 continue;
864 
865             if (layers[j]->bottoms[0] == top_blob_index)
866                 break;
867         }
868 
869         if (j == layer_count)
870             continue;
871 
872         // fuse Deconvolution - BinaryOp to Deconvolution
873         ncnn::Deconvolution* deconvolution = (ncnn::Deconvolution*)layers[i];
874         ncnn::BinaryOp* binaryop = (ncnn::BinaryOp*)layers[j];
875 
876         if (binaryop->op_type != 0 || binaryop->with_scalar)
877             continue;
878 
879         // MemoryData - ..... - BinaryOp
880         size_t k = 0;
881         for (; k < j; k++)
882         {
883             if (layers[k]->type != "MemoryData")
884                 continue;
885 
886             if (layers[k]->tops[0] == binaryop->bottoms[1])
887                 break;
888         }
889 
890         if (k == j)
891             continue;
892 
893         ncnn::MemoryData* memorydata = (ncnn::MemoryData*)layers[k];
894 
895         int channels = deconvolution->num_output;
896 
897         if (memorydata->w != channels || memorydata->h != 0 || memorydata->c != 0)
898         {
899             // not bias-like broadcasting type
900             continue;
901         }
902 
903         fprintf(stderr, "fuse_deconvolution_add %s %s\n", deconvolution->name.c_str(), binaryop->name.c_str());
904 
905         {
906             if (deconvolution->bias_term == 0)
907             {
908                 // init bias
909                 deconvolution->bias_term = 1;
910                 deconvolution->bias_data = memorydata->data;
911             }
912             else
913             {
914                 float* bias = deconvolution->bias_data;
915                 for (int i = 0; i < channels; i++)
916                 {
917                     bias[i] = bias[i] + memorydata->data[i];
918                 }
919             }
920         }
921 
922         int top_blob_index_final = binaryop->tops[0];
923         deconvolution->tops[0] = top_blob_index_final;
924         blobs[top_blob_index_final].producer = i;
925         binaryop->type = "ncnnfused";
926     }
927 
928     return 0;
929 }
930 
fuse_deconvolutiondepthwise_batchnorm()931 int NetOptimize::fuse_deconvolutiondepthwise_batchnorm()
932 {
933     const size_t layer_count = layers.size();
934     for (size_t i = 0; i < layer_count; i++)
935     {
936         if (layers[i]->type != "DeconvolutionDepthWise")
937             continue;
938 
939         // DeconvolutionDepthWise - BatchNorm
940         int top_blob_index = layers[i]->tops[0];
941 
942         size_t j = i + 1;
943         for (; j < layer_count; j++)
944         {
945             if (layers[j]->type != "BatchNorm")
946                 continue;
947 
948             if (layers[j]->bottoms.size() != 1)
949                 continue;
950 
951             if (layers[j]->bottoms[0] == top_blob_index)
952                 break;
953         }
954 
955         if (j == layer_count)
956             continue;
957 
958         // fuse DeconvolutionDepthWise - BatchNorm to DeconvolutionDepthWise
959         ncnn::DeconvolutionDepthWise* deconvolutiondepthwise = (ncnn::DeconvolutionDepthWise*)layers[i];
960         ncnn::BatchNorm* batchnorm = (ncnn::BatchNorm*)layers[j];
961 
962         fprintf(stderr, "fuse_deconvolutiondepthwise_batchnorm %s %s\n", deconvolutiondepthwise->name.c_str(), batchnorm->name.c_str());
963 
964         {
965             int channels = batchnorm->channels;
966             float eps = batchnorm->eps;
967 
968             // a = bias - slope * mean / sqrt(var + eps)
969             // b = slope / sqrt(var + eps)
970             // value = value * b + a
971 
972             std::vector<float> a(channels);
973             std::vector<float> b(channels);
974             for (int i = 0; i < channels; i++)
975             {
976                 float sqrt_var = static_cast<float>(sqrt(batchnorm->var_data[i] + eps));
977                 a[i] = batchnorm->bias_data[i] - batchnorm->slope_data[i] * batchnorm->mean_data[i] / sqrt_var;
978                 b[i] = batchnorm->slope_data[i] / sqrt_var;
979             }
980 
981             if (deconvolutiondepthwise->bias_term == 0)
982             {
983                 // init bias as zero
984                 deconvolutiondepthwise->bias_term = 1;
985                 deconvolutiondepthwise->bias_data = ncnn::Mat(channels);
986                 deconvolutiondepthwise->bias_data.fill(0.f);
987             }
988 
989             const int weight_per_outch = deconvolutiondepthwise->weight_data_size / channels;
990 
991             float* weight = deconvolutiondepthwise->weight_data;
992             float* bias = deconvolutiondepthwise->bias_data;
993             for (int i = 0; i < channels; i++)
994             {
995                 float* conv_weight_outch = weight + weight_per_outch * i;
996                 for (int j = 0; j < weight_per_outch; j++)
997                 {
998                     conv_weight_outch[j] *= b[i];
999                 }
1000 
1001                 bias[i] = bias[i] * b[i] + a[i];
1002             }
1003         }
1004 
1005         int top_blob_index_final = batchnorm->tops[0];
1006         deconvolutiondepthwise->tops[0] = top_blob_index_final;
1007         blobs[top_blob_index_final].producer = i;
1008         batchnorm->type = "ncnnfused";
1009     }
1010 
1011     return 0;
1012 }
1013 
fuse_innerproduct_batchnorm()1014 int NetOptimize::fuse_innerproduct_batchnorm()
1015 {
1016     const size_t layer_count = layers.size();
1017     for (size_t i = 0; i < layer_count; i++)
1018     {
1019         if (layers[i]->type != "InnerProduct")
1020             continue;
1021 
1022         // InnerProduct - BatchNorm
1023         int top_blob_index = layers[i]->tops[0];
1024 
1025         size_t j = i + 1;
1026         for (; j < layer_count; j++)
1027         {
1028             if (layers[j]->type != "BatchNorm")
1029                 continue;
1030 
1031             if (layers[j]->bottoms.size() != 1)
1032                 continue;
1033 
1034             if (layers[j]->bottoms[0] == top_blob_index)
1035                 break;
1036         }
1037 
1038         if (j == layer_count)
1039             continue;
1040 
1041         // fuse InnerProduct - BatchNorm to InnerProduct
1042         ncnn::InnerProduct* innerproduct = (ncnn::InnerProduct*)layers[i];
1043         ncnn::BatchNorm* batchnorm = (ncnn::BatchNorm*)layers[j];
1044 
1045         fprintf(stderr, "fuse_innerproduct_batchnorm %s %s\n", innerproduct->name.c_str(), batchnorm->name.c_str());
1046 
1047         {
1048             int channels = batchnorm->channels;
1049             float eps = batchnorm->eps;
1050 
1051             // a = bias - slope * mean / sqrt(var + eps)
1052             // b = slope / sqrt(var + eps)
1053             // value = value * b + a
1054 
1055             std::vector<float> a(channels);
1056             std::vector<float> b(channels);
1057             for (int i = 0; i < channels; i++)
1058             {
1059                 float sqrt_var = static_cast<float>(sqrt(batchnorm->var_data[i] + eps));
1060                 a[i] = batchnorm->bias_data[i] - batchnorm->slope_data[i] * batchnorm->mean_data[i] / sqrt_var;
1061                 b[i] = batchnorm->slope_data[i] / sqrt_var;
1062             }
1063 
1064             if (innerproduct->bias_term == 0)
1065             {
1066                 // init bias as zero
1067                 innerproduct->bias_term = 1;
1068                 innerproduct->bias_data = ncnn::Mat(channels);
1069                 innerproduct->bias_data.fill(0.f);
1070             }
1071 
1072             const int weight_per_outch = innerproduct->weight_data_size / channels;
1073 
1074             float* weight = innerproduct->weight_data;
1075             float* bias = innerproduct->bias_data;
1076             for (int i = 0; i < channels; i++)
1077             {
1078                 float* conv_weight_outch = weight + weight_per_outch * i;
1079                 for (int j = 0; j < weight_per_outch; j++)
1080                 {
1081                     conv_weight_outch[j] *= b[i];
1082                 }
1083 
1084                 bias[i] = bias[i] * b[i] + a[i];
1085             }
1086         }
1087 
1088         int top_blob_index_final = batchnorm->tops[0];
1089         innerproduct->tops[0] = top_blob_index_final;
1090         blobs[top_blob_index_final].producer = i;
1091         batchnorm->type = "ncnnfused";
1092     }
1093 
1094     return 0;
1095 }
1096 
fuse_innerproduct_add()1097 int NetOptimize::fuse_innerproduct_add()
1098 {
1099     const size_t layer_count = layers.size();
1100     for (size_t i = 0; i < layer_count; i++)
1101     {
1102         if (layers[i]->type != "InnerProduct")
1103             continue;
1104 
1105         // InnerProduct - BinaryOp
1106         int top_blob_index = layers[i]->tops[0];
1107 
1108         size_t j = i + 1;
1109         for (; j < layer_count; j++)
1110         {
1111             if (layers[j]->type != "BinaryOp")
1112                 continue;
1113 
1114             if (layers[j]->bottoms.size() != 2)
1115                 continue;
1116 
1117             if (layers[j]->bottoms[0] == top_blob_index)
1118                 break;
1119         }
1120 
1121         if (j == layer_count)
1122             continue;
1123 
1124         // fuse InnerProduct - BinaryOp to InnerProduct
1125         ncnn::InnerProduct* innerproduct = (ncnn::InnerProduct*)layers[i];
1126         ncnn::BinaryOp* binaryop = (ncnn::BinaryOp*)layers[j];
1127 
1128         if (binaryop->op_type != 0 || binaryop->with_scalar)
1129             continue;
1130 
1131         // MemoryData - ..... - BinaryOp
1132         size_t k = 0;
1133         for (; k < j; k++)
1134         {
1135             if (layers[k]->type != "MemoryData")
1136                 continue;
1137 
1138             if (layers[k]->tops[0] == binaryop->bottoms[1])
1139                 break;
1140         }
1141 
1142         if (k == j)
1143             continue;
1144 
1145         ncnn::MemoryData* memorydata = (ncnn::MemoryData*)layers[k];
1146 
1147         int channels = innerproduct->num_output;
1148 
1149         if (memorydata->w != channels || memorydata->h != 0 || memorydata->c != 0)
1150         {
1151             // not bias-like broadcasting type
1152             continue;
1153         }
1154 
1155         fprintf(stderr, "fuse_innerproduct_add %s %s\n", innerproduct->name.c_str(), binaryop->name.c_str());
1156 
1157         {
1158             if (innerproduct->bias_term == 0)
1159             {
1160                 // init bias
1161                 innerproduct->bias_term = 1;
1162                 innerproduct->bias_data = memorydata->data;
1163             }
1164             else
1165             {
1166                 float* bias = innerproduct->bias_data;
1167                 for (int i = 0; i < channels; i++)
1168                 {
1169                     bias[i] = bias[i] + memorydata->data[i];
1170                 }
1171             }
1172         }
1173 
1174         int top_blob_index_final = binaryop->tops[0];
1175         innerproduct->tops[0] = top_blob_index_final;
1176         blobs[top_blob_index_final].producer = i;
1177         binaryop->type = "ncnnfused";
1178     }
1179 
1180     return 0;
1181 }
1182 
fuse_innerproduct_dropout()1183 int NetOptimize::fuse_innerproduct_dropout()
1184 {
1185     const size_t layer_count = layers.size();
1186     for (size_t i = 0; i < layer_count; i++)
1187     {
1188         if (layers[i]->type != "InnerProduct")
1189             continue;
1190 
1191         // InnerProduct - Dropout
1192         int top_blob_index = layers[i]->tops[0];
1193 
1194         size_t j = i + 1;
1195         for (; j < layer_count; j++)
1196         {
1197             if (layers[j]->type != "Dropout")
1198                 continue;
1199 
1200             if (layers[j]->bottoms.size() != 1)
1201                 continue;
1202 
1203             if (layers[j]->bottoms[0] == top_blob_index)
1204                 break;
1205         }
1206 
1207         if (j == layer_count)
1208             continue;
1209 
1210         // fuse InnerProduct - Dropout to InnerProduct
1211         ncnn::InnerProduct* innerproduct = (ncnn::InnerProduct*)layers[i];
1212         ncnn::Dropout* dropout = (ncnn::Dropout*)layers[j];
1213 
1214         fprintf(stderr, "fuse_innerproduct_dropout %s %s\n", innerproduct->name.c_str(), dropout->name.c_str());
1215 
1216         float scale = dropout->scale;
1217         if (scale != 1.f)
1218         {
1219             const int num_output = innerproduct->num_output;
1220             const int weight_per_outch = innerproduct->weight_data_size / num_output;
1221 
1222             float* weight = innerproduct->weight_data;
1223             for (int i = 0; i < num_output; i++)
1224             {
1225                 float* conv_weight_outch = weight + weight_per_outch * i;
1226                 for (int j = 0; j < weight_per_outch; j++)
1227                 {
1228                     conv_weight_outch[j] *= scale;
1229                 }
1230             }
1231 
1232             if (innerproduct->bias_term)
1233             {
1234                 float* bias = innerproduct->bias_data;
1235                 for (int i = 0; i < num_output; i++)
1236                 {
1237                     bias[i] *= scale;
1238                 }
1239             }
1240         }
1241 
1242         int top_blob_index_final = dropout->tops[0];
1243         innerproduct->tops[0] = top_blob_index_final;
1244         blobs[top_blob_index_final].producer = i;
1245         dropout->type = "ncnnfused";
1246     }
1247 
1248     return 0;
1249 }
1250 
fuse_convolution_activation()1251 int NetOptimize::fuse_convolution_activation()
1252 {
1253     const size_t layer_count = layers.size();
1254     for (size_t i = 0; i < layer_count; i++)
1255     {
1256         if (layers[i]->type != "Convolution")
1257             continue;
1258 
1259         // Convolution - Activation
1260         int top_blob_index = layers[i]->tops[0];
1261 
1262         size_t j = i + 1;
1263         for (; j < layer_count; j++)
1264         {
1265             if (layers[j]->type != "ReLU" && layers[j]->type != "Clip" && layers[j]->type != "Sigmoid" && layers[j]->type != "Mish")
1266                 continue;
1267 
1268             if (layers[j]->bottoms.size() != 1)
1269                 continue;
1270 
1271             if (layers[j]->bottoms[0] == top_blob_index)
1272                 break;
1273         }
1274 
1275         if (j == layer_count)
1276             continue;
1277 
1278         // fuse Convolution - Activation to Convolution
1279         ncnn::Convolution* convolution = (ncnn::Convolution*)layers[i];
1280         ncnn::Layer* activation = layers[j];
1281 
1282         fprintf(stderr, "fuse_convolution_activation %s %s\n", convolution->name.c_str(), activation->name.c_str());
1283 
1284         if (activation->type == "ReLU")
1285         {
1286             ncnn::ReLU* relu = (ncnn::ReLU*)activation;
1287 
1288             if (relu->slope == 0.f)
1289             {
1290                 convolution->activation_type = 1;
1291             }
1292             else
1293             {
1294                 convolution->activation_type = 2;
1295                 convolution->activation_params = ncnn::Mat(1);
1296                 convolution->activation_params[0] = relu->slope;
1297             }
1298         }
1299         else if (activation->type == "Clip")
1300         {
1301             ncnn::Clip* clip = (ncnn::Clip*)activation;
1302 
1303             convolution->activation_type = 3;
1304             convolution->activation_params = ncnn::Mat(2);
1305             convolution->activation_params[0] = clip->min;
1306             convolution->activation_params[1] = clip->max;
1307         }
1308         else if (activation->type == "Sigmoid")
1309         {
1310             convolution->activation_type = 4;
1311         }
1312         else if (activation->type == "Mish")
1313         {
1314             convolution->activation_type = 5;
1315         }
1316 
1317         int top_blob_index_final = activation->tops[0];
1318         convolution->tops[0] = top_blob_index_final;
1319         blobs[top_blob_index_final].producer = i;
1320         activation->type = "ncnnfused";
1321     }
1322 
1323     return 0;
1324 }
1325 
fuse_convolutiondepthwise_activation()1326 int NetOptimize::fuse_convolutiondepthwise_activation()
1327 {
1328     const size_t layer_count = layers.size();
1329     for (size_t i = 0; i < layer_count; i++)
1330     {
1331         if (layers[i]->type != "ConvolutionDepthWise")
1332             continue;
1333 
1334         // ConvolutionDepthWise - Activation
1335         int top_blob_index = layers[i]->tops[0];
1336 
1337         size_t j = i + 1;
1338         for (; j < layer_count; j++)
1339         {
1340             if (layers[j]->type != "ReLU" && layers[j]->type != "Clip" && layers[j]->type != "Sigmoid" && layers[j]->type != "Mish")
1341                 continue;
1342 
1343             if (layers[j]->bottoms.size() != 1)
1344                 continue;
1345 
1346             if (layers[j]->bottoms[0] == top_blob_index)
1347                 break;
1348         }
1349 
1350         if (j == layer_count)
1351             continue;
1352 
1353         // fuse ConvolutionDepthWise - Activation to ConvolutionDepthWise
1354         ncnn::ConvolutionDepthWise* convolutiondepthwise = (ncnn::ConvolutionDepthWise*)layers[i];
1355         ncnn::Layer* activation = layers[j];
1356 
1357         fprintf(stderr, "fuse_convolutiondepthwise_activation %s %s\n", convolutiondepthwise->name.c_str(), activation->name.c_str());
1358 
1359         if (activation->type == "ReLU")
1360         {
1361             ncnn::ReLU* relu = (ncnn::ReLU*)activation;
1362 
1363             if (relu->slope == 0.f)
1364             {
1365                 convolutiondepthwise->activation_type = 1;
1366             }
1367             else
1368             {
1369                 convolutiondepthwise->activation_type = 2;
1370                 convolutiondepthwise->activation_params = ncnn::Mat(1);
1371                 convolutiondepthwise->activation_params[0] = relu->slope;
1372             }
1373         }
1374         else if (activation->type == "Clip")
1375         {
1376             ncnn::Clip* clip = (ncnn::Clip*)activation;
1377 
1378             convolutiondepthwise->activation_type = 3;
1379             convolutiondepthwise->activation_params = ncnn::Mat(2);
1380             convolutiondepthwise->activation_params[0] = clip->min;
1381             convolutiondepthwise->activation_params[1] = clip->max;
1382         }
1383         else if (activation->type == "Sigmoid")
1384         {
1385             convolutiondepthwise->activation_type = 4;
1386         }
1387         else if (activation->type == "Mish")
1388         {
1389             convolutiondepthwise->activation_type = 5;
1390         }
1391 
1392         int top_blob_index_final = activation->tops[0];
1393         convolutiondepthwise->tops[0] = top_blob_index_final;
1394         blobs[top_blob_index_final].producer = i;
1395         activation->type = "ncnnfused";
1396     }
1397 
1398     return 0;
1399 }
1400 
fuse_deconvolution_activation()1401 int NetOptimize::fuse_deconvolution_activation()
1402 {
1403     const size_t layer_count = layers.size();
1404     for (size_t i = 0; i < layer_count; i++)
1405     {
1406         if (layers[i]->type != "Deconvolution")
1407             continue;
1408 
1409         // Deconvolution - Activation
1410         int top_blob_index = layers[i]->tops[0];
1411 
1412         size_t j = i + 1;
1413         for (; j < layer_count; j++)
1414         {
1415             if (layers[j]->type != "ReLU" && layers[j]->type != "Clip" && layers[j]->type != "Sigmoid")
1416                 continue;
1417 
1418             if (layers[j]->bottoms.size() != 1)
1419                 continue;
1420 
1421             if (layers[j]->bottoms[0] == top_blob_index)
1422                 break;
1423         }
1424 
1425         if (j == layer_count)
1426             continue;
1427 
1428         // fuse Deconvolution - Activation to Deconvolution
1429         ncnn::Deconvolution* deconvolution = (ncnn::Deconvolution*)layers[i];
1430         ncnn::Layer* activation = layers[j];
1431 
1432         fprintf(stderr, "fuse_deconvolution_activation %s %s\n", deconvolution->name.c_str(), activation->name.c_str());
1433 
1434         if (activation->type == "ReLU")
1435         {
1436             ncnn::ReLU* relu = (ncnn::ReLU*)activation;
1437 
1438             if (relu->slope == 0.f)
1439             {
1440                 deconvolution->activation_type = 1;
1441             }
1442             else
1443             {
1444                 deconvolution->activation_type = 2;
1445                 deconvolution->activation_params = ncnn::Mat(1);
1446                 deconvolution->activation_params[0] = relu->slope;
1447             }
1448         }
1449         else if (activation->type == "Clip")
1450         {
1451             ncnn::Clip* clip = (ncnn::Clip*)activation;
1452 
1453             deconvolution->activation_type = 3;
1454             deconvolution->activation_params = ncnn::Mat(2);
1455             deconvolution->activation_params[0] = clip->min;
1456             deconvolution->activation_params[1] = clip->max;
1457         }
1458         else if (activation->type == "Sigmoid")
1459         {
1460             deconvolution->activation_type = 4;
1461         }
1462 
1463         int top_blob_index_final = activation->tops[0];
1464         deconvolution->tops[0] = top_blob_index_final;
1465         blobs[top_blob_index_final].producer = i;
1466         activation->type = "ncnnfused";
1467     }
1468 
1469     return 0;
1470 }
1471 
fuse_deconvolutiondepthwise_activation()1472 int NetOptimize::fuse_deconvolutiondepthwise_activation()
1473 {
1474     const size_t layer_count = layers.size();
1475     for (size_t i = 0; i < layer_count; i++)
1476     {
1477         if (layers[i]->type != "DeconvolutionDepthWise")
1478             continue;
1479 
1480         // DeconvolutionDepthWise - Activation
1481         int top_blob_index = layers[i]->tops[0];
1482 
1483         size_t j = i + 1;
1484         for (; j < layer_count; j++)
1485         {
1486             if (layers[j]->type != "ReLU" && layers[j]->type != "Clip" && layers[j]->type != "Sigmoid")
1487                 continue;
1488 
1489             if (layers[j]->bottoms.size() != 1)
1490                 continue;
1491 
1492             if (layers[j]->bottoms[0] == top_blob_index)
1493                 break;
1494         }
1495 
1496         if (j == layer_count)
1497             continue;
1498 
1499         // fuse DeconvolutionDepthWise - Activation to DeconvolutionDepthWise
1500         ncnn::DeconvolutionDepthWise* deconvolutiondepthwise = (ncnn::DeconvolutionDepthWise*)layers[i];
1501         ncnn::Layer* activation = layers[j];
1502 
1503         fprintf(stderr, "fuse_deconvolutiondepthwise_activation %s %s\n", deconvolutiondepthwise->name.c_str(), activation->name.c_str());
1504 
1505         if (activation->type == "ReLU")
1506         {
1507             ncnn::ReLU* relu = (ncnn::ReLU*)activation;
1508 
1509             if (relu->slope == 0.f)
1510             {
1511                 deconvolutiondepthwise->activation_type = 1;
1512             }
1513             else
1514             {
1515                 deconvolutiondepthwise->activation_type = 2;
1516                 deconvolutiondepthwise->activation_params = ncnn::Mat(1);
1517                 deconvolutiondepthwise->activation_params[0] = relu->slope;
1518             }
1519         }
1520         else if (activation->type == "Clip")
1521         {
1522             ncnn::Clip* clip = (ncnn::Clip*)activation;
1523 
1524             deconvolutiondepthwise->activation_type = 3;
1525             deconvolutiondepthwise->activation_params = ncnn::Mat(2);
1526             deconvolutiondepthwise->activation_params[0] = clip->min;
1527             deconvolutiondepthwise->activation_params[1] = clip->max;
1528         }
1529         else if (activation->type == "Sigmoid")
1530         {
1531             deconvolutiondepthwise->activation_type = 4;
1532         }
1533 
1534         int top_blob_index_final = activation->tops[0];
1535         deconvolutiondepthwise->tops[0] = top_blob_index_final;
1536         blobs[top_blob_index_final].producer = i;
1537         activation->type = "ncnnfused";
1538     }
1539 
1540     return 0;
1541 }
1542 
fuse_innerproduct_activation()1543 int NetOptimize::fuse_innerproduct_activation()
1544 {
1545     const size_t layer_count = layers.size();
1546     for (size_t i = 0; i < layer_count; i++)
1547     {
1548         if (layers[i]->type != "InnerProduct")
1549             continue;
1550 
1551         // InnerProduct - Activation
1552         int top_blob_index = layers[i]->tops[0];
1553 
1554         size_t j = i + 1;
1555         for (; j < layer_count; j++)
1556         {
1557             if (layers[j]->type != "ReLU" && layers[j]->type != "Clip" && layers[j]->type != "Sigmoid")
1558                 continue;
1559 
1560             if (layers[j]->bottoms.size() != 1)
1561                 continue;
1562 
1563             if (layers[j]->bottoms[0] == top_blob_index)
1564                 break;
1565         }
1566 
1567         if (j == layer_count)
1568             continue;
1569 
1570         // fuse InnerProduct - Activation to InnerProduct
1571         ncnn::InnerProduct* innerproduct = (ncnn::InnerProduct*)layers[i];
1572         ncnn::Layer* activation = layers[j];
1573 
1574         fprintf(stderr, "fuse_innerproduct_activation %s %s\n", innerproduct->name.c_str(), activation->name.c_str());
1575 
1576         if (activation->type == "ReLU")
1577         {
1578             ncnn::ReLU* relu = (ncnn::ReLU*)activation;
1579 
1580             if (relu->slope == 0.f)
1581             {
1582                 innerproduct->activation_type = 1;
1583             }
1584             else
1585             {
1586                 innerproduct->activation_type = 2;
1587                 innerproduct->activation_params = ncnn::Mat(1);
1588                 innerproduct->activation_params[0] = relu->slope;
1589             }
1590         }
1591         else if (activation->type == "Clip")
1592         {
1593             ncnn::Clip* clip = (ncnn::Clip*)activation;
1594 
1595             innerproduct->activation_type = 3;
1596             innerproduct->activation_params = ncnn::Mat(2);
1597             innerproduct->activation_params[0] = clip->min;
1598             innerproduct->activation_params[1] = clip->max;
1599         }
1600         else if (activation->type == "Sigmoid")
1601         {
1602             innerproduct->activation_type = 4;
1603         }
1604 
1605         int top_blob_index_final = activation->tops[0];
1606         innerproduct->tops[0] = top_blob_index_final;
1607         blobs[top_blob_index_final].producer = i;
1608         activation->type = "ncnnfused";
1609     }
1610 
1611     return 0;
1612 }
1613 
fuse_memorydata_binaryop()1614 int NetOptimize::fuse_memorydata_binaryop()
1615 {
1616     const size_t layer_count = layers.size();
1617     for (size_t i = 0; i < layer_count; i++)
1618     {
1619         if (layers[i]->type != "MemoryData")
1620             continue;
1621 
1622         // MemoryData - BinaryOp
1623         int top_blob_index = layers[i]->tops[0];
1624 
1625         size_t j = i + 1;
1626         for (; j < layer_count; j++)
1627         {
1628             if (layers[j]->type != "BinaryOp")
1629                 continue;
1630 
1631             if (layers[j]->bottoms.size() != 2)
1632                 continue;
1633 
1634             if (layers[j]->bottoms[0] == top_blob_index || layers[j]->bottoms[1] == top_blob_index)
1635                 break;
1636         }
1637 
1638         if (j == layer_count)
1639             continue;
1640 
1641         // fuse MemoryData - BinaryOp to BinaryOp
1642         ncnn::MemoryData* memorydata = (ncnn::MemoryData*)layers[i];
1643         ncnn::BinaryOp* binaryop = (ncnn::BinaryOp*)layers[j];
1644 
1645         if (memorydata->w != 1 || memorydata->h != 0 || memorydata->c != 0)
1646         {
1647             // not a scalar
1648             continue;
1649         }
1650 
1651         int memorydata_index = 1;
1652 
1653         if (binaryop->bottoms[0] == top_blob_index)
1654         {
1655             int op_type = binaryop->op_type;
1656 
1657             if (op_type == ncnn::BinaryOp::Operation_ADD
1658                     || op_type == ncnn::BinaryOp::Operation_MUL
1659                     || op_type == ncnn::BinaryOp::Operation_MAX
1660                     || op_type == ncnn::BinaryOp::Operation_MIN)
1661             {
1662                 memorydata_index = 0;
1663             }
1664             else if (op_type == ncnn::BinaryOp::Operation_SUB)
1665             {
1666                 binaryop->op_type = ncnn::BinaryOp::Operation_RSUB;
1667                 memorydata_index = 0;
1668             }
1669             else if (op_type == ncnn::BinaryOp::Operation_DIV)
1670             {
1671                 binaryop->op_type = ncnn::BinaryOp::Operation_RDIV;
1672                 memorydata_index = 0;
1673             }
1674             else
1675             {
1676                 // non interchangeable binaryop
1677                 continue;
1678             }
1679         }
1680 
1681         float scalar = memorydata->data[0];
1682 
1683         binaryop->with_scalar = 1;
1684         binaryop->b = scalar;
1685 
1686         fprintf(stderr, "fuse_memorydata_binaryop %s %s\n", memorydata->name.c_str(), binaryop->name.c_str());
1687 
1688         binaryop->bottoms.erase(binaryop->bottoms.begin() + memorydata_index);
1689         memorydata->type = "ncnnfused";
1690     }
1691 
1692     for (size_t i = 0; i < layer_count; i++)
1693     {
1694         if (layers[i]->type != "MemoryData")
1695             continue;
1696 
1697         // MemoryData - Split - BinaryOp
1698         int top_blob_index = layers[i]->tops[0];
1699 
1700         size_t j0 = i + 1;
1701         for (; j0 < layer_count; j0++)
1702         {
1703             if (layers[j0]->type != "Split")
1704                 continue;
1705 
1706             if (layers[j0]->bottoms.size() != 1)
1707                 continue;
1708 
1709             if (layers[j0]->bottoms[0] == top_blob_index)
1710                 break;
1711         }
1712 
1713         if (j0 == layer_count)
1714             continue;
1715 
1716         int split_top_blob_index = -1;
1717 
1718         size_t j1 = j0 + 1;
1719         for (; j1 < layer_count; j1++)
1720         {
1721             if (layers[j1]->type != "BinaryOp")
1722                 continue;
1723 
1724             if (layers[j1]->bottoms.size() != 2)
1725                 continue;
1726 
1727             for (int k = 0; k < (int)layers[j0]->tops.size(); k++)
1728             {
1729                 if (layers[j1]->bottoms[0] == layers[j0]->tops[k] || layers[j1]->bottoms[1] == layers[j0]->tops[k])
1730                 {
1731                     split_top_blob_index = k;
1732                     break;
1733                 }
1734             }
1735 
1736             if (split_top_blob_index != -1)
1737                 break;
1738         }
1739 
1740         if (j1 == layer_count)
1741             continue;
1742 
1743         // fuse MemoryData - Split - BinaryOp to BinaryOp
1744         ncnn::MemoryData* memorydata = (ncnn::MemoryData*)layers[i];
1745         ncnn::Split* split = (ncnn::Split*)layers[j0];
1746         ncnn::BinaryOp* binaryop = (ncnn::BinaryOp*)layers[j1];
1747 
1748         if (memorydata->w != 1 || memorydata->h != 0 || memorydata->c != 0)
1749         {
1750             // not a scalar
1751             continue;
1752         }
1753 
1754         int memorydata_index = 1;
1755 
1756         if (binaryop->bottoms[0] == split->tops[split_top_blob_index])
1757         {
1758             int op_type = binaryop->op_type;
1759 
1760             if (op_type == ncnn::BinaryOp::Operation_ADD
1761                     || op_type == ncnn::BinaryOp::Operation_MUL
1762                     || op_type == ncnn::BinaryOp::Operation_MAX
1763                     || op_type == ncnn::BinaryOp::Operation_MIN)
1764             {
1765                 memorydata_index = 0;
1766             }
1767             else if (op_type == ncnn::BinaryOp::Operation_SUB)
1768             {
1769                 binaryop->op_type = ncnn::BinaryOp::Operation_RSUB;
1770                 memorydata_index = 0;
1771             }
1772             else if (op_type == ncnn::BinaryOp::Operation_DIV)
1773             {
1774                 binaryop->op_type = ncnn::BinaryOp::Operation_RDIV;
1775                 memorydata_index = 0;
1776             }
1777             else
1778             {
1779                 // non interchangeable binaryop
1780                 continue;
1781             }
1782         }
1783 
1784         float scalar = memorydata->data[0];
1785 
1786         binaryop->with_scalar = 1;
1787         binaryop->b = scalar;
1788 
1789         fprintf(stderr, "fuse_memorydata_binaryop %s %s\n", memorydata->name.c_str(), binaryop->name.c_str());
1790 
1791         binaryop->bottoms.erase(binaryop->bottoms.begin() + memorydata_index);
1792         split->tops.erase(split->tops.begin() + split_top_blob_index);
1793         if (split->tops.empty())
1794         {
1795             split->type = "ncnnfused";
1796             memorydata->type = "ncnnfused";
1797         }
1798 
1799         i--;
1800     }
1801 
1802     return 0;
1803 }
1804 
fuse_binaryop_eltwise()1805 int NetOptimize::fuse_binaryop_eltwise()
1806 {
1807     const size_t layer_count = layers.size();
1808     for (size_t i = 0; i < layer_count; i++)
1809     {
1810         if (layers[i]->type != "BinaryOp")
1811             continue;
1812 
1813         if (layers[i]->bottoms.size() != 2)
1814             continue;
1815 
1816         ncnn::BinaryOp* binaryop = (ncnn::BinaryOp*)layers[i];
1817 
1818         if (binaryop->op_type != ncnn::BinaryOp::Operation_ADD)
1819             continue;
1820 
1821         if (binaryop->with_scalar)
1822             continue;
1823 
1824         // BinaryOp - BinaryOp - BinaryOp
1825         int bottom_blob_index_0 = binaryop->bottoms[0];
1826         int bottom_blob_index_1 = binaryop->bottoms[1];
1827 
1828         size_t j0 = 0;
1829         for (; j0 < i; j0++)
1830         {
1831             if (layers[j0]->type != "BinaryOp")
1832                 continue;
1833 
1834             if (layers[j0]->bottoms.size() != 1)
1835                 continue;
1836 
1837             if (((ncnn::BinaryOp*)layers[j0])->op_type != ncnn::BinaryOp::Operation_MUL)
1838                 continue;
1839 
1840             if (layers[j0]->tops[0] == bottom_blob_index_0)
1841                 break;
1842         }
1843 
1844         size_t j1 = 0;
1845         for (; j1 < i; j1++)
1846         {
1847             if (layers[j1]->type != "BinaryOp")
1848                 continue;
1849 
1850             if (layers[j1]->bottoms.size() != 1)
1851                 continue;
1852 
1853             if (((ncnn::BinaryOp*)layers[j1])->op_type != ncnn::BinaryOp::Operation_MUL)
1854                 continue;
1855 
1856             if (layers[j1]->tops[0] == bottom_blob_index_1)
1857                 break;
1858         }
1859 
1860         if (j0 == i && j1 == i)
1861             continue;
1862 
1863         ncnn::BinaryOp* binaryop0 = (ncnn::BinaryOp*)layers[j0];
1864         ncnn::BinaryOp* binaryop1 = (ncnn::BinaryOp*)layers[j1];
1865 
1866         fprintf(stderr, "fuse_binaryop_eltwise %s %s %s\n", binaryop0->name.c_str(), binaryop1->name.c_str(), binaryop->name.c_str());
1867 
1868         ncnn::Eltwise* eltwise = (ncnn::Eltwise*)ncnn::create_layer("Eltwise");
1869 
1870         eltwise->type = "Eltwise";
1871         eltwise->name = binaryop->name;
1872         eltwise->bottoms = binaryop->bottoms;
1873         eltwise->tops = binaryop->tops;
1874 
1875         ncnn::ParamDict pd;
1876         eltwise->load_param(pd);
1877 
1878         eltwise->op_type = ncnn::Eltwise::Operation_SUM;
1879 
1880         eltwise->coeffs = ncnn::Mat(2);
1881 
1882         if (j0 != i && j1 != i)
1883         {
1884             // fuse BinaryOp - BinaryOp - BinaryOp to Eltwise
1885             eltwise->coeffs[0] = binaryop0->b;
1886             eltwise->coeffs[1] = binaryop1->b;
1887 
1888             eltwise->bottoms[0] = binaryop0->bottoms[0];
1889             eltwise->bottoms[1] = binaryop1->bottoms[0];
1890 
1891             binaryop0->type = "ncnnfused";
1892             binaryop1->type = "ncnnfused";
1893         }
1894         if (j0 != i && j1 == i)
1895         {
1896             // fuse BinaryOp - X - BinaryOp to Eltwise
1897             eltwise->coeffs[0] = binaryop0->b;
1898             eltwise->coeffs[1] = 1.f;
1899 
1900             eltwise->bottoms[0] = binaryop0->bottoms[0];
1901 
1902             binaryop0->type = "ncnnfused";
1903         }
1904         if (j0 == i && j1 != i)
1905         {
1906             // fuse X - BinaryOp - BinaryOp to Eltwise
1907             eltwise->coeffs[0] = 1.f;
1908             eltwise->coeffs[1] = binaryop1->b;
1909 
1910             eltwise->bottoms[1] = binaryop1->bottoms[0];
1911 
1912             binaryop1->type = "ncnnfused";
1913         }
1914 
1915         layers[i] = eltwise;
1916         delete binaryop;
1917     }
1918 
1919     return 0;
1920 }
1921 
eliminate_dropout()1922 int NetOptimize::eliminate_dropout()
1923 {
1924     const size_t layer_count = layers.size();
1925     for (size_t i = 0; i < layer_count; i++)
1926     {
1927         if (layers[i]->type != "Dropout")
1928             continue;
1929 
1930         ncnn::Dropout* dropout = (ncnn::Dropout*)layers[i];
1931         if (dropout->scale != 1.f)
1932             continue;
1933 
1934         // Any - Dropout
1935         int bottom_blob_index = layers[i]->bottoms[0];
1936 
1937         int j = i - 1;
1938         for (; j >= 0; j--)
1939         {
1940             if (layers[j]->type == "ncnnfused")
1941                 continue;
1942 
1943             if (layers[j]->tops.size() != 1)
1944                 continue;
1945 
1946             if (layers[j]->tops[0] == bottom_blob_index)
1947                 break;
1948         }
1949 
1950         if (j == -1)
1951             continue;
1952 
1953         ncnn::Layer* any = layers[j];
1954 
1955         fprintf(stderr, "eliminate_dropout %s %s\n", any->name.c_str(), dropout->name.c_str());
1956 
1957         int top_blob_index_final = dropout->tops[0];
1958         any->tops[0] = top_blob_index_final;
1959         blobs[top_blob_index_final].producer = j;
1960         dropout->type = "ncnnfused";
1961     }
1962 
1963     return 0;
1964 }
1965 
eliminate_pooling1x1()1966 int NetOptimize::eliminate_pooling1x1()
1967 {
1968     const size_t layer_count = layers.size();
1969     for (size_t i = 0; i < layer_count; i++)
1970     {
1971         if (layers[i]->type != "Pooling")
1972             continue;
1973 
1974         ncnn::Pooling* pooling = (ncnn::Pooling*)layers[i];
1975         if (pooling->pad_left != 0 || pooling->pad_right != 0 || pooling->pad_top != 0 || pooling->pad_bottom != 0)
1976             continue;
1977 
1978         if (pooling->kernel_w != 1 || pooling->kernel_h != 1 || pooling->stride_w != 1 || pooling->stride_h != 1)
1979             continue;
1980 
1981         if (pooling->global_pooling != 0)
1982             continue;
1983 
1984         // Any - Pooling
1985         int bottom_blob_index = layers[i]->bottoms[0];
1986 
1987         int top_i = -1;
1988         int j = i - 1;
1989         for (; j >= 0; j--)
1990         {
1991             if (layers[j]->type == "ncnnfused")
1992                 continue;
1993 
1994             for (size_t k = 0; k < layers[j]->tops.size(); k++)
1995             {
1996                 if (layers[j]->tops[k] == bottom_blob_index)
1997                 {
1998                     top_i = k;
1999                     break;
2000                 }
2001             }
2002 
2003             if (top_i != -1)
2004                 break;
2005         }
2006 
2007         if (j == -1)
2008             continue;
2009 
2010         ncnn::Layer* any = layers[j];
2011 
2012         fprintf(stderr, "eliminate_pooling1x1 %s %s\n", any->name.c_str(), pooling->name.c_str());
2013 
2014         int top_blob_index_final = pooling->tops[0];
2015         any->tops[top_i] = top_blob_index_final;
2016         blobs[top_blob_index_final].producer = j;
2017         pooling->type = "ncnnfused";
2018     }
2019 
2020     return 0;
2021 }
2022 
eliminate_noop()2023 int NetOptimize::eliminate_noop()
2024 {
2025     const size_t layer_count = layers.size();
2026     for (size_t i = 0; i < layer_count; i++)
2027     {
2028         if (layers[i]->type != "Noop")
2029             continue;
2030 
2031         ncnn::Layer* noop = layers[i];
2032 
2033         if (noop->bottoms.empty())
2034         {
2035             // Noop
2036             fprintf(stderr, "eliminate_noop %s\n", noop->name.c_str());
2037 
2038             size_t top_blob_count = noop->tops.size();
2039             for (size_t j = 0; j < top_blob_count; j++)
2040             {
2041                 int top_blob_index_final = noop->tops[j];
2042                 blobs[top_blob_index_final].producer = -1;
2043             }
2044             noop->type = "ncnnfused";
2045 
2046             continue;
2047         }
2048 
2049         // Any - Noop
2050         int bottom_blob_index = noop->bottoms[0];
2051 
2052         int j = i - 1;
2053         int any_k = -1;
2054         for (; j >= 0; j--)
2055         {
2056             if (layers[j]->type == "ncnnfused")
2057                 continue;
2058 
2059             bool link_noop = false;
2060             size_t top_blob_count = layers[j]->tops.size();
2061             for (size_t k = 0; k < top_blob_count; k++)
2062             {
2063                 if (layers[j]->tops[k] == bottom_blob_index)
2064                 {
2065                     link_noop = true;
2066                     any_k = k;
2067                     break;
2068                 }
2069             }
2070 
2071             if (link_noop)
2072                 break;
2073         }
2074 
2075         if (j == -1 || any_k == -1)
2076             continue;
2077 
2078         ncnn::Layer* any = layers[j];
2079 
2080         fprintf(stderr, "eliminate_noop %s %s\n", any->name.c_str(), noop->name.c_str());
2081 
2082         int top_blob_index_final = noop->tops[0];
2083         any->tops[any_k] = top_blob_index_final;
2084         blobs[top_blob_index_final].producer = j;
2085 
2086         noop->type = "ncnnfused";
2087     }
2088 
2089     return 0;
2090 }
2091 
eliminate_split()2092 int NetOptimize::eliminate_split()
2093 {
2094     const size_t layer_count = layers.size();
2095     for (size_t i = 0; i < layer_count; i++)
2096     {
2097         if (layers[i]->type != "Split")
2098             continue;
2099 
2100         ncnn::Layer* split = layers[i];
2101 
2102         int real_split_output_count = 0;
2103         int real_split_top_blob_index = -1;
2104         size_t top_blob_count = split->tops.size();
2105         for (size_t j = 0; j < top_blob_count; j++)
2106         {
2107             int top_blob_index_final = split->tops[j];
2108             if (blobs[top_blob_index_final].consumer != -1)
2109             {
2110                 real_split_output_count += 1;
2111                 real_split_top_blob_index = j;
2112             }
2113         }
2114 
2115         if (real_split_output_count > 1)
2116             continue;
2117 
2118         // Any - Pooling
2119         int bottom_blob_index = split->bottoms[0];
2120 
2121         int top_i = -1;
2122         int j = i - 1;
2123         for (; j >= 0; j--)
2124         {
2125             if (layers[j]->type == "ncnnfused")
2126                 continue;
2127 
2128             for (size_t k = 0; k < layers[j]->tops.size(); k++)
2129             {
2130                 if (layers[j]->tops[k] == bottom_blob_index)
2131                 {
2132                     top_i = k;
2133                     break;
2134                 }
2135             }
2136 
2137             if (top_i != -1)
2138                 break;
2139         }
2140 
2141         if (j == -1)
2142             continue;
2143 
2144         ncnn::Layer* any = layers[j];
2145 
2146         fprintf(stderr, "eliminate_split %s %s\n", any->name.c_str(), split->name.c_str());
2147 
2148         int top_blob_index_final = split->tops[real_split_top_blob_index];
2149         any->tops[top_i] = top_blob_index_final;
2150         blobs[top_blob_index_final].producer = j;
2151         split->type = "ncnnfused";
2152     }
2153 
2154     return 0;
2155 }
2156 
eliminate_orphaned_memorydata()2157 int NetOptimize::eliminate_orphaned_memorydata()
2158 {
2159     const size_t layer_count = layers.size();
2160     for (size_t i = 0; i < layer_count; i++)
2161     {
2162         if (layers[i]->type != "MemoryData")
2163             continue;
2164 
2165         // MemoryData - X
2166         int top_blob_index = layers[i]->tops[0];
2167 
2168         size_t j = i + 1;
2169         for (; j < layer_count; j++)
2170         {
2171             if (layers[j]->type == "ncnnfused")
2172                 continue;
2173 
2174             bool orphaned = true;
2175             for (size_t k = 0; k < layers[j]->bottoms.size(); k++)
2176             {
2177                 if (layers[j]->bottoms[k] == top_blob_index)
2178                 {
2179                     orphaned = false;
2180                     break;
2181                 }
2182             }
2183 
2184             if (!orphaned)
2185                 break;
2186         }
2187 
2188         if (j < layer_count)
2189             continue;
2190 
2191         // assert orphaned == true
2192         fprintf(stderr, "eliminate_orphaned_memorydata %s\n", layers[i]->name.c_str());
2193 
2194         layers[i]->type = "ncnnfused";
2195     }
2196 
2197     return 0;
2198 }
2199 
eliminate_reshape_after_global_pooling()2200 int NetOptimize::eliminate_reshape_after_global_pooling()
2201 {
2202     const size_t layer_count = layers.size();
2203     for (size_t i = 0; i < layer_count; i++)
2204     {
2205         if (layers[i]->type != "Pooling")
2206             continue;
2207 
2208         ncnn::Pooling* pooling = (ncnn::Pooling*)layers[i];
2209         if (pooling->global_pooling == 0)
2210             continue;
2211 
2212         // Pooling - Reshape
2213         int top_blob_index = layers[i]->tops[0];
2214 
2215         size_t j = i + 1;
2216         for (; j < layer_count; j++)
2217         {
2218             if (layers[j]->type != "Reshape")
2219                 continue;
2220 
2221             if (layers[j]->bottoms.size() != 1)
2222                 continue;
2223 
2224             if (layers[j]->bottoms[0] == top_blob_index)
2225                 break;
2226         }
2227 
2228         if (j == layer_count)
2229             continue;
2230 
2231         ncnn::Reshape* reshape = (ncnn::Reshape*)layers[j];
2232         if (reshape->h != -233 || reshape->c != -233 || reshape->permute != 0)
2233             continue;
2234 
2235         fprintf(stderr, "eliminate_reshape_after_global_pooling %s %s\n", pooling->name.c_str(), reshape->name.c_str());
2236 
2237         int top_blob_index_final = reshape->tops[0];
2238         pooling->tops[0] = top_blob_index_final;
2239         blobs[top_blob_index_final].producer = i;
2240         reshape->type = "ncnnfused";
2241     }
2242 
2243     return 0;
2244 }
2245 
eliminate_flatten_after_global_pooling()2246 int NetOptimize::eliminate_flatten_after_global_pooling()
2247 {
2248     const size_t layer_count = layers.size();
2249     for (size_t i = 0; i < layer_count; i++)
2250     {
2251         if (layers[i]->type != "Pooling")
2252             continue;
2253 
2254         ncnn::Pooling* pooling = (ncnn::Pooling*)layers[i];
2255         if (pooling->global_pooling == 0)
2256             continue;
2257 
2258         // Pooling - Flatten
2259         int top_blob_index = layers[i]->tops[0];
2260 
2261         size_t j = i + 1;
2262         for (; j < layer_count; j++)
2263         {
2264             if (layers[j]->type != "Flatten")
2265                 continue;
2266 
2267             if (layers[j]->bottoms.size() != 1)
2268                 continue;
2269 
2270             if (layers[j]->bottoms[0] == top_blob_index)
2271                 break;
2272         }
2273 
2274         if (j == layer_count)
2275             continue;
2276 
2277         ncnn::Flatten* flatten = (ncnn::Flatten*)layers[j];
2278 
2279         fprintf(stderr, "eliminate_flatten_after_global_pooling %s %s\n", pooling->name.c_str(), flatten->name.c_str());
2280 
2281         int top_blob_index_final = flatten->tops[0];
2282         pooling->tops[0] = top_blob_index_final;
2283         blobs[top_blob_index_final].producer = i;
2284         flatten->type = "ncnnfused";
2285     }
2286 
2287     return 0;
2288 }
2289 
eliminate_flatten_after_innerproduct()2290 int NetOptimize::eliminate_flatten_after_innerproduct()
2291 {
2292     const size_t layer_count = layers.size();
2293     for (size_t i = 0; i < layer_count; i++)
2294     {
2295         if (layers[i]->type != "InnerProduct")
2296             continue;
2297 
2298         // InnerProduct - Flatten
2299         int top_blob_index = layers[i]->tops[0];
2300 
2301         size_t j = i + 1;
2302         for (; j < layer_count; j++)
2303         {
2304             if (layers[j]->type != "Flatten")
2305                 continue;
2306 
2307             if (layers[j]->bottoms.size() != 1)
2308                 continue;
2309 
2310             if (layers[j]->bottoms[0] == top_blob_index)
2311                 break;
2312         }
2313 
2314         if (j == layer_count)
2315             continue;
2316 
2317         ncnn::InnerProduct* innerproduct = (ncnn::InnerProduct*)layers[i];
2318         ncnn::Flatten* flatten = (ncnn::Flatten*)layers[j];
2319 
2320         fprintf(stderr, "eliminate_flatten_after_innerproduct %s %s\n", innerproduct->name.c_str(), flatten->name.c_str());
2321 
2322         int top_blob_index_final = flatten->tops[0];
2323         innerproduct->tops[0] = top_blob_index_final;
2324         blobs[top_blob_index_final].producer = i;
2325         flatten->type = "ncnnfused";
2326     }
2327 
2328     return 0;
2329 }
2330 
eliminate_reshape_before_binaryop()2331 int NetOptimize::eliminate_reshape_before_binaryop()
2332 {
2333     const size_t layer_count = layers.size();
2334     for (size_t i = 0; i < layer_count; i++)
2335     {
2336         if (layers[i]->type != "Reshape")
2337             continue;
2338 
2339         ncnn::Reshape* reshape = (ncnn::Reshape*)layers[i];
2340         if (reshape->w != 1 || reshape->h != 1 || reshape->permute != 0)
2341             continue;
2342 
2343         // Reshape - BinaryOp
2344         int top_blob_index = layers[i]->tops[0];
2345 
2346         size_t j = i + 1;
2347         for (; j < layer_count; j++)
2348         {
2349             if (layers[j]->type != "BinaryOp")
2350                 continue;
2351 
2352             if (layers[j]->bottoms.size() != 2)
2353                 continue;
2354 
2355             if (layers[j]->bottoms[0] == top_blob_index || layers[j]->bottoms[1] == top_blob_index)
2356                 break;
2357         }
2358 
2359         if (j == layer_count)
2360             continue;
2361 
2362         ncnn::BinaryOp* binaryop = (ncnn::BinaryOp*)layers[j];
2363 
2364         fprintf(stderr, "eliminate_reshape_before_binaryop %s %s\n", reshape->name.c_str(), binaryop->name.c_str());
2365 
2366         int bottom_blob_index_final = reshape->bottoms[0];
2367         if (layers[j]->bottoms[0] == top_blob_index)
2368             binaryop->bottoms[0] = bottom_blob_index_final;
2369         if (layers[j]->bottoms[1] == top_blob_index)
2370             binaryop->bottoms[1] = bottom_blob_index_final;
2371         blobs[bottom_blob_index_final].consumer = j;
2372         reshape->type = "ncnnfused";
2373     }
2374 
2375     return 0;
2376 }
2377 
replace_reduction_with_global_pooling()2378 int NetOptimize::replace_reduction_with_global_pooling()
2379 {
2380     const size_t layer_count = layers.size();
2381     for (size_t i = 0; i < layer_count; i++)
2382     {
2383         if (layers[i]->type != "Reduction")
2384             continue;
2385 
2386         ncnn::Reduction* reduction1 = (ncnn::Reduction*)layers[i];
2387         if (reduction1->operation != 3 || reduction1->reduce_all != 0 || reduction1->coeff != 1.f)
2388             continue;
2389 
2390         if (reduction1->axes.w != 1)
2391             continue;
2392 
2393         const int* axes_ptr = reduction1->axes;
2394         if (axes_ptr[0] != 2 && axes_ptr[0] != 3)
2395             continue;
2396 
2397         // Reduction(2/3) - Reduction(2)
2398         int top_blob_index = layers[i]->tops[0];
2399 
2400         size_t j = i + 1;
2401         for (; j < layer_count; j++)
2402         {
2403             if (layers[j]->type != "Reduction")
2404                 continue;
2405 
2406             if (layers[j]->bottoms.size() != 1)
2407                 continue;
2408 
2409             if (layers[j]->bottoms[0] == top_blob_index)
2410                 break;
2411         }
2412 
2413         if (j == layer_count)
2414             continue;
2415 
2416         ncnn::Reduction* reduction2 = (ncnn::Reduction*)layers[j];
2417         if (reduction2->operation != 3 || reduction2->reduce_all != 0 || reduction2->coeff != 1.f)
2418             continue;
2419 
2420         if (reduction2->axes.w != 1)
2421             continue;
2422 
2423         const int* axes2_ptr = reduction2->axes;
2424         if (axes2_ptr[0] != 2)
2425             continue;
2426 
2427         fprintf(stderr, "replace_reduction_with_global_pooling %s %s\n", reduction1->name.c_str(), reduction2->name.c_str());
2428 
2429         ncnn::Pooling* pooling = (ncnn::Pooling*)ncnn::create_layer("Pooling");
2430 
2431         pooling->type = "Pooling";
2432         pooling->name = reduction2->name;
2433         pooling->bottoms = reduction2->bottoms;
2434         pooling->tops = reduction2->tops;
2435 
2436         ncnn::ParamDict pd;
2437         pooling->load_param(pd);
2438 
2439         pooling->pooling_type = 1;
2440         pooling->global_pooling = 1;
2441 
2442         layers[j] = pooling;
2443         delete reduction2;
2444 
2445         int bottom_blob_index_final = reduction1->bottoms[0];
2446         pooling->bottoms[0] = bottom_blob_index_final;
2447         blobs[bottom_blob_index_final].consumer = j;
2448         reduction1->type = "ncnnfused";
2449     }
2450 
2451     return 0;
2452 }
2453 
replace_prelu_with_leaky_relu()2454 int NetOptimize::replace_prelu_with_leaky_relu()
2455 {
2456     const size_t layer_count = layers.size();
2457     for (size_t i = 0; i < layer_count; i++)
2458     {
2459         if (layers[i]->type != "PReLU")
2460             continue;
2461 
2462         ncnn::PReLU* prelu = (ncnn::PReLU*)layers[i];
2463         if (prelu->num_slope != 1)
2464             continue;
2465 
2466         fprintf(stderr, "replace_prelu_with_leaky_relu %s\n", prelu->name.c_str());
2467 
2468         ncnn::ReLU* relu = (ncnn::ReLU*)ncnn::create_layer("ReLU");
2469 
2470         relu->type = "ReLU";
2471         relu->name = prelu->name;
2472         relu->bottoms = prelu->bottoms;
2473         relu->tops = prelu->tops;
2474 
2475         ncnn::ParamDict pd;
2476         relu->load_param(pd);
2477 
2478         relu->slope = prelu->slope_data[0];
2479 
2480         layers[i] = relu;
2481         delete prelu;
2482     }
2483 
2484     return 0;
2485 }
2486 
replace_convolution_with_innerproduct_after_global_pooling()2487 int NetOptimize::replace_convolution_with_innerproduct_after_global_pooling()
2488 {
2489     const size_t layer_count = layers.size();
2490     for (size_t i = 0; i < layer_count; i++)
2491     {
2492         if (layers[i]->type != "Pooling")
2493             continue;
2494 
2495         ncnn::Pooling* pooling = (ncnn::Pooling*)layers[i];
2496         if (pooling->global_pooling == 0)
2497             continue;
2498 
2499         // Pooling - Convolution
2500         int top_blob_index = layers[i]->tops[0];
2501 
2502         size_t j = i + 1;
2503         for (; j < layer_count; j++)
2504         {
2505             if (layers[j]->type != "Convolution")
2506                 continue;
2507 
2508             if (layers[j]->bottoms.size() != 1)
2509                 continue;
2510 
2511             if (layers[j]->bottoms[0] == top_blob_index)
2512                 break;
2513         }
2514 
2515         if (j == layer_count)
2516             continue;
2517 
2518         ncnn::Convolution* convolution = (ncnn::Convolution*)layers[j];
2519 
2520         fprintf(stderr, "replace_convolution_with_innerproduct_after_global_pooling %s %s\n", pooling->name.c_str(), convolution->name.c_str());
2521 
2522         ncnn::InnerProduct* innerproduct = (ncnn::InnerProduct*)ncnn::create_layer("InnerProduct");
2523 
2524         innerproduct->type = "InnerProduct";
2525         innerproduct->name = convolution->name;
2526         innerproduct->bottoms = convolution->bottoms;
2527         innerproduct->tops = convolution->tops;
2528 
2529         ncnn::ParamDict pd;
2530         innerproduct->load_param(pd);
2531 
2532         innerproduct->num_output = convolution->num_output;
2533         innerproduct->bias_term = convolution->bias_term;
2534         innerproduct->weight_data_size = convolution->weight_data_size;
2535         innerproduct->int8_scale_term = convolution->int8_scale_term;
2536 
2537         innerproduct->weight_data = convolution->weight_data;
2538         innerproduct->bias_data = convolution->bias_data;
2539 #if NCNN_INT8
2540         innerproduct->weight_data_int8_scales = convolution->weight_data_int8_scales;
2541         innerproduct->bottom_blob_int8_scales = convolution->bottom_blob_int8_scales;
2542 #endif
2543 
2544         innerproduct->activation_type = convolution->activation_type;
2545         innerproduct->activation_params = convolution->activation_params;
2546 
2547         layers[j] = innerproduct;
2548         delete convolution;
2549     }
2550 
2551     return 0;
2552 }
2553 
replace_convolution_with_innerproduct_after_innerproduct()2554 int NetOptimize::replace_convolution_with_innerproduct_after_innerproduct()
2555 {
2556     const size_t layer_count = layers.size();
2557     for (;;)
2558     {
2559         bool replaced = false;
2560 
2561         for (size_t i = 0; i < layer_count; i++)
2562         {
2563             if (layers[i]->type != "InnerProduct")
2564                 continue;
2565 
2566             // InnerProduct - Convolution
2567             int top_blob_index = layers[i]->tops[0];
2568 
2569             size_t j = i + 1;
2570             for (; j < layer_count; j++)
2571             {
2572                 if (layers[j]->type != "Convolution")
2573                     continue;
2574 
2575                 if (layers[j]->bottoms.size() != 1)
2576                     continue;
2577 
2578                 if (layers[j]->bottoms[0] == top_blob_index)
2579                     break;
2580             }
2581 
2582             if (j == layer_count)
2583                 continue;
2584 
2585             ncnn::InnerProduct* innerproduct = (ncnn::InnerProduct*)layers[i];
2586             ncnn::Convolution* convolution = (ncnn::Convolution*)layers[j];
2587 
2588             fprintf(stderr, "replace_convolution_with_innerproduct_after_innerproduct %s %s\n", innerproduct->name.c_str(), convolution->name.c_str());
2589 
2590             ncnn::InnerProduct* innerproduct2 = (ncnn::InnerProduct*)ncnn::create_layer("InnerProduct");
2591 
2592             innerproduct2->type = "InnerProduct";
2593             innerproduct2->name = convolution->name;
2594             innerproduct2->bottoms = convolution->bottoms;
2595             innerproduct2->tops = convolution->tops;
2596 
2597             ncnn::ParamDict pd;
2598             innerproduct2->load_param(pd);
2599 
2600             innerproduct2->num_output = convolution->num_output;
2601             innerproduct2->bias_term = convolution->bias_term;
2602             innerproduct2->weight_data_size = convolution->weight_data_size;
2603             innerproduct->int8_scale_term = convolution->int8_scale_term;
2604 
2605             innerproduct2->weight_data = convolution->weight_data;
2606             innerproduct2->bias_data = convolution->bias_data;
2607 #if NCNN_INT8
2608             innerproduct->weight_data_int8_scales = convolution->weight_data_int8_scales;
2609             innerproduct->bottom_blob_int8_scales = convolution->bottom_blob_int8_scales;
2610 #endif
2611 
2612             innerproduct2->activation_type = convolution->activation_type;
2613             innerproduct2->activation_params = convolution->activation_params;
2614 
2615             layers[j] = innerproduct2;
2616             delete convolution;
2617 
2618             replaced = true;
2619         }
2620 
2621         if (!replaced)
2622             break;
2623     }
2624 
2625     return 0;
2626 }
2627 
main(int argc,char ** argv)2628 int main(int argc, char** argv)
2629 {
2630     if (argc < 6)
2631     {
2632         fprintf(stderr, "usage: %s [inparam] [inbin] [outparam] [outbin] [flag] [cutstart] [cutend]\n", argv[0]);
2633         return -1;
2634     }
2635 
2636     const char* inparam = argv[1];
2637     const char* inbin = argv[2];
2638     const char* outparam = argv[3];
2639     const char* outbin = argv[4];
2640     int flag = atoi(argv[5]);
2641     const char* cutstartname = nullptr;
2642     const char* cutendname = nullptr;
2643 
2644     if (argc > 6)
2645     {
2646         cutstartname = argv[6];
2647     }
2648 
2649     if (argc > 7)
2650     {
2651         cutendname = argv[7];
2652     }
2653 
2654     NetOptimize optimizer;
2655 
2656     if (flag == 65536 || flag == 1)
2657     {
2658         optimizer.storage_type = 1;
2659     }
2660     else
2661     {
2662         optimizer.storage_type = 0;
2663     }
2664 
2665     optimizer.load_param(inparam);
2666 
2667     if (strcmp(inbin, "null") == 0)
2668     {
2669         DataReaderFromEmpty dr;
2670         optimizer.load_model(dr);
2671         optimizer.gen_random_weight = true;
2672     }
2673     else
2674         optimizer.load_model(inbin);
2675 
2676     if (optimizer.set_cutparam(cutstartname, cutendname) < 0)
2677     {
2678         return -1;
2679     }
2680 
2681     optimizer.fuse_batchnorm_scale();
2682     optimizer.fuse_convolution_batchnorm();
2683     optimizer.fuse_convolution_mul();
2684     optimizer.fuse_convolution_add();
2685     optimizer.fuse_convolutiondepthwise_batchnorm();
2686     optimizer.fuse_convolutiondepthwise_mul();
2687     optimizer.fuse_convolutiondepthwise_add();
2688     optimizer.fuse_deconvolution_batchnorm();
2689     optimizer.fuse_deconvolution_mul();
2690     optimizer.fuse_deconvolution_add();
2691     optimizer.fuse_deconvolutiondepthwise_batchnorm();
2692     optimizer.fuse_innerproduct_batchnorm();
2693     optimizer.fuse_innerproduct_add();
2694     optimizer.fuse_innerproduct_dropout();
2695 
2696     optimizer.replace_reduction_with_global_pooling();
2697     optimizer.replace_prelu_with_leaky_relu();
2698 
2699     optimizer.fuse_convolution_activation();
2700     optimizer.fuse_convolutiondepthwise_activation();
2701     optimizer.fuse_deconvolution_activation();
2702     optimizer.fuse_deconvolutiondepthwise_activation();
2703     optimizer.fuse_innerproduct_activation();
2704     optimizer.fuse_memorydata_binaryop();
2705     optimizer.fuse_binaryop_eltwise();
2706 
2707     optimizer.eliminate_dropout();
2708     optimizer.eliminate_pooling1x1();
2709     optimizer.eliminate_noop();
2710     optimizer.eliminate_split();
2711     optimizer.eliminate_flatten_after_global_pooling();
2712     optimizer.eliminate_reshape_after_global_pooling();
2713     optimizer.eliminate_reshape_before_binaryop();
2714 
2715     optimizer.replace_convolution_with_innerproduct_after_global_pooling();
2716     optimizer.replace_convolution_with_innerproduct_after_innerproduct();
2717 
2718     optimizer.eliminate_flatten_after_innerproduct();
2719     optimizer.eliminate_orphaned_memorydata();
2720 
2721     optimizer.shape_inference();
2722 
2723     optimizer.estimate_memory_footprint();
2724 
2725     optimizer.save(outparam, outbin);
2726 
2727     return 0;
2728 }
2729