1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14
15 #ifdef _MSC_VER
16 #define _CRT_SECURE_NO_DEPRECATE
17 #endif
18
19 #include <algorithm>
20 #include <map>
21 #include <set>
22 #include <vector>
23
24 // ncnn public header
25 #include "datareader.h"
26 #include "layer.h"
27 #include "layer_type.h"
28 #include "net.h"
29
30 // ncnn private header
31 #include "modelwriter.h"
32
33 class DataReaderFromEmpty : public ncnn::DataReader
34 {
35 public:
scan(const char * format,void * p) const36 virtual int scan(const char* format, void* p) const
37 {
38 return 0;
39 }
read(void * buf,size_t size) const40 virtual size_t read(void* buf, size_t size) const
41 {
42 memset(buf, 0, size);
43 return size;
44 }
45 };
46
47 class NetOptimize : public ModelWriter
48 {
49 public:
50 NetOptimize();
51
52 public:
53 int fuse_batchnorm_scale();
54 int fuse_convolution_batchnorm();
55 int fuse_convolution_mul();
56 int fuse_convolution_add();
57 int fuse_convolutiondepthwise_batchnorm();
58 int fuse_convolutiondepthwise_mul();
59 int fuse_convolutiondepthwise_add();
60 int fuse_deconvolution_batchnorm();
61 int fuse_deconvolution_mul();
62 int fuse_deconvolution_add();
63 int fuse_deconvolutiondepthwise_batchnorm();
64 int fuse_innerproduct_batchnorm();
65 int fuse_innerproduct_add();
66 int fuse_innerproduct_dropout();
67 int fuse_convolution_activation();
68 int fuse_convolutiondepthwise_activation();
69 int fuse_deconvolution_activation();
70 int fuse_deconvolutiondepthwise_activation();
71 int fuse_innerproduct_activation();
72 int fuse_memorydata_binaryop();
73 int fuse_binaryop_eltwise();
74
75 int eliminate_dropout();
76 int eliminate_pooling1x1();
77 int eliminate_noop();
78 int eliminate_split();
79 int eliminate_orphaned_memorydata();
80 int eliminate_flatten_after_global_pooling();
81 int eliminate_reshape_after_global_pooling();
82 int eliminate_flatten_after_innerproduct();
83 int eliminate_reshape_before_binaryop();
84
85 int replace_reduction_with_global_pooling();
86 int replace_prelu_with_leaky_relu();
87 int replace_convolution_with_innerproduct_after_global_pooling();
88 int replace_convolution_with_innerproduct_after_innerproduct();
89 };
90
NetOptimize()91 NetOptimize::NetOptimize()
92 : ModelWriter()
93 {
94 }
95
fuse_batchnorm_scale()96 int NetOptimize::fuse_batchnorm_scale()
97 {
98 const size_t layer_count = layers.size();
99 for (size_t i = 0; i < layer_count; i++)
100 {
101 if (layers[i]->type != "BatchNorm")
102 continue;
103
104 // BatchNorm - Scale
105 int top_blob_index = layers[i]->tops[0];
106
107 size_t j = i + 1;
108 for (; j < layer_count; j++)
109 {
110 if (layers[j]->type != "Scale")
111 continue;
112
113 if (layers[j]->bottoms.size() != 1)
114 continue;
115
116 if (layers[j]->bottoms[0] == top_blob_index)
117 break;
118 }
119
120 if (j == layer_count)
121 continue;
122
123 // fuse BatchNorm - Scale to BatchNorm
124 ncnn::BatchNorm* batchnorm = (ncnn::BatchNorm*)layers[i];
125 ncnn::Scale* scale = (ncnn::Scale*)layers[j];
126
127 fprintf(stderr, "fuse_batchnorm_scale %s %s\n", batchnorm->name.c_str(), scale->name.c_str());
128
129 {
130 // v = ((v - mean) / sqrt(var + eps) * slope + bias) * s + b
131 // = (v - mean) / sqrt(var + eps) * (slope * s) + (bias * s + b)
132
133 int channels = batchnorm->channels;
134
135 float* slope = batchnorm->slope_data;
136 float* bias = batchnorm->bias_data;
137
138 for (int q = 0; q < channels; q++)
139 {
140 slope[q] = slope[q] * scale->scale_data[q];
141 if (scale->bias_term)
142 bias[q] = bias[q] * scale->scale_data[q] + scale->bias_data[q];
143 else
144 bias[q] = bias[q] * scale->scale_data[q];
145 }
146 }
147
148 int top_blob_index_final = scale->tops[0];
149 batchnorm->tops[0] = top_blob_index_final;
150 blobs[top_blob_index_final].producer = i;
151 scale->type = "ncnnfused";
152 }
153
154 return 0;
155 }
156
fuse_convolution_batchnorm()157 int NetOptimize::fuse_convolution_batchnorm()
158 {
159 const size_t layer_count = layers.size();
160 for (size_t i = 0; i < layer_count; i++)
161 {
162 if (layers[i]->type != "Convolution")
163 continue;
164
165 // Convolution - BatchNorm
166 int top_blob_index = layers[i]->tops[0];
167
168 size_t j = i + 1;
169 for (; j < layer_count; j++)
170 {
171 if (layers[j]->type != "BatchNorm")
172 continue;
173
174 if (layers[j]->bottoms.size() != 1)
175 continue;
176
177 if (layers[j]->bottoms[0] == top_blob_index)
178 break;
179 }
180
181 if (j == layer_count)
182 continue;
183
184 // fuse Convolution - BatchNorm to Convolution
185 ncnn::Convolution* convolution = (ncnn::Convolution*)layers[i];
186 ncnn::BatchNorm* batchnorm = (ncnn::BatchNorm*)layers[j];
187
188 fprintf(stderr, "fuse_convolution_batchnorm %s %s\n", convolution->name.c_str(), batchnorm->name.c_str());
189
190 {
191 int channels = batchnorm->channels;
192 float eps = batchnorm->eps;
193
194 // a = bias - slope * mean / sqrt(var + eps)
195 // b = slope / sqrt(var + eps)
196 // value = value * b + a
197
198 std::vector<float> a(channels);
199 std::vector<float> b(channels);
200 for (int i = 0; i < channels; i++)
201 {
202 float sqrt_var = static_cast<float>(sqrt(batchnorm->var_data[i] + eps));
203 a[i] = batchnorm->bias_data[i] - batchnorm->slope_data[i] * batchnorm->mean_data[i] / sqrt_var;
204 b[i] = batchnorm->slope_data[i] / sqrt_var;
205 }
206
207 if (convolution->bias_term == 0)
208 {
209 // init bias as zero
210 convolution->bias_term = 1;
211 convolution->bias_data = ncnn::Mat(channels);
212 convolution->bias_data.fill(0.f);
213 }
214
215 const int weight_per_outch = convolution->weight_data_size / channels;
216
217 float* weight = convolution->weight_data;
218 float* bias = convolution->bias_data;
219 for (int i = 0; i < channels; i++)
220 {
221 float* conv_weight_outch = weight + weight_per_outch * i;
222 for (int j = 0; j < weight_per_outch; j++)
223 {
224 conv_weight_outch[j] *= b[i];
225 }
226
227 bias[i] = bias[i] * b[i] + a[i];
228 }
229 }
230
231 int top_blob_index_final = batchnorm->tops[0];
232 convolution->tops[0] = top_blob_index_final;
233 blobs[top_blob_index_final].producer = i;
234 batchnorm->type = "ncnnfused";
235 }
236
237 return 0;
238 }
239
fuse_convolution_mul()240 int NetOptimize::fuse_convolution_mul()
241 {
242 const size_t layer_count = layers.size();
243 for (size_t i = 0; i < layer_count; i++)
244 {
245 if (layers[i]->type != "Convolution")
246 continue;
247
248 // Convolution - BinaryOp
249 int top_blob_index = layers[i]->tops[0];
250
251 size_t j = i + 1;
252 for (; j < layer_count; j++)
253 {
254 if (layers[j]->type != "BinaryOp")
255 continue;
256
257 if (layers[j]->bottoms.size() != 2)
258 continue;
259
260 if (layers[j]->bottoms[0] == top_blob_index)
261 break;
262 }
263
264 if (j == layer_count)
265 continue;
266
267 // fuse Convolution - BinaryOp to Convolution
268 ncnn::Convolution* convolution = (ncnn::Convolution*)layers[i];
269 ncnn::BinaryOp* binaryop = (ncnn::BinaryOp*)layers[j];
270
271 if (binaryop->op_type != 2 || binaryop->with_scalar)
272 continue;
273
274 // MemoryData - ..... - BinaryOp
275 size_t k = 0;
276 for (; k < j; k++)
277 {
278 if (layers[k]->type != "MemoryData")
279 continue;
280
281 if (layers[k]->tops[0] == binaryop->bottoms[1])
282 break;
283 }
284
285 if (k == j)
286 continue;
287
288 ncnn::MemoryData* memorydata = (ncnn::MemoryData*)layers[k];
289
290 int channels = convolution->num_output;
291
292 if (memorydata->w != channels || memorydata->h != 0 || memorydata->c != 0)
293 {
294 // not bias-like broadcasting type
295 continue;
296 }
297
298 fprintf(stderr, "fuse_convolution_mul %s %s\n", convolution->name.c_str(), binaryop->name.c_str());
299
300 {
301 const int weight_per_outch = convolution->weight_data_size / channels;
302
303 float* weight = convolution->weight_data;
304 float* bias = convolution->bias_data;
305 for (int i = 0; i < channels; i++)
306 {
307 float* conv_weight_outch = weight + weight_per_outch * i;
308 for (int j = 0; j < weight_per_outch; j++)
309 {
310 conv_weight_outch[j] *= memorydata->data[i];
311 }
312
313 if (bias)
314 {
315 bias[i] = bias[i] * memorydata->data[i];
316 }
317 }
318 }
319
320 int top_blob_index_final = binaryop->tops[0];
321 convolution->tops[0] = top_blob_index_final;
322 blobs[top_blob_index_final].producer = i;
323 binaryop->type = "ncnnfused";
324 }
325
326 return 0;
327 }
328
fuse_convolution_add()329 int NetOptimize::fuse_convolution_add()
330 {
331 const size_t layer_count = layers.size();
332 for (size_t i = 0; i < layer_count; i++)
333 {
334 if (layers[i]->type != "Convolution")
335 continue;
336
337 // Convolution - BinaryOp
338 int top_blob_index = layers[i]->tops[0];
339
340 size_t j = i + 1;
341 for (; j < layer_count; j++)
342 {
343 if (layers[j]->type != "BinaryOp")
344 continue;
345
346 if (layers[j]->bottoms.size() != 2)
347 continue;
348
349 if (layers[j]->bottoms[0] == top_blob_index)
350 break;
351 }
352
353 if (j == layer_count)
354 continue;
355
356 // fuse Convolution - BinaryOp to Convolution
357 ncnn::Convolution* convolution = (ncnn::Convolution*)layers[i];
358 ncnn::BinaryOp* binaryop = (ncnn::BinaryOp*)layers[j];
359
360 if (binaryop->op_type != 0 || binaryop->with_scalar)
361 continue;
362
363 // MemoryData - ..... - BinaryOp
364 size_t k = 0;
365 for (; k < j; k++)
366 {
367 if (layers[k]->type != "MemoryData")
368 continue;
369
370 if (layers[k]->tops[0] == binaryop->bottoms[1])
371 break;
372 }
373
374 if (k == j)
375 continue;
376
377 ncnn::MemoryData* memorydata = (ncnn::MemoryData*)layers[k];
378
379 int channels = convolution->num_output;
380
381 if (memorydata->w != channels || memorydata->h != 0 || memorydata->c != 0)
382 {
383 // not bias-like broadcasting type
384 continue;
385 }
386
387 fprintf(stderr, "fuse_convolution_add %s %s\n", convolution->name.c_str(), binaryop->name.c_str());
388
389 {
390 if (convolution->bias_term == 0)
391 {
392 // init bias
393 convolution->bias_term = 1;
394 convolution->bias_data = memorydata->data;
395 }
396 else
397 {
398 float* bias = convolution->bias_data;
399 for (int i = 0; i < channels; i++)
400 {
401 bias[i] = bias[i] + memorydata->data[i];
402 }
403 }
404 }
405
406 int top_blob_index_final = binaryop->tops[0];
407 convolution->tops[0] = top_blob_index_final;
408 blobs[top_blob_index_final].producer = i;
409 binaryop->type = "ncnnfused";
410 }
411
412 return 0;
413 }
414
fuse_convolutiondepthwise_batchnorm()415 int NetOptimize::fuse_convolutiondepthwise_batchnorm()
416 {
417 const size_t layer_count = layers.size();
418 for (size_t i = 0; i < layer_count; i++)
419 {
420 if (layers[i]->type != "ConvolutionDepthWise")
421 continue;
422
423 // ConvolutionDepthWise - BatchNorm
424 int top_blob_index = layers[i]->tops[0];
425
426 size_t j = i + 1;
427 for (; j < layer_count; j++)
428 {
429 if (layers[j]->type != "BatchNorm")
430 continue;
431
432 if (layers[j]->bottoms.size() != 1)
433 continue;
434
435 if (layers[j]->bottoms[0] == top_blob_index)
436 break;
437 }
438
439 if (j == layer_count)
440 continue;
441
442 // fuse ConvolutionDepthWise - BatchNorm to ConvolutionDepthWise
443 ncnn::ConvolutionDepthWise* convolutiondepthwise = (ncnn::ConvolutionDepthWise*)layers[i];
444 ncnn::BatchNorm* batchnorm = (ncnn::BatchNorm*)layers[j];
445
446 fprintf(stderr, "fuse_convolutiondepthwise_batchnorm %s %s\n", convolutiondepthwise->name.c_str(), batchnorm->name.c_str());
447
448 {
449 int channels = batchnorm->channels;
450 float eps = batchnorm->eps;
451
452 // a = bias - slope * mean / sqrt(var + eps)
453 // b = slope / sqrt(var + eps)
454 // value = value * b + a
455
456 std::vector<float> a(channels);
457 std::vector<float> b(channels);
458 for (int i = 0; i < channels; i++)
459 {
460 float sqrt_var = static_cast<float>(sqrt(batchnorm->var_data[i] + eps));
461 a[i] = batchnorm->bias_data[i] - batchnorm->slope_data[i] * batchnorm->mean_data[i] / sqrt_var;
462 b[i] = batchnorm->slope_data[i] / sqrt_var;
463 }
464
465 if (convolutiondepthwise->bias_term == 0)
466 {
467 // init bias as zero
468 convolutiondepthwise->bias_term = 1;
469 convolutiondepthwise->bias_data = ncnn::Mat(channels);
470 convolutiondepthwise->bias_data.fill(0.f);
471 }
472
473 const int weight_per_outch = convolutiondepthwise->weight_data_size / channels;
474
475 float* weight = convolutiondepthwise->weight_data;
476 float* bias = convolutiondepthwise->bias_data;
477 for (int i = 0; i < channels; i++)
478 {
479 float* conv_weight_outch = weight + weight_per_outch * i;
480 for (int j = 0; j < weight_per_outch; j++)
481 {
482 conv_weight_outch[j] *= b[i];
483 }
484
485 bias[i] = bias[i] * b[i] + a[i];
486 }
487 }
488
489 int top_blob_index_final = batchnorm->tops[0];
490 convolutiondepthwise->tops[0] = top_blob_index_final;
491 blobs[top_blob_index_final].producer = i;
492 batchnorm->type = "ncnnfused";
493 }
494
495 return 0;
496 }
497
fuse_convolutiondepthwise_mul()498 int NetOptimize::fuse_convolutiondepthwise_mul()
499 {
500 const size_t layer_count = layers.size();
501 for (size_t i = 0; i < layer_count; i++)
502 {
503 if (layers[i]->type != "ConvolutionDepthWise")
504 continue;
505
506 // ConvolutionDepthWise - BinaryOp
507 int top_blob_index = layers[i]->tops[0];
508
509 size_t j = i + 1;
510 for (; j < layer_count; j++)
511 {
512 if (layers[j]->type != "BinaryOp")
513 continue;
514
515 if (layers[j]->bottoms.size() != 2)
516 continue;
517
518 if (layers[j]->bottoms[0] == top_blob_index)
519 break;
520 }
521
522 if (j == layer_count)
523 continue;
524
525 // fuse ConvolutionDepthWise - BinaryOp to ConvolutionDepthWise
526 ncnn::ConvolutionDepthWise* convolutiondepthwise = (ncnn::ConvolutionDepthWise*)layers[i];
527 ncnn::BinaryOp* binaryop = (ncnn::BinaryOp*)layers[j];
528
529 if (binaryop->op_type != 2 || binaryop->with_scalar)
530 continue;
531
532 // MemoryData - ..... - BinaryOp
533 size_t k = 0;
534 for (; k < j; k++)
535 {
536 if (layers[k]->type != "MemoryData")
537 continue;
538
539 if (layers[k]->tops[0] == binaryop->bottoms[1])
540 break;
541 }
542
543 if (k == j)
544 continue;
545
546 ncnn::MemoryData* memorydata = (ncnn::MemoryData*)layers[k];
547
548 int channels = convolutiondepthwise->num_output;
549
550 if (memorydata->w != channels || memorydata->h != 0 || memorydata->c != 0)
551 {
552 // not bias-like broadcasting type
553 continue;
554 }
555
556 fprintf(stderr, "fuse_convolutiondepthwise_mul %s %s\n", convolutiondepthwise->name.c_str(), binaryop->name.c_str());
557
558 {
559 const int weight_per_outch = convolutiondepthwise->weight_data_size / channels;
560
561 float* weight = convolutiondepthwise->weight_data;
562 float* bias = convolutiondepthwise->bias_data;
563 for (int i = 0; i < channels; i++)
564 {
565 float* conv_weight_outch = weight + weight_per_outch * i;
566 for (int j = 0; j < weight_per_outch; j++)
567 {
568 conv_weight_outch[j] *= memorydata->data[i];
569 }
570
571 if (bias)
572 {
573 bias[i] = bias[i] * memorydata->data[i];
574 }
575 }
576 }
577
578 int top_blob_index_final = binaryop->tops[0];
579 convolutiondepthwise->tops[0] = top_blob_index_final;
580 blobs[top_blob_index_final].producer = i;
581 binaryop->type = "ncnnfused";
582 }
583
584 return 0;
585 }
586
fuse_convolutiondepthwise_add()587 int NetOptimize::fuse_convolutiondepthwise_add()
588 {
589 const size_t layer_count = layers.size();
590 for (size_t i = 0; i < layer_count; i++)
591 {
592 if (layers[i]->type != "ConvolutionDepthWise")
593 continue;
594
595 // ConvolutionDepthWise - BinaryOp
596 int top_blob_index = layers[i]->tops[0];
597
598 size_t j = i + 1;
599 for (; j < layer_count; j++)
600 {
601 if (layers[j]->type != "BinaryOp")
602 continue;
603
604 if (layers[j]->bottoms.size() != 2)
605 continue;
606
607 if (layers[j]->bottoms[0] == top_blob_index)
608 break;
609 }
610
611 if (j == layer_count)
612 continue;
613
614 // fuse ConvolutionDepthWise - BinaryOp to ConvolutionDepthWise
615 ncnn::ConvolutionDepthWise* convolutiondepthwise = (ncnn::ConvolutionDepthWise*)layers[i];
616 ncnn::BinaryOp* binaryop = (ncnn::BinaryOp*)layers[j];
617
618 if (binaryop->op_type != 0 || binaryop->with_scalar)
619 continue;
620
621 // MemoryData - ..... - BinaryOp
622 size_t k = 0;
623 for (; k < j; k++)
624 {
625 if (layers[k]->type != "MemoryData")
626 continue;
627
628 if (layers[k]->tops[0] == binaryop->bottoms[1])
629 break;
630 }
631
632 if (k == j)
633 continue;
634
635 ncnn::MemoryData* memorydata = (ncnn::MemoryData*)layers[k];
636
637 int channels = convolutiondepthwise->num_output;
638
639 if (memorydata->w != channels || memorydata->h != 0 || memorydata->c != 0)
640 {
641 // not bias-like broadcasting type
642 continue;
643 }
644
645 fprintf(stderr, "fuse_convolutiondepthwise_add %s %s\n", convolutiondepthwise->name.c_str(), binaryop->name.c_str());
646
647 {
648 if (convolutiondepthwise->bias_term == 0)
649 {
650 // init bias
651 convolutiondepthwise->bias_term = 1;
652 convolutiondepthwise->bias_data = memorydata->data;
653 }
654 else
655 {
656 float* bias = convolutiondepthwise->bias_data;
657 for (int i = 0; i < channels; i++)
658 {
659 bias[i] = bias[i] + memorydata->data[i];
660 }
661 }
662 }
663
664 int top_blob_index_final = binaryop->tops[0];
665 convolutiondepthwise->tops[0] = top_blob_index_final;
666 blobs[top_blob_index_final].producer = i;
667 binaryop->type = "ncnnfused";
668 }
669
670 return 0;
671 }
672
fuse_deconvolution_batchnorm()673 int NetOptimize::fuse_deconvolution_batchnorm()
674 {
675 const size_t layer_count = layers.size();
676 for (size_t i = 0; i < layer_count; i++)
677 {
678 if (layers[i]->type != "Deconvolution")
679 continue;
680
681 // Deconvolution - BatchNorm
682 int top_blob_index = layers[i]->tops[0];
683
684 size_t j = i + 1;
685 for (; j < layer_count; j++)
686 {
687 if (layers[j]->type != "BatchNorm")
688 continue;
689
690 if (layers[j]->bottoms.size() != 1)
691 continue;
692
693 if (layers[j]->bottoms[0] == top_blob_index)
694 break;
695 }
696
697 if (j == layer_count)
698 continue;
699
700 // fuse Deconvolution - BatchNorm to Deconvolution
701 ncnn::Deconvolution* deconvolution = (ncnn::Deconvolution*)layers[i];
702 ncnn::BatchNorm* batchnorm = (ncnn::BatchNorm*)layers[j];
703
704 fprintf(stderr, "fuse_deconvolution_batchnorm %s %s\n", deconvolution->name.c_str(), batchnorm->name.c_str());
705
706 {
707 int channels = batchnorm->channels;
708 float eps = batchnorm->eps;
709
710 // a = bias - slope * mean / sqrt(var + eps)
711 // b = slope / sqrt(var + eps)
712 // value = value * b + a
713
714 std::vector<float> a(channels);
715 std::vector<float> b(channels);
716 for (int i = 0; i < channels; i++)
717 {
718 float sqrt_var = static_cast<float>(sqrt(batchnorm->var_data[i] + eps));
719 a[i] = batchnorm->bias_data[i] - batchnorm->slope_data[i] * batchnorm->mean_data[i] / sqrt_var;
720 b[i] = batchnorm->slope_data[i] / sqrt_var;
721 }
722
723 if (deconvolution->bias_term == 0)
724 {
725 // init bias as zero
726 deconvolution->bias_term = 1;
727 deconvolution->bias_data = ncnn::Mat(channels);
728 deconvolution->bias_data.fill(0.f);
729 }
730
731 const int weight_per_outch = deconvolution->weight_data_size / channels;
732
733 float* weight = deconvolution->weight_data;
734 float* bias = deconvolution->bias_data;
735 for (int i = 0; i < channels; i++)
736 {
737 float* conv_weight_outch = weight + weight_per_outch * i;
738 for (int j = 0; j < weight_per_outch; j++)
739 {
740 conv_weight_outch[j] *= b[i];
741 }
742
743 bias[i] = bias[i] * b[i] + a[i];
744 }
745 }
746
747 int top_blob_index_final = batchnorm->tops[0];
748 deconvolution->tops[0] = top_blob_index_final;
749 blobs[top_blob_index_final].producer = i;
750 batchnorm->type = "ncnnfused";
751 }
752
753 return 0;
754 }
755
fuse_deconvolution_mul()756 int NetOptimize::fuse_deconvolution_mul()
757 {
758 const size_t layer_count = layers.size();
759 for (size_t i = 0; i < layer_count; i++)
760 {
761 if (layers[i]->type != "Deconvolution")
762 continue;
763
764 // Deconvolution - BinaryOp
765 int top_blob_index = layers[i]->tops[0];
766
767 size_t j = i + 1;
768 for (; j < layer_count; j++)
769 {
770 if (layers[j]->type != "BinaryOp")
771 continue;
772
773 if (layers[j]->bottoms.size() != 2)
774 continue;
775
776 if (layers[j]->bottoms[0] == top_blob_index)
777 break;
778 }
779
780 if (j == layer_count)
781 continue;
782
783 // fuse Deconvolution - BinaryOp to Deconvolution
784 ncnn::Deconvolution* deconvolution = (ncnn::Deconvolution*)layers[i];
785 ncnn::BinaryOp* binaryop = (ncnn::BinaryOp*)layers[j];
786
787 if (binaryop->op_type != 2 || binaryop->with_scalar)
788 continue;
789
790 // MemoryData - ..... - BinaryOp
791 size_t k = 0;
792 for (; k < j; k++)
793 {
794 if (layers[k]->type != "MemoryData")
795 continue;
796
797 if (layers[k]->tops[0] == binaryop->bottoms[1])
798 break;
799 }
800
801 if (k == j)
802 continue;
803
804 ncnn::MemoryData* memorydata = (ncnn::MemoryData*)layers[k];
805
806 int channels = deconvolution->num_output;
807
808 if (memorydata->w != channels || memorydata->h != 0 || memorydata->c != 0)
809 {
810 // not bias-like broadcasting type
811 continue;
812 }
813
814 fprintf(stderr, "fuse_deconvolution_mul %s %s\n", deconvolution->name.c_str(), binaryop->name.c_str());
815
816 {
817 const int weight_per_outch = deconvolution->weight_data_size / channels;
818
819 float* weight = deconvolution->weight_data;
820 float* bias = deconvolution->bias_data;
821 for (int i = 0; i < channels; i++)
822 {
823 float* conv_weight_outch = weight + weight_per_outch * i;
824 for (int j = 0; j < weight_per_outch; j++)
825 {
826 conv_weight_outch[j] *= memorydata->data[i];
827 }
828
829 if (bias)
830 {
831 bias[i] = bias[i] * memorydata->data[i];
832 }
833 }
834 }
835
836 int top_blob_index_final = binaryop->tops[0];
837 deconvolution->tops[0] = top_blob_index_final;
838 blobs[top_blob_index_final].producer = i;
839 binaryop->type = "ncnnfused";
840 }
841
842 return 0;
843 }
844
fuse_deconvolution_add()845 int NetOptimize::fuse_deconvolution_add()
846 {
847 const size_t layer_count = layers.size();
848 for (size_t i = 0; i < layer_count; i++)
849 {
850 if (layers[i]->type != "Deconvolution")
851 continue;
852
853 // Deconvolution - BinaryOp
854 int top_blob_index = layers[i]->tops[0];
855
856 size_t j = i + 1;
857 for (; j < layer_count; j++)
858 {
859 if (layers[j]->type != "BinaryOp")
860 continue;
861
862 if (layers[j]->bottoms.size() != 2)
863 continue;
864
865 if (layers[j]->bottoms[0] == top_blob_index)
866 break;
867 }
868
869 if (j == layer_count)
870 continue;
871
872 // fuse Deconvolution - BinaryOp to Deconvolution
873 ncnn::Deconvolution* deconvolution = (ncnn::Deconvolution*)layers[i];
874 ncnn::BinaryOp* binaryop = (ncnn::BinaryOp*)layers[j];
875
876 if (binaryop->op_type != 0 || binaryop->with_scalar)
877 continue;
878
879 // MemoryData - ..... - BinaryOp
880 size_t k = 0;
881 for (; k < j; k++)
882 {
883 if (layers[k]->type != "MemoryData")
884 continue;
885
886 if (layers[k]->tops[0] == binaryop->bottoms[1])
887 break;
888 }
889
890 if (k == j)
891 continue;
892
893 ncnn::MemoryData* memorydata = (ncnn::MemoryData*)layers[k];
894
895 int channels = deconvolution->num_output;
896
897 if (memorydata->w != channels || memorydata->h != 0 || memorydata->c != 0)
898 {
899 // not bias-like broadcasting type
900 continue;
901 }
902
903 fprintf(stderr, "fuse_deconvolution_add %s %s\n", deconvolution->name.c_str(), binaryop->name.c_str());
904
905 {
906 if (deconvolution->bias_term == 0)
907 {
908 // init bias
909 deconvolution->bias_term = 1;
910 deconvolution->bias_data = memorydata->data;
911 }
912 else
913 {
914 float* bias = deconvolution->bias_data;
915 for (int i = 0; i < channels; i++)
916 {
917 bias[i] = bias[i] + memorydata->data[i];
918 }
919 }
920 }
921
922 int top_blob_index_final = binaryop->tops[0];
923 deconvolution->tops[0] = top_blob_index_final;
924 blobs[top_blob_index_final].producer = i;
925 binaryop->type = "ncnnfused";
926 }
927
928 return 0;
929 }
930
fuse_deconvolutiondepthwise_batchnorm()931 int NetOptimize::fuse_deconvolutiondepthwise_batchnorm()
932 {
933 const size_t layer_count = layers.size();
934 for (size_t i = 0; i < layer_count; i++)
935 {
936 if (layers[i]->type != "DeconvolutionDepthWise")
937 continue;
938
939 // DeconvolutionDepthWise - BatchNorm
940 int top_blob_index = layers[i]->tops[0];
941
942 size_t j = i + 1;
943 for (; j < layer_count; j++)
944 {
945 if (layers[j]->type != "BatchNorm")
946 continue;
947
948 if (layers[j]->bottoms.size() != 1)
949 continue;
950
951 if (layers[j]->bottoms[0] == top_blob_index)
952 break;
953 }
954
955 if (j == layer_count)
956 continue;
957
958 // fuse DeconvolutionDepthWise - BatchNorm to DeconvolutionDepthWise
959 ncnn::DeconvolutionDepthWise* deconvolutiondepthwise = (ncnn::DeconvolutionDepthWise*)layers[i];
960 ncnn::BatchNorm* batchnorm = (ncnn::BatchNorm*)layers[j];
961
962 fprintf(stderr, "fuse_deconvolutiondepthwise_batchnorm %s %s\n", deconvolutiondepthwise->name.c_str(), batchnorm->name.c_str());
963
964 {
965 int channels = batchnorm->channels;
966 float eps = batchnorm->eps;
967
968 // a = bias - slope * mean / sqrt(var + eps)
969 // b = slope / sqrt(var + eps)
970 // value = value * b + a
971
972 std::vector<float> a(channels);
973 std::vector<float> b(channels);
974 for (int i = 0; i < channels; i++)
975 {
976 float sqrt_var = static_cast<float>(sqrt(batchnorm->var_data[i] + eps));
977 a[i] = batchnorm->bias_data[i] - batchnorm->slope_data[i] * batchnorm->mean_data[i] / sqrt_var;
978 b[i] = batchnorm->slope_data[i] / sqrt_var;
979 }
980
981 if (deconvolutiondepthwise->bias_term == 0)
982 {
983 // init bias as zero
984 deconvolutiondepthwise->bias_term = 1;
985 deconvolutiondepthwise->bias_data = ncnn::Mat(channels);
986 deconvolutiondepthwise->bias_data.fill(0.f);
987 }
988
989 const int weight_per_outch = deconvolutiondepthwise->weight_data_size / channels;
990
991 float* weight = deconvolutiondepthwise->weight_data;
992 float* bias = deconvolutiondepthwise->bias_data;
993 for (int i = 0; i < channels; i++)
994 {
995 float* conv_weight_outch = weight + weight_per_outch * i;
996 for (int j = 0; j < weight_per_outch; j++)
997 {
998 conv_weight_outch[j] *= b[i];
999 }
1000
1001 bias[i] = bias[i] * b[i] + a[i];
1002 }
1003 }
1004
1005 int top_blob_index_final = batchnorm->tops[0];
1006 deconvolutiondepthwise->tops[0] = top_blob_index_final;
1007 blobs[top_blob_index_final].producer = i;
1008 batchnorm->type = "ncnnfused";
1009 }
1010
1011 return 0;
1012 }
1013
fuse_innerproduct_batchnorm()1014 int NetOptimize::fuse_innerproduct_batchnorm()
1015 {
1016 const size_t layer_count = layers.size();
1017 for (size_t i = 0; i < layer_count; i++)
1018 {
1019 if (layers[i]->type != "InnerProduct")
1020 continue;
1021
1022 // InnerProduct - BatchNorm
1023 int top_blob_index = layers[i]->tops[0];
1024
1025 size_t j = i + 1;
1026 for (; j < layer_count; j++)
1027 {
1028 if (layers[j]->type != "BatchNorm")
1029 continue;
1030
1031 if (layers[j]->bottoms.size() != 1)
1032 continue;
1033
1034 if (layers[j]->bottoms[0] == top_blob_index)
1035 break;
1036 }
1037
1038 if (j == layer_count)
1039 continue;
1040
1041 // fuse InnerProduct - BatchNorm to InnerProduct
1042 ncnn::InnerProduct* innerproduct = (ncnn::InnerProduct*)layers[i];
1043 ncnn::BatchNorm* batchnorm = (ncnn::BatchNorm*)layers[j];
1044
1045 fprintf(stderr, "fuse_innerproduct_batchnorm %s %s\n", innerproduct->name.c_str(), batchnorm->name.c_str());
1046
1047 {
1048 int channels = batchnorm->channels;
1049 float eps = batchnorm->eps;
1050
1051 // a = bias - slope * mean / sqrt(var + eps)
1052 // b = slope / sqrt(var + eps)
1053 // value = value * b + a
1054
1055 std::vector<float> a(channels);
1056 std::vector<float> b(channels);
1057 for (int i = 0; i < channels; i++)
1058 {
1059 float sqrt_var = static_cast<float>(sqrt(batchnorm->var_data[i] + eps));
1060 a[i] = batchnorm->bias_data[i] - batchnorm->slope_data[i] * batchnorm->mean_data[i] / sqrt_var;
1061 b[i] = batchnorm->slope_data[i] / sqrt_var;
1062 }
1063
1064 if (innerproduct->bias_term == 0)
1065 {
1066 // init bias as zero
1067 innerproduct->bias_term = 1;
1068 innerproduct->bias_data = ncnn::Mat(channels);
1069 innerproduct->bias_data.fill(0.f);
1070 }
1071
1072 const int weight_per_outch = innerproduct->weight_data_size / channels;
1073
1074 float* weight = innerproduct->weight_data;
1075 float* bias = innerproduct->bias_data;
1076 for (int i = 0; i < channels; i++)
1077 {
1078 float* conv_weight_outch = weight + weight_per_outch * i;
1079 for (int j = 0; j < weight_per_outch; j++)
1080 {
1081 conv_weight_outch[j] *= b[i];
1082 }
1083
1084 bias[i] = bias[i] * b[i] + a[i];
1085 }
1086 }
1087
1088 int top_blob_index_final = batchnorm->tops[0];
1089 innerproduct->tops[0] = top_blob_index_final;
1090 blobs[top_blob_index_final].producer = i;
1091 batchnorm->type = "ncnnfused";
1092 }
1093
1094 return 0;
1095 }
1096
fuse_innerproduct_add()1097 int NetOptimize::fuse_innerproduct_add()
1098 {
1099 const size_t layer_count = layers.size();
1100 for (size_t i = 0; i < layer_count; i++)
1101 {
1102 if (layers[i]->type != "InnerProduct")
1103 continue;
1104
1105 // InnerProduct - BinaryOp
1106 int top_blob_index = layers[i]->tops[0];
1107
1108 size_t j = i + 1;
1109 for (; j < layer_count; j++)
1110 {
1111 if (layers[j]->type != "BinaryOp")
1112 continue;
1113
1114 if (layers[j]->bottoms.size() != 2)
1115 continue;
1116
1117 if (layers[j]->bottoms[0] == top_blob_index)
1118 break;
1119 }
1120
1121 if (j == layer_count)
1122 continue;
1123
1124 // fuse InnerProduct - BinaryOp to InnerProduct
1125 ncnn::InnerProduct* innerproduct = (ncnn::InnerProduct*)layers[i];
1126 ncnn::BinaryOp* binaryop = (ncnn::BinaryOp*)layers[j];
1127
1128 if (binaryop->op_type != 0 || binaryop->with_scalar)
1129 continue;
1130
1131 // MemoryData - ..... - BinaryOp
1132 size_t k = 0;
1133 for (; k < j; k++)
1134 {
1135 if (layers[k]->type != "MemoryData")
1136 continue;
1137
1138 if (layers[k]->tops[0] == binaryop->bottoms[1])
1139 break;
1140 }
1141
1142 if (k == j)
1143 continue;
1144
1145 ncnn::MemoryData* memorydata = (ncnn::MemoryData*)layers[k];
1146
1147 int channels = innerproduct->num_output;
1148
1149 if (memorydata->w != channels || memorydata->h != 0 || memorydata->c != 0)
1150 {
1151 // not bias-like broadcasting type
1152 continue;
1153 }
1154
1155 fprintf(stderr, "fuse_innerproduct_add %s %s\n", innerproduct->name.c_str(), binaryop->name.c_str());
1156
1157 {
1158 if (innerproduct->bias_term == 0)
1159 {
1160 // init bias
1161 innerproduct->bias_term = 1;
1162 innerproduct->bias_data = memorydata->data;
1163 }
1164 else
1165 {
1166 float* bias = innerproduct->bias_data;
1167 for (int i = 0; i < channels; i++)
1168 {
1169 bias[i] = bias[i] + memorydata->data[i];
1170 }
1171 }
1172 }
1173
1174 int top_blob_index_final = binaryop->tops[0];
1175 innerproduct->tops[0] = top_blob_index_final;
1176 blobs[top_blob_index_final].producer = i;
1177 binaryop->type = "ncnnfused";
1178 }
1179
1180 return 0;
1181 }
1182
fuse_innerproduct_dropout()1183 int NetOptimize::fuse_innerproduct_dropout()
1184 {
1185 const size_t layer_count = layers.size();
1186 for (size_t i = 0; i < layer_count; i++)
1187 {
1188 if (layers[i]->type != "InnerProduct")
1189 continue;
1190
1191 // InnerProduct - Dropout
1192 int top_blob_index = layers[i]->tops[0];
1193
1194 size_t j = i + 1;
1195 for (; j < layer_count; j++)
1196 {
1197 if (layers[j]->type != "Dropout")
1198 continue;
1199
1200 if (layers[j]->bottoms.size() != 1)
1201 continue;
1202
1203 if (layers[j]->bottoms[0] == top_blob_index)
1204 break;
1205 }
1206
1207 if (j == layer_count)
1208 continue;
1209
1210 // fuse InnerProduct - Dropout to InnerProduct
1211 ncnn::InnerProduct* innerproduct = (ncnn::InnerProduct*)layers[i];
1212 ncnn::Dropout* dropout = (ncnn::Dropout*)layers[j];
1213
1214 fprintf(stderr, "fuse_innerproduct_dropout %s %s\n", innerproduct->name.c_str(), dropout->name.c_str());
1215
1216 float scale = dropout->scale;
1217 if (scale != 1.f)
1218 {
1219 const int num_output = innerproduct->num_output;
1220 const int weight_per_outch = innerproduct->weight_data_size / num_output;
1221
1222 float* weight = innerproduct->weight_data;
1223 for (int i = 0; i < num_output; i++)
1224 {
1225 float* conv_weight_outch = weight + weight_per_outch * i;
1226 for (int j = 0; j < weight_per_outch; j++)
1227 {
1228 conv_weight_outch[j] *= scale;
1229 }
1230 }
1231
1232 if (innerproduct->bias_term)
1233 {
1234 float* bias = innerproduct->bias_data;
1235 for (int i = 0; i < num_output; i++)
1236 {
1237 bias[i] *= scale;
1238 }
1239 }
1240 }
1241
1242 int top_blob_index_final = dropout->tops[0];
1243 innerproduct->tops[0] = top_blob_index_final;
1244 blobs[top_blob_index_final].producer = i;
1245 dropout->type = "ncnnfused";
1246 }
1247
1248 return 0;
1249 }
1250
fuse_convolution_activation()1251 int NetOptimize::fuse_convolution_activation()
1252 {
1253 const size_t layer_count = layers.size();
1254 for (size_t i = 0; i < layer_count; i++)
1255 {
1256 if (layers[i]->type != "Convolution")
1257 continue;
1258
1259 // Convolution - Activation
1260 int top_blob_index = layers[i]->tops[0];
1261
1262 size_t j = i + 1;
1263 for (; j < layer_count; j++)
1264 {
1265 if (layers[j]->type != "ReLU" && layers[j]->type != "Clip" && layers[j]->type != "Sigmoid" && layers[j]->type != "Mish")
1266 continue;
1267
1268 if (layers[j]->bottoms.size() != 1)
1269 continue;
1270
1271 if (layers[j]->bottoms[0] == top_blob_index)
1272 break;
1273 }
1274
1275 if (j == layer_count)
1276 continue;
1277
1278 // fuse Convolution - Activation to Convolution
1279 ncnn::Convolution* convolution = (ncnn::Convolution*)layers[i];
1280 ncnn::Layer* activation = layers[j];
1281
1282 fprintf(stderr, "fuse_convolution_activation %s %s\n", convolution->name.c_str(), activation->name.c_str());
1283
1284 if (activation->type == "ReLU")
1285 {
1286 ncnn::ReLU* relu = (ncnn::ReLU*)activation;
1287
1288 if (relu->slope == 0.f)
1289 {
1290 convolution->activation_type = 1;
1291 }
1292 else
1293 {
1294 convolution->activation_type = 2;
1295 convolution->activation_params = ncnn::Mat(1);
1296 convolution->activation_params[0] = relu->slope;
1297 }
1298 }
1299 else if (activation->type == "Clip")
1300 {
1301 ncnn::Clip* clip = (ncnn::Clip*)activation;
1302
1303 convolution->activation_type = 3;
1304 convolution->activation_params = ncnn::Mat(2);
1305 convolution->activation_params[0] = clip->min;
1306 convolution->activation_params[1] = clip->max;
1307 }
1308 else if (activation->type == "Sigmoid")
1309 {
1310 convolution->activation_type = 4;
1311 }
1312 else if (activation->type == "Mish")
1313 {
1314 convolution->activation_type = 5;
1315 }
1316
1317 int top_blob_index_final = activation->tops[0];
1318 convolution->tops[0] = top_blob_index_final;
1319 blobs[top_blob_index_final].producer = i;
1320 activation->type = "ncnnfused";
1321 }
1322
1323 return 0;
1324 }
1325
fuse_convolutiondepthwise_activation()1326 int NetOptimize::fuse_convolutiondepthwise_activation()
1327 {
1328 const size_t layer_count = layers.size();
1329 for (size_t i = 0; i < layer_count; i++)
1330 {
1331 if (layers[i]->type != "ConvolutionDepthWise")
1332 continue;
1333
1334 // ConvolutionDepthWise - Activation
1335 int top_blob_index = layers[i]->tops[0];
1336
1337 size_t j = i + 1;
1338 for (; j < layer_count; j++)
1339 {
1340 if (layers[j]->type != "ReLU" && layers[j]->type != "Clip" && layers[j]->type != "Sigmoid" && layers[j]->type != "Mish")
1341 continue;
1342
1343 if (layers[j]->bottoms.size() != 1)
1344 continue;
1345
1346 if (layers[j]->bottoms[0] == top_blob_index)
1347 break;
1348 }
1349
1350 if (j == layer_count)
1351 continue;
1352
1353 // fuse ConvolutionDepthWise - Activation to ConvolutionDepthWise
1354 ncnn::ConvolutionDepthWise* convolutiondepthwise = (ncnn::ConvolutionDepthWise*)layers[i];
1355 ncnn::Layer* activation = layers[j];
1356
1357 fprintf(stderr, "fuse_convolutiondepthwise_activation %s %s\n", convolutiondepthwise->name.c_str(), activation->name.c_str());
1358
1359 if (activation->type == "ReLU")
1360 {
1361 ncnn::ReLU* relu = (ncnn::ReLU*)activation;
1362
1363 if (relu->slope == 0.f)
1364 {
1365 convolutiondepthwise->activation_type = 1;
1366 }
1367 else
1368 {
1369 convolutiondepthwise->activation_type = 2;
1370 convolutiondepthwise->activation_params = ncnn::Mat(1);
1371 convolutiondepthwise->activation_params[0] = relu->slope;
1372 }
1373 }
1374 else if (activation->type == "Clip")
1375 {
1376 ncnn::Clip* clip = (ncnn::Clip*)activation;
1377
1378 convolutiondepthwise->activation_type = 3;
1379 convolutiondepthwise->activation_params = ncnn::Mat(2);
1380 convolutiondepthwise->activation_params[0] = clip->min;
1381 convolutiondepthwise->activation_params[1] = clip->max;
1382 }
1383 else if (activation->type == "Sigmoid")
1384 {
1385 convolutiondepthwise->activation_type = 4;
1386 }
1387 else if (activation->type == "Mish")
1388 {
1389 convolutiondepthwise->activation_type = 5;
1390 }
1391
1392 int top_blob_index_final = activation->tops[0];
1393 convolutiondepthwise->tops[0] = top_blob_index_final;
1394 blobs[top_blob_index_final].producer = i;
1395 activation->type = "ncnnfused";
1396 }
1397
1398 return 0;
1399 }
1400
fuse_deconvolution_activation()1401 int NetOptimize::fuse_deconvolution_activation()
1402 {
1403 const size_t layer_count = layers.size();
1404 for (size_t i = 0; i < layer_count; i++)
1405 {
1406 if (layers[i]->type != "Deconvolution")
1407 continue;
1408
1409 // Deconvolution - Activation
1410 int top_blob_index = layers[i]->tops[0];
1411
1412 size_t j = i + 1;
1413 for (; j < layer_count; j++)
1414 {
1415 if (layers[j]->type != "ReLU" && layers[j]->type != "Clip" && layers[j]->type != "Sigmoid")
1416 continue;
1417
1418 if (layers[j]->bottoms.size() != 1)
1419 continue;
1420
1421 if (layers[j]->bottoms[0] == top_blob_index)
1422 break;
1423 }
1424
1425 if (j == layer_count)
1426 continue;
1427
1428 // fuse Deconvolution - Activation to Deconvolution
1429 ncnn::Deconvolution* deconvolution = (ncnn::Deconvolution*)layers[i];
1430 ncnn::Layer* activation = layers[j];
1431
1432 fprintf(stderr, "fuse_deconvolution_activation %s %s\n", deconvolution->name.c_str(), activation->name.c_str());
1433
1434 if (activation->type == "ReLU")
1435 {
1436 ncnn::ReLU* relu = (ncnn::ReLU*)activation;
1437
1438 if (relu->slope == 0.f)
1439 {
1440 deconvolution->activation_type = 1;
1441 }
1442 else
1443 {
1444 deconvolution->activation_type = 2;
1445 deconvolution->activation_params = ncnn::Mat(1);
1446 deconvolution->activation_params[0] = relu->slope;
1447 }
1448 }
1449 else if (activation->type == "Clip")
1450 {
1451 ncnn::Clip* clip = (ncnn::Clip*)activation;
1452
1453 deconvolution->activation_type = 3;
1454 deconvolution->activation_params = ncnn::Mat(2);
1455 deconvolution->activation_params[0] = clip->min;
1456 deconvolution->activation_params[1] = clip->max;
1457 }
1458 else if (activation->type == "Sigmoid")
1459 {
1460 deconvolution->activation_type = 4;
1461 }
1462
1463 int top_blob_index_final = activation->tops[0];
1464 deconvolution->tops[0] = top_blob_index_final;
1465 blobs[top_blob_index_final].producer = i;
1466 activation->type = "ncnnfused";
1467 }
1468
1469 return 0;
1470 }
1471
fuse_deconvolutiondepthwise_activation()1472 int NetOptimize::fuse_deconvolutiondepthwise_activation()
1473 {
1474 const size_t layer_count = layers.size();
1475 for (size_t i = 0; i < layer_count; i++)
1476 {
1477 if (layers[i]->type != "DeconvolutionDepthWise")
1478 continue;
1479
1480 // DeconvolutionDepthWise - Activation
1481 int top_blob_index = layers[i]->tops[0];
1482
1483 size_t j = i + 1;
1484 for (; j < layer_count; j++)
1485 {
1486 if (layers[j]->type != "ReLU" && layers[j]->type != "Clip" && layers[j]->type != "Sigmoid")
1487 continue;
1488
1489 if (layers[j]->bottoms.size() != 1)
1490 continue;
1491
1492 if (layers[j]->bottoms[0] == top_blob_index)
1493 break;
1494 }
1495
1496 if (j == layer_count)
1497 continue;
1498
1499 // fuse DeconvolutionDepthWise - Activation to DeconvolutionDepthWise
1500 ncnn::DeconvolutionDepthWise* deconvolutiondepthwise = (ncnn::DeconvolutionDepthWise*)layers[i];
1501 ncnn::Layer* activation = layers[j];
1502
1503 fprintf(stderr, "fuse_deconvolutiondepthwise_activation %s %s\n", deconvolutiondepthwise->name.c_str(), activation->name.c_str());
1504
1505 if (activation->type == "ReLU")
1506 {
1507 ncnn::ReLU* relu = (ncnn::ReLU*)activation;
1508
1509 if (relu->slope == 0.f)
1510 {
1511 deconvolutiondepthwise->activation_type = 1;
1512 }
1513 else
1514 {
1515 deconvolutiondepthwise->activation_type = 2;
1516 deconvolutiondepthwise->activation_params = ncnn::Mat(1);
1517 deconvolutiondepthwise->activation_params[0] = relu->slope;
1518 }
1519 }
1520 else if (activation->type == "Clip")
1521 {
1522 ncnn::Clip* clip = (ncnn::Clip*)activation;
1523
1524 deconvolutiondepthwise->activation_type = 3;
1525 deconvolutiondepthwise->activation_params = ncnn::Mat(2);
1526 deconvolutiondepthwise->activation_params[0] = clip->min;
1527 deconvolutiondepthwise->activation_params[1] = clip->max;
1528 }
1529 else if (activation->type == "Sigmoid")
1530 {
1531 deconvolutiondepthwise->activation_type = 4;
1532 }
1533
1534 int top_blob_index_final = activation->tops[0];
1535 deconvolutiondepthwise->tops[0] = top_blob_index_final;
1536 blobs[top_blob_index_final].producer = i;
1537 activation->type = "ncnnfused";
1538 }
1539
1540 return 0;
1541 }
1542
fuse_innerproduct_activation()1543 int NetOptimize::fuse_innerproduct_activation()
1544 {
1545 const size_t layer_count = layers.size();
1546 for (size_t i = 0; i < layer_count; i++)
1547 {
1548 if (layers[i]->type != "InnerProduct")
1549 continue;
1550
1551 // InnerProduct - Activation
1552 int top_blob_index = layers[i]->tops[0];
1553
1554 size_t j = i + 1;
1555 for (; j < layer_count; j++)
1556 {
1557 if (layers[j]->type != "ReLU" && layers[j]->type != "Clip" && layers[j]->type != "Sigmoid")
1558 continue;
1559
1560 if (layers[j]->bottoms.size() != 1)
1561 continue;
1562
1563 if (layers[j]->bottoms[0] == top_blob_index)
1564 break;
1565 }
1566
1567 if (j == layer_count)
1568 continue;
1569
1570 // fuse InnerProduct - Activation to InnerProduct
1571 ncnn::InnerProduct* innerproduct = (ncnn::InnerProduct*)layers[i];
1572 ncnn::Layer* activation = layers[j];
1573
1574 fprintf(stderr, "fuse_innerproduct_activation %s %s\n", innerproduct->name.c_str(), activation->name.c_str());
1575
1576 if (activation->type == "ReLU")
1577 {
1578 ncnn::ReLU* relu = (ncnn::ReLU*)activation;
1579
1580 if (relu->slope == 0.f)
1581 {
1582 innerproduct->activation_type = 1;
1583 }
1584 else
1585 {
1586 innerproduct->activation_type = 2;
1587 innerproduct->activation_params = ncnn::Mat(1);
1588 innerproduct->activation_params[0] = relu->slope;
1589 }
1590 }
1591 else if (activation->type == "Clip")
1592 {
1593 ncnn::Clip* clip = (ncnn::Clip*)activation;
1594
1595 innerproduct->activation_type = 3;
1596 innerproduct->activation_params = ncnn::Mat(2);
1597 innerproduct->activation_params[0] = clip->min;
1598 innerproduct->activation_params[1] = clip->max;
1599 }
1600 else if (activation->type == "Sigmoid")
1601 {
1602 innerproduct->activation_type = 4;
1603 }
1604
1605 int top_blob_index_final = activation->tops[0];
1606 innerproduct->tops[0] = top_blob_index_final;
1607 blobs[top_blob_index_final].producer = i;
1608 activation->type = "ncnnfused";
1609 }
1610
1611 return 0;
1612 }
1613
fuse_memorydata_binaryop()1614 int NetOptimize::fuse_memorydata_binaryop()
1615 {
1616 const size_t layer_count = layers.size();
1617 for (size_t i = 0; i < layer_count; i++)
1618 {
1619 if (layers[i]->type != "MemoryData")
1620 continue;
1621
1622 // MemoryData - BinaryOp
1623 int top_blob_index = layers[i]->tops[0];
1624
1625 size_t j = i + 1;
1626 for (; j < layer_count; j++)
1627 {
1628 if (layers[j]->type != "BinaryOp")
1629 continue;
1630
1631 if (layers[j]->bottoms.size() != 2)
1632 continue;
1633
1634 if (layers[j]->bottoms[0] == top_blob_index || layers[j]->bottoms[1] == top_blob_index)
1635 break;
1636 }
1637
1638 if (j == layer_count)
1639 continue;
1640
1641 // fuse MemoryData - BinaryOp to BinaryOp
1642 ncnn::MemoryData* memorydata = (ncnn::MemoryData*)layers[i];
1643 ncnn::BinaryOp* binaryop = (ncnn::BinaryOp*)layers[j];
1644
1645 if (memorydata->w != 1 || memorydata->h != 0 || memorydata->c != 0)
1646 {
1647 // not a scalar
1648 continue;
1649 }
1650
1651 int memorydata_index = 1;
1652
1653 if (binaryop->bottoms[0] == top_blob_index)
1654 {
1655 int op_type = binaryop->op_type;
1656
1657 if (op_type == ncnn::BinaryOp::Operation_ADD
1658 || op_type == ncnn::BinaryOp::Operation_MUL
1659 || op_type == ncnn::BinaryOp::Operation_MAX
1660 || op_type == ncnn::BinaryOp::Operation_MIN)
1661 {
1662 memorydata_index = 0;
1663 }
1664 else if (op_type == ncnn::BinaryOp::Operation_SUB)
1665 {
1666 binaryop->op_type = ncnn::BinaryOp::Operation_RSUB;
1667 memorydata_index = 0;
1668 }
1669 else if (op_type == ncnn::BinaryOp::Operation_DIV)
1670 {
1671 binaryop->op_type = ncnn::BinaryOp::Operation_RDIV;
1672 memorydata_index = 0;
1673 }
1674 else
1675 {
1676 // non interchangeable binaryop
1677 continue;
1678 }
1679 }
1680
1681 float scalar = memorydata->data[0];
1682
1683 binaryop->with_scalar = 1;
1684 binaryop->b = scalar;
1685
1686 fprintf(stderr, "fuse_memorydata_binaryop %s %s\n", memorydata->name.c_str(), binaryop->name.c_str());
1687
1688 binaryop->bottoms.erase(binaryop->bottoms.begin() + memorydata_index);
1689 memorydata->type = "ncnnfused";
1690 }
1691
1692 for (size_t i = 0; i < layer_count; i++)
1693 {
1694 if (layers[i]->type != "MemoryData")
1695 continue;
1696
1697 // MemoryData - Split - BinaryOp
1698 int top_blob_index = layers[i]->tops[0];
1699
1700 size_t j0 = i + 1;
1701 for (; j0 < layer_count; j0++)
1702 {
1703 if (layers[j0]->type != "Split")
1704 continue;
1705
1706 if (layers[j0]->bottoms.size() != 1)
1707 continue;
1708
1709 if (layers[j0]->bottoms[0] == top_blob_index)
1710 break;
1711 }
1712
1713 if (j0 == layer_count)
1714 continue;
1715
1716 int split_top_blob_index = -1;
1717
1718 size_t j1 = j0 + 1;
1719 for (; j1 < layer_count; j1++)
1720 {
1721 if (layers[j1]->type != "BinaryOp")
1722 continue;
1723
1724 if (layers[j1]->bottoms.size() != 2)
1725 continue;
1726
1727 for (int k = 0; k < (int)layers[j0]->tops.size(); k++)
1728 {
1729 if (layers[j1]->bottoms[0] == layers[j0]->tops[k] || layers[j1]->bottoms[1] == layers[j0]->tops[k])
1730 {
1731 split_top_blob_index = k;
1732 break;
1733 }
1734 }
1735
1736 if (split_top_blob_index != -1)
1737 break;
1738 }
1739
1740 if (j1 == layer_count)
1741 continue;
1742
1743 // fuse MemoryData - Split - BinaryOp to BinaryOp
1744 ncnn::MemoryData* memorydata = (ncnn::MemoryData*)layers[i];
1745 ncnn::Split* split = (ncnn::Split*)layers[j0];
1746 ncnn::BinaryOp* binaryop = (ncnn::BinaryOp*)layers[j1];
1747
1748 if (memorydata->w != 1 || memorydata->h != 0 || memorydata->c != 0)
1749 {
1750 // not a scalar
1751 continue;
1752 }
1753
1754 int memorydata_index = 1;
1755
1756 if (binaryop->bottoms[0] == split->tops[split_top_blob_index])
1757 {
1758 int op_type = binaryop->op_type;
1759
1760 if (op_type == ncnn::BinaryOp::Operation_ADD
1761 || op_type == ncnn::BinaryOp::Operation_MUL
1762 || op_type == ncnn::BinaryOp::Operation_MAX
1763 || op_type == ncnn::BinaryOp::Operation_MIN)
1764 {
1765 memorydata_index = 0;
1766 }
1767 else if (op_type == ncnn::BinaryOp::Operation_SUB)
1768 {
1769 binaryop->op_type = ncnn::BinaryOp::Operation_RSUB;
1770 memorydata_index = 0;
1771 }
1772 else if (op_type == ncnn::BinaryOp::Operation_DIV)
1773 {
1774 binaryop->op_type = ncnn::BinaryOp::Operation_RDIV;
1775 memorydata_index = 0;
1776 }
1777 else
1778 {
1779 // non interchangeable binaryop
1780 continue;
1781 }
1782 }
1783
1784 float scalar = memorydata->data[0];
1785
1786 binaryop->with_scalar = 1;
1787 binaryop->b = scalar;
1788
1789 fprintf(stderr, "fuse_memorydata_binaryop %s %s\n", memorydata->name.c_str(), binaryop->name.c_str());
1790
1791 binaryop->bottoms.erase(binaryop->bottoms.begin() + memorydata_index);
1792 split->tops.erase(split->tops.begin() + split_top_blob_index);
1793 if (split->tops.empty())
1794 {
1795 split->type = "ncnnfused";
1796 memorydata->type = "ncnnfused";
1797 }
1798
1799 i--;
1800 }
1801
1802 return 0;
1803 }
1804
fuse_binaryop_eltwise()1805 int NetOptimize::fuse_binaryop_eltwise()
1806 {
1807 const size_t layer_count = layers.size();
1808 for (size_t i = 0; i < layer_count; i++)
1809 {
1810 if (layers[i]->type != "BinaryOp")
1811 continue;
1812
1813 if (layers[i]->bottoms.size() != 2)
1814 continue;
1815
1816 ncnn::BinaryOp* binaryop = (ncnn::BinaryOp*)layers[i];
1817
1818 if (binaryop->op_type != ncnn::BinaryOp::Operation_ADD)
1819 continue;
1820
1821 if (binaryop->with_scalar)
1822 continue;
1823
1824 // BinaryOp - BinaryOp - BinaryOp
1825 int bottom_blob_index_0 = binaryop->bottoms[0];
1826 int bottom_blob_index_1 = binaryop->bottoms[1];
1827
1828 size_t j0 = 0;
1829 for (; j0 < i; j0++)
1830 {
1831 if (layers[j0]->type != "BinaryOp")
1832 continue;
1833
1834 if (layers[j0]->bottoms.size() != 1)
1835 continue;
1836
1837 if (((ncnn::BinaryOp*)layers[j0])->op_type != ncnn::BinaryOp::Operation_MUL)
1838 continue;
1839
1840 if (layers[j0]->tops[0] == bottom_blob_index_0)
1841 break;
1842 }
1843
1844 size_t j1 = 0;
1845 for (; j1 < i; j1++)
1846 {
1847 if (layers[j1]->type != "BinaryOp")
1848 continue;
1849
1850 if (layers[j1]->bottoms.size() != 1)
1851 continue;
1852
1853 if (((ncnn::BinaryOp*)layers[j1])->op_type != ncnn::BinaryOp::Operation_MUL)
1854 continue;
1855
1856 if (layers[j1]->tops[0] == bottom_blob_index_1)
1857 break;
1858 }
1859
1860 if (j0 == i && j1 == i)
1861 continue;
1862
1863 ncnn::BinaryOp* binaryop0 = (ncnn::BinaryOp*)layers[j0];
1864 ncnn::BinaryOp* binaryop1 = (ncnn::BinaryOp*)layers[j1];
1865
1866 fprintf(stderr, "fuse_binaryop_eltwise %s %s %s\n", binaryop0->name.c_str(), binaryop1->name.c_str(), binaryop->name.c_str());
1867
1868 ncnn::Eltwise* eltwise = (ncnn::Eltwise*)ncnn::create_layer("Eltwise");
1869
1870 eltwise->type = "Eltwise";
1871 eltwise->name = binaryop->name;
1872 eltwise->bottoms = binaryop->bottoms;
1873 eltwise->tops = binaryop->tops;
1874
1875 ncnn::ParamDict pd;
1876 eltwise->load_param(pd);
1877
1878 eltwise->op_type = ncnn::Eltwise::Operation_SUM;
1879
1880 eltwise->coeffs = ncnn::Mat(2);
1881
1882 if (j0 != i && j1 != i)
1883 {
1884 // fuse BinaryOp - BinaryOp - BinaryOp to Eltwise
1885 eltwise->coeffs[0] = binaryop0->b;
1886 eltwise->coeffs[1] = binaryop1->b;
1887
1888 eltwise->bottoms[0] = binaryop0->bottoms[0];
1889 eltwise->bottoms[1] = binaryop1->bottoms[0];
1890
1891 binaryop0->type = "ncnnfused";
1892 binaryop1->type = "ncnnfused";
1893 }
1894 if (j0 != i && j1 == i)
1895 {
1896 // fuse BinaryOp - X - BinaryOp to Eltwise
1897 eltwise->coeffs[0] = binaryop0->b;
1898 eltwise->coeffs[1] = 1.f;
1899
1900 eltwise->bottoms[0] = binaryop0->bottoms[0];
1901
1902 binaryop0->type = "ncnnfused";
1903 }
1904 if (j0 == i && j1 != i)
1905 {
1906 // fuse X - BinaryOp - BinaryOp to Eltwise
1907 eltwise->coeffs[0] = 1.f;
1908 eltwise->coeffs[1] = binaryop1->b;
1909
1910 eltwise->bottoms[1] = binaryop1->bottoms[0];
1911
1912 binaryop1->type = "ncnnfused";
1913 }
1914
1915 layers[i] = eltwise;
1916 delete binaryop;
1917 }
1918
1919 return 0;
1920 }
1921
eliminate_dropout()1922 int NetOptimize::eliminate_dropout()
1923 {
1924 const size_t layer_count = layers.size();
1925 for (size_t i = 0; i < layer_count; i++)
1926 {
1927 if (layers[i]->type != "Dropout")
1928 continue;
1929
1930 ncnn::Dropout* dropout = (ncnn::Dropout*)layers[i];
1931 if (dropout->scale != 1.f)
1932 continue;
1933
1934 // Any - Dropout
1935 int bottom_blob_index = layers[i]->bottoms[0];
1936
1937 int j = i - 1;
1938 for (; j >= 0; j--)
1939 {
1940 if (layers[j]->type == "ncnnfused")
1941 continue;
1942
1943 if (layers[j]->tops.size() != 1)
1944 continue;
1945
1946 if (layers[j]->tops[0] == bottom_blob_index)
1947 break;
1948 }
1949
1950 if (j == -1)
1951 continue;
1952
1953 ncnn::Layer* any = layers[j];
1954
1955 fprintf(stderr, "eliminate_dropout %s %s\n", any->name.c_str(), dropout->name.c_str());
1956
1957 int top_blob_index_final = dropout->tops[0];
1958 any->tops[0] = top_blob_index_final;
1959 blobs[top_blob_index_final].producer = j;
1960 dropout->type = "ncnnfused";
1961 }
1962
1963 return 0;
1964 }
1965
eliminate_pooling1x1()1966 int NetOptimize::eliminate_pooling1x1()
1967 {
1968 const size_t layer_count = layers.size();
1969 for (size_t i = 0; i < layer_count; i++)
1970 {
1971 if (layers[i]->type != "Pooling")
1972 continue;
1973
1974 ncnn::Pooling* pooling = (ncnn::Pooling*)layers[i];
1975 if (pooling->pad_left != 0 || pooling->pad_right != 0 || pooling->pad_top != 0 || pooling->pad_bottom != 0)
1976 continue;
1977
1978 if (pooling->kernel_w != 1 || pooling->kernel_h != 1 || pooling->stride_w != 1 || pooling->stride_h != 1)
1979 continue;
1980
1981 if (pooling->global_pooling != 0)
1982 continue;
1983
1984 // Any - Pooling
1985 int bottom_blob_index = layers[i]->bottoms[0];
1986
1987 int top_i = -1;
1988 int j = i - 1;
1989 for (; j >= 0; j--)
1990 {
1991 if (layers[j]->type == "ncnnfused")
1992 continue;
1993
1994 for (size_t k = 0; k < layers[j]->tops.size(); k++)
1995 {
1996 if (layers[j]->tops[k] == bottom_blob_index)
1997 {
1998 top_i = k;
1999 break;
2000 }
2001 }
2002
2003 if (top_i != -1)
2004 break;
2005 }
2006
2007 if (j == -1)
2008 continue;
2009
2010 ncnn::Layer* any = layers[j];
2011
2012 fprintf(stderr, "eliminate_pooling1x1 %s %s\n", any->name.c_str(), pooling->name.c_str());
2013
2014 int top_blob_index_final = pooling->tops[0];
2015 any->tops[top_i] = top_blob_index_final;
2016 blobs[top_blob_index_final].producer = j;
2017 pooling->type = "ncnnfused";
2018 }
2019
2020 return 0;
2021 }
2022
eliminate_noop()2023 int NetOptimize::eliminate_noop()
2024 {
2025 const size_t layer_count = layers.size();
2026 for (size_t i = 0; i < layer_count; i++)
2027 {
2028 if (layers[i]->type != "Noop")
2029 continue;
2030
2031 ncnn::Layer* noop = layers[i];
2032
2033 if (noop->bottoms.empty())
2034 {
2035 // Noop
2036 fprintf(stderr, "eliminate_noop %s\n", noop->name.c_str());
2037
2038 size_t top_blob_count = noop->tops.size();
2039 for (size_t j = 0; j < top_blob_count; j++)
2040 {
2041 int top_blob_index_final = noop->tops[j];
2042 blobs[top_blob_index_final].producer = -1;
2043 }
2044 noop->type = "ncnnfused";
2045
2046 continue;
2047 }
2048
2049 // Any - Noop
2050 int bottom_blob_index = noop->bottoms[0];
2051
2052 int j = i - 1;
2053 int any_k = -1;
2054 for (; j >= 0; j--)
2055 {
2056 if (layers[j]->type == "ncnnfused")
2057 continue;
2058
2059 bool link_noop = false;
2060 size_t top_blob_count = layers[j]->tops.size();
2061 for (size_t k = 0; k < top_blob_count; k++)
2062 {
2063 if (layers[j]->tops[k] == bottom_blob_index)
2064 {
2065 link_noop = true;
2066 any_k = k;
2067 break;
2068 }
2069 }
2070
2071 if (link_noop)
2072 break;
2073 }
2074
2075 if (j == -1 || any_k == -1)
2076 continue;
2077
2078 ncnn::Layer* any = layers[j];
2079
2080 fprintf(stderr, "eliminate_noop %s %s\n", any->name.c_str(), noop->name.c_str());
2081
2082 int top_blob_index_final = noop->tops[0];
2083 any->tops[any_k] = top_blob_index_final;
2084 blobs[top_blob_index_final].producer = j;
2085
2086 noop->type = "ncnnfused";
2087 }
2088
2089 return 0;
2090 }
2091
eliminate_split()2092 int NetOptimize::eliminate_split()
2093 {
2094 const size_t layer_count = layers.size();
2095 for (size_t i = 0; i < layer_count; i++)
2096 {
2097 if (layers[i]->type != "Split")
2098 continue;
2099
2100 ncnn::Layer* split = layers[i];
2101
2102 int real_split_output_count = 0;
2103 int real_split_top_blob_index = -1;
2104 size_t top_blob_count = split->tops.size();
2105 for (size_t j = 0; j < top_blob_count; j++)
2106 {
2107 int top_blob_index_final = split->tops[j];
2108 if (blobs[top_blob_index_final].consumer != -1)
2109 {
2110 real_split_output_count += 1;
2111 real_split_top_blob_index = j;
2112 }
2113 }
2114
2115 if (real_split_output_count > 1)
2116 continue;
2117
2118 // Any - Pooling
2119 int bottom_blob_index = split->bottoms[0];
2120
2121 int top_i = -1;
2122 int j = i - 1;
2123 for (; j >= 0; j--)
2124 {
2125 if (layers[j]->type == "ncnnfused")
2126 continue;
2127
2128 for (size_t k = 0; k < layers[j]->tops.size(); k++)
2129 {
2130 if (layers[j]->tops[k] == bottom_blob_index)
2131 {
2132 top_i = k;
2133 break;
2134 }
2135 }
2136
2137 if (top_i != -1)
2138 break;
2139 }
2140
2141 if (j == -1)
2142 continue;
2143
2144 ncnn::Layer* any = layers[j];
2145
2146 fprintf(stderr, "eliminate_split %s %s\n", any->name.c_str(), split->name.c_str());
2147
2148 int top_blob_index_final = split->tops[real_split_top_blob_index];
2149 any->tops[top_i] = top_blob_index_final;
2150 blobs[top_blob_index_final].producer = j;
2151 split->type = "ncnnfused";
2152 }
2153
2154 return 0;
2155 }
2156
eliminate_orphaned_memorydata()2157 int NetOptimize::eliminate_orphaned_memorydata()
2158 {
2159 const size_t layer_count = layers.size();
2160 for (size_t i = 0; i < layer_count; i++)
2161 {
2162 if (layers[i]->type != "MemoryData")
2163 continue;
2164
2165 // MemoryData - X
2166 int top_blob_index = layers[i]->tops[0];
2167
2168 size_t j = i + 1;
2169 for (; j < layer_count; j++)
2170 {
2171 if (layers[j]->type == "ncnnfused")
2172 continue;
2173
2174 bool orphaned = true;
2175 for (size_t k = 0; k < layers[j]->bottoms.size(); k++)
2176 {
2177 if (layers[j]->bottoms[k] == top_blob_index)
2178 {
2179 orphaned = false;
2180 break;
2181 }
2182 }
2183
2184 if (!orphaned)
2185 break;
2186 }
2187
2188 if (j < layer_count)
2189 continue;
2190
2191 // assert orphaned == true
2192 fprintf(stderr, "eliminate_orphaned_memorydata %s\n", layers[i]->name.c_str());
2193
2194 layers[i]->type = "ncnnfused";
2195 }
2196
2197 return 0;
2198 }
2199
eliminate_reshape_after_global_pooling()2200 int NetOptimize::eliminate_reshape_after_global_pooling()
2201 {
2202 const size_t layer_count = layers.size();
2203 for (size_t i = 0; i < layer_count; i++)
2204 {
2205 if (layers[i]->type != "Pooling")
2206 continue;
2207
2208 ncnn::Pooling* pooling = (ncnn::Pooling*)layers[i];
2209 if (pooling->global_pooling == 0)
2210 continue;
2211
2212 // Pooling - Reshape
2213 int top_blob_index = layers[i]->tops[0];
2214
2215 size_t j = i + 1;
2216 for (; j < layer_count; j++)
2217 {
2218 if (layers[j]->type != "Reshape")
2219 continue;
2220
2221 if (layers[j]->bottoms.size() != 1)
2222 continue;
2223
2224 if (layers[j]->bottoms[0] == top_blob_index)
2225 break;
2226 }
2227
2228 if (j == layer_count)
2229 continue;
2230
2231 ncnn::Reshape* reshape = (ncnn::Reshape*)layers[j];
2232 if (reshape->h != -233 || reshape->c != -233 || reshape->permute != 0)
2233 continue;
2234
2235 fprintf(stderr, "eliminate_reshape_after_global_pooling %s %s\n", pooling->name.c_str(), reshape->name.c_str());
2236
2237 int top_blob_index_final = reshape->tops[0];
2238 pooling->tops[0] = top_blob_index_final;
2239 blobs[top_blob_index_final].producer = i;
2240 reshape->type = "ncnnfused";
2241 }
2242
2243 return 0;
2244 }
2245
eliminate_flatten_after_global_pooling()2246 int NetOptimize::eliminate_flatten_after_global_pooling()
2247 {
2248 const size_t layer_count = layers.size();
2249 for (size_t i = 0; i < layer_count; i++)
2250 {
2251 if (layers[i]->type != "Pooling")
2252 continue;
2253
2254 ncnn::Pooling* pooling = (ncnn::Pooling*)layers[i];
2255 if (pooling->global_pooling == 0)
2256 continue;
2257
2258 // Pooling - Flatten
2259 int top_blob_index = layers[i]->tops[0];
2260
2261 size_t j = i + 1;
2262 for (; j < layer_count; j++)
2263 {
2264 if (layers[j]->type != "Flatten")
2265 continue;
2266
2267 if (layers[j]->bottoms.size() != 1)
2268 continue;
2269
2270 if (layers[j]->bottoms[0] == top_blob_index)
2271 break;
2272 }
2273
2274 if (j == layer_count)
2275 continue;
2276
2277 ncnn::Flatten* flatten = (ncnn::Flatten*)layers[j];
2278
2279 fprintf(stderr, "eliminate_flatten_after_global_pooling %s %s\n", pooling->name.c_str(), flatten->name.c_str());
2280
2281 int top_blob_index_final = flatten->tops[0];
2282 pooling->tops[0] = top_blob_index_final;
2283 blobs[top_blob_index_final].producer = i;
2284 flatten->type = "ncnnfused";
2285 }
2286
2287 return 0;
2288 }
2289
eliminate_flatten_after_innerproduct()2290 int NetOptimize::eliminate_flatten_after_innerproduct()
2291 {
2292 const size_t layer_count = layers.size();
2293 for (size_t i = 0; i < layer_count; i++)
2294 {
2295 if (layers[i]->type != "InnerProduct")
2296 continue;
2297
2298 // InnerProduct - Flatten
2299 int top_blob_index = layers[i]->tops[0];
2300
2301 size_t j = i + 1;
2302 for (; j < layer_count; j++)
2303 {
2304 if (layers[j]->type != "Flatten")
2305 continue;
2306
2307 if (layers[j]->bottoms.size() != 1)
2308 continue;
2309
2310 if (layers[j]->bottoms[0] == top_blob_index)
2311 break;
2312 }
2313
2314 if (j == layer_count)
2315 continue;
2316
2317 ncnn::InnerProduct* innerproduct = (ncnn::InnerProduct*)layers[i];
2318 ncnn::Flatten* flatten = (ncnn::Flatten*)layers[j];
2319
2320 fprintf(stderr, "eliminate_flatten_after_innerproduct %s %s\n", innerproduct->name.c_str(), flatten->name.c_str());
2321
2322 int top_blob_index_final = flatten->tops[0];
2323 innerproduct->tops[0] = top_blob_index_final;
2324 blobs[top_blob_index_final].producer = i;
2325 flatten->type = "ncnnfused";
2326 }
2327
2328 return 0;
2329 }
2330
eliminate_reshape_before_binaryop()2331 int NetOptimize::eliminate_reshape_before_binaryop()
2332 {
2333 const size_t layer_count = layers.size();
2334 for (size_t i = 0; i < layer_count; i++)
2335 {
2336 if (layers[i]->type != "Reshape")
2337 continue;
2338
2339 ncnn::Reshape* reshape = (ncnn::Reshape*)layers[i];
2340 if (reshape->w != 1 || reshape->h != 1 || reshape->permute != 0)
2341 continue;
2342
2343 // Reshape - BinaryOp
2344 int top_blob_index = layers[i]->tops[0];
2345
2346 size_t j = i + 1;
2347 for (; j < layer_count; j++)
2348 {
2349 if (layers[j]->type != "BinaryOp")
2350 continue;
2351
2352 if (layers[j]->bottoms.size() != 2)
2353 continue;
2354
2355 if (layers[j]->bottoms[0] == top_blob_index || layers[j]->bottoms[1] == top_blob_index)
2356 break;
2357 }
2358
2359 if (j == layer_count)
2360 continue;
2361
2362 ncnn::BinaryOp* binaryop = (ncnn::BinaryOp*)layers[j];
2363
2364 fprintf(stderr, "eliminate_reshape_before_binaryop %s %s\n", reshape->name.c_str(), binaryop->name.c_str());
2365
2366 int bottom_blob_index_final = reshape->bottoms[0];
2367 if (layers[j]->bottoms[0] == top_blob_index)
2368 binaryop->bottoms[0] = bottom_blob_index_final;
2369 if (layers[j]->bottoms[1] == top_blob_index)
2370 binaryop->bottoms[1] = bottom_blob_index_final;
2371 blobs[bottom_blob_index_final].consumer = j;
2372 reshape->type = "ncnnfused";
2373 }
2374
2375 return 0;
2376 }
2377
replace_reduction_with_global_pooling()2378 int NetOptimize::replace_reduction_with_global_pooling()
2379 {
2380 const size_t layer_count = layers.size();
2381 for (size_t i = 0; i < layer_count; i++)
2382 {
2383 if (layers[i]->type != "Reduction")
2384 continue;
2385
2386 ncnn::Reduction* reduction1 = (ncnn::Reduction*)layers[i];
2387 if (reduction1->operation != 3 || reduction1->reduce_all != 0 || reduction1->coeff != 1.f)
2388 continue;
2389
2390 if (reduction1->axes.w != 1)
2391 continue;
2392
2393 const int* axes_ptr = reduction1->axes;
2394 if (axes_ptr[0] != 2 && axes_ptr[0] != 3)
2395 continue;
2396
2397 // Reduction(2/3) - Reduction(2)
2398 int top_blob_index = layers[i]->tops[0];
2399
2400 size_t j = i + 1;
2401 for (; j < layer_count; j++)
2402 {
2403 if (layers[j]->type != "Reduction")
2404 continue;
2405
2406 if (layers[j]->bottoms.size() != 1)
2407 continue;
2408
2409 if (layers[j]->bottoms[0] == top_blob_index)
2410 break;
2411 }
2412
2413 if (j == layer_count)
2414 continue;
2415
2416 ncnn::Reduction* reduction2 = (ncnn::Reduction*)layers[j];
2417 if (reduction2->operation != 3 || reduction2->reduce_all != 0 || reduction2->coeff != 1.f)
2418 continue;
2419
2420 if (reduction2->axes.w != 1)
2421 continue;
2422
2423 const int* axes2_ptr = reduction2->axes;
2424 if (axes2_ptr[0] != 2)
2425 continue;
2426
2427 fprintf(stderr, "replace_reduction_with_global_pooling %s %s\n", reduction1->name.c_str(), reduction2->name.c_str());
2428
2429 ncnn::Pooling* pooling = (ncnn::Pooling*)ncnn::create_layer("Pooling");
2430
2431 pooling->type = "Pooling";
2432 pooling->name = reduction2->name;
2433 pooling->bottoms = reduction2->bottoms;
2434 pooling->tops = reduction2->tops;
2435
2436 ncnn::ParamDict pd;
2437 pooling->load_param(pd);
2438
2439 pooling->pooling_type = 1;
2440 pooling->global_pooling = 1;
2441
2442 layers[j] = pooling;
2443 delete reduction2;
2444
2445 int bottom_blob_index_final = reduction1->bottoms[0];
2446 pooling->bottoms[0] = bottom_blob_index_final;
2447 blobs[bottom_blob_index_final].consumer = j;
2448 reduction1->type = "ncnnfused";
2449 }
2450
2451 return 0;
2452 }
2453
replace_prelu_with_leaky_relu()2454 int NetOptimize::replace_prelu_with_leaky_relu()
2455 {
2456 const size_t layer_count = layers.size();
2457 for (size_t i = 0; i < layer_count; i++)
2458 {
2459 if (layers[i]->type != "PReLU")
2460 continue;
2461
2462 ncnn::PReLU* prelu = (ncnn::PReLU*)layers[i];
2463 if (prelu->num_slope != 1)
2464 continue;
2465
2466 fprintf(stderr, "replace_prelu_with_leaky_relu %s\n", prelu->name.c_str());
2467
2468 ncnn::ReLU* relu = (ncnn::ReLU*)ncnn::create_layer("ReLU");
2469
2470 relu->type = "ReLU";
2471 relu->name = prelu->name;
2472 relu->bottoms = prelu->bottoms;
2473 relu->tops = prelu->tops;
2474
2475 ncnn::ParamDict pd;
2476 relu->load_param(pd);
2477
2478 relu->slope = prelu->slope_data[0];
2479
2480 layers[i] = relu;
2481 delete prelu;
2482 }
2483
2484 return 0;
2485 }
2486
replace_convolution_with_innerproduct_after_global_pooling()2487 int NetOptimize::replace_convolution_with_innerproduct_after_global_pooling()
2488 {
2489 const size_t layer_count = layers.size();
2490 for (size_t i = 0; i < layer_count; i++)
2491 {
2492 if (layers[i]->type != "Pooling")
2493 continue;
2494
2495 ncnn::Pooling* pooling = (ncnn::Pooling*)layers[i];
2496 if (pooling->global_pooling == 0)
2497 continue;
2498
2499 // Pooling - Convolution
2500 int top_blob_index = layers[i]->tops[0];
2501
2502 size_t j = i + 1;
2503 for (; j < layer_count; j++)
2504 {
2505 if (layers[j]->type != "Convolution")
2506 continue;
2507
2508 if (layers[j]->bottoms.size() != 1)
2509 continue;
2510
2511 if (layers[j]->bottoms[0] == top_blob_index)
2512 break;
2513 }
2514
2515 if (j == layer_count)
2516 continue;
2517
2518 ncnn::Convolution* convolution = (ncnn::Convolution*)layers[j];
2519
2520 fprintf(stderr, "replace_convolution_with_innerproduct_after_global_pooling %s %s\n", pooling->name.c_str(), convolution->name.c_str());
2521
2522 ncnn::InnerProduct* innerproduct = (ncnn::InnerProduct*)ncnn::create_layer("InnerProduct");
2523
2524 innerproduct->type = "InnerProduct";
2525 innerproduct->name = convolution->name;
2526 innerproduct->bottoms = convolution->bottoms;
2527 innerproduct->tops = convolution->tops;
2528
2529 ncnn::ParamDict pd;
2530 innerproduct->load_param(pd);
2531
2532 innerproduct->num_output = convolution->num_output;
2533 innerproduct->bias_term = convolution->bias_term;
2534 innerproduct->weight_data_size = convolution->weight_data_size;
2535 innerproduct->int8_scale_term = convolution->int8_scale_term;
2536
2537 innerproduct->weight_data = convolution->weight_data;
2538 innerproduct->bias_data = convolution->bias_data;
2539 #if NCNN_INT8
2540 innerproduct->weight_data_int8_scales = convolution->weight_data_int8_scales;
2541 innerproduct->bottom_blob_int8_scales = convolution->bottom_blob_int8_scales;
2542 #endif
2543
2544 innerproduct->activation_type = convolution->activation_type;
2545 innerproduct->activation_params = convolution->activation_params;
2546
2547 layers[j] = innerproduct;
2548 delete convolution;
2549 }
2550
2551 return 0;
2552 }
2553
replace_convolution_with_innerproduct_after_innerproduct()2554 int NetOptimize::replace_convolution_with_innerproduct_after_innerproduct()
2555 {
2556 const size_t layer_count = layers.size();
2557 for (;;)
2558 {
2559 bool replaced = false;
2560
2561 for (size_t i = 0; i < layer_count; i++)
2562 {
2563 if (layers[i]->type != "InnerProduct")
2564 continue;
2565
2566 // InnerProduct - Convolution
2567 int top_blob_index = layers[i]->tops[0];
2568
2569 size_t j = i + 1;
2570 for (; j < layer_count; j++)
2571 {
2572 if (layers[j]->type != "Convolution")
2573 continue;
2574
2575 if (layers[j]->bottoms.size() != 1)
2576 continue;
2577
2578 if (layers[j]->bottoms[0] == top_blob_index)
2579 break;
2580 }
2581
2582 if (j == layer_count)
2583 continue;
2584
2585 ncnn::InnerProduct* innerproduct = (ncnn::InnerProduct*)layers[i];
2586 ncnn::Convolution* convolution = (ncnn::Convolution*)layers[j];
2587
2588 fprintf(stderr, "replace_convolution_with_innerproduct_after_innerproduct %s %s\n", innerproduct->name.c_str(), convolution->name.c_str());
2589
2590 ncnn::InnerProduct* innerproduct2 = (ncnn::InnerProduct*)ncnn::create_layer("InnerProduct");
2591
2592 innerproduct2->type = "InnerProduct";
2593 innerproduct2->name = convolution->name;
2594 innerproduct2->bottoms = convolution->bottoms;
2595 innerproduct2->tops = convolution->tops;
2596
2597 ncnn::ParamDict pd;
2598 innerproduct2->load_param(pd);
2599
2600 innerproduct2->num_output = convolution->num_output;
2601 innerproduct2->bias_term = convolution->bias_term;
2602 innerproduct2->weight_data_size = convolution->weight_data_size;
2603 innerproduct->int8_scale_term = convolution->int8_scale_term;
2604
2605 innerproduct2->weight_data = convolution->weight_data;
2606 innerproduct2->bias_data = convolution->bias_data;
2607 #if NCNN_INT8
2608 innerproduct->weight_data_int8_scales = convolution->weight_data_int8_scales;
2609 innerproduct->bottom_blob_int8_scales = convolution->bottom_blob_int8_scales;
2610 #endif
2611
2612 innerproduct2->activation_type = convolution->activation_type;
2613 innerproduct2->activation_params = convolution->activation_params;
2614
2615 layers[j] = innerproduct2;
2616 delete convolution;
2617
2618 replaced = true;
2619 }
2620
2621 if (!replaced)
2622 break;
2623 }
2624
2625 return 0;
2626 }
2627
main(int argc,char ** argv)2628 int main(int argc, char** argv)
2629 {
2630 if (argc < 6)
2631 {
2632 fprintf(stderr, "usage: %s [inparam] [inbin] [outparam] [outbin] [flag] [cutstart] [cutend]\n", argv[0]);
2633 return -1;
2634 }
2635
2636 const char* inparam = argv[1];
2637 const char* inbin = argv[2];
2638 const char* outparam = argv[3];
2639 const char* outbin = argv[4];
2640 int flag = atoi(argv[5]);
2641 const char* cutstartname = nullptr;
2642 const char* cutendname = nullptr;
2643
2644 if (argc > 6)
2645 {
2646 cutstartname = argv[6];
2647 }
2648
2649 if (argc > 7)
2650 {
2651 cutendname = argv[7];
2652 }
2653
2654 NetOptimize optimizer;
2655
2656 if (flag == 65536 || flag == 1)
2657 {
2658 optimizer.storage_type = 1;
2659 }
2660 else
2661 {
2662 optimizer.storage_type = 0;
2663 }
2664
2665 optimizer.load_param(inparam);
2666
2667 if (strcmp(inbin, "null") == 0)
2668 {
2669 DataReaderFromEmpty dr;
2670 optimizer.load_model(dr);
2671 optimizer.gen_random_weight = true;
2672 }
2673 else
2674 optimizer.load_model(inbin);
2675
2676 if (optimizer.set_cutparam(cutstartname, cutendname) < 0)
2677 {
2678 return -1;
2679 }
2680
2681 optimizer.fuse_batchnorm_scale();
2682 optimizer.fuse_convolution_batchnorm();
2683 optimizer.fuse_convolution_mul();
2684 optimizer.fuse_convolution_add();
2685 optimizer.fuse_convolutiondepthwise_batchnorm();
2686 optimizer.fuse_convolutiondepthwise_mul();
2687 optimizer.fuse_convolutiondepthwise_add();
2688 optimizer.fuse_deconvolution_batchnorm();
2689 optimizer.fuse_deconvolution_mul();
2690 optimizer.fuse_deconvolution_add();
2691 optimizer.fuse_deconvolutiondepthwise_batchnorm();
2692 optimizer.fuse_innerproduct_batchnorm();
2693 optimizer.fuse_innerproduct_add();
2694 optimizer.fuse_innerproduct_dropout();
2695
2696 optimizer.replace_reduction_with_global_pooling();
2697 optimizer.replace_prelu_with_leaky_relu();
2698
2699 optimizer.fuse_convolution_activation();
2700 optimizer.fuse_convolutiondepthwise_activation();
2701 optimizer.fuse_deconvolution_activation();
2702 optimizer.fuse_deconvolutiondepthwise_activation();
2703 optimizer.fuse_innerproduct_activation();
2704 optimizer.fuse_memorydata_binaryop();
2705 optimizer.fuse_binaryop_eltwise();
2706
2707 optimizer.eliminate_dropout();
2708 optimizer.eliminate_pooling1x1();
2709 optimizer.eliminate_noop();
2710 optimizer.eliminate_split();
2711 optimizer.eliminate_flatten_after_global_pooling();
2712 optimizer.eliminate_reshape_after_global_pooling();
2713 optimizer.eliminate_reshape_before_binaryop();
2714
2715 optimizer.replace_convolution_with_innerproduct_after_global_pooling();
2716 optimizer.replace_convolution_with_innerproduct_after_innerproduct();
2717
2718 optimizer.eliminate_flatten_after_innerproduct();
2719 optimizer.eliminate_orphaned_memorydata();
2720
2721 optimizer.shape_inference();
2722
2723 optimizer.estimate_memory_footprint();
2724
2725 optimizer.save(outparam, outbin);
2726
2727 return 0;
2728 }
2729