1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2019 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14
15 #ifdef _MSC_VER
16 #define _CRT_SECURE_NO_DEPRECATE
17 #endif
18
19 #include <algorithm>
20 #include <map>
21 #include <set>
22 #include <vector>
23
24 // ncnn public header
25 #include "datareader.h"
26 #include "layer.h"
27 #include "layer_type.h"
28 #include "net.h"
29
30 // ncnn private header
31 #include "modelwriter.h"
32
33 class DataReaderFromEmpty : public ncnn::DataReader
34 {
35 public:
scan(const char * format,void * p) const36 virtual int scan(const char* format, void* p) const
37 {
38 return 0;
39 }
read(void * buf,size_t size) const40 virtual size_t read(void* buf, size_t size) const
41 {
42 memset(buf, 0, size);
43 return size;
44 }
45 };
46
47 class NetOptimize : public ModelWriter
48 {
49 public:
50 NetOptimize();
51
52 public:
53 int fuse_batchnorm_scale();
54 int fuse_convolution_batchnorm();
55 int fuse_convolution_mul();
56 int fuse_convolution_add();
57 int fuse_convolutiondepthwise_batchnorm();
58 int fuse_convolutiondepthwise_mul();
59 int fuse_convolutiondepthwise_add();
60 int fuse_deconvolution_batchnorm();
61 int fuse_deconvolution_mul();
62 int fuse_deconvolution_add();
63 int fuse_deconvolutiondepthwise_batchnorm();
64 int fuse_innerproduct_batchnorm();
65 int fuse_innerproduct_add();
66 int fuse_innerproduct_dropout();
67 int fuse_convolution_activation();
68 int fuse_convolutiondepthwise_activation();
69 int fuse_deconvolution_activation();
70 int fuse_deconvolutiondepthwise_activation();
71 int fuse_innerproduct_activation();
72 int fuse_memorydata_binaryop();
73 int fuse_binaryop_eltwise();
74
75 int eliminate_dropout();
76 int eliminate_pooling1x1();
77 int eliminate_noop();
78 int eliminate_split();
79 int eliminate_orphaned_memorydata();
80 int eliminate_flatten_after_global_pooling();
81 int eliminate_reshape_after_global_pooling();
82 int eliminate_flatten_after_innerproduct();
83 int eliminate_reshape_before_binaryop();
84
85 int replace_reduction_with_global_pooling();
86 int replace_prelu_with_leaky_relu();
87 int replace_convolution_with_innerproduct_after_global_pooling();
88 int replace_convolution_with_innerproduct_after_innerproduct();
89 };
90
NetOptimize()91 NetOptimize::NetOptimize()
92 : ModelWriter()
93 {
94 }
95
fuse_batchnorm_scale()96 int NetOptimize::fuse_batchnorm_scale()
97 {
98 const size_t layer_count = layers.size();
99 for (size_t i = 0; i < layer_count; i++)
100 {
101 if (layers[i]->type != "BatchNorm")
102 continue;
103
104 // BatchNorm - Scale
105 int top_blob_index = layers[i]->tops[0];
106
107 size_t j = i + 1;
108 for (; j < layer_count; j++)
109 {
110 if (layers[j]->type != "Scale")
111 continue;
112
113 if (layers[j]->bottoms.size() != 1)
114 continue;
115
116 if (layers[j]->bottoms[0] == top_blob_index)
117 break;
118 }
119
120 if (j == layer_count)
121 continue;
122
123 // fuse BatchNorm - Scale to BatchNorm
124 ncnn::BatchNorm* batchnorm = (ncnn::BatchNorm*)layers[i];
125 ncnn::Scale* scale = (ncnn::Scale*)layers[j];
126
127 fprintf(stderr, "fuse_batchnorm_scale %s %s\n", batchnorm->name.c_str(), scale->name.c_str());
128
129 {
130 // v = ((v - mean) / sqrt(var + eps) * slope + bias) * s + b
131 // = (v - mean) / sqrt(var + eps) * (slope * s) + (bias * s + b)
132
133 int channels = batchnorm->channels;
134
135 float* slope = batchnorm->slope_data;
136 float* bias = batchnorm->bias_data;
137
138 for (int q = 0; q < channels; q++)
139 {
140 slope[q] = slope[q] * scale->scale_data[q];
141 if (scale->bias_term)
142 bias[q] = bias[q] * scale->scale_data[q] + scale->bias_data[q];
143 else
144 bias[q] = bias[q] * scale->scale_data[q];
145 }
146 }
147
148 int top_blob_index_final = scale->tops[0];
149 batchnorm->tops[0] = top_blob_index_final;
150 blobs[top_blob_index_final].producer = i;
151 scale->type = "ncnnfused";
152 }
153
154 return 0;
155 }
156
fuse_convolution_batchnorm()157 int NetOptimize::fuse_convolution_batchnorm()
158 {
159 const size_t layer_count = layers.size();
160 for (size_t i = 0; i < layer_count; i++)
161 {
162 if (layers[i]->type != "Convolution")
163 continue;
164
165 // Convolution - BatchNorm
166 int top_blob_index = layers[i]->tops[0];
167
168 size_t j = i + 1;
169 for (; j < layer_count; j++)
170 {
171 if (layers[j]->type != "BatchNorm")
172 continue;
173
174 if (layers[j]->bottoms.size() != 1)
175 continue;
176
177 if (layers[j]->bottoms[0] == top_blob_index)
178 break;
179 }
180
181 if (j == layer_count)
182 continue;
183
184 // fuse Convolution - BatchNorm to Convolution
185 ncnn::Convolution* convolution = (ncnn::Convolution*)layers[i];
186 ncnn::BatchNorm* batchnorm = (ncnn::BatchNorm*)layers[j];
187
188 fprintf(stderr, "fuse_convolution_batchnorm %s %s\n", convolution->name.c_str(), batchnorm->name.c_str());
189
190 {
191 int channels = batchnorm->channels;
192 float eps = batchnorm->eps;
193
194 // a = bias - slope * mean / sqrt(var + eps)
195 // b = slope / sqrt(var + eps)
196 // value = value * b + a
197
198 std::vector<float> a(channels);
199 std::vector<float> b(channels);
200 for (int i = 0; i < channels; i++)
201 {
202 float sqrt_var = static_cast<float>(sqrt(batchnorm->var_data[i] + eps));
203 a[i] = batchnorm->bias_data[i] - batchnorm->slope_data[i] * batchnorm->mean_data[i] / sqrt_var;
204 b[i] = batchnorm->slope_data[i] / sqrt_var;
205 }
206
207 if (convolution->bias_term == 0)
208 {
209 // init bias as zero
210 convolution->bias_term = 1;
211 convolution->bias_data = ncnn::Mat(channels);
212 convolution->bias_data.fill(0.f);
213 }
214
215 const int weight_per_outch = convolution->weight_data_size / channels;
216
217 float* weight = convolution->weight_data;
218 float* bias = convolution->bias_data;
219 for (int i = 0; i < channels; i++)
220 {
221 float* conv_weight_outch = weight + weight_per_outch * i;
222 for (int j = 0; j < weight_per_outch; j++)
223 {
224 conv_weight_outch[j] *= b[i];
225 }
226
227 bias[i] = bias[i] * b[i] + a[i];
228 }
229 }
230
231 int top_blob_index_final = batchnorm->tops[0];
232 convolution->tops[0] = top_blob_index_final;
233 blobs[top_blob_index_final].producer = i;
234 batchnorm->type = "ncnnfused";
235 }
236
237 return 0;
238 }
239
fuse_convolution_mul()240 int NetOptimize::fuse_convolution_mul()
241 {
242 const size_t layer_count = layers.size();
243 for (size_t i = 0; i < layer_count; i++)
244 {
245 if (layers[i]->type != "Convolution")
246 continue;
247
248 // Convolution - BinaryOp
249 int top_blob_index = layers[i]->tops[0];
250
251 size_t j = i + 1;
252 for (; j < layer_count; j++)
253 {
254 if (layers[j]->type != "BinaryOp")
255 continue;
256
257 if (layers[j]->bottoms.size() != 2)
258 continue;
259
260 if (layers[j]->bottoms[0] == top_blob_index)
261 break;
262 }
263
264 if (j == layer_count)
265 continue;
266
267 // fuse Convolution - BinaryOp to Convolution
268 ncnn::Convolution* convolution = (ncnn::Convolution*)layers[i];
269 ncnn::BinaryOp* binaryop = (ncnn::BinaryOp*)layers[j];
270
271 if (binaryop->op_type != 2 || binaryop->with_scalar)
272 continue;
273
274 // MemoryData - ..... - BinaryOp
275 size_t k = 0;
276 for (; k < j; k++)
277 {
278 if (layers[k]->type != "MemoryData")
279 continue;
280
281 if (layers[k]->tops[0] == binaryop->bottoms[1])
282 break;
283 }
284
285 if (k == j)
286 continue;
287
288 ncnn::MemoryData* memorydata = (ncnn::MemoryData*)layers[k];
289
290 int channels = convolution->num_output;
291
292 if (memorydata->w != channels || memorydata->h != 0 || memorydata->c != 0)
293 {
294 // not bias-like broadcasting type
295 continue;
296 }
297
298 fprintf(stderr, "fuse_convolution_mul %s %s\n", convolution->name.c_str(), binaryop->name.c_str());
299
300 {
301 const int weight_per_outch = convolution->weight_data_size / channels;
302
303 float* weight = convolution->weight_data;
304 float* bias = convolution->bias_data;
305 for (int i = 0; i < channels; i++)
306 {
307 float* conv_weight_outch = weight + weight_per_outch * i;
308 for (int j = 0; j < weight_per_outch; j++)
309 {
310 conv_weight_outch[j] *= memorydata->data[i];
311 }
312
313 if (bias)
314 {
315 bias[i] = bias[i] * memorydata->data[i];
316 }
317 }
318 }
319
320 int top_blob_index_final = binaryop->tops[0];
321 convolution->tops[0] = top_blob_index_final;
322 blobs[top_blob_index_final].producer = i;
323 binaryop->type = "ncnnfused";
324 }
325
326 return 0;
327 }
328
fuse_convolution_add()329 int NetOptimize::fuse_convolution_add()
330 {
331 const size_t layer_count = layers.size();
332 for (size_t i = 0; i < layer_count; i++)
333 {
334 if (layers[i]->type != "Convolution")
335 continue;
336
337 // Convolution - BinaryOp
338 int top_blob_index = layers[i]->tops[0];
339
340 size_t j = i + 1;
341 for (; j < layer_count; j++)
342 {
343 if (layers[j]->type != "BinaryOp")
344 continue;
345
346 if (layers[j]->bottoms.size() != 2)
347 continue;
348
349 if (layers[j]->bottoms[0] == top_blob_index)
350 break;
351 }
352
353 if (j == layer_count)
354 continue;
355
356 // fuse Convolution - BinaryOp to Convolution
357 ncnn::Convolution* convolution = (ncnn::Convolution*)layers[i];
358 ncnn::BinaryOp* binaryop = (ncnn::BinaryOp*)layers[j];
359
360 if (binaryop->op_type != 0 || binaryop->with_scalar)
361 continue;
362
363 // MemoryData - ..... - BinaryOp
364 size_t k = 0;
365 for (; k < j; k++)
366 {
367 if (layers[k]->type != "MemoryData")
368 continue;
369
370 if (layers[k]->tops[0] == binaryop->bottoms[1])
371 break;
372 }
373
374 if (k == j)
375 continue;
376
377 ncnn::MemoryData* memorydata = (ncnn::MemoryData*)layers[k];
378
379 int channels = convolution->num_output;
380
381 bool broadcasting_type_ok = false;
382 if (memorydata->w == channels && memorydata->h == 0 && memorydata->c == 0)
383 broadcasting_type_ok = true;
384 if (memorydata->w == 1 && memorydata->h == 1 && memorydata->c == channels)
385 broadcasting_type_ok = true;
386
387 if (!broadcasting_type_ok)
388 {
389 // not bias-like broadcasting type
390 continue;
391 }
392
393 fprintf(stderr, "fuse_convolution_add %s %s\n", convolution->name.c_str(), binaryop->name.c_str());
394
395 ncnn::Mat bias_data = memorydata->data.reshape(channels);
396 {
397 if (convolution->bias_term == 0)
398 {
399 // init bias
400 convolution->bias_term = 1;
401 convolution->bias_data = bias_data;
402 }
403 else
404 {
405 float* bias = convolution->bias_data;
406 for (int i = 0; i < channels; i++)
407 {
408 bias[i] = bias[i] + bias_data[i];
409 }
410 }
411 }
412
413 int top_blob_index_final = binaryop->tops[0];
414 convolution->tops[0] = top_blob_index_final;
415 blobs[top_blob_index_final].producer = i;
416 binaryop->type = "ncnnfused";
417 }
418
419 return 0;
420 }
421
fuse_convolutiondepthwise_batchnorm()422 int NetOptimize::fuse_convolutiondepthwise_batchnorm()
423 {
424 const size_t layer_count = layers.size();
425 for (size_t i = 0; i < layer_count; i++)
426 {
427 if (layers[i]->type != "ConvolutionDepthWise")
428 continue;
429
430 // ConvolutionDepthWise - BatchNorm
431 int top_blob_index = layers[i]->tops[0];
432
433 size_t j = i + 1;
434 for (; j < layer_count; j++)
435 {
436 if (layers[j]->type != "BatchNorm")
437 continue;
438
439 if (layers[j]->bottoms.size() != 1)
440 continue;
441
442 if (layers[j]->bottoms[0] == top_blob_index)
443 break;
444 }
445
446 if (j == layer_count)
447 continue;
448
449 // fuse ConvolutionDepthWise - BatchNorm to ConvolutionDepthWise
450 ncnn::ConvolutionDepthWise* convolutiondepthwise = (ncnn::ConvolutionDepthWise*)layers[i];
451 ncnn::BatchNorm* batchnorm = (ncnn::BatchNorm*)layers[j];
452
453 fprintf(stderr, "fuse_convolutiondepthwise_batchnorm %s %s\n", convolutiondepthwise->name.c_str(), batchnorm->name.c_str());
454
455 {
456 int channels = batchnorm->channels;
457 float eps = batchnorm->eps;
458
459 // a = bias - slope * mean / sqrt(var + eps)
460 // b = slope / sqrt(var + eps)
461 // value = value * b + a
462
463 std::vector<float> a(channels);
464 std::vector<float> b(channels);
465 for (int i = 0; i < channels; i++)
466 {
467 float sqrt_var = static_cast<float>(sqrt(batchnorm->var_data[i] + eps));
468 a[i] = batchnorm->bias_data[i] - batchnorm->slope_data[i] * batchnorm->mean_data[i] / sqrt_var;
469 b[i] = batchnorm->slope_data[i] / sqrt_var;
470 }
471
472 if (convolutiondepthwise->bias_term == 0)
473 {
474 // init bias as zero
475 convolutiondepthwise->bias_term = 1;
476 convolutiondepthwise->bias_data = ncnn::Mat(channels);
477 convolutiondepthwise->bias_data.fill(0.f);
478 }
479
480 const int weight_per_outch = convolutiondepthwise->weight_data_size / channels;
481
482 float* weight = convolutiondepthwise->weight_data;
483 float* bias = convolutiondepthwise->bias_data;
484 for (int i = 0; i < channels; i++)
485 {
486 float* conv_weight_outch = weight + weight_per_outch * i;
487 for (int j = 0; j < weight_per_outch; j++)
488 {
489 conv_weight_outch[j] *= b[i];
490 }
491
492 bias[i] = bias[i] * b[i] + a[i];
493 }
494 }
495
496 int top_blob_index_final = batchnorm->tops[0];
497 convolutiondepthwise->tops[0] = top_blob_index_final;
498 blobs[top_blob_index_final].producer = i;
499 batchnorm->type = "ncnnfused";
500 }
501
502 return 0;
503 }
504
fuse_convolutiondepthwise_mul()505 int NetOptimize::fuse_convolutiondepthwise_mul()
506 {
507 const size_t layer_count = layers.size();
508 for (size_t i = 0; i < layer_count; i++)
509 {
510 if (layers[i]->type != "ConvolutionDepthWise")
511 continue;
512
513 // ConvolutionDepthWise - BinaryOp
514 int top_blob_index = layers[i]->tops[0];
515
516 size_t j = i + 1;
517 for (; j < layer_count; j++)
518 {
519 if (layers[j]->type != "BinaryOp")
520 continue;
521
522 if (layers[j]->bottoms.size() != 2)
523 continue;
524
525 if (layers[j]->bottoms[0] == top_blob_index)
526 break;
527 }
528
529 if (j == layer_count)
530 continue;
531
532 // fuse ConvolutionDepthWise - BinaryOp to ConvolutionDepthWise
533 ncnn::ConvolutionDepthWise* convolutiondepthwise = (ncnn::ConvolutionDepthWise*)layers[i];
534 ncnn::BinaryOp* binaryop = (ncnn::BinaryOp*)layers[j];
535
536 if (binaryop->op_type != 2 || binaryop->with_scalar)
537 continue;
538
539 // MemoryData - ..... - BinaryOp
540 size_t k = 0;
541 for (; k < j; k++)
542 {
543 if (layers[k]->type != "MemoryData")
544 continue;
545
546 if (layers[k]->tops[0] == binaryop->bottoms[1])
547 break;
548 }
549
550 if (k == j)
551 continue;
552
553 ncnn::MemoryData* memorydata = (ncnn::MemoryData*)layers[k];
554
555 int channels = convolutiondepthwise->num_output;
556
557 if (memorydata->w != channels || memorydata->h != 0 || memorydata->c != 0)
558 {
559 // not bias-like broadcasting type
560 continue;
561 }
562
563 fprintf(stderr, "fuse_convolutiondepthwise_mul %s %s\n", convolutiondepthwise->name.c_str(), binaryop->name.c_str());
564
565 {
566 const int weight_per_outch = convolutiondepthwise->weight_data_size / channels;
567
568 float* weight = convolutiondepthwise->weight_data;
569 float* bias = convolutiondepthwise->bias_data;
570 for (int i = 0; i < channels; i++)
571 {
572 float* conv_weight_outch = weight + weight_per_outch * i;
573 for (int j = 0; j < weight_per_outch; j++)
574 {
575 conv_weight_outch[j] *= memorydata->data[i];
576 }
577
578 if (bias)
579 {
580 bias[i] = bias[i] * memorydata->data[i];
581 }
582 }
583 }
584
585 int top_blob_index_final = binaryop->tops[0];
586 convolutiondepthwise->tops[0] = top_blob_index_final;
587 blobs[top_blob_index_final].producer = i;
588 binaryop->type = "ncnnfused";
589 }
590
591 return 0;
592 }
593
fuse_convolutiondepthwise_add()594 int NetOptimize::fuse_convolutiondepthwise_add()
595 {
596 const size_t layer_count = layers.size();
597 for (size_t i = 0; i < layer_count; i++)
598 {
599 if (layers[i]->type != "ConvolutionDepthWise")
600 continue;
601
602 // ConvolutionDepthWise - BinaryOp
603 int top_blob_index = layers[i]->tops[0];
604
605 size_t j = i + 1;
606 for (; j < layer_count; j++)
607 {
608 if (layers[j]->type != "BinaryOp")
609 continue;
610
611 if (layers[j]->bottoms.size() != 2)
612 continue;
613
614 if (layers[j]->bottoms[0] == top_blob_index)
615 break;
616 }
617
618 if (j == layer_count)
619 continue;
620
621 // fuse ConvolutionDepthWise - BinaryOp to ConvolutionDepthWise
622 ncnn::ConvolutionDepthWise* convolutiondepthwise = (ncnn::ConvolutionDepthWise*)layers[i];
623 ncnn::BinaryOp* binaryop = (ncnn::BinaryOp*)layers[j];
624
625 if (binaryop->op_type != 0 || binaryop->with_scalar)
626 continue;
627
628 // MemoryData - ..... - BinaryOp
629 size_t k = 0;
630 for (; k < j; k++)
631 {
632 if (layers[k]->type != "MemoryData")
633 continue;
634
635 if (layers[k]->tops[0] == binaryop->bottoms[1])
636 break;
637 }
638
639 if (k == j)
640 continue;
641
642 ncnn::MemoryData* memorydata = (ncnn::MemoryData*)layers[k];
643
644 int channels = convolutiondepthwise->num_output;
645
646 bool broadcasting_type_ok = false;
647 if (memorydata->w == channels && memorydata->h == 0 && memorydata->c == 0)
648 broadcasting_type_ok = true;
649 if (memorydata->w == 1 && memorydata->h == 1 && memorydata->c == channels)
650 broadcasting_type_ok = true;
651
652 if (!broadcasting_type_ok)
653 {
654 // not bias-like broadcasting type
655 continue;
656 }
657
658 fprintf(stderr, "fuse_convolutiondepthwise_add %s %s\n", convolutiondepthwise->name.c_str(), binaryop->name.c_str());
659
660 ncnn::Mat bias_data = memorydata->data.reshape(channels);
661 {
662 if (convolutiondepthwise->bias_term == 0)
663 {
664 // init bias
665 convolutiondepthwise->bias_term = 1;
666 convolutiondepthwise->bias_data = bias_data;
667 }
668 else
669 {
670 float* bias = convolutiondepthwise->bias_data;
671 for (int i = 0; i < channels; i++)
672 {
673 bias[i] = bias[i] + bias_data[i];
674 }
675 }
676 }
677
678 int top_blob_index_final = binaryop->tops[0];
679 convolutiondepthwise->tops[0] = top_blob_index_final;
680 blobs[top_blob_index_final].producer = i;
681 binaryop->type = "ncnnfused";
682 }
683
684 return 0;
685 }
686
fuse_deconvolution_batchnorm()687 int NetOptimize::fuse_deconvolution_batchnorm()
688 {
689 const size_t layer_count = layers.size();
690 for (size_t i = 0; i < layer_count; i++)
691 {
692 if (layers[i]->type != "Deconvolution")
693 continue;
694
695 // Deconvolution - BatchNorm
696 int top_blob_index = layers[i]->tops[0];
697
698 size_t j = i + 1;
699 for (; j < layer_count; j++)
700 {
701 if (layers[j]->type != "BatchNorm")
702 continue;
703
704 if (layers[j]->bottoms.size() != 1)
705 continue;
706
707 if (layers[j]->bottoms[0] == top_blob_index)
708 break;
709 }
710
711 if (j == layer_count)
712 continue;
713
714 // fuse Deconvolution - BatchNorm to Deconvolution
715 ncnn::Deconvolution* deconvolution = (ncnn::Deconvolution*)layers[i];
716 ncnn::BatchNorm* batchnorm = (ncnn::BatchNorm*)layers[j];
717
718 fprintf(stderr, "fuse_deconvolution_batchnorm %s %s\n", deconvolution->name.c_str(), batchnorm->name.c_str());
719
720 {
721 int channels = batchnorm->channels;
722 float eps = batchnorm->eps;
723
724 // a = bias - slope * mean / sqrt(var + eps)
725 // b = slope / sqrt(var + eps)
726 // value = value * b + a
727
728 std::vector<float> a(channels);
729 std::vector<float> b(channels);
730 for (int i = 0; i < channels; i++)
731 {
732 float sqrt_var = static_cast<float>(sqrt(batchnorm->var_data[i] + eps));
733 a[i] = batchnorm->bias_data[i] - batchnorm->slope_data[i] * batchnorm->mean_data[i] / sqrt_var;
734 b[i] = batchnorm->slope_data[i] / sqrt_var;
735 }
736
737 if (deconvolution->bias_term == 0)
738 {
739 // init bias as zero
740 deconvolution->bias_term = 1;
741 deconvolution->bias_data = ncnn::Mat(channels);
742 deconvolution->bias_data.fill(0.f);
743 }
744
745 const int weight_per_outch = deconvolution->weight_data_size / channels;
746
747 float* weight = deconvolution->weight_data;
748 float* bias = deconvolution->bias_data;
749 for (int i = 0; i < channels; i++)
750 {
751 float* conv_weight_outch = weight + weight_per_outch * i;
752 for (int j = 0; j < weight_per_outch; j++)
753 {
754 conv_weight_outch[j] *= b[i];
755 }
756
757 bias[i] = bias[i] * b[i] + a[i];
758 }
759 }
760
761 int top_blob_index_final = batchnorm->tops[0];
762 deconvolution->tops[0] = top_blob_index_final;
763 blobs[top_blob_index_final].producer = i;
764 batchnorm->type = "ncnnfused";
765 }
766
767 return 0;
768 }
769
fuse_deconvolution_mul()770 int NetOptimize::fuse_deconvolution_mul()
771 {
772 const size_t layer_count = layers.size();
773 for (size_t i = 0; i < layer_count; i++)
774 {
775 if (layers[i]->type != "Deconvolution")
776 continue;
777
778 // Deconvolution - BinaryOp
779 int top_blob_index = layers[i]->tops[0];
780
781 size_t j = i + 1;
782 for (; j < layer_count; j++)
783 {
784 if (layers[j]->type != "BinaryOp")
785 continue;
786
787 if (layers[j]->bottoms.size() != 2)
788 continue;
789
790 if (layers[j]->bottoms[0] == top_blob_index)
791 break;
792 }
793
794 if (j == layer_count)
795 continue;
796
797 // fuse Deconvolution - BinaryOp to Deconvolution
798 ncnn::Deconvolution* deconvolution = (ncnn::Deconvolution*)layers[i];
799 ncnn::BinaryOp* binaryop = (ncnn::BinaryOp*)layers[j];
800
801 if (binaryop->op_type != 2 || binaryop->with_scalar)
802 continue;
803
804 // MemoryData - ..... - BinaryOp
805 size_t k = 0;
806 for (; k < j; k++)
807 {
808 if (layers[k]->type != "MemoryData")
809 continue;
810
811 if (layers[k]->tops[0] == binaryop->bottoms[1])
812 break;
813 }
814
815 if (k == j)
816 continue;
817
818 ncnn::MemoryData* memorydata = (ncnn::MemoryData*)layers[k];
819
820 int channels = deconvolution->num_output;
821
822 if (memorydata->w != channels || memorydata->h != 0 || memorydata->c != 0)
823 {
824 // not bias-like broadcasting type
825 continue;
826 }
827
828 fprintf(stderr, "fuse_deconvolution_mul %s %s\n", deconvolution->name.c_str(), binaryop->name.c_str());
829
830 {
831 const int weight_per_outch = deconvolution->weight_data_size / channels;
832
833 float* weight = deconvolution->weight_data;
834 float* bias = deconvolution->bias_data;
835 for (int i = 0; i < channels; i++)
836 {
837 float* conv_weight_outch = weight + weight_per_outch * i;
838 for (int j = 0; j < weight_per_outch; j++)
839 {
840 conv_weight_outch[j] *= memorydata->data[i];
841 }
842
843 if (bias)
844 {
845 bias[i] = bias[i] * memorydata->data[i];
846 }
847 }
848 }
849
850 int top_blob_index_final = binaryop->tops[0];
851 deconvolution->tops[0] = top_blob_index_final;
852 blobs[top_blob_index_final].producer = i;
853 binaryop->type = "ncnnfused";
854 }
855
856 return 0;
857 }
858
fuse_deconvolution_add()859 int NetOptimize::fuse_deconvolution_add()
860 {
861 const size_t layer_count = layers.size();
862 for (size_t i = 0; i < layer_count; i++)
863 {
864 if (layers[i]->type != "Deconvolution")
865 continue;
866
867 // Deconvolution - BinaryOp
868 int top_blob_index = layers[i]->tops[0];
869
870 size_t j = i + 1;
871 for (; j < layer_count; j++)
872 {
873 if (layers[j]->type != "BinaryOp")
874 continue;
875
876 if (layers[j]->bottoms.size() != 2)
877 continue;
878
879 if (layers[j]->bottoms[0] == top_blob_index)
880 break;
881 }
882
883 if (j == layer_count)
884 continue;
885
886 // fuse Deconvolution - BinaryOp to Deconvolution
887 ncnn::Deconvolution* deconvolution = (ncnn::Deconvolution*)layers[i];
888 ncnn::BinaryOp* binaryop = (ncnn::BinaryOp*)layers[j];
889
890 if (binaryop->op_type != 0 || binaryop->with_scalar)
891 continue;
892
893 // MemoryData - ..... - BinaryOp
894 size_t k = 0;
895 for (; k < j; k++)
896 {
897 if (layers[k]->type != "MemoryData")
898 continue;
899
900 if (layers[k]->tops[0] == binaryop->bottoms[1])
901 break;
902 }
903
904 if (k == j)
905 continue;
906
907 ncnn::MemoryData* memorydata = (ncnn::MemoryData*)layers[k];
908
909 int channels = deconvolution->num_output;
910
911 bool broadcasting_type_ok = false;
912 if (memorydata->w == channels && memorydata->h == 0 && memorydata->c == 0)
913 broadcasting_type_ok = true;
914 if (memorydata->w == 1 && memorydata->h == 1 && memorydata->c == channels)
915 broadcasting_type_ok = true;
916
917 if (!broadcasting_type_ok)
918 {
919 // not bias-like broadcasting type
920 continue;
921 }
922
923 fprintf(stderr, "fuse_deconvolution_add %s %s\n", deconvolution->name.c_str(), binaryop->name.c_str());
924
925 ncnn::Mat bias_data = memorydata->data.reshape(channels);
926 {
927 if (deconvolution->bias_term == 0)
928 {
929 // init bias
930 deconvolution->bias_term = 1;
931 deconvolution->bias_data = bias_data;
932 }
933 else
934 {
935 float* bias = deconvolution->bias_data;
936 for (int i = 0; i < channels; i++)
937 {
938 bias[i] = bias[i] + bias_data[i];
939 }
940 }
941 }
942
943 int top_blob_index_final = binaryop->tops[0];
944 deconvolution->tops[0] = top_blob_index_final;
945 blobs[top_blob_index_final].producer = i;
946 binaryop->type = "ncnnfused";
947 }
948
949 return 0;
950 }
951
fuse_deconvolutiondepthwise_batchnorm()952 int NetOptimize::fuse_deconvolutiondepthwise_batchnorm()
953 {
954 const size_t layer_count = layers.size();
955 for (size_t i = 0; i < layer_count; i++)
956 {
957 if (layers[i]->type != "DeconvolutionDepthWise")
958 continue;
959
960 // DeconvolutionDepthWise - BatchNorm
961 int top_blob_index = layers[i]->tops[0];
962
963 size_t j = i + 1;
964 for (; j < layer_count; j++)
965 {
966 if (layers[j]->type != "BatchNorm")
967 continue;
968
969 if (layers[j]->bottoms.size() != 1)
970 continue;
971
972 if (layers[j]->bottoms[0] == top_blob_index)
973 break;
974 }
975
976 if (j == layer_count)
977 continue;
978
979 // fuse DeconvolutionDepthWise - BatchNorm to DeconvolutionDepthWise
980 ncnn::DeconvolutionDepthWise* deconvolutiondepthwise = (ncnn::DeconvolutionDepthWise*)layers[i];
981 ncnn::BatchNorm* batchnorm = (ncnn::BatchNorm*)layers[j];
982
983 fprintf(stderr, "fuse_deconvolutiondepthwise_batchnorm %s %s\n", deconvolutiondepthwise->name.c_str(), batchnorm->name.c_str());
984
985 {
986 int channels = batchnorm->channels;
987 float eps = batchnorm->eps;
988
989 // a = bias - slope * mean / sqrt(var + eps)
990 // b = slope / sqrt(var + eps)
991 // value = value * b + a
992
993 std::vector<float> a(channels);
994 std::vector<float> b(channels);
995 for (int i = 0; i < channels; i++)
996 {
997 float sqrt_var = static_cast<float>(sqrt(batchnorm->var_data[i] + eps));
998 a[i] = batchnorm->bias_data[i] - batchnorm->slope_data[i] * batchnorm->mean_data[i] / sqrt_var;
999 b[i] = batchnorm->slope_data[i] / sqrt_var;
1000 }
1001
1002 if (deconvolutiondepthwise->bias_term == 0)
1003 {
1004 // init bias as zero
1005 deconvolutiondepthwise->bias_term = 1;
1006 deconvolutiondepthwise->bias_data = ncnn::Mat(channels);
1007 deconvolutiondepthwise->bias_data.fill(0.f);
1008 }
1009
1010 const int weight_per_outch = deconvolutiondepthwise->weight_data_size / channels;
1011
1012 float* weight = deconvolutiondepthwise->weight_data;
1013 float* bias = deconvolutiondepthwise->bias_data;
1014 for (int i = 0; i < channels; i++)
1015 {
1016 float* conv_weight_outch = weight + weight_per_outch * i;
1017 for (int j = 0; j < weight_per_outch; j++)
1018 {
1019 conv_weight_outch[j] *= b[i];
1020 }
1021
1022 bias[i] = bias[i] * b[i] + a[i];
1023 }
1024 }
1025
1026 int top_blob_index_final = batchnorm->tops[0];
1027 deconvolutiondepthwise->tops[0] = top_blob_index_final;
1028 blobs[top_blob_index_final].producer = i;
1029 batchnorm->type = "ncnnfused";
1030 }
1031
1032 return 0;
1033 }
1034
fuse_innerproduct_batchnorm()1035 int NetOptimize::fuse_innerproduct_batchnorm()
1036 {
1037 const size_t layer_count = layers.size();
1038 for (size_t i = 0; i < layer_count; i++)
1039 {
1040 if (layers[i]->type != "InnerProduct")
1041 continue;
1042
1043 // InnerProduct - BatchNorm
1044 int top_blob_index = layers[i]->tops[0];
1045
1046 size_t j = i + 1;
1047 for (; j < layer_count; j++)
1048 {
1049 if (layers[j]->type != "BatchNorm")
1050 continue;
1051
1052 if (layers[j]->bottoms.size() != 1)
1053 continue;
1054
1055 if (layers[j]->bottoms[0] == top_blob_index)
1056 break;
1057 }
1058
1059 if (j == layer_count)
1060 continue;
1061
1062 // fuse InnerProduct - BatchNorm to InnerProduct
1063 ncnn::InnerProduct* innerproduct = (ncnn::InnerProduct*)layers[i];
1064 ncnn::BatchNorm* batchnorm = (ncnn::BatchNorm*)layers[j];
1065
1066 fprintf(stderr, "fuse_innerproduct_batchnorm %s %s\n", innerproduct->name.c_str(), batchnorm->name.c_str());
1067
1068 {
1069 int channels = batchnorm->channels;
1070 float eps = batchnorm->eps;
1071
1072 // a = bias - slope * mean / sqrt(var + eps)
1073 // b = slope / sqrt(var + eps)
1074 // value = value * b + a
1075
1076 std::vector<float> a(channels);
1077 std::vector<float> b(channels);
1078 for (int i = 0; i < channels; i++)
1079 {
1080 float sqrt_var = static_cast<float>(sqrt(batchnorm->var_data[i] + eps));
1081 a[i] = batchnorm->bias_data[i] - batchnorm->slope_data[i] * batchnorm->mean_data[i] / sqrt_var;
1082 b[i] = batchnorm->slope_data[i] / sqrt_var;
1083 }
1084
1085 if (innerproduct->bias_term == 0)
1086 {
1087 // init bias as zero
1088 innerproduct->bias_term = 1;
1089 innerproduct->bias_data = ncnn::Mat(channels);
1090 innerproduct->bias_data.fill(0.f);
1091 }
1092
1093 const int weight_per_outch = innerproduct->weight_data_size / channels;
1094
1095 float* weight = innerproduct->weight_data;
1096 float* bias = innerproduct->bias_data;
1097 for (int i = 0; i < channels; i++)
1098 {
1099 float* conv_weight_outch = weight + weight_per_outch * i;
1100 for (int j = 0; j < weight_per_outch; j++)
1101 {
1102 conv_weight_outch[j] *= b[i];
1103 }
1104
1105 bias[i] = bias[i] * b[i] + a[i];
1106 }
1107 }
1108
1109 int top_blob_index_final = batchnorm->tops[0];
1110 innerproduct->tops[0] = top_blob_index_final;
1111 blobs[top_blob_index_final].producer = i;
1112 batchnorm->type = "ncnnfused";
1113 }
1114
1115 return 0;
1116 }
1117
fuse_innerproduct_add()1118 int NetOptimize::fuse_innerproduct_add()
1119 {
1120 const size_t layer_count = layers.size();
1121 for (size_t i = 0; i < layer_count; i++)
1122 {
1123 if (layers[i]->type != "InnerProduct")
1124 continue;
1125
1126 // InnerProduct - BinaryOp
1127 int top_blob_index = layers[i]->tops[0];
1128
1129 size_t j = i + 1;
1130 for (; j < layer_count; j++)
1131 {
1132 if (layers[j]->type != "BinaryOp")
1133 continue;
1134
1135 if (layers[j]->bottoms.size() != 2)
1136 continue;
1137
1138 if (layers[j]->bottoms[0] == top_blob_index)
1139 break;
1140 }
1141
1142 if (j == layer_count)
1143 continue;
1144
1145 // fuse InnerProduct - BinaryOp to InnerProduct
1146 ncnn::InnerProduct* innerproduct = (ncnn::InnerProduct*)layers[i];
1147 ncnn::BinaryOp* binaryop = (ncnn::BinaryOp*)layers[j];
1148
1149 if (binaryop->op_type != 0 || binaryop->with_scalar)
1150 continue;
1151
1152 // MemoryData - ..... - BinaryOp
1153 size_t k = 0;
1154 for (; k < j; k++)
1155 {
1156 if (layers[k]->type != "MemoryData")
1157 continue;
1158
1159 if (layers[k]->tops[0] == binaryop->bottoms[1])
1160 break;
1161 }
1162
1163 if (k == j)
1164 continue;
1165
1166 ncnn::MemoryData* memorydata = (ncnn::MemoryData*)layers[k];
1167
1168 int channels = innerproduct->num_output;
1169
1170 bool broadcasting_type_ok = false;
1171 if (memorydata->w == channels && memorydata->h == 0 && memorydata->c == 0)
1172 broadcasting_type_ok = true;
1173 if (memorydata->w == 1 && memorydata->h == 1 && memorydata->c == channels)
1174 broadcasting_type_ok = true;
1175
1176 if (!broadcasting_type_ok)
1177 {
1178 // not bias-like broadcasting type
1179 continue;
1180 }
1181
1182 fprintf(stderr, "fuse_innerproduct_add %s %s\n", innerproduct->name.c_str(), binaryop->name.c_str());
1183
1184 ncnn::Mat bias_data = memorydata->data.reshape(channels);
1185 {
1186 if (innerproduct->bias_term == 0)
1187 {
1188 // init bias
1189 innerproduct->bias_term = 1;
1190 innerproduct->bias_data = bias_data;
1191 }
1192 else
1193 {
1194 float* bias = innerproduct->bias_data;
1195 for (int i = 0; i < channels; i++)
1196 {
1197 bias[i] = bias[i] + bias_data[i];
1198 }
1199 }
1200 }
1201
1202 int top_blob_index_final = binaryop->tops[0];
1203 innerproduct->tops[0] = top_blob_index_final;
1204 blobs[top_blob_index_final].producer = i;
1205 binaryop->type = "ncnnfused";
1206 }
1207
1208 return 0;
1209 }
1210
fuse_innerproduct_dropout()1211 int NetOptimize::fuse_innerproduct_dropout()
1212 {
1213 const size_t layer_count = layers.size();
1214 for (size_t i = 0; i < layer_count; i++)
1215 {
1216 if (layers[i]->type != "InnerProduct")
1217 continue;
1218
1219 // InnerProduct - Dropout
1220 int top_blob_index = layers[i]->tops[0];
1221
1222 size_t j = i + 1;
1223 for (; j < layer_count; j++)
1224 {
1225 if (layers[j]->type != "Dropout")
1226 continue;
1227
1228 if (layers[j]->bottoms.size() != 1)
1229 continue;
1230
1231 if (layers[j]->bottoms[0] == top_blob_index)
1232 break;
1233 }
1234
1235 if (j == layer_count)
1236 continue;
1237
1238 // fuse InnerProduct - Dropout to InnerProduct
1239 ncnn::InnerProduct* innerproduct = (ncnn::InnerProduct*)layers[i];
1240 ncnn::Dropout* dropout = (ncnn::Dropout*)layers[j];
1241
1242 fprintf(stderr, "fuse_innerproduct_dropout %s %s\n", innerproduct->name.c_str(), dropout->name.c_str());
1243
1244 float scale = dropout->scale;
1245 if (scale != 1.f)
1246 {
1247 const int num_output = innerproduct->num_output;
1248 const int weight_per_outch = innerproduct->weight_data_size / num_output;
1249
1250 float* weight = innerproduct->weight_data;
1251 for (int i = 0; i < num_output; i++)
1252 {
1253 float* conv_weight_outch = weight + weight_per_outch * i;
1254 for (int j = 0; j < weight_per_outch; j++)
1255 {
1256 conv_weight_outch[j] *= scale;
1257 }
1258 }
1259
1260 if (innerproduct->bias_term)
1261 {
1262 float* bias = innerproduct->bias_data;
1263 for (int i = 0; i < num_output; i++)
1264 {
1265 bias[i] *= scale;
1266 }
1267 }
1268 }
1269
1270 int top_blob_index_final = dropout->tops[0];
1271 innerproduct->tops[0] = top_blob_index_final;
1272 blobs[top_blob_index_final].producer = i;
1273 dropout->type = "ncnnfused";
1274 }
1275
1276 return 0;
1277 }
1278
fuse_convolution_activation()1279 int NetOptimize::fuse_convolution_activation()
1280 {
1281 const size_t layer_count = layers.size();
1282 for (size_t i = 0; i < layer_count; i++)
1283 {
1284 if (layers[i]->type != "Convolution")
1285 continue;
1286
1287 // Convolution - Activation
1288 int top_blob_index = layers[i]->tops[0];
1289
1290 size_t j = i + 1;
1291 for (; j < layer_count; j++)
1292 {
1293 if (layers[j]->type != "ReLU" && layers[j]->type != "Clip" && layers[j]->type != "Sigmoid" && layers[j]->type != "Mish" && layers[j]->type != "HardSwish")
1294 continue;
1295
1296 if (layers[j]->bottoms.size() != 1)
1297 continue;
1298
1299 if (layers[j]->bottoms[0] == top_blob_index)
1300 break;
1301 }
1302
1303 if (j == layer_count)
1304 continue;
1305
1306 // fuse Convolution - Activation to Convolution
1307 ncnn::Convolution* convolution = (ncnn::Convolution*)layers[i];
1308 ncnn::Layer* activation = layers[j];
1309
1310 fprintf(stderr, "fuse_convolution_activation %s %s\n", convolution->name.c_str(), activation->name.c_str());
1311
1312 if (activation->type == "ReLU")
1313 {
1314 ncnn::ReLU* relu = (ncnn::ReLU*)activation;
1315
1316 if (relu->slope == 0.f)
1317 {
1318 convolution->activation_type = 1;
1319 }
1320 else
1321 {
1322 convolution->activation_type = 2;
1323 convolution->activation_params = ncnn::Mat(1);
1324 convolution->activation_params[0] = relu->slope;
1325 }
1326 }
1327 else if (activation->type == "Clip")
1328 {
1329 ncnn::Clip* clip = (ncnn::Clip*)activation;
1330
1331 convolution->activation_type = 3;
1332 convolution->activation_params = ncnn::Mat(2);
1333 convolution->activation_params[0] = clip->min;
1334 convolution->activation_params[1] = clip->max;
1335 }
1336 else if (activation->type == "Sigmoid")
1337 {
1338 convolution->activation_type = 4;
1339 }
1340 else if (activation->type == "Mish")
1341 {
1342 convolution->activation_type = 5;
1343 }
1344 else if (activation->type == "HardSwish")
1345 {
1346 ncnn::HardSwish* hardswish = (ncnn::HardSwish*)activation;
1347
1348 convolution->activation_type = 6;
1349 convolution->activation_params = ncnn::Mat(2);
1350 convolution->activation_params[0] = hardswish->alpha;
1351 convolution->activation_params[1] = hardswish->beta;
1352 }
1353
1354 int top_blob_index_final = activation->tops[0];
1355 convolution->tops[0] = top_blob_index_final;
1356 blobs[top_blob_index_final].producer = i;
1357 activation->type = "ncnnfused";
1358 }
1359
1360 for (size_t i = 0; i < layer_count; i++)
1361 {
1362 if (layers[i]->type != "Convolution1D")
1363 continue;
1364
1365 // Convolution1D - Activation
1366 int top_blob_index = layers[i]->tops[0];
1367
1368 size_t j = i + 1;
1369 for (; j < layer_count; j++)
1370 {
1371 if (layers[j]->type != "ReLU" && layers[j]->type != "Clip" && layers[j]->type != "Sigmoid" && layers[j]->type != "Mish")
1372 continue;
1373
1374 if (layers[j]->bottoms.size() != 1)
1375 continue;
1376
1377 if (layers[j]->bottoms[0] == top_blob_index)
1378 break;
1379 }
1380
1381 if (j == layer_count)
1382 continue;
1383
1384 // fuse Convolution1D - Activation to Convolution1D
1385 ncnn::Convolution1D* convolution = (ncnn::Convolution1D*)layers[i];
1386 ncnn::Layer* activation = layers[j];
1387
1388 fprintf(stderr, "fuse_convolution1d_activation %s %s\n", convolution->name.c_str(), activation->name.c_str());
1389
1390 if (activation->type == "ReLU")
1391 {
1392 ncnn::ReLU* relu = (ncnn::ReLU*)activation;
1393
1394 if (relu->slope == 0.f)
1395 {
1396 convolution->activation_type = 1;
1397 }
1398 else
1399 {
1400 convolution->activation_type = 2;
1401 convolution->activation_params = ncnn::Mat(1);
1402 convolution->activation_params[0] = relu->slope;
1403 }
1404 }
1405 else if (activation->type == "Clip")
1406 {
1407 ncnn::Clip* clip = (ncnn::Clip*)activation;
1408
1409 convolution->activation_type = 3;
1410 convolution->activation_params = ncnn::Mat(2);
1411 convolution->activation_params[0] = clip->min;
1412 convolution->activation_params[1] = clip->max;
1413 }
1414 else if (activation->type == "Sigmoid")
1415 {
1416 convolution->activation_type = 4;
1417 }
1418 else if (activation->type == "Mish")
1419 {
1420 convolution->activation_type = 5;
1421 }
1422
1423 int top_blob_index_final = activation->tops[0];
1424 convolution->tops[0] = top_blob_index_final;
1425 blobs[top_blob_index_final].producer = i;
1426 activation->type = "ncnnfused";
1427 }
1428
1429 return 0;
1430 }
1431
fuse_convolutiondepthwise_activation()1432 int NetOptimize::fuse_convolutiondepthwise_activation()
1433 {
1434 const size_t layer_count = layers.size();
1435 for (size_t i = 0; i < layer_count; i++)
1436 {
1437 if (layers[i]->type != "ConvolutionDepthWise")
1438 continue;
1439
1440 // ConvolutionDepthWise - Activation
1441 int top_blob_index = layers[i]->tops[0];
1442
1443 size_t j = i + 1;
1444 for (; j < layer_count; j++)
1445 {
1446 if (layers[j]->type != "ReLU" && layers[j]->type != "Clip" && layers[j]->type != "Sigmoid" && layers[j]->type != "Mish" && layers[j]->type != "HardSwish")
1447 continue;
1448
1449 if (layers[j]->bottoms.size() != 1)
1450 continue;
1451
1452 if (layers[j]->bottoms[0] == top_blob_index)
1453 break;
1454 }
1455
1456 if (j == layer_count)
1457 continue;
1458
1459 // fuse ConvolutionDepthWise - Activation to ConvolutionDepthWise
1460 ncnn::ConvolutionDepthWise* convolutiondepthwise = (ncnn::ConvolutionDepthWise*)layers[i];
1461 ncnn::Layer* activation = layers[j];
1462
1463 fprintf(stderr, "fuse_convolutiondepthwise_activation %s %s\n", convolutiondepthwise->name.c_str(), activation->name.c_str());
1464
1465 if (activation->type == "ReLU")
1466 {
1467 ncnn::ReLU* relu = (ncnn::ReLU*)activation;
1468
1469 if (relu->slope == 0.f)
1470 {
1471 convolutiondepthwise->activation_type = 1;
1472 }
1473 else
1474 {
1475 convolutiondepthwise->activation_type = 2;
1476 convolutiondepthwise->activation_params = ncnn::Mat(1);
1477 convolutiondepthwise->activation_params[0] = relu->slope;
1478 }
1479 }
1480 else if (activation->type == "Clip")
1481 {
1482 ncnn::Clip* clip = (ncnn::Clip*)activation;
1483
1484 convolutiondepthwise->activation_type = 3;
1485 convolutiondepthwise->activation_params = ncnn::Mat(2);
1486 convolutiondepthwise->activation_params[0] = clip->min;
1487 convolutiondepthwise->activation_params[1] = clip->max;
1488 }
1489 else if (activation->type == "Sigmoid")
1490 {
1491 convolutiondepthwise->activation_type = 4;
1492 }
1493 else if (activation->type == "Mish")
1494 {
1495 convolutiondepthwise->activation_type = 5;
1496 }
1497 else if (activation->type == "HardSwish")
1498 {
1499 ncnn::HardSwish* hardswish = (ncnn::HardSwish*)activation;
1500
1501 convolutiondepthwise->activation_type = 6;
1502 convolutiondepthwise->activation_params = ncnn::Mat(2);
1503 convolutiondepthwise->activation_params[0] = hardswish->alpha;
1504 convolutiondepthwise->activation_params[1] = hardswish->beta;
1505 }
1506
1507 int top_blob_index_final = activation->tops[0];
1508 convolutiondepthwise->tops[0] = top_blob_index_final;
1509 blobs[top_blob_index_final].producer = i;
1510 activation->type = "ncnnfused";
1511 }
1512
1513 return 0;
1514 }
1515
fuse_deconvolution_activation()1516 int NetOptimize::fuse_deconvolution_activation()
1517 {
1518 const size_t layer_count = layers.size();
1519 for (size_t i = 0; i < layer_count; i++)
1520 {
1521 if (layers[i]->type != "Deconvolution")
1522 continue;
1523
1524 // Deconvolution - Activation
1525 int top_blob_index = layers[i]->tops[0];
1526
1527 size_t j = i + 1;
1528 for (; j < layer_count; j++)
1529 {
1530 if (layers[j]->type != "ReLU" && layers[j]->type != "Clip" && layers[j]->type != "Sigmoid")
1531 continue;
1532
1533 if (layers[j]->bottoms.size() != 1)
1534 continue;
1535
1536 if (layers[j]->bottoms[0] == top_blob_index)
1537 break;
1538 }
1539
1540 if (j == layer_count)
1541 continue;
1542
1543 // fuse Deconvolution - Activation to Deconvolution
1544 ncnn::Deconvolution* deconvolution = (ncnn::Deconvolution*)layers[i];
1545 ncnn::Layer* activation = layers[j];
1546
1547 fprintf(stderr, "fuse_deconvolution_activation %s %s\n", deconvolution->name.c_str(), activation->name.c_str());
1548
1549 if (activation->type == "ReLU")
1550 {
1551 ncnn::ReLU* relu = (ncnn::ReLU*)activation;
1552
1553 if (relu->slope == 0.f)
1554 {
1555 deconvolution->activation_type = 1;
1556 }
1557 else
1558 {
1559 deconvolution->activation_type = 2;
1560 deconvolution->activation_params = ncnn::Mat(1);
1561 deconvolution->activation_params[0] = relu->slope;
1562 }
1563 }
1564 else if (activation->type == "Clip")
1565 {
1566 ncnn::Clip* clip = (ncnn::Clip*)activation;
1567
1568 deconvolution->activation_type = 3;
1569 deconvolution->activation_params = ncnn::Mat(2);
1570 deconvolution->activation_params[0] = clip->min;
1571 deconvolution->activation_params[1] = clip->max;
1572 }
1573 else if (activation->type == "Sigmoid")
1574 {
1575 deconvolution->activation_type = 4;
1576 }
1577
1578 int top_blob_index_final = activation->tops[0];
1579 deconvolution->tops[0] = top_blob_index_final;
1580 blobs[top_blob_index_final].producer = i;
1581 activation->type = "ncnnfused";
1582 }
1583
1584 return 0;
1585 }
1586
fuse_deconvolutiondepthwise_activation()1587 int NetOptimize::fuse_deconvolutiondepthwise_activation()
1588 {
1589 const size_t layer_count = layers.size();
1590 for (size_t i = 0; i < layer_count; i++)
1591 {
1592 if (layers[i]->type != "DeconvolutionDepthWise")
1593 continue;
1594
1595 // DeconvolutionDepthWise - Activation
1596 int top_blob_index = layers[i]->tops[0];
1597
1598 size_t j = i + 1;
1599 for (; j < layer_count; j++)
1600 {
1601 if (layers[j]->type != "ReLU" && layers[j]->type != "Clip" && layers[j]->type != "Sigmoid")
1602 continue;
1603
1604 if (layers[j]->bottoms.size() != 1)
1605 continue;
1606
1607 if (layers[j]->bottoms[0] == top_blob_index)
1608 break;
1609 }
1610
1611 if (j == layer_count)
1612 continue;
1613
1614 // fuse DeconvolutionDepthWise - Activation to DeconvolutionDepthWise
1615 ncnn::DeconvolutionDepthWise* deconvolutiondepthwise = (ncnn::DeconvolutionDepthWise*)layers[i];
1616 ncnn::Layer* activation = layers[j];
1617
1618 fprintf(stderr, "fuse_deconvolutiondepthwise_activation %s %s\n", deconvolutiondepthwise->name.c_str(), activation->name.c_str());
1619
1620 if (activation->type == "ReLU")
1621 {
1622 ncnn::ReLU* relu = (ncnn::ReLU*)activation;
1623
1624 if (relu->slope == 0.f)
1625 {
1626 deconvolutiondepthwise->activation_type = 1;
1627 }
1628 else
1629 {
1630 deconvolutiondepthwise->activation_type = 2;
1631 deconvolutiondepthwise->activation_params = ncnn::Mat(1);
1632 deconvolutiondepthwise->activation_params[0] = relu->slope;
1633 }
1634 }
1635 else if (activation->type == "Clip")
1636 {
1637 ncnn::Clip* clip = (ncnn::Clip*)activation;
1638
1639 deconvolutiondepthwise->activation_type = 3;
1640 deconvolutiondepthwise->activation_params = ncnn::Mat(2);
1641 deconvolutiondepthwise->activation_params[0] = clip->min;
1642 deconvolutiondepthwise->activation_params[1] = clip->max;
1643 }
1644 else if (activation->type == "Sigmoid")
1645 {
1646 deconvolutiondepthwise->activation_type = 4;
1647 }
1648
1649 int top_blob_index_final = activation->tops[0];
1650 deconvolutiondepthwise->tops[0] = top_blob_index_final;
1651 blobs[top_blob_index_final].producer = i;
1652 activation->type = "ncnnfused";
1653 }
1654
1655 return 0;
1656 }
1657
fuse_innerproduct_activation()1658 int NetOptimize::fuse_innerproduct_activation()
1659 {
1660 const size_t layer_count = layers.size();
1661 for (size_t i = 0; i < layer_count; i++)
1662 {
1663 if (layers[i]->type != "InnerProduct")
1664 continue;
1665
1666 // InnerProduct - Activation
1667 int top_blob_index = layers[i]->tops[0];
1668
1669 size_t j = i + 1;
1670 for (; j < layer_count; j++)
1671 {
1672 if (layers[j]->type != "ReLU" && layers[j]->type != "Clip" && layers[j]->type != "Sigmoid" && layers[j]->type != "Mish" && layers[j]->type != "HardSwish")
1673 continue;
1674
1675 if (layers[j]->bottoms.size() != 1)
1676 continue;
1677
1678 if (layers[j]->bottoms[0] == top_blob_index)
1679 break;
1680 }
1681
1682 if (j == layer_count)
1683 continue;
1684
1685 // fuse InnerProduct - Activation to InnerProduct
1686 ncnn::InnerProduct* innerproduct = (ncnn::InnerProduct*)layers[i];
1687 ncnn::Layer* activation = layers[j];
1688
1689 fprintf(stderr, "fuse_innerproduct_activation %s %s\n", innerproduct->name.c_str(), activation->name.c_str());
1690
1691 if (activation->type == "ReLU")
1692 {
1693 ncnn::ReLU* relu = (ncnn::ReLU*)activation;
1694
1695 if (relu->slope == 0.f)
1696 {
1697 innerproduct->activation_type = 1;
1698 }
1699 else
1700 {
1701 innerproduct->activation_type = 2;
1702 innerproduct->activation_params = ncnn::Mat(1);
1703 innerproduct->activation_params[0] = relu->slope;
1704 }
1705 }
1706 else if (activation->type == "Clip")
1707 {
1708 ncnn::Clip* clip = (ncnn::Clip*)activation;
1709
1710 innerproduct->activation_type = 3;
1711 innerproduct->activation_params = ncnn::Mat(2);
1712 innerproduct->activation_params[0] = clip->min;
1713 innerproduct->activation_params[1] = clip->max;
1714 }
1715 else if (activation->type == "Sigmoid")
1716 {
1717 innerproduct->activation_type = 4;
1718 }
1719 else if (activation->type == "Mish")
1720 {
1721 innerproduct->activation_type = 5;
1722 }
1723 else if (activation->type == "HardSwish")
1724 {
1725 ncnn::HardSwish* hardswish = (ncnn::HardSwish*)activation;
1726
1727 innerproduct->activation_type = 6;
1728 innerproduct->activation_params = ncnn::Mat(2);
1729 innerproduct->activation_params[0] = hardswish->alpha;
1730 innerproduct->activation_params[1] = hardswish->beta;
1731 }
1732
1733 int top_blob_index_final = activation->tops[0];
1734 innerproduct->tops[0] = top_blob_index_final;
1735 blobs[top_blob_index_final].producer = i;
1736 activation->type = "ncnnfused";
1737 }
1738
1739 return 0;
1740 }
1741
fuse_memorydata_binaryop()1742 int NetOptimize::fuse_memorydata_binaryop()
1743 {
1744 const size_t layer_count = layers.size();
1745 for (size_t i = 0; i < layer_count; i++)
1746 {
1747 if (layers[i]->type != "MemoryData")
1748 continue;
1749
1750 // MemoryData - BinaryOp
1751 int top_blob_index = layers[i]->tops[0];
1752
1753 size_t j = i + 1;
1754 for (; j < layer_count; j++)
1755 {
1756 if (layers[j]->type != "BinaryOp")
1757 continue;
1758
1759 if (layers[j]->bottoms.size() != 2)
1760 continue;
1761
1762 if (layers[j]->bottoms[0] == top_blob_index || layers[j]->bottoms[1] == top_blob_index)
1763 break;
1764 }
1765
1766 if (j == layer_count)
1767 continue;
1768
1769 // fuse MemoryData - BinaryOp to BinaryOp
1770 ncnn::MemoryData* memorydata = (ncnn::MemoryData*)layers[i];
1771 ncnn::BinaryOp* binaryop = (ncnn::BinaryOp*)layers[j];
1772
1773 if (memorydata->w != 1 || memorydata->h != 0 || memorydata->c != 0)
1774 {
1775 // not a scalar
1776 continue;
1777 }
1778
1779 int memorydata_index = 1;
1780
1781 if (binaryop->bottoms[0] == top_blob_index)
1782 {
1783 int op_type = binaryop->op_type;
1784
1785 if (op_type == ncnn::BinaryOp::Operation_ADD
1786 || op_type == ncnn::BinaryOp::Operation_MUL
1787 || op_type == ncnn::BinaryOp::Operation_MAX
1788 || op_type == ncnn::BinaryOp::Operation_MIN)
1789 {
1790 memorydata_index = 0;
1791 }
1792 else if (op_type == ncnn::BinaryOp::Operation_SUB)
1793 {
1794 binaryop->op_type = ncnn::BinaryOp::Operation_RSUB;
1795 memorydata_index = 0;
1796 }
1797 else if (op_type == ncnn::BinaryOp::Operation_DIV)
1798 {
1799 binaryop->op_type = ncnn::BinaryOp::Operation_RDIV;
1800 memorydata_index = 0;
1801 }
1802 else
1803 {
1804 // non interchangeable binaryop
1805 continue;
1806 }
1807 }
1808
1809 float scalar = memorydata->data[0];
1810
1811 binaryop->with_scalar = 1;
1812 binaryop->b = scalar;
1813
1814 fprintf(stderr, "fuse_memorydata_binaryop %s %s\n", memorydata->name.c_str(), binaryop->name.c_str());
1815
1816 binaryop->bottoms.erase(binaryop->bottoms.begin() + memorydata_index);
1817 memorydata->type = "ncnnfused";
1818 }
1819
1820 for (size_t i = 0; i < layer_count; i++)
1821 {
1822 if (layers[i]->type != "MemoryData")
1823 continue;
1824
1825 // MemoryData - Split - BinaryOp
1826 int top_blob_index = layers[i]->tops[0];
1827
1828 size_t j0 = i + 1;
1829 for (; j0 < layer_count; j0++)
1830 {
1831 if (layers[j0]->type != "Split")
1832 continue;
1833
1834 if (layers[j0]->bottoms.size() != 1)
1835 continue;
1836
1837 if (layers[j0]->bottoms[0] == top_blob_index)
1838 break;
1839 }
1840
1841 if (j0 == layer_count)
1842 continue;
1843
1844 int split_top_blob_index = -1;
1845
1846 size_t j1 = j0 + 1;
1847 for (; j1 < layer_count; j1++)
1848 {
1849 if (layers[j1]->type != "BinaryOp")
1850 continue;
1851
1852 if (layers[j1]->bottoms.size() != 2)
1853 continue;
1854
1855 for (int k = 0; k < (int)layers[j0]->tops.size(); k++)
1856 {
1857 if (layers[j1]->bottoms[0] == layers[j0]->tops[k] || layers[j1]->bottoms[1] == layers[j0]->tops[k])
1858 {
1859 split_top_blob_index = k;
1860 break;
1861 }
1862 }
1863
1864 if (split_top_blob_index != -1)
1865 break;
1866 }
1867
1868 if (j1 == layer_count)
1869 continue;
1870
1871 // fuse MemoryData - Split - BinaryOp to BinaryOp
1872 ncnn::MemoryData* memorydata = (ncnn::MemoryData*)layers[i];
1873 ncnn::Split* split = (ncnn::Split*)layers[j0];
1874 ncnn::BinaryOp* binaryop = (ncnn::BinaryOp*)layers[j1];
1875
1876 if (memorydata->w != 1 || memorydata->h != 0 || memorydata->c != 0)
1877 {
1878 // not a scalar
1879 continue;
1880 }
1881
1882 int memorydata_index = 1;
1883
1884 if (binaryop->bottoms[0] == split->tops[split_top_blob_index])
1885 {
1886 int op_type = binaryop->op_type;
1887
1888 if (op_type == ncnn::BinaryOp::Operation_ADD
1889 || op_type == ncnn::BinaryOp::Operation_MUL
1890 || op_type == ncnn::BinaryOp::Operation_MAX
1891 || op_type == ncnn::BinaryOp::Operation_MIN)
1892 {
1893 memorydata_index = 0;
1894 }
1895 else if (op_type == ncnn::BinaryOp::Operation_SUB)
1896 {
1897 binaryop->op_type = ncnn::BinaryOp::Operation_RSUB;
1898 memorydata_index = 0;
1899 }
1900 else if (op_type == ncnn::BinaryOp::Operation_DIV)
1901 {
1902 binaryop->op_type = ncnn::BinaryOp::Operation_RDIV;
1903 memorydata_index = 0;
1904 }
1905 else
1906 {
1907 // non interchangeable binaryop
1908 continue;
1909 }
1910 }
1911
1912 float scalar = memorydata->data[0];
1913
1914 binaryop->with_scalar = 1;
1915 binaryop->b = scalar;
1916
1917 fprintf(stderr, "fuse_memorydata_binaryop %s %s\n", memorydata->name.c_str(), binaryop->name.c_str());
1918
1919 binaryop->bottoms.erase(binaryop->bottoms.begin() + memorydata_index);
1920 split->tops.erase(split->tops.begin() + split_top_blob_index);
1921 if (split->tops.empty())
1922 {
1923 split->type = "ncnnfused";
1924 memorydata->type = "ncnnfused";
1925 }
1926
1927 i--;
1928 }
1929
1930 return 0;
1931 }
1932
fuse_binaryop_eltwise()1933 int NetOptimize::fuse_binaryop_eltwise()
1934 {
1935 const size_t layer_count = layers.size();
1936 for (size_t i = 0; i < layer_count; i++)
1937 {
1938 if (layers[i]->type != "BinaryOp")
1939 continue;
1940
1941 if (layers[i]->bottoms.size() != 2)
1942 continue;
1943
1944 ncnn::BinaryOp* binaryop = (ncnn::BinaryOp*)layers[i];
1945
1946 if (binaryop->op_type != ncnn::BinaryOp::Operation_ADD)
1947 continue;
1948
1949 if (binaryop->with_scalar)
1950 continue;
1951
1952 // BinaryOp - BinaryOp - BinaryOp
1953 int bottom_blob_index_0 = binaryop->bottoms[0];
1954 int bottom_blob_index_1 = binaryop->bottoms[1];
1955
1956 size_t j0 = 0;
1957 for (; j0 < i; j0++)
1958 {
1959 if (layers[j0]->type != "BinaryOp")
1960 continue;
1961
1962 if (layers[j0]->bottoms.size() != 1)
1963 continue;
1964
1965 if (((ncnn::BinaryOp*)layers[j0])->op_type != ncnn::BinaryOp::Operation_MUL)
1966 continue;
1967
1968 if (layers[j0]->tops[0] == bottom_blob_index_0)
1969 break;
1970 }
1971
1972 size_t j1 = 0;
1973 for (; j1 < i; j1++)
1974 {
1975 if (layers[j1]->type != "BinaryOp")
1976 continue;
1977
1978 if (layers[j1]->bottoms.size() != 1)
1979 continue;
1980
1981 if (((ncnn::BinaryOp*)layers[j1])->op_type != ncnn::BinaryOp::Operation_MUL)
1982 continue;
1983
1984 if (layers[j1]->tops[0] == bottom_blob_index_1)
1985 break;
1986 }
1987
1988 if (j0 == i && j1 == i)
1989 continue;
1990
1991 ncnn::BinaryOp* binaryop0 = (ncnn::BinaryOp*)layers[j0];
1992 ncnn::BinaryOp* binaryop1 = (ncnn::BinaryOp*)layers[j1];
1993
1994 fprintf(stderr, "fuse_binaryop_eltwise %s %s %s\n", binaryop0->name.c_str(), binaryop1->name.c_str(), binaryop->name.c_str());
1995
1996 ncnn::Eltwise* eltwise = (ncnn::Eltwise*)ncnn::create_layer("Eltwise");
1997
1998 eltwise->type = "Eltwise";
1999 eltwise->name = binaryop->name;
2000 eltwise->bottoms = binaryop->bottoms;
2001 eltwise->tops = binaryop->tops;
2002
2003 ncnn::ParamDict pd;
2004 eltwise->load_param(pd);
2005
2006 eltwise->op_type = ncnn::Eltwise::Operation_SUM;
2007
2008 eltwise->coeffs = ncnn::Mat(2);
2009
2010 if (j0 != i && j1 != i)
2011 {
2012 // fuse BinaryOp - BinaryOp - BinaryOp to Eltwise
2013 eltwise->coeffs[0] = binaryop0->b;
2014 eltwise->coeffs[1] = binaryop1->b;
2015
2016 eltwise->bottoms[0] = binaryop0->bottoms[0];
2017 eltwise->bottoms[1] = binaryop1->bottoms[0];
2018
2019 binaryop0->type = "ncnnfused";
2020 binaryop1->type = "ncnnfused";
2021 }
2022 if (j0 != i && j1 == i)
2023 {
2024 // fuse BinaryOp - X - BinaryOp to Eltwise
2025 eltwise->coeffs[0] = binaryop0->b;
2026 eltwise->coeffs[1] = 1.f;
2027
2028 eltwise->bottoms[0] = binaryop0->bottoms[0];
2029
2030 binaryop0->type = "ncnnfused";
2031 }
2032 if (j0 == i && j1 != i)
2033 {
2034 // fuse X - BinaryOp - BinaryOp to Eltwise
2035 eltwise->coeffs[0] = 1.f;
2036 eltwise->coeffs[1] = binaryop1->b;
2037
2038 eltwise->bottoms[1] = binaryop1->bottoms[0];
2039
2040 binaryop1->type = "ncnnfused";
2041 }
2042
2043 layers[i] = eltwise;
2044 delete binaryop;
2045 }
2046
2047 return 0;
2048 }
2049
eliminate_dropout()2050 int NetOptimize::eliminate_dropout()
2051 {
2052 const size_t layer_count = layers.size();
2053 for (size_t i = 0; i < layer_count; i++)
2054 {
2055 if (layers[i]->type != "Dropout")
2056 continue;
2057
2058 ncnn::Dropout* dropout = (ncnn::Dropout*)layers[i];
2059 if (dropout->scale != 1.f)
2060 continue;
2061
2062 // Any - Dropout
2063 int bottom_blob_index = layers[i]->bottoms[0];
2064
2065 int j = i - 1;
2066 for (; j >= 0; j--)
2067 {
2068 if (layers[j]->type == "ncnnfused")
2069 continue;
2070
2071 if (layers[j]->tops.size() != 1)
2072 continue;
2073
2074 if (layers[j]->tops[0] == bottom_blob_index)
2075 break;
2076 }
2077
2078 if (j == -1)
2079 continue;
2080
2081 ncnn::Layer* any = layers[j];
2082
2083 fprintf(stderr, "eliminate_dropout %s %s\n", any->name.c_str(), dropout->name.c_str());
2084
2085 int top_blob_index_final = dropout->tops[0];
2086 any->tops[0] = top_blob_index_final;
2087 blobs[top_blob_index_final].producer = j;
2088 dropout->type = "ncnnfused";
2089 }
2090
2091 return 0;
2092 }
2093
eliminate_pooling1x1()2094 int NetOptimize::eliminate_pooling1x1()
2095 {
2096 const size_t layer_count = layers.size();
2097 for (size_t i = 0; i < layer_count; i++)
2098 {
2099 if (layers[i]->type != "Pooling")
2100 continue;
2101
2102 ncnn::Pooling* pooling = (ncnn::Pooling*)layers[i];
2103 if (pooling->pad_left != 0 || pooling->pad_right != 0 || pooling->pad_top != 0 || pooling->pad_bottom != 0)
2104 continue;
2105
2106 if (pooling->kernel_w != 1 || pooling->kernel_h != 1 || pooling->stride_w != 1 || pooling->stride_h != 1)
2107 continue;
2108
2109 if (pooling->global_pooling != 0)
2110 continue;
2111
2112 // Any - Pooling
2113 int bottom_blob_index = layers[i]->bottoms[0];
2114
2115 int top_i = -1;
2116 int j = i - 1;
2117 for (; j >= 0; j--)
2118 {
2119 if (layers[j]->type == "ncnnfused")
2120 continue;
2121
2122 for (size_t k = 0; k < layers[j]->tops.size(); k++)
2123 {
2124 if (layers[j]->tops[k] == bottom_blob_index)
2125 {
2126 top_i = k;
2127 break;
2128 }
2129 }
2130
2131 if (top_i != -1)
2132 break;
2133 }
2134
2135 if (j == -1)
2136 continue;
2137
2138 ncnn::Layer* any = layers[j];
2139
2140 fprintf(stderr, "eliminate_pooling1x1 %s %s\n", any->name.c_str(), pooling->name.c_str());
2141
2142 int top_blob_index_final = pooling->tops[0];
2143 any->tops[top_i] = top_blob_index_final;
2144 blobs[top_blob_index_final].producer = j;
2145 pooling->type = "ncnnfused";
2146 }
2147
2148 return 0;
2149 }
2150
eliminate_noop()2151 int NetOptimize::eliminate_noop()
2152 {
2153 const size_t layer_count = layers.size();
2154 for (size_t i = 0; i < layer_count; i++)
2155 {
2156 if (layers[i]->type != "Noop")
2157 continue;
2158
2159 ncnn::Layer* noop = layers[i];
2160
2161 if (noop->bottoms.empty())
2162 {
2163 // Noop
2164 fprintf(stderr, "eliminate_noop %s\n", noop->name.c_str());
2165
2166 size_t top_blob_count = noop->tops.size();
2167 for (size_t j = 0; j < top_blob_count; j++)
2168 {
2169 int top_blob_index_final = noop->tops[j];
2170 blobs[top_blob_index_final].producer = -1;
2171 }
2172 noop->type = "ncnnfused";
2173
2174 continue;
2175 }
2176
2177 // Any - Noop
2178 int bottom_blob_index = noop->bottoms[0];
2179
2180 int j = i - 1;
2181 int any_k = -1;
2182 for (; j >= 0; j--)
2183 {
2184 if (layers[j]->type == "ncnnfused")
2185 continue;
2186
2187 bool link_noop = false;
2188 size_t top_blob_count = layers[j]->tops.size();
2189 for (size_t k = 0; k < top_blob_count; k++)
2190 {
2191 if (layers[j]->tops[k] == bottom_blob_index)
2192 {
2193 link_noop = true;
2194 any_k = k;
2195 break;
2196 }
2197 }
2198
2199 if (link_noop)
2200 break;
2201 }
2202
2203 if (j == -1 || any_k == -1)
2204 continue;
2205
2206 ncnn::Layer* any = layers[j];
2207
2208 fprintf(stderr, "eliminate_noop %s %s\n", any->name.c_str(), noop->name.c_str());
2209
2210 int top_blob_index_final = noop->tops[0];
2211 any->tops[any_k] = top_blob_index_final;
2212 blobs[top_blob_index_final].producer = j;
2213
2214 noop->type = "ncnnfused";
2215 }
2216
2217 return 0;
2218 }
2219
eliminate_split()2220 int NetOptimize::eliminate_split()
2221 {
2222 const size_t layer_count = layers.size();
2223 for (size_t i = 0; i < layer_count; i++)
2224 {
2225 if (layers[i]->type != "Split")
2226 continue;
2227
2228 ncnn::Layer* split = layers[i];
2229
2230 int real_split_output_count = 0;
2231 int real_split_top_blob_index = -1;
2232 size_t top_blob_count = split->tops.size();
2233 for (size_t j = 0; j < top_blob_count; j++)
2234 {
2235 int top_blob_index_final = split->tops[j];
2236 if (blobs[top_blob_index_final].consumer != -1)
2237 {
2238 real_split_output_count += 1;
2239 real_split_top_blob_index = j;
2240 }
2241 }
2242
2243 if (real_split_output_count > 1)
2244 continue;
2245
2246 // Any - Pooling
2247 int bottom_blob_index = split->bottoms[0];
2248
2249 int top_i = -1;
2250 int j = i - 1;
2251 for (; j >= 0; j--)
2252 {
2253 if (layers[j]->type == "ncnnfused")
2254 continue;
2255
2256 for (size_t k = 0; k < layers[j]->tops.size(); k++)
2257 {
2258 if (layers[j]->tops[k] == bottom_blob_index)
2259 {
2260 top_i = k;
2261 break;
2262 }
2263 }
2264
2265 if (top_i != -1)
2266 break;
2267 }
2268
2269 if (j == -1)
2270 continue;
2271
2272 ncnn::Layer* any = layers[j];
2273
2274 fprintf(stderr, "eliminate_split %s %s\n", any->name.c_str(), split->name.c_str());
2275
2276 int top_blob_index_final = split->tops[real_split_top_blob_index];
2277 any->tops[top_i] = top_blob_index_final;
2278 blobs[top_blob_index_final].producer = j;
2279 split->type = "ncnnfused";
2280 }
2281
2282 return 0;
2283 }
2284
eliminate_orphaned_memorydata()2285 int NetOptimize::eliminate_orphaned_memorydata()
2286 {
2287 const size_t layer_count = layers.size();
2288 for (size_t i = 0; i < layer_count; i++)
2289 {
2290 if (layers[i]->type != "MemoryData")
2291 continue;
2292
2293 // MemoryData - X
2294 int top_blob_index = layers[i]->tops[0];
2295
2296 size_t j = i + 1;
2297 for (; j < layer_count; j++)
2298 {
2299 if (layers[j]->type == "ncnnfused")
2300 continue;
2301
2302 bool orphaned = true;
2303 for (size_t k = 0; k < layers[j]->bottoms.size(); k++)
2304 {
2305 if (layers[j]->bottoms[k] == top_blob_index)
2306 {
2307 orphaned = false;
2308 break;
2309 }
2310 }
2311
2312 if (!orphaned)
2313 break;
2314 }
2315
2316 if (j < layer_count)
2317 continue;
2318
2319 // assert orphaned == true
2320 fprintf(stderr, "eliminate_orphaned_memorydata %s\n", layers[i]->name.c_str());
2321
2322 layers[i]->type = "ncnnfused";
2323 }
2324
2325 return 0;
2326 }
2327
eliminate_reshape_after_global_pooling()2328 int NetOptimize::eliminate_reshape_after_global_pooling()
2329 {
2330 const size_t layer_count = layers.size();
2331 for (size_t i = 0; i < layer_count; i++)
2332 {
2333 if (layers[i]->type != "Pooling")
2334 continue;
2335
2336 ncnn::Pooling* pooling = (ncnn::Pooling*)layers[i];
2337 if (pooling->global_pooling == 0)
2338 continue;
2339
2340 // Pooling - Reshape
2341 int top_blob_index = layers[i]->tops[0];
2342
2343 size_t j = i + 1;
2344 for (; j < layer_count; j++)
2345 {
2346 if (layers[j]->type != "Reshape")
2347 continue;
2348
2349 if (layers[j]->bottoms.size() != 1)
2350 continue;
2351
2352 if (layers[j]->bottoms[0] == top_blob_index)
2353 break;
2354 }
2355
2356 if (j == layer_count)
2357 continue;
2358
2359 ncnn::Reshape* reshape = (ncnn::Reshape*)layers[j];
2360 if (reshape->h != -233 || reshape->c != -233 || reshape->permute != 0)
2361 continue;
2362
2363 fprintf(stderr, "eliminate_reshape_after_global_pooling %s %s\n", pooling->name.c_str(), reshape->name.c_str());
2364
2365 int top_blob_index_final = reshape->tops[0];
2366 pooling->tops[0] = top_blob_index_final;
2367 blobs[top_blob_index_final].producer = i;
2368 reshape->type = "ncnnfused";
2369 }
2370
2371 return 0;
2372 }
2373
eliminate_flatten_after_global_pooling()2374 int NetOptimize::eliminate_flatten_after_global_pooling()
2375 {
2376 const size_t layer_count = layers.size();
2377 for (size_t i = 0; i < layer_count; i++)
2378 {
2379 if (layers[i]->type != "Pooling")
2380 continue;
2381
2382 ncnn::Pooling* pooling = (ncnn::Pooling*)layers[i];
2383 if (pooling->global_pooling == 0)
2384 continue;
2385
2386 // Pooling - Flatten
2387 int top_blob_index = layers[i]->tops[0];
2388
2389 size_t j = i + 1;
2390 for (; j < layer_count; j++)
2391 {
2392 if (layers[j]->type != "Flatten")
2393 continue;
2394
2395 if (layers[j]->bottoms.size() != 1)
2396 continue;
2397
2398 if (layers[j]->bottoms[0] == top_blob_index)
2399 break;
2400 }
2401
2402 if (j == layer_count)
2403 continue;
2404
2405 ncnn::Flatten* flatten = (ncnn::Flatten*)layers[j];
2406
2407 fprintf(stderr, "eliminate_flatten_after_global_pooling %s %s\n", pooling->name.c_str(), flatten->name.c_str());
2408
2409 int top_blob_index_final = flatten->tops[0];
2410 pooling->tops[0] = top_blob_index_final;
2411 blobs[top_blob_index_final].producer = i;
2412 flatten->type = "ncnnfused";
2413 }
2414
2415 return 0;
2416 }
2417
eliminate_flatten_after_innerproduct()2418 int NetOptimize::eliminate_flatten_after_innerproduct()
2419 {
2420 const size_t layer_count = layers.size();
2421 for (size_t i = 0; i < layer_count; i++)
2422 {
2423 if (layers[i]->type != "InnerProduct")
2424 continue;
2425
2426 // InnerProduct - Flatten
2427 int top_blob_index = layers[i]->tops[0];
2428
2429 size_t j = i + 1;
2430 for (; j < layer_count; j++)
2431 {
2432 if (layers[j]->type != "Flatten")
2433 continue;
2434
2435 if (layers[j]->bottoms.size() != 1)
2436 continue;
2437
2438 if (layers[j]->bottoms[0] == top_blob_index)
2439 break;
2440 }
2441
2442 if (j == layer_count)
2443 continue;
2444
2445 ncnn::InnerProduct* innerproduct = (ncnn::InnerProduct*)layers[i];
2446 ncnn::Flatten* flatten = (ncnn::Flatten*)layers[j];
2447
2448 fprintf(stderr, "eliminate_flatten_after_innerproduct %s %s\n", innerproduct->name.c_str(), flatten->name.c_str());
2449
2450 int top_blob_index_final = flatten->tops[0];
2451 innerproduct->tops[0] = top_blob_index_final;
2452 blobs[top_blob_index_final].producer = i;
2453 flatten->type = "ncnnfused";
2454 }
2455
2456 return 0;
2457 }
2458
eliminate_reshape_before_binaryop()2459 int NetOptimize::eliminate_reshape_before_binaryop()
2460 {
2461 const size_t layer_count = layers.size();
2462 for (size_t i = 0; i < layer_count; i++)
2463 {
2464 if (layers[i]->type != "Reshape")
2465 continue;
2466
2467 ncnn::Reshape* reshape = (ncnn::Reshape*)layers[i];
2468 if (reshape->w != 1 || reshape->h != 1 || reshape->permute != 0)
2469 continue;
2470
2471 // Reshape - BinaryOp
2472 int top_blob_index = layers[i]->tops[0];
2473
2474 size_t j = i + 1;
2475 for (; j < layer_count; j++)
2476 {
2477 if (layers[j]->type != "BinaryOp")
2478 continue;
2479
2480 if (layers[j]->bottoms.size() != 2)
2481 continue;
2482
2483 if (layers[j]->bottoms[0] == top_blob_index || layers[j]->bottoms[1] == top_blob_index)
2484 break;
2485 }
2486
2487 if (j == layer_count)
2488 continue;
2489
2490 ncnn::BinaryOp* binaryop = (ncnn::BinaryOp*)layers[j];
2491
2492 fprintf(stderr, "eliminate_reshape_before_binaryop %s %s\n", reshape->name.c_str(), binaryop->name.c_str());
2493
2494 int bottom_blob_index_final = reshape->bottoms[0];
2495 if (layers[j]->bottoms[0] == top_blob_index)
2496 binaryop->bottoms[0] = bottom_blob_index_final;
2497 if (layers[j]->bottoms[1] == top_blob_index)
2498 binaryop->bottoms[1] = bottom_blob_index_final;
2499 blobs[bottom_blob_index_final].consumer = j;
2500 reshape->type = "ncnnfused";
2501 }
2502
2503 return 0;
2504 }
2505
replace_reduction_with_global_pooling()2506 int NetOptimize::replace_reduction_with_global_pooling()
2507 {
2508 const size_t layer_count = layers.size();
2509 for (size_t i = 0; i < layer_count; i++)
2510 {
2511 if (layers[i]->type != "Reduction")
2512 continue;
2513
2514 ncnn::Reduction* reduction1 = (ncnn::Reduction*)layers[i];
2515 if (reduction1->operation != 3 || reduction1->reduce_all != 0 || reduction1->coeff != 1.f)
2516 continue;
2517
2518 if (reduction1->axes.w != 1)
2519 continue;
2520
2521 const int* axes_ptr = reduction1->axes;
2522 if (axes_ptr[0] != 2 && axes_ptr[0] != 3)
2523 continue;
2524
2525 // Reduction(2/3) - Reduction(2)
2526 int top_blob_index = layers[i]->tops[0];
2527
2528 size_t j = i + 1;
2529 for (; j < layer_count; j++)
2530 {
2531 if (layers[j]->type != "Reduction")
2532 continue;
2533
2534 if (layers[j]->bottoms.size() != 1)
2535 continue;
2536
2537 if (layers[j]->bottoms[0] == top_blob_index)
2538 break;
2539 }
2540
2541 if (j == layer_count)
2542 continue;
2543
2544 ncnn::Reduction* reduction2 = (ncnn::Reduction*)layers[j];
2545 if (reduction2->operation != 3 || reduction2->reduce_all != 0 || reduction2->coeff != 1.f)
2546 continue;
2547
2548 if (reduction2->axes.w != 1)
2549 continue;
2550
2551 const int* axes2_ptr = reduction2->axes;
2552 if (axes2_ptr[0] != 2)
2553 continue;
2554
2555 fprintf(stderr, "replace_reduction_with_global_pooling %s %s\n", reduction1->name.c_str(), reduction2->name.c_str());
2556
2557 ncnn::Pooling* pooling = (ncnn::Pooling*)ncnn::create_layer("Pooling");
2558
2559 pooling->type = "Pooling";
2560 pooling->name = reduction2->name;
2561 pooling->bottoms = reduction2->bottoms;
2562 pooling->tops = reduction2->tops;
2563
2564 ncnn::ParamDict pd;
2565 pooling->load_param(pd);
2566
2567 pooling->pooling_type = 1;
2568 pooling->global_pooling = 1;
2569
2570 layers[j] = pooling;
2571 delete reduction2;
2572
2573 int bottom_blob_index_final = reduction1->bottoms[0];
2574 pooling->bottoms[0] = bottom_blob_index_final;
2575 blobs[bottom_blob_index_final].consumer = j;
2576 reduction1->type = "ncnnfused";
2577 }
2578
2579 return 0;
2580 }
2581
replace_prelu_with_leaky_relu()2582 int NetOptimize::replace_prelu_with_leaky_relu()
2583 {
2584 const size_t layer_count = layers.size();
2585 for (size_t i = 0; i < layer_count; i++)
2586 {
2587 if (layers[i]->type != "PReLU")
2588 continue;
2589
2590 ncnn::PReLU* prelu = (ncnn::PReLU*)layers[i];
2591 if (prelu->num_slope != 1)
2592 continue;
2593
2594 fprintf(stderr, "replace_prelu_with_leaky_relu %s\n", prelu->name.c_str());
2595
2596 ncnn::ReLU* relu = (ncnn::ReLU*)ncnn::create_layer("ReLU");
2597
2598 relu->type = "ReLU";
2599 relu->name = prelu->name;
2600 relu->bottoms = prelu->bottoms;
2601 relu->tops = prelu->tops;
2602
2603 ncnn::ParamDict pd;
2604 relu->load_param(pd);
2605
2606 relu->slope = prelu->slope_data[0];
2607
2608 layers[i] = relu;
2609 delete prelu;
2610 }
2611
2612 return 0;
2613 }
2614
replace_convolution_with_innerproduct_after_global_pooling()2615 int NetOptimize::replace_convolution_with_innerproduct_after_global_pooling()
2616 {
2617 const size_t layer_count = layers.size();
2618 for (size_t i = 0; i < layer_count; i++)
2619 {
2620 if (layers[i]->type != "Pooling")
2621 continue;
2622
2623 ncnn::Pooling* pooling = (ncnn::Pooling*)layers[i];
2624 if (pooling->global_pooling == 0)
2625 continue;
2626
2627 // Pooling - Convolution
2628 int top_blob_index = layers[i]->tops[0];
2629
2630 size_t j = i + 1;
2631 for (; j < layer_count; j++)
2632 {
2633 if (layers[j]->type != "Convolution")
2634 continue;
2635
2636 if (layers[j]->bottoms.size() != 1)
2637 continue;
2638
2639 if (layers[j]->bottoms[0] == top_blob_index)
2640 break;
2641 }
2642
2643 if (j == layer_count)
2644 continue;
2645
2646 ncnn::Convolution* convolution = (ncnn::Convolution*)layers[j];
2647
2648 fprintf(stderr, "replace_convolution_with_innerproduct_after_global_pooling %s %s\n", pooling->name.c_str(), convolution->name.c_str());
2649
2650 ncnn::InnerProduct* innerproduct = (ncnn::InnerProduct*)ncnn::create_layer("InnerProduct");
2651
2652 innerproduct->type = "InnerProduct";
2653 innerproduct->name = convolution->name;
2654 innerproduct->bottoms = convolution->bottoms;
2655 innerproduct->tops = convolution->tops;
2656
2657 ncnn::ParamDict pd;
2658 innerproduct->load_param(pd);
2659
2660 innerproduct->num_output = convolution->num_output;
2661 innerproduct->bias_term = convolution->bias_term;
2662 innerproduct->weight_data_size = convolution->weight_data_size;
2663 innerproduct->int8_scale_term = convolution->int8_scale_term;
2664
2665 innerproduct->weight_data = convolution->weight_data;
2666 innerproduct->bias_data = convolution->bias_data;
2667 #if NCNN_INT8
2668 innerproduct->weight_data_int8_scales = convolution->weight_data_int8_scales;
2669 innerproduct->bottom_blob_int8_scales = convolution->bottom_blob_int8_scales;
2670 #endif
2671
2672 innerproduct->activation_type = convolution->activation_type;
2673 innerproduct->activation_params = convolution->activation_params;
2674
2675 layers[j] = innerproduct;
2676 delete convolution;
2677 }
2678
2679 return 0;
2680 }
2681
replace_convolution_with_innerproduct_after_innerproduct()2682 int NetOptimize::replace_convolution_with_innerproduct_after_innerproduct()
2683 {
2684 const size_t layer_count = layers.size();
2685 for (;;)
2686 {
2687 bool replaced = false;
2688
2689 for (size_t i = 0; i < layer_count; i++)
2690 {
2691 if (layers[i]->type != "InnerProduct")
2692 continue;
2693
2694 // InnerProduct - Convolution
2695 int top_blob_index = layers[i]->tops[0];
2696
2697 size_t j = i + 1;
2698 for (; j < layer_count; j++)
2699 {
2700 if (layers[j]->type != "Convolution")
2701 continue;
2702
2703 if (layers[j]->bottoms.size() != 1)
2704 continue;
2705
2706 if (layers[j]->bottoms[0] == top_blob_index)
2707 break;
2708 }
2709
2710 if (j == layer_count)
2711 continue;
2712
2713 ncnn::InnerProduct* innerproduct = (ncnn::InnerProduct*)layers[i];
2714 ncnn::Convolution* convolution = (ncnn::Convolution*)layers[j];
2715
2716 fprintf(stderr, "replace_convolution_with_innerproduct_after_innerproduct %s %s\n", innerproduct->name.c_str(), convolution->name.c_str());
2717
2718 ncnn::InnerProduct* innerproduct2 = (ncnn::InnerProduct*)ncnn::create_layer("InnerProduct");
2719
2720 innerproduct2->type = "InnerProduct";
2721 innerproduct2->name = convolution->name;
2722 innerproduct2->bottoms = convolution->bottoms;
2723 innerproduct2->tops = convolution->tops;
2724
2725 ncnn::ParamDict pd;
2726 innerproduct2->load_param(pd);
2727
2728 innerproduct2->num_output = convolution->num_output;
2729 innerproduct2->bias_term = convolution->bias_term;
2730 innerproduct2->weight_data_size = convolution->weight_data_size;
2731 innerproduct->int8_scale_term = convolution->int8_scale_term;
2732
2733 innerproduct2->weight_data = convolution->weight_data;
2734 innerproduct2->bias_data = convolution->bias_data;
2735 #if NCNN_INT8
2736 innerproduct->weight_data_int8_scales = convolution->weight_data_int8_scales;
2737 innerproduct->bottom_blob_int8_scales = convolution->bottom_blob_int8_scales;
2738 #endif
2739
2740 innerproduct2->activation_type = convolution->activation_type;
2741 innerproduct2->activation_params = convolution->activation_params;
2742
2743 layers[j] = innerproduct2;
2744 delete convolution;
2745
2746 replaced = true;
2747 }
2748
2749 if (!replaced)
2750 break;
2751 }
2752
2753 return 0;
2754 }
2755
main(int argc,char ** argv)2756 int main(int argc, char** argv)
2757 {
2758 if (argc < 6)
2759 {
2760 fprintf(stderr, "usage: %s [inparam] [inbin] [outparam] [outbin] [flag] [cutstart] [cutend]\n", argv[0]);
2761 return -1;
2762 }
2763
2764 const char* inparam = argv[1];
2765 const char* inbin = argv[2];
2766 const char* outparam = argv[3];
2767 const char* outbin = argv[4];
2768 int flag = atoi(argv[5]);
2769 const char* cutstartname = nullptr;
2770 const char* cutendname = nullptr;
2771
2772 if (argc > 6)
2773 {
2774 cutstartname = argv[6];
2775 }
2776
2777 if (argc > 7)
2778 {
2779 cutendname = argv[7];
2780 }
2781
2782 NetOptimize optimizer;
2783
2784 if (flag == 65536 || flag == 1)
2785 {
2786 optimizer.storage_type = 1;
2787 }
2788 else
2789 {
2790 optimizer.storage_type = 0;
2791 }
2792
2793 optimizer.load_param(inparam);
2794
2795 if (strcmp(inbin, "null") == 0)
2796 {
2797 DataReaderFromEmpty dr;
2798 optimizer.load_model(dr);
2799 optimizer.gen_random_weight = true;
2800 }
2801 else
2802 optimizer.load_model(inbin);
2803
2804 if (optimizer.set_cutparam(cutstartname, cutendname) < 0)
2805 {
2806 return -1;
2807 }
2808
2809 optimizer.fuse_batchnorm_scale();
2810 optimizer.fuse_convolution_batchnorm();
2811 optimizer.fuse_convolution_mul();
2812 optimizer.fuse_convolution_add();
2813 optimizer.fuse_convolutiondepthwise_batchnorm();
2814 optimizer.fuse_convolutiondepthwise_mul();
2815 optimizer.fuse_convolutiondepthwise_add();
2816 optimizer.fuse_deconvolution_batchnorm();
2817 optimizer.fuse_deconvolution_mul();
2818 optimizer.fuse_deconvolution_add();
2819 optimizer.fuse_deconvolutiondepthwise_batchnorm();
2820 optimizer.fuse_innerproduct_batchnorm();
2821 optimizer.fuse_innerproduct_add();
2822 optimizer.fuse_innerproduct_dropout();
2823
2824 optimizer.replace_reduction_with_global_pooling();
2825 optimizer.replace_prelu_with_leaky_relu();
2826
2827 optimizer.fuse_convolution_activation();
2828 optimizer.fuse_convolutiondepthwise_activation();
2829 optimizer.fuse_deconvolution_activation();
2830 optimizer.fuse_deconvolutiondepthwise_activation();
2831 optimizer.fuse_innerproduct_activation();
2832 optimizer.fuse_memorydata_binaryop();
2833 optimizer.fuse_binaryop_eltwise();
2834
2835 optimizer.eliminate_dropout();
2836 optimizer.eliminate_pooling1x1();
2837 optimizer.eliminate_noop();
2838 optimizer.eliminate_split();
2839 optimizer.eliminate_flatten_after_global_pooling();
2840 optimizer.eliminate_reshape_after_global_pooling();
2841 optimizer.eliminate_reshape_before_binaryop();
2842
2843 optimizer.replace_convolution_with_innerproduct_after_global_pooling();
2844 optimizer.replace_convolution_with_innerproduct_after_innerproduct();
2845
2846 optimizer.eliminate_flatten_after_innerproduct();
2847 optimizer.eliminate_orphaned_memorydata();
2848
2849 optimizer.shape_inference();
2850
2851 optimizer.estimate_memory_footprint();
2852
2853 optimizer.save(outparam, outbin);
2854
2855 return 0;
2856 }
2857