1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 
15 #include <algorithm>
16 #include <assert.h>
17 #include <cctype>
18 #include <deque>
19 #include <fstream>
20 #include <iostream>
21 #include <locale>
22 #include <sstream>
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <string>
26 #include <unordered_map>
27 #include <vector>
28 
29 #define OUTPUT_LAYER_MAP 0 //enable this to generate darknet style layer output
30 
file_error(const char * s)31 void file_error(const char* s)
32 {
33     fprintf(stderr, "Couldn't open file: %s\n", s);
34     exit(EXIT_FAILURE);
35 }
36 
fread_or_error(void * buffer,size_t size,size_t count,FILE * fp,const char * s)37 void fread_or_error(void* buffer, size_t size, size_t count, FILE* fp, const char* s)
38 {
39     if (count != fread(buffer, size, count, fp))
40     {
41         fprintf(stderr, "Couldn't read from file: %s\n", s);
42         fclose(fp);
43         assert(0);
44         exit(EXIT_FAILURE);
45     }
46 }
47 
error(const char * s)48 void error(const char* s)
49 {
50     perror(s);
51     assert(0);
52     exit(EXIT_FAILURE);
53 }
54 
55 typedef struct Section
56 {
57     std::string name;
58     int line_number = -1;
59     int original_layer_count;
60 
61     std::unordered_map<std::string, std::string> options;
62     int w = 416, h = 416, c = 3, inputs = 256;
63     int out_w, out_h, out_c;
64     int batch_normalize = 0, filters = 1, size = 1, groups = 1, stride = 1, padding = -1, pad = 0, dilation = 1;
65     std::string activation;
66     int from, reverse;
67     std::vector<int> layers, mask, anchors;
68     int group_id = -1;
69     int classes = 0, num = 0;
70     float ignore_thresh = 0.45f, scale_x_y = 1.f;
71 
72     std::vector<float> weights, bias, scales, rolling_mean, rolling_variance;
73 
74     std::string layer_type, layer_name;
75     std::vector<std::string> input_blobs, output_blobs;
76     std::vector<std::string> real_output_blobs;
77     std::vector<std::string> param;
78 } Section;
79 
trim(std::string & s)80 static inline std::string& trim(std::string& s)
81 {
82     s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](int ch) {
83         return !std::isspace(ch);
84     }));
85     s.erase(std::find_if(s.rbegin(), s.rend(), [](int ch) {
86         return !std::isspace(ch);
87     }).base(),
88     s.end());
89     return s;
90 }
91 
92 typedef enum FIELD_TYPE
93 {
94     INT,
95     FLOAT,
96     IARRAY,
97     FARRAY,
98     STRING,
99     UNSUPPORTED
100 } FIELD_TYPE;
101 
102 typedef struct Section_Field
103 {
104     const char* name;
105     FIELD_TYPE type;
106     size_t offset;
107 } Section_Field;
108 
109 #define FIELD_OFFSET(c) ((size_t) & (((Section*)0)->c))
110 
111 int yolo_layer_count = 0;
112 
split(const std::string & s,char delimiter)113 std::vector<std::string> split(const std::string& s, char delimiter)
114 {
115     std::vector<std::string> tokens;
116     std::string token;
117     std::istringstream tokenStream(s);
118     while (std::getline(tokenStream, token, delimiter))
119     {
120         tokens.push_back(token);
121     }
122     return tokens;
123 }
124 
125 template<typename... Args>
format(const char * fmt,Args...args)126 std::string format(const char* fmt, Args... args)
127 {
128     size_t size = snprintf(nullptr, 0, fmt, args...);
129     std::string buf;
130     buf.reserve(size + 1);
131     buf.resize(size);
132     snprintf(&buf[0], size + 1, fmt, args...);
133     return buf;
134 }
135 
update_field(Section * section,std::string key,std::string value)136 void update_field(Section* section, std::string key, std::string value)
137 {
138     static const Section_Field fields[] = {
139         //net
140         {"width", INT, FIELD_OFFSET(w)},
141         {"height", INT, FIELD_OFFSET(h)},
142         {"channels", INT, FIELD_OFFSET(c)},
143         {"inputs", INT, FIELD_OFFSET(inputs)},
144         //convolutional, upsample, maxpool
145         {"batch_normalize", INT, FIELD_OFFSET(batch_normalize)},
146         {"filters", INT, FIELD_OFFSET(filters)},
147         {"size", INT, FIELD_OFFSET(size)},
148         {"groups", INT, FIELD_OFFSET(groups)},
149         {"stride", INT, FIELD_OFFSET(stride)},
150         {"padding", INT, FIELD_OFFSET(padding)},
151         {"pad", INT, FIELD_OFFSET(pad)},
152         {"dilation", INT, FIELD_OFFSET(dilation)},
153         {"activation", STRING, FIELD_OFFSET(activation)},
154         //shortcut
155         {"from", INT, FIELD_OFFSET(from)},
156         {"reverse", INT, FIELD_OFFSET(reverse)},
157         //route
158         {"layers", IARRAY, FIELD_OFFSET(layers)},
159         {"group_id", INT, FIELD_OFFSET(group_id)},
160         //yolo
161         {"mask", IARRAY, FIELD_OFFSET(mask)},
162         {"anchors", IARRAY, FIELD_OFFSET(anchors)},
163         {"classes", INT, FIELD_OFFSET(classes)},
164         {"num", INT, FIELD_OFFSET(num)},
165         {"ignore_thresh", FLOAT, FIELD_OFFSET(ignore_thresh)},
166         {"scale_x_y", FLOAT, FIELD_OFFSET(scale_x_y)},
167     };
168 
169     for (size_t i = 0; i < sizeof(fields) / sizeof(fields[0]); i++)
170     {
171         auto f = fields[i];
172         if (key != f.name)
173             continue;
174         char* addr = ((char*)section) + f.offset;
175         switch (f.type)
176         {
177         case INT:
178             *(int*)(addr) = std::stoi(value);
179             return;
180 
181         case FLOAT:
182             *(float*)(addr) = std::stof(value);
183             return;
184 
185         case IARRAY:
186             for (auto v : split(value, ','))
187                 reinterpret_cast<std::vector<int>*>(addr)->push_back(std::stoi(v));
188             return;
189 
190         case FARRAY:
191             for (auto v : split(value, ','))
192                 reinterpret_cast<std::vector<float>*>(addr)->push_back(std::stof(v));
193             return;
194 
195         case STRING:
196             *reinterpret_cast<std::string*>(addr) = value;
197             return;
198 
199         case UNSUPPORTED:
200             printf("unsupported option: %s\n", key.c_str());
201             exit(EXIT_FAILURE);
202         }
203     }
204 }
205 
load_cfg(const char * filename,std::deque<Section * > & dnet)206 void load_cfg(const char* filename, std::deque<Section*>& dnet)
207 {
208     std::string line;
209     std::ifstream icfg(filename, std::ifstream::in);
210     if (!icfg.good())
211     {
212         fprintf(stderr, "Couldn't cfg open file: %s\n", filename);
213         exit(EXIT_FAILURE);
214     }
215 
216     Section* section = NULL;
217     size_t pos;
218     int section_count = 0, line_count = 0;
219     while (!icfg.eof())
220     {
221         line_count++;
222         std::getline(icfg, line);
223         trim(line);
224         if (line.length() == 0 || line.at(0) == '#')
225             continue;
226         if (line.at(0) == '[' && line.at(line.length() - 1) == ']')
227         {
228             line = line.substr(1, line.length() - 2);
229             section = new Section;
230             section->name = line;
231             section->line_number = line_count;
232             section->original_layer_count = section_count++;
233             dnet.push_back(section);
234         }
235         else if ((pos = line.find_first_of('=')) != std::string::npos)
236         {
237             std::string key = line.substr(0, pos);
238             std::string value = line.substr(pos + 1, line.length() - 1);
239             section->options[trim(key)] = trim(value);
240             update_field(section, key, value);
241         }
242     }
243 
244     icfg.close();
245 }
246 
get_original_section(std::deque<Section * > & dnet,int count,int offset)247 Section* get_original_section(std::deque<Section*>& dnet, int count, int offset)
248 {
249     if (offset >= 0)
250         count = offset + 1;
251     else
252         count += offset;
253     for (auto s : dnet)
254         if (s->original_layer_count == count)
255             return s;
256     return dnet[0];
257 }
258 
259 template<typename T>
array_to_float_string(std::vector<T> vec)260 std::string array_to_float_string(std::vector<T> vec)
261 {
262     std::string ret;
263     for (size_t i = 0; i < vec.size(); i++)
264         ret.append(format(",%f", (float)vec[i]));
265     return ret;
266 }
267 
get_section_by_output_blob(std::deque<Section * > & dnet,std::string blob)268 Section* get_section_by_output_blob(std::deque<Section*>& dnet, std::string blob)
269 {
270     for (auto s : dnet)
271         for (auto b : s->output_blobs)
272             if (b == blob)
273                 return s;
274     return NULL;
275 }
276 
get_sections_by_input_blob(std::deque<Section * > & dnet,std::string blob)277 std::vector<Section*> get_sections_by_input_blob(std::deque<Section*>& dnet, std::string blob)
278 {
279     std::vector<Section*> ret;
280     for (auto s : dnet)
281         for (auto b : s->input_blobs)
282             if (b == blob)
283                 ret.push_back(s);
284     return ret;
285 }
286 
addActivationLayer(Section * s,std::deque<Section * >::iterator & it,std::deque<Section * > & dnet)287 void addActivationLayer(Section* s, std::deque<Section*>::iterator& it, std::deque<Section*>& dnet)
288 {
289     Section* act = new Section;
290 
291     if (s->activation == "relu")
292     {
293         act->layer_type = "ReLU";
294         act->param.push_back("0=0");
295     }
296     else if (s->activation == "leaky")
297     {
298         act->layer_type = "ReLU";
299         act->param.push_back("0=0.1");
300     }
301     else if (s->activation == "mish")
302         act->layer_type = "Mish";
303     else if (s->activation == "logistic")
304         act->layer_type = "Sigmoid";
305     else if (s->activation == "swish")
306         act->layer_type = "Swish";
307 
308     if (s->batch_normalize)
309         act->layer_name = s->layer_name + "_bn";
310     else
311         act->layer_name = s->layer_name;
312     act->h = s->out_h;
313     act->w = s->out_w;
314     act->c = s->out_c;
315     act->out_h = s->out_h;
316     act->out_w = s->out_w;
317     act->out_c = s->out_c;
318     act->layer_name += "_" + s->activation;
319     act->input_blobs = s->real_output_blobs;
320     act->output_blobs.push_back(act->layer_name);
321 
322     s->real_output_blobs = act->real_output_blobs = act->output_blobs;
323     it = dnet.insert(it + 1, act);
324 }
325 
parse_cfg(std::deque<Section * > & dnet,int merge_output)326 void parse_cfg(std::deque<Section*>& dnet, int merge_output)
327 {
328     int input_w = 416, input_h = 416;
329     int yolo_count = 0;
330     std::vector<Section*> yolo_layers;
331 
332 #if OUTPUT_LAYER_MAP
333     printf("   layer   filters  size/strd(dil)      input                output\n");
334 #endif
335     for (auto it = dnet.begin(); it != dnet.end(); it++)
336     {
337         auto s = *it;
338         if (s->line_number < 0)
339             continue;
340 
341         auto p = get_original_section(dnet, s->original_layer_count, -1);
342 
343 #if OUTPUT_LAYER_MAP
344         if (s->original_layer_count > 0)
345             printf("%4d ", s->original_layer_count - 1);
346 #endif
347 
348         s->layer_name = format("%d_%d", s->original_layer_count - 1, s->line_number);
349         s->input_blobs = p->real_output_blobs;
350         s->output_blobs.push_back(s->layer_name);
351         s->real_output_blobs = s->output_blobs;
352 
353         if (s->name == "net")
354         {
355             s->out_h = s->h;
356             s->out_w = s->w;
357             s->out_c = s->c;
358             input_h = s->h;
359             input_w = s->w;
360 
361             s->layer_type = "Input";
362             s->layer_name = "data";
363             s->input_blobs.clear();
364             s->output_blobs.clear();
365             s->output_blobs.push_back("data");
366             s->real_output_blobs = s->output_blobs;
367             s->param.push_back(format("0=%d", s->w));
368             s->param.push_back(format("1=%d", s->h));
369             s->param.push_back(format("2=%d", s->c));
370         }
371         else if (s->name == "convolutional")
372         {
373             if (s->padding == -1)
374                 s->padding = 0;
375             s->h = p->out_h;
376             s->w = p->out_w;
377             s->c = p->out_c;
378             s->out_h = s->h / s->stride;
379             s->out_w = s->w / s->stride;
380             s->out_c = s->filters;
381 
382 #if OUTPUT_LAYER_MAP
383             if (s->groups == 1)
384                 printf("conv %5d      %2d x%2d/%2d   ", s->filters, s->size, s->size, s->stride);
385             else
386                 printf("conv %5d/%4d %2d x%2d/%2d   ", s->filters, s->groups, s->size, s->size, s->stride);
387             printf("%4d x%4d x%4d -> %4d x%4d x%4d\n", s->h, s->w, s->c, s->out_h, s->out_w, s->out_c);
388 #endif
389 
390             if (s->groups == 1)
391                 s->layer_type = "Convolution";
392             else
393                 s->layer_type = "ConvolutionDepthWise";
394             s->param.push_back(format("0=%d", s->filters));                        //num_output
395             s->param.push_back(format("1=%d", s->size));                           //kernel_w
396             s->param.push_back(format("2=%d", s->dilation));                       //dilation_w
397             s->param.push_back(format("3=%d", s->stride));                         //stride_w
398             s->param.push_back(format("4=%d", s->pad ? s->size / 2 : s->padding)); //pad_left
399 
400             if (s->batch_normalize)
401             {
402                 s->param.push_back("5=0"); //bias_term
403 
404                 Section* bn = new Section;
405                 bn->layer_type = "BatchNorm";
406                 bn->layer_name = s->layer_name + "_bn";
407                 bn->h = s->out_h;
408                 bn->w = s->out_w;
409                 bn->c = s->out_c;
410                 bn->out_h = s->out_h;
411                 bn->out_w = s->out_w;
412                 bn->out_c = s->out_c;
413                 bn->input_blobs = s->real_output_blobs;
414                 bn->output_blobs.push_back(bn->layer_name);
415                 bn->param.push_back(format("0=%d", s->filters)); //channels
416                 bn->param.push_back("1=.00001");                 //eps
417 
418                 s->real_output_blobs = bn->real_output_blobs = bn->output_blobs;
419                 it = dnet.insert(it + 1, bn);
420             }
421             else
422             {
423                 s->param.push_back("5=1"); //bias_term
424             }
425             s->param.push_back(format("6=%d", s->c * s->size * s->size * s->filters / s->groups)); //weight_data_size
426 
427             if (s->groups > 1)
428                 s->param.push_back(format("7=%d", s->groups)); //stride_w
429 
430             if (s->activation.size() > 0)
431             {
432                 if (s->activation == "relu" || s->activation == "leaky" || s->activation == "mish" || s->activation == "logistic" || s->activation == "swish")
433                 {
434                     addActivationLayer(s, it, dnet);
435                 }
436                 else if (s->activation != "linear")
437                     error(format("Unsupported convolutional activation type: %s", s->activation.c_str()).c_str());
438             }
439         }
440         else if (s->name == "shortcut")
441         {
442             auto q = get_original_section(dnet, s->original_layer_count, s->from);
443             if (p->out_h != q->out_h || p->out_w != q->out_w)
444                 error("shortcut dim not match");
445 
446             s->h = p->out_h;
447             s->w = p->out_w;
448             s->c = p->out_c;
449             s->out_h = s->h;
450             s->out_w = s->w;
451             s->out_c = p->out_c;
452 
453 #if OUTPUT_LAYER_MAP
454             printf("Shortcut Layer: %d, ", q->original_layer_count - 1);
455             printf("outputs: %4d x%4d x%4d\n", s->out_h, s->out_w, s->out_c);
456             if (p->out_c != q->out_c)
457                 printf("(%4d x%4d x%4d) + (%4d x%4d x%4d)\n", p->out_h, p->out_w, p->out_c,
458                        q->out_h, q->out_w, q->out_c);
459 #endif
460 
461             if (s->activation.size() > 0)
462             {
463                 if (s->activation == "relu" || s->activation == "leaky" || s->activation == "mish" || s->activation == "logistic" || s->activation == "swish")
464                 {
465                     addActivationLayer(s, it, dnet);
466                 }
467                 else if (s->activation != "linear")
468                     error(format("Unsupported convolutional activation type: %s", s->activation.c_str()).c_str());
469             }
470 
471             s->layer_type = "Eltwise";
472             s->input_blobs.clear();
473             s->input_blobs.push_back(p->real_output_blobs[0]);
474             s->input_blobs.push_back(q->real_output_blobs[0]);
475 
476             s->param.push_back("0=1"); //op_type=Operation_SUM
477         }
478         else if (s->name == "maxpool")
479         {
480             if (s->padding == -1)
481                 s->padding = s->stride * int((s->size - 1) / 2);
482             s->h = p->out_h;
483             s->w = p->out_w;
484             s->c = p->out_c;
485             s->out_h = (s->h + s->padding - s->size) / s->stride + 1;
486             s->out_w = (s->w + s->padding - s->size) / s->stride + 1;
487             s->out_c = s->c;
488 
489 #if OUTPUT_LAYER_MAP
490             printf("max             %2d x%2d/%2d   ", s->size, s->size, s->stride);
491             printf("%4d x%4d x%4d -> %4d x%4d x%4d\n", s->h, s->w, s->c, s->out_h, s->out_w, s->out_c);
492 #endif
493 
494             s->layer_type = "Pooling";
495             s->param.push_back("0=0");                       //pooling_type=PoolMethod_MAX
496             s->param.push_back(format("1=%d", s->size));     //kernel_w
497             s->param.push_back(format("2=%d", s->stride));   //stride_w
498             s->param.push_back("5=1");                       //pad_mode=SAME_UPPER
499             s->param.push_back(format("3=%d", s->padding));  //pad_left
500             s->param.push_back(format("13=%d", s->padding)); //pad_top
501             s->param.push_back(format("14=%d", s->padding)); //pad_right
502             s->param.push_back(format("15=%d", s->padding)); //pad_bottom
503         }
504         else if (s->name == "avgpool")
505         {
506             if (s->padding == -1)
507                 s->padding = s->size - 1;
508             s->h = p->out_h;
509             s->w = p->out_w;
510             s->c = p->out_c;
511             s->out_h = 1;
512             s->out_w = s->out_h;
513             s->out_c = s->c;
514 
515 #if OUTPUT_LAYER_MAP
516             printf("avg                         %4d x%4d x%4d ->   %4d\n", s->h, s->w, s->c, s->out_c);
517 #endif
518 
519             s->layer_type = "Pooling";
520             s->param.push_back("0=1"); //pooling_type=PoolMethod_AVE
521             s->param.push_back("4=1"); //global_pooling
522 
523             Section* r = new Section;
524             r->layer_type = "Reshape";
525             r->layer_name = s->layer_name + "_reshape";
526             r->h = s->out_h;
527             r->w = s->out_w;
528             r->c = s->out_c;
529             r->out_h = 1;
530             r->out_w = 1;
531             r->out_c = r->h * r->w * r->c;
532             r->input_blobs.push_back(s->output_blobs[0]);
533             r->output_blobs.push_back(r->layer_name);
534             r->param.push_back("0=1");                    //w
535             r->param.push_back("1=1");                    //h
536             r->param.push_back(format("2=%d", r->out_c)); //c
537 
538             s->real_output_blobs.clear();
539             s->real_output_blobs.push_back(r->layer_name);
540 
541             it = dnet.insert(it + 1, r);
542         }
543         else if (s->name == "scale_channels")
544         {
545             auto q = get_original_section(dnet, s->original_layer_count, s->from);
546             if (p->out_c != q->out_c)
547                 error("scale channels not match");
548 
549             s->h = q->out_h;
550             s->w = q->out_w;
551             s->c = q->out_c;
552             s->out_h = s->h;
553             s->out_w = s->w;
554             s->out_c = q->out_c;
555 
556 #if OUTPUT_LAYER_MAP
557             printf("scale Layer: %d\n", q->original_layer_count - 1);
558 #endif
559 
560             if (s->activation.size() > 0 && s->activation != "linear")
561                 error(format("Unsupported scale_channels activation type: %s", s->activation.c_str()).c_str());
562 
563             s->layer_type = "BinaryOp";
564             s->input_blobs.clear();
565             s->input_blobs.push_back(q->real_output_blobs[0]);
566             s->input_blobs.push_back(p->real_output_blobs[0]);
567             s->param.push_back("0=2"); //op_type=Operation_MUL
568         }
569         else if (s->name == "route")
570         {
571 #if OUTPUT_LAYER_MAP
572             printf("route  ");
573 #endif
574             s->out_c = 0;
575             s->input_blobs.clear();
576             for (int l : s->layers)
577             {
578                 auto q = get_original_section(dnet, s->original_layer_count, l);
579 #if OUTPUT_LAYER_MAP
580                 printf("%d ", q->original_layer_count - 1);
581 #endif
582                 s->out_h = q->out_h;
583                 s->out_w = q->out_w;
584                 s->out_c += q->out_c;
585 
586                 for (auto blob : q->real_output_blobs)
587                     s->input_blobs.push_back(blob);
588             }
589             if (s->input_blobs.size() == 1)
590             {
591                 if (s->groups <= 1 || s->group_id == -1)
592                     s->layer_type = "Noop";
593                 else
594                 {
595                     s->out_c /= s->groups;
596 #if OUTPUT_LAYER_MAP
597                     printf("%31d/%d -> %4d x%4d x%4d", 1, s->groups, s->out_w, s->out_h, s->out_c);
598 #endif
599 
600                     s->layer_type = "Crop";
601                     s->param.push_back(format("2=%d", s->out_c * s->group_id));
602                     s->param.push_back(format("3=%d", s->out_w));
603                     s->param.push_back(format("4=%d", s->out_h));
604                     s->param.push_back(format("5=%d", s->out_c));
605                 }
606             }
607             else
608             {
609                 s->layer_type = "Concat";
610             }
611 #if OUTPUT_LAYER_MAP
612             printf("\n");
613 #endif
614         }
615         else if (s->name == "upsample")
616         {
617             s->h = p->out_h;
618             s->w = p->out_w;
619             s->c = p->out_c;
620             s->out_h = s->h * s->stride;
621             s->out_w = s->w * s->stride;
622             s->out_c = s->c;
623 
624 #if OUTPUT_LAYER_MAP
625             printf("upsample               %2dx  ", s->stride);
626             printf("%4d x%4d x%4d -> %4d x%4d x%4d\n", s->h, s->w, s->c, s->out_h, s->out_w, s->out_c);
627 #endif
628             s->layer_type = "Interp";
629             s->param.push_back("0=1");   //resize_type=nearest
630             s->param.push_back("1=2.f"); //height_scale
631             s->param.push_back("2=2.f"); //width_scale
632         }
633         else if (s->name == "yolo")
634         {
635 #if OUTPUT_LAYER_MAP
636             printf("yolo%d\n", yolo_count);
637 #endif
638 
639             if (s->ignore_thresh > 0.25)
640             {
641                 fprintf(stderr, "WARNING: The ignore_thresh=%f of yolo%d is too high. "
642                         "An alternative value 0.25 is written instead.\n",
643                         s->ignore_thresh, yolo_count);
644                 s->ignore_thresh = 0.25;
645             }
646 
647             s->layer_type = "Yolov3DetectionOutput";
648             s->layer_name = format("yolo%d", yolo_count++);
649             s->output_blobs[0] = s->layer_name;
650             s->h = p->out_h;
651             s->w = p->out_w;
652             s->c = p->out_c;
653             s->out_h = s->h;
654             s->out_w = s->w;
655             s->out_c = s->c * (int)s->mask.size();
656             s->param.push_back(format("0=%d", s->classes));                                                             //num_class
657             s->param.push_back(format("1=%d", s->mask.size()));                                                         //num_box
658             s->param.push_back(format("2=%f", s->ignore_thresh));                                                       //confidence_threshold
659             s->param.push_back(format("-23304=%d%s", s->anchors.size(), array_to_float_string(s->anchors).c_str()));    //biases
660             s->param.push_back(format("-23305=%d%s", s->mask.size(), array_to_float_string(s->mask).c_str()));          //mask
661             s->param.push_back(format("-23306=2,%f,%f", input_w * s->scale_x_y / s->w, input_h * s->scale_x_y / s->h)); //biases_index
662 
663             yolo_layer_count++;
664             yolo_layers.push_back(s);
665         }
666         else if (s->name == "dropout")
667         {
668 #if OUTPUT_LAYER_MAP
669             printf("dropout\n");
670 #endif
671             s->h = p->out_h;
672             s->w = p->out_w;
673             s->c = p->out_c;
674             s->out_h = s->h;
675             s->out_w = s->w;
676             s->out_c = p->out_c;
677             s->layer_type = "Noop";
678         }
679         else
680         {
681 #if OUTPUT_LAYER_MAP
682             printf("%-8s (unsupported)\n", s->name.c_str());
683 #endif
684         }
685     }
686 
687     for (auto it = dnet.begin(); it != dnet.end(); it++)
688     {
689         auto s = *it;
690         for (size_t i = 0; i < s->input_blobs.size(); i++)
691         {
692             auto p = get_section_by_output_blob(dnet, s->input_blobs[i]);
693             if (p == NULL || p->layer_type != "Noop")
694                 continue;
695             s->input_blobs[i] = p->input_blobs[0];
696         }
697     }
698 
699     for (auto it = dnet.begin(); it != dnet.end();)
700         if ((*it)->layer_type == "Noop")
701             it = dnet.erase(it);
702         else
703             it++;
704 
705     for (auto it = dnet.begin(); it != dnet.end(); it++)
706     {
707         auto s = *it;
708         for (std::string output_name : s->output_blobs)
709         {
710             auto q = get_sections_by_input_blob(dnet, output_name);
711             if (q.size() <= 1 || s->layer_type == "Split")
712                 continue;
713             Section* p = new Section;
714             p->layer_type = "Split";
715             p->layer_name = s->layer_name + "_split";
716             p->w = s->w;
717             p->h = s->h;
718             p->c = s->c;
719             p->out_w = s->out_w;
720             p->out_h = s->out_h;
721             p->out_c = s->out_c;
722             p->input_blobs.push_back(output_name);
723             for (size_t i = 0; i < q.size(); i++)
724             {
725                 std::string new_output_name = p->layer_name + "_" + std::to_string(i);
726                 p->output_blobs.push_back(new_output_name);
727 
728                 for (size_t j = 0; j < q[i]->input_blobs.size(); j++)
729                     if (q[i]->input_blobs[j] == output_name)
730                         q[i]->input_blobs[j] = new_output_name;
731             }
732             it = dnet.insert(it + 1, p);
733         }
734     }
735 
736     if (merge_output && yolo_layer_count > 0)
737     {
738         std::vector<int> masks;
739         std::vector<float> scale_x_y;
740 
741         Section* s = new Section;
742         s->classes = yolo_layers[0]->classes;
743         s->anchors = yolo_layers[0]->anchors;
744         s->mask = yolo_layers[0]->mask;
745 
746         for (auto p : yolo_layers)
747         {
748             if (s->classes != p->classes)
749                 error("yolo object classes number not match, output cannot be merged.");
750 
751             if (s->anchors.size() != p->anchors.size())
752                 error("yolo layer anchor count not match, output cannot be merged.");
753 
754             for (size_t i = 0; i < s->anchors.size(); i++)
755                 if (s->anchors[i] != p->anchors[i])
756                     error("yolo anchor size not match, output cannot be merged.");
757 
758             if (s->ignore_thresh > p->ignore_thresh)
759                 s->ignore_thresh = p->ignore_thresh;
760 
761             for (int m : p->mask)
762                 masks.push_back(m);
763 
764             scale_x_y.push_back(input_w * p->scale_x_y / p->w);
765             s->input_blobs.push_back(p->input_blobs[0]);
766         }
767 
768         for (auto it = dnet.begin(); it != dnet.end();)
769             if ((*it)->name == "yolo")
770                 it = dnet.erase(it);
771             else
772                 it++;
773 
774         s->layer_type = "Yolov3DetectionOutput";
775         s->layer_name = "detection_out";
776         s->output_blobs.push_back("output");
777         s->param.push_back(format("0=%d", s->classes));                                                          //num_class
778         s->param.push_back(format("1=%d", s->mask.size()));                                                      //num_box
779         s->param.push_back(format("2=%f", s->ignore_thresh));                                                    //confidence_threshold
780         s->param.push_back(format("-23304=%d%s", s->anchors.size(), array_to_float_string(s->anchors).c_str())); //biases
781         s->param.push_back(format("-23305=%d%s", masks.size(), array_to_float_string(masks).c_str()));           //mask
782         s->param.push_back(format("-23306=%d%s", scale_x_y.size(), array_to_float_string(scale_x_y).c_str()));   //biases_index
783 
784         dnet.push_back(s);
785     }
786 }
787 
read_to(std::vector<float> & vec,size_t size,FILE * fp)788 void read_to(std::vector<float>& vec, size_t size, FILE* fp)
789 {
790     vec.resize(size);
791     size_t read_size = fread(&vec[0], sizeof(float), size, fp);
792     if (read_size != size)
793         error("\n Warning: Unexpected end of wights-file!\n");
794 }
795 
load_weights(const char * filename,std::deque<Section * > & dnet)796 void load_weights(const char* filename, std::deque<Section*>& dnet)
797 {
798     FILE* fp = fopen(filename, "rb");
799     if (fp == NULL)
800         file_error(filename);
801 
802     int major, minor, revision;
803 
804     fread_or_error(&major, sizeof(int), 1, fp, filename);
805     fread_or_error(&minor, sizeof(int), 1, fp, filename);
806     fread_or_error(&revision, sizeof(int), 1, fp, filename);
807     if ((major * 10 + minor) >= 2)
808     {
809         uint64_t iseen = 0;
810         fread_or_error(&iseen, sizeof(uint64_t), 1, fp, filename);
811     }
812     else
813     {
814         uint32_t iseen = 0;
815         fread_or_error(&iseen, sizeof(uint32_t), 1, fp, filename);
816     }
817 
818     for (auto s : dnet)
819     {
820         if (s->name == "convolutional")
821         {
822             read_to(s->bias, s->filters, fp);
823             if (s->batch_normalize)
824             {
825                 read_to(s->scales, s->filters, fp);
826                 read_to(s->rolling_mean, s->filters, fp);
827                 read_to(s->rolling_variance, s->filters, fp);
828             }
829 
830             if (s->layer_type == "Convolution")
831                 read_to(s->weights, (size_t)(s->c) * s->filters * s->size * s->size, fp);
832             else if (s->layer_type == "ConvolutionDepthWise")
833                 read_to(s->weights, s->c * s->filters * s->size * s->size / s->groups, fp);
834         }
835     }
836 
837     fclose(fp);
838 }
839 
count_output_blob(std::deque<Section * > & dnet)840 int count_output_blob(std::deque<Section*>& dnet)
841 {
842     int count = 0;
843     for (auto s : dnet)
844         count += (int)s->output_blobs.size();
845     return count;
846 }
847 
main(int argc,char ** argv)848 int main(int argc, char** argv)
849 {
850     if (!(argc == 3 || argc == 5 || argc == 6))
851     {
852         fprintf(stderr, "Usage: %s [darknetcfg] [darknetweights] [ncnnparam] [ncnnbin] [merge_output]\n"
853                 "\t[darknetcfg]     .cfg file of input darknet model.\n"
854                 "\t[darknetweights] .weights file of input darknet model.\n"
855                 "\t[cnnparam]       .param file of output ncnn model.\n"
856                 "\t[ncnnbin]        .bin file of output ncnn model.\n"
857                 "\t[merge_output]   merge all output yolo layers into one, enabled by default.\n",
858                 argv[0]);
859         return -1;
860     }
861 
862     const char* darknetcfg = argv[1];
863     const char* darknetweights = argv[2];
864     const char* ncnn_param = argc >= 5 ? argv[3] : "ncnn.param";
865     const char* ncnn_bin = argc >= 5 ? argv[4] : "ncnn.bin";
866     int merge_output = argc >= 6 ? atoi(argv[5]) : 1;
867 
868     std::deque<Section*> dnet;
869 
870     printf("Loading cfg...\n");
871     load_cfg(darknetcfg, dnet);
872     parse_cfg(dnet, merge_output);
873 
874     printf("Loading weights...\n");
875     load_weights(darknetweights, dnet);
876 
877     FILE* pp = fopen(ncnn_param, "wb");
878     if (pp == NULL)
879         file_error(ncnn_param);
880 
881     FILE* bp = fopen(ncnn_bin, "wb");
882     if (bp == NULL)
883         file_error(ncnn_bin);
884 
885     printf("Converting model...\n");
886 
887     fprintf(pp, "7767517\n");
888     fprintf(pp, "%d %d\n", (int)dnet.size(), count_output_blob(dnet));
889 
890     for (auto s : dnet)
891     {
892         fprintf(pp, "%-22s %-20s %d %d", s->layer_type.c_str(), s->layer_name.c_str(), (int)s->input_blobs.size(), (int)s->output_blobs.size());
893         for (auto b : s->input_blobs)
894             fprintf(pp, " %s", b.c_str());
895         for (auto b : s->output_blobs)
896             fprintf(pp, " %s", b.c_str());
897         for (auto p : s->param)
898             fprintf(pp, " %s", p.c_str());
899         fprintf(pp, "\n");
900 
901         if (s->name == "convolutional")
902         {
903             fseek(bp, 4, SEEK_CUR);
904             if (s->weights.size() > 0)
905                 fwrite(&s->weights[0], sizeof(float), s->weights.size(), bp);
906             if (s->scales.size() > 0)
907                 fwrite(&s->scales[0], sizeof(float), s->scales.size(), bp);
908             if (s->rolling_mean.size() > 0)
909                 fwrite(&s->rolling_mean[0], sizeof(float), s->rolling_mean.size(), bp);
910             if (s->rolling_variance.size() > 0)
911                 fwrite(&s->rolling_variance[0], sizeof(float), s->rolling_variance.size(), bp);
912             if (s->bias.size() > 0)
913                 fwrite(&s->bias[0], sizeof(float), s->bias.size(), bp);
914         }
915     }
916     fclose(pp);
917 
918     printf("%d layers, %d blobs generated.\n", (int)dnet.size(), count_output_blob(dnet));
919     printf("NOTE: The input of darknet uses: mean_vals=0 and norm_vals=1/255.f.\n");
920     if (!merge_output)
921         printf("NOTE: There are %d unmerged yolo output layer. Make sure all outputs are processed with nms.\n", yolo_layer_count);
922     printf("NOTE: Remeber to use ncnnoptimize for better performance.\n");
923 
924     return 0;
925 }
926