1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14 
15 #include "net.h"
16 
17 #include <opencv2/core/core.hpp>
18 #include <opencv2/highgui/highgui.hpp>
19 #include <opencv2/imgproc/imgproc.hpp>
20 #include <stdio.h>
21 #include <vector>
22 
23 struct Object
24 {
25     cv::Rect_<float> rect;
26     int label;
27     float prob;
28     std::vector<float> maskdata;
29     cv::Mat mask;
30 };
31 
intersection_area(const Object & a,const Object & b)32 static inline float intersection_area(const Object& a, const Object& b)
33 {
34     cv::Rect_<float> inter = a.rect & b.rect;
35     return inter.area();
36 }
37 
qsort_descent_inplace(std::vector<Object> & objects,int left,int right)38 static void qsort_descent_inplace(std::vector<Object>& objects, int left, int right)
39 {
40     int i = left;
41     int j = right;
42     float p = objects[(left + right) / 2].prob;
43 
44     while (i <= j)
45     {
46         while (objects[i].prob > p)
47             i++;
48 
49         while (objects[j].prob < p)
50             j--;
51 
52         if (i <= j)
53         {
54             // swap
55             std::swap(objects[i], objects[j]);
56 
57             i++;
58             j--;
59         }
60     }
61 
62     #pragma omp parallel sections
63     {
64         #pragma omp section
65         {
66             if (left < j) qsort_descent_inplace(objects, left, j);
67         }
68         #pragma omp section
69         {
70             if (i < right) qsort_descent_inplace(objects, i, right);
71         }
72     }
73 }
74 
qsort_descent_inplace(std::vector<Object> & objects)75 static void qsort_descent_inplace(std::vector<Object>& objects)
76 {
77     if (objects.empty())
78         return;
79 
80     qsort_descent_inplace(objects, 0, objects.size() - 1);
81 }
82 
nms_sorted_bboxes(const std::vector<Object> & objects,std::vector<int> & picked,float nms_threshold)83 static void nms_sorted_bboxes(const std::vector<Object>& objects, std::vector<int>& picked, float nms_threshold)
84 {
85     picked.clear();
86 
87     const int n = objects.size();
88 
89     std::vector<float> areas(n);
90     for (int i = 0; i < n; i++)
91     {
92         areas[i] = objects[i].rect.area();
93     }
94 
95     for (int i = 0; i < n; i++)
96     {
97         const Object& a = objects[i];
98 
99         int keep = 1;
100         for (int j = 0; j < (int)picked.size(); j++)
101         {
102             const Object& b = objects[picked[j]];
103 
104             // intersection over union
105             float inter_area = intersection_area(a, b);
106             float union_area = areas[i] + areas[picked[j]] - inter_area;
107             //             float IoU = inter_area / union_area
108             if (inter_area / union_area > nms_threshold)
109                 keep = 0;
110         }
111 
112         if (keep)
113             picked.push_back(i);
114     }
115 }
116 
detect_yolact(const cv::Mat & bgr,std::vector<Object> & objects)117 static int detect_yolact(const cv::Mat& bgr, std::vector<Object>& objects)
118 {
119     ncnn::Net yolact;
120 
121     yolact.opt.use_vulkan_compute = true;
122 
123     // original model converted from https://github.com/dbolya/yolact
124     // yolact_resnet50_54_800000.pth
125     // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
126     yolact.load_param("yolact.param");
127     yolact.load_model("yolact.bin");
128 
129     const int target_size = 550;
130 
131     int img_w = bgr.cols;
132     int img_h = bgr.rows;
133 
134     ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h, target_size, target_size);
135 
136     const float mean_vals[3] = {123.68f, 116.78f, 103.94f};
137     const float norm_vals[3] = {1.0 / 58.40f, 1.0 / 57.12f, 1.0 / 57.38f};
138     in.substract_mean_normalize(mean_vals, norm_vals);
139 
140     ncnn::Extractor ex = yolact.create_extractor();
141 
142     ex.input("input.1", in);
143 
144     ncnn::Mat maskmaps;
145     ncnn::Mat location;
146     ncnn::Mat mask;
147     ncnn::Mat confidence;
148 
149     ex.extract("619", maskmaps); // 138x138 x 32
150 
151     ex.extract("816", location);   // 4 x 19248
152     ex.extract("818", mask);       // maskdim 32 x 19248
153     ex.extract("820", confidence); // 81 x 19248
154 
155     int num_class = confidence.w;
156     int num_priors = confidence.h;
157 
158     // make priorbox
159     ncnn::Mat priorbox(4, num_priors);
160     {
161         const int conv_ws[5] = {69, 35, 18, 9, 5};
162         const int conv_hs[5] = {69, 35, 18, 9, 5};
163 
164         const float aspect_ratios[3] = {1.f, 0.5f, 2.f};
165         const float scales[5] = {24.f, 48.f, 96.f, 192.f, 384.f};
166 
167         float* pb = priorbox;
168 
169         for (int p = 0; p < 5; p++)
170         {
171             int conv_w = conv_ws[p];
172             int conv_h = conv_hs[p];
173 
174             float scale = scales[p];
175 
176             for (int i = 0; i < conv_h; i++)
177             {
178                 for (int j = 0; j < conv_w; j++)
179                 {
180                     // +0.5 because priors are in center-size notation
181                     float cx = (j + 0.5f) / conv_w;
182                     float cy = (i + 0.5f) / conv_h;
183 
184                     for (int k = 0; k < 3; k++)
185                     {
186                         float ar = aspect_ratios[k];
187 
188                         ar = sqrt(ar);
189 
190                         float w = scale * ar / 550;
191                         float h = scale / ar / 550;
192 
193                         // This is for backward compatability with a bug where I made everything square by accident
194                         // cfg.backbone.use_square_anchors:
195                         h = w;
196 
197                         pb[0] = cx;
198                         pb[1] = cy;
199                         pb[2] = w;
200                         pb[3] = h;
201 
202                         pb += 4;
203                     }
204                 }
205             }
206         }
207     }
208 
209     const float confidence_thresh = 0.05f;
210     const float nms_threshold = 0.5f;
211     const int keep_top_k = 200;
212 
213     std::vector<std::vector<Object> > class_candidates;
214     class_candidates.resize(num_class);
215 
216     for (int i = 0; i < num_priors; i++)
217     {
218         const float* conf = confidence.row(i);
219         const float* loc = location.row(i);
220         const float* pb = priorbox.row(i);
221         const float* maskdata = mask.row(i);
222 
223         // find class id with highest score
224         // start from 1 to skip background
225         int label = 0;
226         float score = 0.f;
227         for (int j = 1; j < num_class; j++)
228         {
229             float class_score = conf[j];
230             if (class_score > score)
231             {
232                 label = j;
233                 score = class_score;
234             }
235         }
236 
237         // ignore background or low score
238         if (label == 0 || score <= confidence_thresh)
239             continue;
240 
241         // CENTER_SIZE
242         float var[4] = {0.1f, 0.1f, 0.2f, 0.2f};
243 
244         float pb_cx = pb[0];
245         float pb_cy = pb[1];
246         float pb_w = pb[2];
247         float pb_h = pb[3];
248 
249         float bbox_cx = var[0] * loc[0] * pb_w + pb_cx;
250         float bbox_cy = var[1] * loc[1] * pb_h + pb_cy;
251         float bbox_w = (float)(exp(var[2] * loc[2]) * pb_w);
252         float bbox_h = (float)(exp(var[3] * loc[3]) * pb_h);
253 
254         float obj_x1 = bbox_cx - bbox_w * 0.5f;
255         float obj_y1 = bbox_cy - bbox_h * 0.5f;
256         float obj_x2 = bbox_cx + bbox_w * 0.5f;
257         float obj_y2 = bbox_cy + bbox_h * 0.5f;
258 
259         // clip
260         obj_x1 = std::max(std::min(obj_x1 * bgr.cols, (float)(bgr.cols - 1)), 0.f);
261         obj_y1 = std::max(std::min(obj_y1 * bgr.rows, (float)(bgr.rows - 1)), 0.f);
262         obj_x2 = std::max(std::min(obj_x2 * bgr.cols, (float)(bgr.cols - 1)), 0.f);
263         obj_y2 = std::max(std::min(obj_y2 * bgr.rows, (float)(bgr.rows - 1)), 0.f);
264 
265         // append object
266         Object obj;
267         obj.rect = cv::Rect_<float>(obj_x1, obj_y1, obj_x2 - obj_x1 + 1, obj_y2 - obj_y1 + 1);
268         obj.label = label;
269         obj.prob = score;
270         obj.maskdata = std::vector<float>(maskdata, maskdata + mask.w);
271 
272         class_candidates[label].push_back(obj);
273     }
274 
275     objects.clear();
276     for (int i = 0; i < (int)class_candidates.size(); i++)
277     {
278         std::vector<Object>& candidates = class_candidates[i];
279 
280         qsort_descent_inplace(candidates);
281 
282         std::vector<int> picked;
283         nms_sorted_bboxes(candidates, picked, nms_threshold);
284 
285         for (int j = 0; j < (int)picked.size(); j++)
286         {
287             int z = picked[j];
288             objects.push_back(candidates[z]);
289         }
290     }
291 
292     qsort_descent_inplace(objects);
293 
294     // keep_top_k
295     if (keep_top_k < (int)objects.size())
296     {
297         objects.resize(keep_top_k);
298     }
299 
300     // generate mask
301     for (int i = 0; i < (int)objects.size(); i++)
302     {
303         Object& obj = objects[i];
304 
305         cv::Mat mask(maskmaps.h, maskmaps.w, CV_32FC1);
306         {
307             mask = cv::Scalar(0.f);
308 
309             for (int p = 0; p < maskmaps.c; p++)
310             {
311                 const float* maskmap = maskmaps.channel(p);
312                 float coeff = obj.maskdata[p];
313                 float* mp = (float*)mask.data;
314 
315                 // mask += m * coeff
316                 for (int j = 0; j < maskmaps.w * maskmaps.h; j++)
317                 {
318                     mp[j] += maskmap[j] * coeff;
319                 }
320             }
321         }
322 
323         cv::Mat mask2;
324         cv::resize(mask, mask2, cv::Size(img_w, img_h));
325 
326         // crop obj box and binarize
327         obj.mask = cv::Mat(img_h, img_w, CV_8UC1);
328         {
329             obj.mask = cv::Scalar(0);
330 
331             for (int y = 0; y < img_h; y++)
332             {
333                 if (y < obj.rect.y || y > obj.rect.y + obj.rect.height)
334                     continue;
335 
336                 const float* mp2 = mask2.ptr<const float>(y);
337                 uchar* bmp = obj.mask.ptr<uchar>(y);
338 
339                 for (int x = 0; x < img_w; x++)
340                 {
341                     if (x < obj.rect.x || x > obj.rect.x + obj.rect.width)
342                         continue;
343 
344                     bmp[x] = mp2[x] > 0.5f ? 255 : 0;
345                 }
346             }
347         }
348     }
349 
350     return 0;
351 }
352 
draw_objects(const cv::Mat & bgr,const std::vector<Object> & objects)353 static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
354 {
355     static const char* class_names[] = {"background",
356                                         "person", "bicycle", "car", "motorcycle", "airplane", "bus",
357                                         "train", "truck", "boat", "traffic light", "fire hydrant",
358                                         "stop sign", "parking meter", "bench", "bird", "cat", "dog",
359                                         "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe",
360                                         "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
361                                         "skis", "snowboard", "sports ball", "kite", "baseball bat",
362                                         "baseball glove", "skateboard", "surfboard", "tennis racket",
363                                         "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl",
364                                         "banana", "apple", "sandwich", "orange", "broccoli", "carrot",
365                                         "hot dog", "pizza", "donut", "cake", "chair", "couch",
366                                         "potted plant", "bed", "dining table", "toilet", "tv", "laptop",
367                                         "mouse", "remote", "keyboard", "cell phone", "microwave", "oven",
368                                         "toaster", "sink", "refrigerator", "book", "clock", "vase",
369                                         "scissors", "teddy bear", "hair drier", "toothbrush"
370                                        };
371 
372     static const unsigned char colors[81][3] = {
373         {56, 0, 255},
374         {226, 255, 0},
375         {0, 94, 255},
376         {0, 37, 255},
377         {0, 255, 94},
378         {255, 226, 0},
379         {0, 18, 255},
380         {255, 151, 0},
381         {170, 0, 255},
382         {0, 255, 56},
383         {255, 0, 75},
384         {0, 75, 255},
385         {0, 255, 169},
386         {255, 0, 207},
387         {75, 255, 0},
388         {207, 0, 255},
389         {37, 0, 255},
390         {0, 207, 255},
391         {94, 0, 255},
392         {0, 255, 113},
393         {255, 18, 0},
394         {255, 0, 56},
395         {18, 0, 255},
396         {0, 255, 226},
397         {170, 255, 0},
398         {255, 0, 245},
399         {151, 255, 0},
400         {132, 255, 0},
401         {75, 0, 255},
402         {151, 0, 255},
403         {0, 151, 255},
404         {132, 0, 255},
405         {0, 255, 245},
406         {255, 132, 0},
407         {226, 0, 255},
408         {255, 37, 0},
409         {207, 255, 0},
410         {0, 255, 207},
411         {94, 255, 0},
412         {0, 226, 255},
413         {56, 255, 0},
414         {255, 94, 0},
415         {255, 113, 0},
416         {0, 132, 255},
417         {255, 0, 132},
418         {255, 170, 0},
419         {255, 0, 188},
420         {113, 255, 0},
421         {245, 0, 255},
422         {113, 0, 255},
423         {255, 188, 0},
424         {0, 113, 255},
425         {255, 0, 0},
426         {0, 56, 255},
427         {255, 0, 113},
428         {0, 255, 188},
429         {255, 0, 94},
430         {255, 0, 18},
431         {18, 255, 0},
432         {0, 255, 132},
433         {0, 188, 255},
434         {0, 245, 255},
435         {0, 169, 255},
436         {37, 255, 0},
437         {255, 0, 151},
438         {188, 0, 255},
439         {0, 255, 37},
440         {0, 255, 0},
441         {255, 0, 170},
442         {255, 0, 37},
443         {255, 75, 0},
444         {0, 0, 255},
445         {255, 207, 0},
446         {255, 0, 226},
447         {255, 245, 0},
448         {188, 255, 0},
449         {0, 255, 18},
450         {0, 255, 75},
451         {0, 255, 151},
452         {255, 56, 0},
453         {245, 255, 0}
454     };
455 
456     cv::Mat image = bgr.clone();
457 
458     int color_index = 0;
459 
460     for (size_t i = 0; i < objects.size(); i++)
461     {
462         const Object& obj = objects[i];
463 
464         if (obj.prob < 0.15)
465             continue;
466 
467         fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
468                 obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
469 
470         const unsigned char* color = colors[color_index++];
471 
472         cv::rectangle(image, obj.rect, cv::Scalar(color[0], color[1], color[2]));
473 
474         char text[256];
475         sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);
476 
477         int baseLine = 0;
478         cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
479 
480         int x = obj.rect.x;
481         int y = obj.rect.y - label_size.height - baseLine;
482         if (y < 0)
483             y = 0;
484         if (x + label_size.width > image.cols)
485             x = image.cols - label_size.width;
486 
487         cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
488                       cv::Scalar(255, 255, 255), -1);
489 
490         cv::putText(image, text, cv::Point(x, y + label_size.height),
491                     cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
492 
493         // draw mask
494         for (int y = 0; y < image.rows; y++)
495         {
496             const uchar* mp = obj.mask.ptr(y);
497             uchar* p = image.ptr(y);
498             for (int x = 0; x < image.cols; x++)
499             {
500                 if (mp[x] == 255)
501                 {
502                     p[0] = cv::saturate_cast<uchar>(p[0] * 0.5 + color[0] * 0.5);
503                     p[1] = cv::saturate_cast<uchar>(p[1] * 0.5 + color[1] * 0.5);
504                     p[2] = cv::saturate_cast<uchar>(p[2] * 0.5 + color[2] * 0.5);
505                 }
506                 p += 3;
507             }
508         }
509     }
510 
511     cv::imwrite("result.png", image);
512     cv::imshow("image", image);
513     cv::waitKey(0);
514 }
515 
main(int argc,char ** argv)516 int main(int argc, char** argv)
517 {
518     if (argc != 2)
519     {
520         fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
521         return -1;
522     }
523 
524     const char* imagepath = argv[1];
525 
526     cv::Mat m = cv::imread(imagepath, 1);
527     if (m.empty())
528     {
529         fprintf(stderr, "cv::imread %s failed\n", imagepath);
530         return -1;
531     }
532 
533     std::vector<Object> objects;
534     detect_yolact(m, objects);
535 
536     draw_objects(m, objects);
537 
538     return 0;
539 }
540