1 // Tencent is pleased to support the open source community by making ncnn available.
2 //
3 // Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
4 //
5 // Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6 // in compliance with the License. You may obtain a copy of the License at
7 //
8 // https://opensource.org/licenses/BSD-3-Clause
9 //
10 // Unless required by applicable law or agreed to in writing, software distributed
11 // under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12 // CONDITIONS OF ANY KIND, either express or implied. See the License for the
13 // specific language governing permissions and limitations under the License.
14
15 #include "net.h"
16
17 #include <opencv2/core/core.hpp>
18 #include <opencv2/highgui/highgui.hpp>
19 #include <opencv2/imgproc/imgproc.hpp>
20 #include <stdio.h>
21 #include <vector>
22
23 struct Object
24 {
25 cv::Rect_<float> rect;
26 int label;
27 float prob;
28 std::vector<float> maskdata;
29 cv::Mat mask;
30 };
31
intersection_area(const Object & a,const Object & b)32 static inline float intersection_area(const Object& a, const Object& b)
33 {
34 cv::Rect_<float> inter = a.rect & b.rect;
35 return inter.area();
36 }
37
qsort_descent_inplace(std::vector<Object> & objects,int left,int right)38 static void qsort_descent_inplace(std::vector<Object>& objects, int left, int right)
39 {
40 int i = left;
41 int j = right;
42 float p = objects[(left + right) / 2].prob;
43
44 while (i <= j)
45 {
46 while (objects[i].prob > p)
47 i++;
48
49 while (objects[j].prob < p)
50 j--;
51
52 if (i <= j)
53 {
54 // swap
55 std::swap(objects[i], objects[j]);
56
57 i++;
58 j--;
59 }
60 }
61
62 #pragma omp parallel sections
63 {
64 #pragma omp section
65 {
66 if (left < j) qsort_descent_inplace(objects, left, j);
67 }
68 #pragma omp section
69 {
70 if (i < right) qsort_descent_inplace(objects, i, right);
71 }
72 }
73 }
74
qsort_descent_inplace(std::vector<Object> & objects)75 static void qsort_descent_inplace(std::vector<Object>& objects)
76 {
77 if (objects.empty())
78 return;
79
80 qsort_descent_inplace(objects, 0, objects.size() - 1);
81 }
82
nms_sorted_bboxes(const std::vector<Object> & objects,std::vector<int> & picked,float nms_threshold)83 static void nms_sorted_bboxes(const std::vector<Object>& objects, std::vector<int>& picked, float nms_threshold)
84 {
85 picked.clear();
86
87 const int n = objects.size();
88
89 std::vector<float> areas(n);
90 for (int i = 0; i < n; i++)
91 {
92 areas[i] = objects[i].rect.area();
93 }
94
95 for (int i = 0; i < n; i++)
96 {
97 const Object& a = objects[i];
98
99 int keep = 1;
100 for (int j = 0; j < (int)picked.size(); j++)
101 {
102 const Object& b = objects[picked[j]];
103
104 // intersection over union
105 float inter_area = intersection_area(a, b);
106 float union_area = areas[i] + areas[picked[j]] - inter_area;
107 // float IoU = inter_area / union_area
108 if (inter_area / union_area > nms_threshold)
109 keep = 0;
110 }
111
112 if (keep)
113 picked.push_back(i);
114 }
115 }
116
detect_yolact(const cv::Mat & bgr,std::vector<Object> & objects)117 static int detect_yolact(const cv::Mat& bgr, std::vector<Object>& objects)
118 {
119 ncnn::Net yolact;
120
121 yolact.opt.use_vulkan_compute = true;
122
123 // original model converted from https://github.com/dbolya/yolact
124 // yolact_resnet50_54_800000.pth
125 // the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
126 yolact.load_param("yolact.param");
127 yolact.load_model("yolact.bin");
128
129 const int target_size = 550;
130
131 int img_w = bgr.cols;
132 int img_h = bgr.rows;
133
134 ncnn::Mat in = ncnn::Mat::from_pixels_resize(bgr.data, ncnn::Mat::PIXEL_BGR2RGB, img_w, img_h, target_size, target_size);
135
136 const float mean_vals[3] = {123.68f, 116.78f, 103.94f};
137 const float norm_vals[3] = {1.0 / 58.40f, 1.0 / 57.12f, 1.0 / 57.38f};
138 in.substract_mean_normalize(mean_vals, norm_vals);
139
140 ncnn::Extractor ex = yolact.create_extractor();
141
142 ex.input("input.1", in);
143
144 ncnn::Mat maskmaps;
145 ncnn::Mat location;
146 ncnn::Mat mask;
147 ncnn::Mat confidence;
148
149 ex.extract("619", maskmaps); // 138x138 x 32
150
151 ex.extract("816", location); // 4 x 19248
152 ex.extract("818", mask); // maskdim 32 x 19248
153 ex.extract("820", confidence); // 81 x 19248
154
155 int num_class = confidence.w;
156 int num_priors = confidence.h;
157
158 // make priorbox
159 ncnn::Mat priorbox(4, num_priors);
160 {
161 const int conv_ws[5] = {69, 35, 18, 9, 5};
162 const int conv_hs[5] = {69, 35, 18, 9, 5};
163
164 const float aspect_ratios[3] = {1.f, 0.5f, 2.f};
165 const float scales[5] = {24.f, 48.f, 96.f, 192.f, 384.f};
166
167 float* pb = priorbox;
168
169 for (int p = 0; p < 5; p++)
170 {
171 int conv_w = conv_ws[p];
172 int conv_h = conv_hs[p];
173
174 float scale = scales[p];
175
176 for (int i = 0; i < conv_h; i++)
177 {
178 for (int j = 0; j < conv_w; j++)
179 {
180 // +0.5 because priors are in center-size notation
181 float cx = (j + 0.5f) / conv_w;
182 float cy = (i + 0.5f) / conv_h;
183
184 for (int k = 0; k < 3; k++)
185 {
186 float ar = aspect_ratios[k];
187
188 ar = sqrt(ar);
189
190 float w = scale * ar / 550;
191 float h = scale / ar / 550;
192
193 // This is for backward compatability with a bug where I made everything square by accident
194 // cfg.backbone.use_square_anchors:
195 h = w;
196
197 pb[0] = cx;
198 pb[1] = cy;
199 pb[2] = w;
200 pb[3] = h;
201
202 pb += 4;
203 }
204 }
205 }
206 }
207 }
208
209 const float confidence_thresh = 0.05f;
210 const float nms_threshold = 0.5f;
211 const int keep_top_k = 200;
212
213 std::vector<std::vector<Object> > class_candidates;
214 class_candidates.resize(num_class);
215
216 for (int i = 0; i < num_priors; i++)
217 {
218 const float* conf = confidence.row(i);
219 const float* loc = location.row(i);
220 const float* pb = priorbox.row(i);
221 const float* maskdata = mask.row(i);
222
223 // find class id with highest score
224 // start from 1 to skip background
225 int label = 0;
226 float score = 0.f;
227 for (int j = 1; j < num_class; j++)
228 {
229 float class_score = conf[j];
230 if (class_score > score)
231 {
232 label = j;
233 score = class_score;
234 }
235 }
236
237 // ignore background or low score
238 if (label == 0 || score <= confidence_thresh)
239 continue;
240
241 // CENTER_SIZE
242 float var[4] = {0.1f, 0.1f, 0.2f, 0.2f};
243
244 float pb_cx = pb[0];
245 float pb_cy = pb[1];
246 float pb_w = pb[2];
247 float pb_h = pb[3];
248
249 float bbox_cx = var[0] * loc[0] * pb_w + pb_cx;
250 float bbox_cy = var[1] * loc[1] * pb_h + pb_cy;
251 float bbox_w = (float)(exp(var[2] * loc[2]) * pb_w);
252 float bbox_h = (float)(exp(var[3] * loc[3]) * pb_h);
253
254 float obj_x1 = bbox_cx - bbox_w * 0.5f;
255 float obj_y1 = bbox_cy - bbox_h * 0.5f;
256 float obj_x2 = bbox_cx + bbox_w * 0.5f;
257 float obj_y2 = bbox_cy + bbox_h * 0.5f;
258
259 // clip
260 obj_x1 = std::max(std::min(obj_x1 * bgr.cols, (float)(bgr.cols - 1)), 0.f);
261 obj_y1 = std::max(std::min(obj_y1 * bgr.rows, (float)(bgr.rows - 1)), 0.f);
262 obj_x2 = std::max(std::min(obj_x2 * bgr.cols, (float)(bgr.cols - 1)), 0.f);
263 obj_y2 = std::max(std::min(obj_y2 * bgr.rows, (float)(bgr.rows - 1)), 0.f);
264
265 // append object
266 Object obj;
267 obj.rect = cv::Rect_<float>(obj_x1, obj_y1, obj_x2 - obj_x1 + 1, obj_y2 - obj_y1 + 1);
268 obj.label = label;
269 obj.prob = score;
270 obj.maskdata = std::vector<float>(maskdata, maskdata + mask.w);
271
272 class_candidates[label].push_back(obj);
273 }
274
275 objects.clear();
276 for (int i = 0; i < (int)class_candidates.size(); i++)
277 {
278 std::vector<Object>& candidates = class_candidates[i];
279
280 qsort_descent_inplace(candidates);
281
282 std::vector<int> picked;
283 nms_sorted_bboxes(candidates, picked, nms_threshold);
284
285 for (int j = 0; j < (int)picked.size(); j++)
286 {
287 int z = picked[j];
288 objects.push_back(candidates[z]);
289 }
290 }
291
292 qsort_descent_inplace(objects);
293
294 // keep_top_k
295 if (keep_top_k < (int)objects.size())
296 {
297 objects.resize(keep_top_k);
298 }
299
300 // generate mask
301 for (int i = 0; i < (int)objects.size(); i++)
302 {
303 Object& obj = objects[i];
304
305 cv::Mat mask(maskmaps.h, maskmaps.w, CV_32FC1);
306 {
307 mask = cv::Scalar(0.f);
308
309 for (int p = 0; p < maskmaps.c; p++)
310 {
311 const float* maskmap = maskmaps.channel(p);
312 float coeff = obj.maskdata[p];
313 float* mp = (float*)mask.data;
314
315 // mask += m * coeff
316 for (int j = 0; j < maskmaps.w * maskmaps.h; j++)
317 {
318 mp[j] += maskmap[j] * coeff;
319 }
320 }
321 }
322
323 cv::Mat mask2;
324 cv::resize(mask, mask2, cv::Size(img_w, img_h));
325
326 // crop obj box and binarize
327 obj.mask = cv::Mat(img_h, img_w, CV_8UC1);
328 {
329 obj.mask = cv::Scalar(0);
330
331 for (int y = 0; y < img_h; y++)
332 {
333 if (y < obj.rect.y || y > obj.rect.y + obj.rect.height)
334 continue;
335
336 const float* mp2 = mask2.ptr<const float>(y);
337 uchar* bmp = obj.mask.ptr<uchar>(y);
338
339 for (int x = 0; x < img_w; x++)
340 {
341 if (x < obj.rect.x || x > obj.rect.x + obj.rect.width)
342 continue;
343
344 bmp[x] = mp2[x] > 0.5f ? 255 : 0;
345 }
346 }
347 }
348 }
349
350 return 0;
351 }
352
draw_objects(const cv::Mat & bgr,const std::vector<Object> & objects)353 static void draw_objects(const cv::Mat& bgr, const std::vector<Object>& objects)
354 {
355 static const char* class_names[] = {"background",
356 "person", "bicycle", "car", "motorcycle", "airplane", "bus",
357 "train", "truck", "boat", "traffic light", "fire hydrant",
358 "stop sign", "parking meter", "bench", "bird", "cat", "dog",
359 "horse", "sheep", "cow", "elephant", "bear", "zebra", "giraffe",
360 "backpack", "umbrella", "handbag", "tie", "suitcase", "frisbee",
361 "skis", "snowboard", "sports ball", "kite", "baseball bat",
362 "baseball glove", "skateboard", "surfboard", "tennis racket",
363 "bottle", "wine glass", "cup", "fork", "knife", "spoon", "bowl",
364 "banana", "apple", "sandwich", "orange", "broccoli", "carrot",
365 "hot dog", "pizza", "donut", "cake", "chair", "couch",
366 "potted plant", "bed", "dining table", "toilet", "tv", "laptop",
367 "mouse", "remote", "keyboard", "cell phone", "microwave", "oven",
368 "toaster", "sink", "refrigerator", "book", "clock", "vase",
369 "scissors", "teddy bear", "hair drier", "toothbrush"
370 };
371
372 static const unsigned char colors[81][3] = {
373 {56, 0, 255},
374 {226, 255, 0},
375 {0, 94, 255},
376 {0, 37, 255},
377 {0, 255, 94},
378 {255, 226, 0},
379 {0, 18, 255},
380 {255, 151, 0},
381 {170, 0, 255},
382 {0, 255, 56},
383 {255, 0, 75},
384 {0, 75, 255},
385 {0, 255, 169},
386 {255, 0, 207},
387 {75, 255, 0},
388 {207, 0, 255},
389 {37, 0, 255},
390 {0, 207, 255},
391 {94, 0, 255},
392 {0, 255, 113},
393 {255, 18, 0},
394 {255, 0, 56},
395 {18, 0, 255},
396 {0, 255, 226},
397 {170, 255, 0},
398 {255, 0, 245},
399 {151, 255, 0},
400 {132, 255, 0},
401 {75, 0, 255},
402 {151, 0, 255},
403 {0, 151, 255},
404 {132, 0, 255},
405 {0, 255, 245},
406 {255, 132, 0},
407 {226, 0, 255},
408 {255, 37, 0},
409 {207, 255, 0},
410 {0, 255, 207},
411 {94, 255, 0},
412 {0, 226, 255},
413 {56, 255, 0},
414 {255, 94, 0},
415 {255, 113, 0},
416 {0, 132, 255},
417 {255, 0, 132},
418 {255, 170, 0},
419 {255, 0, 188},
420 {113, 255, 0},
421 {245, 0, 255},
422 {113, 0, 255},
423 {255, 188, 0},
424 {0, 113, 255},
425 {255, 0, 0},
426 {0, 56, 255},
427 {255, 0, 113},
428 {0, 255, 188},
429 {255, 0, 94},
430 {255, 0, 18},
431 {18, 255, 0},
432 {0, 255, 132},
433 {0, 188, 255},
434 {0, 245, 255},
435 {0, 169, 255},
436 {37, 255, 0},
437 {255, 0, 151},
438 {188, 0, 255},
439 {0, 255, 37},
440 {0, 255, 0},
441 {255, 0, 170},
442 {255, 0, 37},
443 {255, 75, 0},
444 {0, 0, 255},
445 {255, 207, 0},
446 {255, 0, 226},
447 {255, 245, 0},
448 {188, 255, 0},
449 {0, 255, 18},
450 {0, 255, 75},
451 {0, 255, 151},
452 {255, 56, 0},
453 {245, 255, 0}
454 };
455
456 cv::Mat image = bgr.clone();
457
458 int color_index = 0;
459
460 for (size_t i = 0; i < objects.size(); i++)
461 {
462 const Object& obj = objects[i];
463
464 if (obj.prob < 0.15)
465 continue;
466
467 fprintf(stderr, "%d = %.5f at %.2f %.2f %.2f x %.2f\n", obj.label, obj.prob,
468 obj.rect.x, obj.rect.y, obj.rect.width, obj.rect.height);
469
470 const unsigned char* color = colors[color_index++];
471
472 cv::rectangle(image, obj.rect, cv::Scalar(color[0], color[1], color[2]));
473
474 char text[256];
475 sprintf(text, "%s %.1f%%", class_names[obj.label], obj.prob * 100);
476
477 int baseLine = 0;
478 cv::Size label_size = cv::getTextSize(text, cv::FONT_HERSHEY_SIMPLEX, 0.5, 1, &baseLine);
479
480 int x = obj.rect.x;
481 int y = obj.rect.y - label_size.height - baseLine;
482 if (y < 0)
483 y = 0;
484 if (x + label_size.width > image.cols)
485 x = image.cols - label_size.width;
486
487 cv::rectangle(image, cv::Rect(cv::Point(x, y), cv::Size(label_size.width, label_size.height + baseLine)),
488 cv::Scalar(255, 255, 255), -1);
489
490 cv::putText(image, text, cv::Point(x, y + label_size.height),
491 cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));
492
493 // draw mask
494 for (int y = 0; y < image.rows; y++)
495 {
496 const uchar* mp = obj.mask.ptr(y);
497 uchar* p = image.ptr(y);
498 for (int x = 0; x < image.cols; x++)
499 {
500 if (mp[x] == 255)
501 {
502 p[0] = cv::saturate_cast<uchar>(p[0] * 0.5 + color[0] * 0.5);
503 p[1] = cv::saturate_cast<uchar>(p[1] * 0.5 + color[1] * 0.5);
504 p[2] = cv::saturate_cast<uchar>(p[2] * 0.5 + color[2] * 0.5);
505 }
506 p += 3;
507 }
508 }
509 }
510
511 cv::imwrite("result.png", image);
512 cv::imshow("image", image);
513 cv::waitKey(0);
514 }
515
main(int argc,char ** argv)516 int main(int argc, char** argv)
517 {
518 if (argc != 2)
519 {
520 fprintf(stderr, "Usage: %s [imagepath]\n", argv[0]);
521 return -1;
522 }
523
524 const char* imagepath = argv[1];
525
526 cv::Mat m = cv::imread(imagepath, 1);
527 if (m.empty())
528 {
529 fprintf(stderr, "cv::imread %s failed\n", imagepath);
530 return -1;
531 }
532
533 std::vector<Object> objects;
534 detect_yolact(m, objects);
535
536 draw_objects(m, objects);
537
538 return 0;
539 }
540