ncnn/model_zoo/rfcn.py

# Tencent is pleased to support the open source community by making ncnn available.
#
# Copyright (C) 2020 THL A29 Limited, a Tencent company. All rights reserved.
#
# Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
# in compliance with the License. You may obtain a copy of the License at
#
# https://opensource.org/licenses/BSD-3-Clause
#
# Unless required by applicable law or agreed to in writing, software distributed
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
# specific language governing permissions and limitations under the License.

import numpy as np
import ncnn
from .model_store import get_model_file
from ..utils.objects import Detect_Object


class RFCN:
    def __init__(
        self,
        target_size=224,
        max_per_image=100,
        confidence_thresh=0.6,
        nms_threshold=0.3,
        num_threads=1,
        use_gpu=False,
    ):
        self.target_size = target_size
        self.max_per_image = max_per_image
        self.confidence_thresh = confidence_thresh
        self.nms_threshold = nms_threshold
        self.num_threads = num_threads
        self.use_gpu = use_gpu

        self.mean_vals = [102.9801, 115.9465, 122.7717]
        self.norm_vals = []

        self.net = ncnn.Net()
        self.net.opt.use_vulkan_compute = self.use_gpu

        # original pretrained model from https://github.com/YuwenXiong/py-R-FCN
        # https://github.com/YuwenXiong/py-R-FCN/blob/master/models/pascal_voc/ResNet-50/rfcn_end2end/test_agnostic.prototxt
        # https://1drv.ms/u/s!AoN7vygOjLIQqUWHpY67oaC7mopf
        # resnet50_rfcn_final.caffemodel
        # the ncnn model https://github.com/nihui/ncnn-assets/tree/master/models
        self.net.load_param(get_model_file("rfcn_end2end.param"))
        self.net.load_model(get_model_file("rfcn_end2end.bin"))

        self.class_names = [
            "background",
            "aeroplane",
            "bicycle",
            "bird",
            "boat",
            "bottle",
            "bus",
            "car",
            "cat",
            "chair",
            "cow",
            "diningtable",
            "dog",
            "horse",
            "motorbike",
            "person",
            "pottedplant",
            "sheep",
            "sofa",
            "train",
            "tvmonitor",
        ]

    def __del__(self):
        self.net = None

    def __call__(self, img):
        h = img.shape[0]
        w = img.shape[1]

        scale = 1.0
        if w < h:
            scale = float(self.target_size) / w
            w = self.target_size
            h = h * scale
        else:
            scale = float(self.target_size) / h
            h = self.target_size
            w = w * scale

        mat_in = ncnn.Mat.from_pixels_resize(
            img,
            ncnn.Mat.PixelType.PIXEL_BGR,
            img.shape[1],
            img.shape[0],
            int(w),
            int(h),
        )
        mat_in.substract_mean_normalize(self.mean_vals, self.norm_vals)

        im_info = ncnn.Mat(3)
        im_info[0] = h
        im_info[1] = w
        im_info[2] = scale

        # step1, extract feature and all rois
        ex1 = self.net.create_extractor()
        ex1.set_num_threads(self.num_threads)
        ex1.input("data", mat_in)
        ex1.input("im_info", im_info)

        ret1, rfcn_cls = ex1.extract("rfcn_cls")
        ret2, rfcn_bbox = ex1.extract("rfcn_bbox")
        ret3, rois = ex1.extract("rois")  # all rois

        # step2, extract bbox and score for each roi
        class_candidates = []
        for i in range(rois.c):
            ex2 = self.net.create_extractor()

            roi = rois.channel(i)  # get single roi
            ex2.input("rfcn_cls", rfcn_cls)
            ex2.input("rfcn_bbox", rfcn_bbox)
            ex2.input("rois", roi)

            ret1, bbox_pred = ex2.extract("bbox_pred")
            ret2, cls_prob = ex2.extract("cls_prob")

            num_class = cls_prob.w
            while len(class_candidates) < num_class:
                class_candidates.append([])

            # find class id with highest score
            label = 0
            score = 0.0
            for j in range(num_class):
                class_score = cls_prob[j]
                if class_score > score:
                    label = j
                    score = class_score

            # ignore background or low score
            if label == 0 or score <= self.confidence_thresh:
                continue

            # fprintf(stderr, "%d = %f\n", label, score)

            # unscale to image size
            x1 = roi[0] / scale
            y1 = roi[1] / scale
            x2 = roi[2] / scale
            y2 = roi[3] / scale

            pb_w = x2 - x1 + 1
            pb_h = y2 - y1 + 1

            # apply bbox regression
            dx = bbox_pred[4]
            dy = bbox_pred[4 + 1]
            dw = bbox_pred[4 + 2]
            dh = bbox_pred[4 + 3]

            cx = x1 + pb_w * 0.5
            cy = y1 + pb_h * 0.5

            obj_cx = cx + pb_w * dx
            obj_cy = cy + pb_h * dy

            obj_w = pb_w * np.exp(dw)
            obj_h = pb_h * np.exp(dh)

            obj_x1 = obj_cx - obj_w * 0.5
            obj_y1 = obj_cy - obj_h * 0.5
            obj_x2 = obj_cx + obj_w * 0.5
            obj_y2 = obj_cy + obj_h * 0.5

            # clip
            obj_x1 = np.maximum(np.minimum(obj_x1, float(img.shape[1] - 1)), 0.0)
            obj_y1 = np.maximum(np.minimum(obj_y1, float(img.shape[0] - 1)), 0.0)
            obj_x2 = np.maximum(np.minimum(obj_x2, float(img.shape[1] - 1)), 0.0)
            obj_y2 = np.maximum(np.minimum(obj_y2, float(img.shape[0] - 1)), 0.0)

            # append object
            obj = Detect_Object()
            obj.rect.x = obj_x1
            obj.rect.y = obj_y1
            obj.rect.w = obj_x2 - obj_x1 + 1
            obj.rect.h = obj_y2 - obj_y1 + 1
            obj.label = label
            obj.prob = score

            class_candidates[label].append(obj)

        # post process
        objects = []
        for candidates in class_candidates:
            if len(candidates) == 0:
                continue

            candidates.sort(key=lambda obj: obj.prob, reverse=True)

            picked = self.nms_sorted_bboxes(candidates, self.nms_threshold)

            for j in range(len(picked)):
                z = picked[j]
                objects.append(candidates[z])

        objects.sort(key=lambda obj: obj.prob, reverse=True)

        objects = objects[: self.max_per_image]

        return objects

    def nms_sorted_bboxes(self, objects, nms_threshold):
        picked = []

        n = len(objects)

        areas = np.zeros((n,), dtype=np.float32)
        for i in range(n):
            areas[i] = objects[i].rect.area()

        for i in range(n):
            a = objects[i]

            keep = True
            for j in range(len(picked)):
                b = objects[picked[j]]

                # intersection over union
                inter_area = a.rect.intersection_area(b.rect)
                union_area = areas[i] + areas[picked[j]] - inter_area
                # float IoU = inter_area / union_area
                if inter_area / union_area > nms_threshold:
                    keep = False

            if keep:
                picked.append(i)

        return picked