1 // This file is part of OpenCV project.
2 // It is subject to the license terms in the LICENSE file found in the top-level directory
3 // of this distribution and at http://opencv.org/license.html.
4 
5 #ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_DETECTION_OUTPUT_HPP
6 #define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_DETECTION_OUTPUT_HPP
7 
8 #include "../../op_cuda.hpp"
9 
10 #include "../csl/stream.hpp"
11 #include "../csl/tensor.hpp"
12 
13 #include "../kernels/fill_copy.hpp"
14 #include "../kernels/permute.hpp"
15 #include "../kernels/detection_output.hpp"
16 #include "../kernels/grid_nms.hpp"
17 
18 #include <cstddef>
19 #include <utility>
20 
21 namespace cv { namespace dnn { namespace cuda4dnn {
22 
23     struct DetectionOutputConfiguration {
24         std::size_t batch_size;
25 
26         enum class CodeType {
27             CORNER,
28             CENTER_SIZE
29         };
30         CodeType code_type;
31 
32         bool share_location;
33         std::size_t num_priors;
34         std::size_t num_classes;
35         std::size_t background_class_id;
36 
37         bool transpose_location;
38         bool variance_encoded_in_target;
39         bool normalized_bbox;
40         bool clip_box;
41 
42         std::size_t classwise_topK;
43         float confidence_threshold;
44         float nms_threshold;
45 
46         int keepTopK;
47     };
48 
49     template <class T>
50     class DetectionOutputOp final : public CUDABackendNode {
51     private:
52         /* We have block level NMS kernel where each block handles one class of one batch item.
53          * If the number of classes and batch size together is very low, the blockwise NMS kernel
54          * won't able to fully saturate the GPU with work.
55          *
56          * We also have a grid level NMS kernel where multiple blocks handle each class of every batch item.
57          * This performs better in the worst case and utilizes resources better when block level kernel isn't
58          * able to saturate the GPU with enough work. However, this is not efficient in the average case where
59          * the block level kernel is able to saturate the GPU. It does better when the blockwise NMS barely
60          * saturates the GPU.
61          *
62          * `GRID_NMS_CUTOFF` is the cutoff for `num_classes * batch_size` above which we will switch from grid
63          * level NMS to block level NMS.
64          */
65         static constexpr int GRID_NMS_CUTOFF = 32;
66 
67     public:
68         using wrapper_type = GetCUDABackendWrapperType<T>;
69 
DetectionOutputOp(csl::Stream stream_,const DetectionOutputConfiguration & config)70         DetectionOutputOp(csl::Stream stream_, const DetectionOutputConfiguration& config)
71             : stream(std::move(stream_))
72         {
73             corner_true_or_center_false = (config.code_type == DetectionOutputConfiguration::CodeType::CORNER);
74 
75             share_location = config.share_location;
76             num_priors = config.num_priors;
77             num_classes = config.num_classes;
78             background_class_id = config.background_class_id;
79 
80             transpose_location = config.transpose_location;
81             variance_encoded_in_target = config.variance_encoded_in_target;
82             normalized_bbox = config.normalized_bbox;
83             clip_box = config.clip_box;
84 
85             classwise_topK = config.classwise_topK;
86             confidence_threshold = config.confidence_threshold;
87             nms_threshold = config.nms_threshold;
88 
89             keepTopK = config.keepTopK;
90             CV_Assert(keepTopK > 0);
91 
92             if (classwise_topK == -1)
93             {
94                 classwise_topK = num_priors;
95                 if (keepTopK > 0 && keepTopK < num_priors)
96                     classwise_topK = keepTopK;
97             }
98 
99             auto batch_size = config.batch_size;
100             auto num_loc_classes = (share_location ? 1 : num_classes);
101 
102             csl::WorkspaceBuilder builder;
103             builder.require<T>(batch_size * num_priors * num_loc_classes * 4); /* decoded boxes */
104             builder.require<T>(batch_size * num_classes * num_priors); /* transposed scores */
105             builder.require<int>(batch_size * num_classes * classwise_topK); /* indices */
106             builder.require<int>(batch_size * num_classes); /* classwise topK count */
107             builder.require<T>(batch_size * num_classes * classwise_topK * 4); /* topK decoded boxes */
108 
109             if (batch_size * num_classes <= GRID_NMS_CUTOFF)
110             {
111                 auto workspace_per_batch_item = kernels::getGridNMSWorkspaceSizePerBatchItem(num_classes, classwise_topK);
112                 builder.require(batch_size * workspace_per_batch_item);
113             }
114 
115             builder.require<int>(batch_size * keepTopK); /* final kept indices */
116             builder.require<int>(batch_size); /* kept indices count */
117             builder.require<int>(1); /* total number of detections */
118 
119             scratch_mem_in_bytes = builder.required_workspace_size();
120         }
121 
forward(const std::vector<cv::Ptr<BackendWrapper>> & inputs,const std::vector<cv::Ptr<BackendWrapper>> & outputs,csl::Workspace & workspace)122         void forward(
123             const std::vector<cv::Ptr<BackendWrapper>>& inputs,
124             const std::vector<cv::Ptr<BackendWrapper>>& outputs,
125             csl::Workspace& workspace) override
126         {
127             /* locations, scores and priors make the first three inputs in order */
128             /* the 4th input is used to obtain the shape for clipping */
129             CV_Assert((inputs.size() == 3 || inputs.size() == 4) && outputs.size() == 1);
130 
131             // locations: [batch_size, num_priors, num_loc_classes, 4]
132             auto locations_wrapper = inputs[0].dynamicCast<wrapper_type>();
133             auto locations = locations_wrapper->getView();
134 
135             // scores: [batch_size, num_priors, num_classes]
136             auto scores_wrapper = inputs[1].dynamicCast<wrapper_type>();
137             auto scores = scores_wrapper->getView();
138             scores.unsqueeze();
139             scores.reshape(-1, num_priors, num_classes);
140 
141             // priors: [1, 2, num_priors, 4]
142             auto priors_wrapper = inputs[2].dynamicCast<wrapper_type>();
143             auto priors = priors_wrapper->getView();
144 
145             // output: [1, 1, batch_size * keepTopK, 7]
146             auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
147             auto output = output_wrapper->getSpan();
148 
149             auto batch_size = locations.get_axis_size(0);
150             auto num_loc_classes = (share_location ? 1 : num_classes);
151             while(locations.rank() < 4)
152                 locations.unsqueeze();
153             locations.reshape(batch_size, num_priors, num_loc_classes, 4);
154 
155             float clip_width = 0.0, clip_height = 0.0;
156             if (clip_box)
157             {
158                 if (normalized_bbox)
159                 {
160                     clip_width = clip_height = 1.0f;
161                 }
162                 else
163                 {
164                     auto image_wrapper = inputs[3].dynamicCast<wrapper_type>();
165                     auto image_shape = image_wrapper->getShape();
166 
167                     CV_Assert(image_shape.size() == 4);
168                     clip_width = image_shape[3] - 1;
169                     clip_height = image_shape[2] - 1;
170                 }
171             }
172 
173             csl::WorkspaceAllocator allocator(workspace);
174 
175             // decoded_boxes: [batch_size, num_priors, num_loc_classes, 4]
176             csl::TensorSpan<T> decoded_boxes;
177             {
178                 auto shape = std::vector<std::size_t>{batch_size, num_priors, num_loc_classes, 4};
179                 decoded_boxes = allocator.get_tensor_span<T>(std::begin(shape), std::end(shape));
180                 CV_Assert(is_shape_same(decoded_boxes, locations));
181             }
182 
183             kernels::decode_bboxes<T>(stream, decoded_boxes, locations, priors,
184                 num_loc_classes, share_location, background_class_id,
185                 transpose_location, variance_encoded_in_target,
186                 corner_true_or_center_false, normalized_bbox,
187                 clip_box, clip_width, clip_height);
188 
189             // scores_permuted: [batch_size, num_classes, num_priors]
190             csl::TensorSpan<T> scores_permuted;
191             {
192                 auto shape = std::vector<std::size_t>{batch_size, num_classes, num_priors};
193                 scores_permuted = allocator.get_tensor_span<T>(std::begin(shape), std::end(shape));
194             }
195 
196             kernels::permute<T>(stream, scores_permuted, scores, {0, 2, 1});
197 
198             // indices: [batch_size, num_classes, classwise_topK]
199             csl::TensorSpan<int> indices;
200             {
201                 auto shape = std::vector<std::size_t>{batch_size, num_classes, classwise_topK};
202                 indices = allocator.get_tensor_span<int>(std::begin(shape), std::end(shape));
203             }
204 
205             // count: [batch_size, num_classes]
206             csl::TensorSpan<int> count;
207             {
208                 auto shape = std::vector<std::size_t>{batch_size, num_classes};
209                 count = allocator.get_tensor_span<int>(std::begin(shape), std::end(shape));
210             }
211 
212             kernels::findTopK<T>(stream, indices, count, scores_permuted, background_class_id, confidence_threshold);
213 
214             // collected_bboxes: [batch_size, num_classes, classwise_topK, 4]
215             csl::TensorSpan<T> collected_bboxes;
216             {
217                 auto shape = std::vector<std::size_t>{batch_size, num_classes, classwise_topK, 4};
218                 collected_bboxes = allocator.get_tensor_span<T>(std::begin(shape), std::end(shape));
219             }
220 
221             kernels::box_collect<T>(stream, collected_bboxes, decoded_boxes, indices, count, share_location, background_class_id);
222 
223             if (batch_size * num_classes <= GRID_NMS_CUTOFF)
224             {
225                 auto workspace_per_batch_item = kernels::getGridNMSWorkspaceSizePerBatchItem(num_classes, classwise_topK);
226                 auto workspace = allocator.get_span<unsigned int>(batch_size * workspace_per_batch_item / sizeof(unsigned int));
227                 kernels::grid_nms<T>(stream, workspace, indices, count, collected_bboxes, background_class_id, normalized_bbox, nms_threshold);
228             }
229             else
230             {
231                 kernels::blockwise_class_nms<T>(stream, indices, count, collected_bboxes, normalized_bbox, background_class_id, nms_threshold);
232             }
233 
234             // kept_indices: [batch_size, keepTopK]
235             csl::TensorSpan<int> kept_indices;
236             {
237                 auto shape = std::vector<std::size_t>{batch_size, static_cast<std::size_t>(keepTopK)};
238                 kept_indices = allocator.get_tensor_span<int>(std::begin(shape), std::end(shape));
239             }
240 
241             // kept_count: [batch_size]
242             csl::TensorSpan<int> kept_count;
243             {
244                 auto shape = std::vector<std::size_t>{batch_size};
245                 kept_count = allocator.get_tensor_span<int>(std::begin(shape), std::end(shape));
246             }
247 
248             kernels::nms_collect<T>(stream, kept_indices, kept_count, indices, count, scores_permuted, confidence_threshold, background_class_id);
249 
250             auto num_detections = allocator.get_span<int>(1);
251             kernels::fill<int>(stream, num_detections, 0);
252             kernels::fill<T>(stream, output, 0.0);
253             kernels::consolidate_detections<T>(stream, output, kept_indices, kept_count, decoded_boxes, scores_permuted, share_location, num_detections.data());
254         }
255 
get_workspace_memory_in_bytes() const256         std::size_t get_workspace_memory_in_bytes() const noexcept override { return scratch_mem_in_bytes; }
257 
258     private:
259         csl::Stream stream;
260         std::size_t scratch_mem_in_bytes;
261 
262         bool share_location;
263         std::size_t num_priors;
264         std::size_t num_classes;
265         std::size_t background_class_id;
266 
267         bool transpose_location;
268         bool variance_encoded_in_target;
269         bool corner_true_or_center_false;
270         bool normalized_bbox;
271         bool clip_box;
272 
273         std::size_t classwise_topK;
274         float confidence_threshold;
275         float nms_threshold;
276 
277         int keepTopK;
278     };
279 
280 }}} /* namespace cv::dnn::cuda4dnn */
281 
282 #endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_DETECTION_OUTPUT_HPP */
283