1 // This file is part of OpenCV project. 2 // It is subject to the license terms in the LICENSE file found in the top-level directory 3 // of this distribution and at http://opencv.org/license.html. 4 5 #ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_DETECTION_OUTPUT_HPP 6 #define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_DETECTION_OUTPUT_HPP 7 8 #include "../../op_cuda.hpp" 9 10 #include "../csl/stream.hpp" 11 #include "../csl/tensor.hpp" 12 13 #include "../kernels/fill_copy.hpp" 14 #include "../kernels/permute.hpp" 15 #include "../kernels/detection_output.hpp" 16 #include "../kernels/grid_nms.hpp" 17 18 #include <cstddef> 19 #include <utility> 20 21 namespace cv { namespace dnn { namespace cuda4dnn { 22 23 struct DetectionOutputConfiguration { 24 std::size_t batch_size; 25 26 enum class CodeType { 27 CORNER, 28 CENTER_SIZE 29 }; 30 CodeType code_type; 31 32 bool share_location; 33 std::size_t num_priors; 34 std::size_t num_classes; 35 std::size_t background_class_id; 36 37 bool transpose_location; 38 bool variance_encoded_in_target; 39 bool normalized_bbox; 40 bool clip_box; 41 42 std::size_t classwise_topK; 43 float confidence_threshold; 44 float nms_threshold; 45 46 int keepTopK; 47 }; 48 49 template <class T> 50 class DetectionOutputOp final : public CUDABackendNode { 51 private: 52 /* We have block level NMS kernel where each block handles one class of one batch item. 53 * If the number of classes and batch size together is very low, the blockwise NMS kernel 54 * won't able to fully saturate the GPU with work. 55 * 56 * We also have a grid level NMS kernel where multiple blocks handle each class of every batch item. 57 * This performs better in the worst case and utilizes resources better when block level kernel isn't 58 * able to saturate the GPU with enough work. However, this is not efficient in the average case where 59 * the block level kernel is able to saturate the GPU. It does better when the blockwise NMS barely 60 * saturates the GPU. 61 * 62 * `GRID_NMS_CUTOFF` is the cutoff for `num_classes * batch_size` above which we will switch from grid 63 * level NMS to block level NMS. 64 */ 65 static constexpr int GRID_NMS_CUTOFF = 32; 66 67 public: 68 using wrapper_type = GetCUDABackendWrapperType<T>; 69 DetectionOutputOp(csl::Stream stream_,const DetectionOutputConfiguration & config)70 DetectionOutputOp(csl::Stream stream_, const DetectionOutputConfiguration& config) 71 : stream(std::move(stream_)) 72 { 73 corner_true_or_center_false = (config.code_type == DetectionOutputConfiguration::CodeType::CORNER); 74 75 share_location = config.share_location; 76 num_priors = config.num_priors; 77 num_classes = config.num_classes; 78 background_class_id = config.background_class_id; 79 80 transpose_location = config.transpose_location; 81 variance_encoded_in_target = config.variance_encoded_in_target; 82 normalized_bbox = config.normalized_bbox; 83 clip_box = config.clip_box; 84 85 classwise_topK = config.classwise_topK; 86 confidence_threshold = config.confidence_threshold; 87 nms_threshold = config.nms_threshold; 88 89 keepTopK = config.keepTopK; 90 CV_Assert(keepTopK > 0); 91 92 if (classwise_topK == -1) 93 { 94 classwise_topK = num_priors; 95 if (keepTopK > 0 && keepTopK < num_priors) 96 classwise_topK = keepTopK; 97 } 98 99 auto batch_size = config.batch_size; 100 auto num_loc_classes = (share_location ? 1 : num_classes); 101 102 csl::WorkspaceBuilder builder; 103 builder.require<T>(batch_size * num_priors * num_loc_classes * 4); /* decoded boxes */ 104 builder.require<T>(batch_size * num_classes * num_priors); /* transposed scores */ 105 builder.require<int>(batch_size * num_classes * classwise_topK); /* indices */ 106 builder.require<int>(batch_size * num_classes); /* classwise topK count */ 107 builder.require<T>(batch_size * num_classes * classwise_topK * 4); /* topK decoded boxes */ 108 109 if (batch_size * num_classes <= GRID_NMS_CUTOFF) 110 { 111 auto workspace_per_batch_item = kernels::getGridNMSWorkspaceSizePerBatchItem(num_classes, classwise_topK); 112 builder.require(batch_size * workspace_per_batch_item); 113 } 114 115 builder.require<int>(batch_size * keepTopK); /* final kept indices */ 116 builder.require<int>(batch_size); /* kept indices count */ 117 builder.require<int>(1); /* total number of detections */ 118 119 scratch_mem_in_bytes = builder.required_workspace_size(); 120 } 121 forward(const std::vector<cv::Ptr<BackendWrapper>> & inputs,const std::vector<cv::Ptr<BackendWrapper>> & outputs,csl::Workspace & workspace)122 void forward( 123 const std::vector<cv::Ptr<BackendWrapper>>& inputs, 124 const std::vector<cv::Ptr<BackendWrapper>>& outputs, 125 csl::Workspace& workspace) override 126 { 127 /* locations, scores and priors make the first three inputs in order */ 128 /* the 4th input is used to obtain the shape for clipping */ 129 CV_Assert((inputs.size() == 3 || inputs.size() == 4) && outputs.size() == 1); 130 131 // locations: [batch_size, num_priors, num_loc_classes, 4] 132 auto locations_wrapper = inputs[0].dynamicCast<wrapper_type>(); 133 auto locations = locations_wrapper->getView(); 134 135 // scores: [batch_size, num_priors, num_classes] 136 auto scores_wrapper = inputs[1].dynamicCast<wrapper_type>(); 137 auto scores = scores_wrapper->getView(); 138 scores.unsqueeze(); 139 scores.reshape(-1, num_priors, num_classes); 140 141 // priors: [1, 2, num_priors, 4] 142 auto priors_wrapper = inputs[2].dynamicCast<wrapper_type>(); 143 auto priors = priors_wrapper->getView(); 144 145 // output: [1, 1, batch_size * keepTopK, 7] 146 auto output_wrapper = outputs[0].dynamicCast<wrapper_type>(); 147 auto output = output_wrapper->getSpan(); 148 149 auto batch_size = locations.get_axis_size(0); 150 auto num_loc_classes = (share_location ? 1 : num_classes); 151 while(locations.rank() < 4) 152 locations.unsqueeze(); 153 locations.reshape(batch_size, num_priors, num_loc_classes, 4); 154 155 float clip_width = 0.0, clip_height = 0.0; 156 if (clip_box) 157 { 158 if (normalized_bbox) 159 { 160 clip_width = clip_height = 1.0f; 161 } 162 else 163 { 164 auto image_wrapper = inputs[3].dynamicCast<wrapper_type>(); 165 auto image_shape = image_wrapper->getShape(); 166 167 CV_Assert(image_shape.size() == 4); 168 clip_width = image_shape[3] - 1; 169 clip_height = image_shape[2] - 1; 170 } 171 } 172 173 csl::WorkspaceAllocator allocator(workspace); 174 175 // decoded_boxes: [batch_size, num_priors, num_loc_classes, 4] 176 csl::TensorSpan<T> decoded_boxes; 177 { 178 auto shape = std::vector<std::size_t>{batch_size, num_priors, num_loc_classes, 4}; 179 decoded_boxes = allocator.get_tensor_span<T>(std::begin(shape), std::end(shape)); 180 CV_Assert(is_shape_same(decoded_boxes, locations)); 181 } 182 183 kernels::decode_bboxes<T>(stream, decoded_boxes, locations, priors, 184 num_loc_classes, share_location, background_class_id, 185 transpose_location, variance_encoded_in_target, 186 corner_true_or_center_false, normalized_bbox, 187 clip_box, clip_width, clip_height); 188 189 // scores_permuted: [batch_size, num_classes, num_priors] 190 csl::TensorSpan<T> scores_permuted; 191 { 192 auto shape = std::vector<std::size_t>{batch_size, num_classes, num_priors}; 193 scores_permuted = allocator.get_tensor_span<T>(std::begin(shape), std::end(shape)); 194 } 195 196 kernels::permute<T>(stream, scores_permuted, scores, {0, 2, 1}); 197 198 // indices: [batch_size, num_classes, classwise_topK] 199 csl::TensorSpan<int> indices; 200 { 201 auto shape = std::vector<std::size_t>{batch_size, num_classes, classwise_topK}; 202 indices = allocator.get_tensor_span<int>(std::begin(shape), std::end(shape)); 203 } 204 205 // count: [batch_size, num_classes] 206 csl::TensorSpan<int> count; 207 { 208 auto shape = std::vector<std::size_t>{batch_size, num_classes}; 209 count = allocator.get_tensor_span<int>(std::begin(shape), std::end(shape)); 210 } 211 212 kernels::findTopK<T>(stream, indices, count, scores_permuted, background_class_id, confidence_threshold); 213 214 // collected_bboxes: [batch_size, num_classes, classwise_topK, 4] 215 csl::TensorSpan<T> collected_bboxes; 216 { 217 auto shape = std::vector<std::size_t>{batch_size, num_classes, classwise_topK, 4}; 218 collected_bboxes = allocator.get_tensor_span<T>(std::begin(shape), std::end(shape)); 219 } 220 221 kernels::box_collect<T>(stream, collected_bboxes, decoded_boxes, indices, count, share_location, background_class_id); 222 223 if (batch_size * num_classes <= GRID_NMS_CUTOFF) 224 { 225 auto workspace_per_batch_item = kernels::getGridNMSWorkspaceSizePerBatchItem(num_classes, classwise_topK); 226 auto workspace = allocator.get_span<unsigned int>(batch_size * workspace_per_batch_item / sizeof(unsigned int)); 227 kernels::grid_nms<T>(stream, workspace, indices, count, collected_bboxes, background_class_id, normalized_bbox, nms_threshold); 228 } 229 else 230 { 231 kernels::blockwise_class_nms<T>(stream, indices, count, collected_bboxes, normalized_bbox, background_class_id, nms_threshold); 232 } 233 234 // kept_indices: [batch_size, keepTopK] 235 csl::TensorSpan<int> kept_indices; 236 { 237 auto shape = std::vector<std::size_t>{batch_size, static_cast<std::size_t>(keepTopK)}; 238 kept_indices = allocator.get_tensor_span<int>(std::begin(shape), std::end(shape)); 239 } 240 241 // kept_count: [batch_size] 242 csl::TensorSpan<int> kept_count; 243 { 244 auto shape = std::vector<std::size_t>{batch_size}; 245 kept_count = allocator.get_tensor_span<int>(std::begin(shape), std::end(shape)); 246 } 247 248 kernels::nms_collect<T>(stream, kept_indices, kept_count, indices, count, scores_permuted, confidence_threshold, background_class_id); 249 250 auto num_detections = allocator.get_span<int>(1); 251 kernels::fill<int>(stream, num_detections, 0); 252 kernels::fill<T>(stream, output, 0.0); 253 kernels::consolidate_detections<T>(stream, output, kept_indices, kept_count, decoded_boxes, scores_permuted, share_location, num_detections.data()); 254 } 255 get_workspace_memory_in_bytes() const256 std::size_t get_workspace_memory_in_bytes() const noexcept override { return scratch_mem_in_bytes; } 257 258 private: 259 csl::Stream stream; 260 std::size_t scratch_mem_in_bytes; 261 262 bool share_location; 263 std::size_t num_priors; 264 std::size_t num_classes; 265 std::size_t background_class_id; 266 267 bool transpose_location; 268 bool variance_encoded_in_target; 269 bool corner_true_or_center_false; 270 bool normalized_bbox; 271 bool clip_box; 272 273 std::size_t classwise_topK; 274 float confidence_threshold; 275 float nms_threshold; 276 277 int keepTopK; 278 }; 279 280 }}} /* namespace cv::dnn::cuda4dnn */ 281 282 #endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_DETECTION_OUTPUT_HPP */ 283