// This file is part of OpenCV project. // It is subject to the license terms in the LICENSE file found in the top-level directory // of this distribution and at http://opencv.org/license.html. #ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_DETECTION_OUTPUT_HPP #define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_DETECTION_OUTPUT_HPP #include "../../op_cuda.hpp" #include "../csl/stream.hpp" #include "../csl/tensor.hpp" #include "../kernels/fill_copy.hpp" #include "../kernels/permute.hpp" #include "../kernels/detection_output.hpp" #include "../kernels/grid_nms.hpp" #include #include namespace cv { namespace dnn { namespace cuda4dnn { struct DetectionOutputConfiguration { std::size_t batch_size; enum class CodeType { CORNER, CENTER_SIZE }; CodeType code_type; bool share_location; std::size_t num_priors; std::size_t num_classes; std::size_t background_class_id; bool transpose_location; bool variance_encoded_in_target; bool normalized_bbox; bool clip_box; std::size_t classwise_topK; float confidence_threshold; float nms_threshold; int keepTopK; }; template class DetectionOutputOp final : public CUDABackendNode { private: /* We have block level NMS kernel where each block handles one class of one batch item. * If the number of classes and batch size together is very low, the blockwise NMS kernel * won't able to fully saturate the GPU with work. * * We also have a grid level NMS kernel where multiple blocks handle each class of every batch item. * This performs better in the worst case and utilizes resources better when block level kernel isn't * able to saturate the GPU with enough work. However, this is not efficient in the average case where * the block level kernel is able to saturate the GPU. It does better when the blockwise NMS barely * saturates the GPU. * * `GRID_NMS_CUTOFF` is the cutoff for `num_classes * batch_size` above which we will switch from grid * level NMS to block level NMS. */ static constexpr int GRID_NMS_CUTOFF = 32; public: using wrapper_type = GetCUDABackendWrapperType; DetectionOutputOp(csl::Stream stream_, const DetectionOutputConfiguration& config) : stream(std::move(stream_)) { corner_true_or_center_false = (config.code_type == DetectionOutputConfiguration::CodeType::CORNER); share_location = config.share_location; num_priors = config.num_priors; num_classes = config.num_classes; background_class_id = config.background_class_id; transpose_location = config.transpose_location; variance_encoded_in_target = config.variance_encoded_in_target; normalized_bbox = config.normalized_bbox; clip_box = config.clip_box; classwise_topK = config.classwise_topK; confidence_threshold = config.confidence_threshold; nms_threshold = config.nms_threshold; keepTopK = config.keepTopK; CV_Assert(keepTopK > 0); if (classwise_topK == -1) { classwise_topK = num_priors; if (keepTopK > 0 && keepTopK < num_priors) classwise_topK = keepTopK; } auto batch_size = config.batch_size; auto num_loc_classes = (share_location ? 1 : num_classes); csl::WorkspaceBuilder builder; builder.require(batch_size * num_priors * num_loc_classes * 4); /* decoded boxes */ builder.require(batch_size * num_classes * num_priors); /* transposed scores */ builder.require(batch_size * num_classes * classwise_topK); /* indices */ builder.require(batch_size * num_classes); /* classwise topK count */ builder.require(batch_size * num_classes * classwise_topK * 4); /* topK decoded boxes */ if (batch_size * num_classes <= GRID_NMS_CUTOFF) { auto workspace_per_batch_item = kernels::getGridNMSWorkspaceSizePerBatchItem(num_classes, classwise_topK); builder.require(batch_size * workspace_per_batch_item); } builder.require(batch_size * keepTopK); /* final kept indices */ builder.require(batch_size); /* kept indices count */ builder.require(1); /* total number of detections */ scratch_mem_in_bytes = builder.required_workspace_size(); } void forward( const std::vector>& inputs, const std::vector>& outputs, csl::Workspace& workspace) override { /* locations, scores and priors make the first three inputs in order */ /* the 4th input is used to obtain the shape for clipping */ CV_Assert((inputs.size() == 3 || inputs.size() == 4) && outputs.size() == 1); // locations: [batch_size, num_priors, num_loc_classes, 4] auto locations_wrapper = inputs[0].dynamicCast(); auto locations = locations_wrapper->getView(); // scores: [batch_size, num_priors, num_classes] auto scores_wrapper = inputs[1].dynamicCast(); auto scores = scores_wrapper->getView(); scores.unsqueeze(); scores.reshape(-1, num_priors, num_classes); // priors: [1, 2, num_priors, 4] auto priors_wrapper = inputs[2].dynamicCast(); auto priors = priors_wrapper->getView(); // output: [1, 1, batch_size * keepTopK, 7] auto output_wrapper = outputs[0].dynamicCast(); auto output = output_wrapper->getSpan(); auto batch_size = locations.get_axis_size(0); auto num_loc_classes = (share_location ? 1 : num_classes); while(locations.rank() < 4) locations.unsqueeze(); locations.reshape(batch_size, num_priors, num_loc_classes, 4); float clip_width = 0.0, clip_height = 0.0; if (clip_box) { if (normalized_bbox) { clip_width = clip_height = 1.0f; } else { auto image_wrapper = inputs[3].dynamicCast(); auto image_shape = image_wrapper->getShape(); CV_Assert(image_shape.size() == 4); clip_width = image_shape[3] - 1; clip_height = image_shape[2] - 1; } } csl::WorkspaceAllocator allocator(workspace); // decoded_boxes: [batch_size, num_priors, num_loc_classes, 4] csl::TensorSpan decoded_boxes; { auto shape = std::vector{batch_size, num_priors, num_loc_classes, 4}; decoded_boxes = allocator.get_tensor_span(std::begin(shape), std::end(shape)); CV_Assert(is_shape_same(decoded_boxes, locations)); } kernels::decode_bboxes(stream, decoded_boxes, locations, priors, num_loc_classes, share_location, background_class_id, transpose_location, variance_encoded_in_target, corner_true_or_center_false, normalized_bbox, clip_box, clip_width, clip_height); // scores_permuted: [batch_size, num_classes, num_priors] csl::TensorSpan scores_permuted; { auto shape = std::vector{batch_size, num_classes, num_priors}; scores_permuted = allocator.get_tensor_span(std::begin(shape), std::end(shape)); } kernels::permute(stream, scores_permuted, scores, {0, 2, 1}); // indices: [batch_size, num_classes, classwise_topK] csl::TensorSpan indices; { auto shape = std::vector{batch_size, num_classes, classwise_topK}; indices = allocator.get_tensor_span(std::begin(shape), std::end(shape)); } // count: [batch_size, num_classes] csl::TensorSpan count; { auto shape = std::vector{batch_size, num_classes}; count = allocator.get_tensor_span(std::begin(shape), std::end(shape)); } kernels::findTopK(stream, indices, count, scores_permuted, background_class_id, confidence_threshold); // collected_bboxes: [batch_size, num_classes, classwise_topK, 4] csl::TensorSpan collected_bboxes; { auto shape = std::vector{batch_size, num_classes, classwise_topK, 4}; collected_bboxes = allocator.get_tensor_span(std::begin(shape), std::end(shape)); } kernels::box_collect(stream, collected_bboxes, decoded_boxes, indices, count, share_location, background_class_id); if (batch_size * num_classes <= GRID_NMS_CUTOFF) { auto workspace_per_batch_item = kernels::getGridNMSWorkspaceSizePerBatchItem(num_classes, classwise_topK); auto workspace = allocator.get_span(batch_size * workspace_per_batch_item / sizeof(unsigned int)); kernels::grid_nms(stream, workspace, indices, count, collected_bboxes, background_class_id, normalized_bbox, nms_threshold); } else { kernels::blockwise_class_nms(stream, indices, count, collected_bboxes, normalized_bbox, background_class_id, nms_threshold); } // kept_indices: [batch_size, keepTopK] csl::TensorSpan kept_indices; { auto shape = std::vector{batch_size, static_cast(keepTopK)}; kept_indices = allocator.get_tensor_span(std::begin(shape), std::end(shape)); } // kept_count: [batch_size] csl::TensorSpan kept_count; { auto shape = std::vector{batch_size}; kept_count = allocator.get_tensor_span(std::begin(shape), std::end(shape)); } kernels::nms_collect(stream, kept_indices, kept_count, indices, count, scores_permuted, confidence_threshold, background_class_id); auto num_detections = allocator.get_span(1); kernels::fill(stream, num_detections, 0); kernels::fill(stream, output, 0.0); kernels::consolidate_detections(stream, output, kept_indices, kept_count, decoded_boxes, scores_permuted, share_location, num_detections.data()); } std::size_t get_workspace_memory_in_bytes() const noexcept override { return scratch_mem_in_bytes; } private: csl::Stream stream; std::size_t scratch_mem_in_bytes; bool share_location; std::size_t num_priors; std::size_t num_classes; std::size_t background_class_id; bool transpose_location; bool variance_encoded_in_target; bool corner_true_or_center_false; bool normalized_bbox; bool clip_box; std::size_t classwise_topK; float confidence_threshold; float nms_threshold; int keepTopK; }; }}} /* namespace cv::dnn::cuda4dnn */ #endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_DETECTION_OUTPUT_HPP */