1 // This file is part of OpenCV project.
2 // It is subject to the license terms in the LICENSE file found in the top-level directory
3 // of this distribution and at http://opencv.org/license.html.
4 
5 #include <cuda_runtime.h>
6 #include <cuda_fp16.h>
7 
8 #include "math.hpp"
9 #include "bbox_utils.hpp"
10 #include "grid_stride_range.hpp"
11 #include "block_stride_range.hpp"
12 #include "execution.hpp"
13 #include "vector_traits.hpp"
14 #include "memory.hpp"
15 
16 #include "../cuda4dnn/csl/stream.hpp"
17 #include "../cuda4dnn/csl/span.hpp"
18 #include "../cuda4dnn/csl/tensor.hpp"
19 
20 using namespace cv::dnn::cuda4dnn::csl;
21 using namespace cv::dnn::cuda4dnn::csl::device;
22 
23 namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
24 
25 namespace raw {
26 
27     template <class T, bool SHARE_LOCATION, bool VARIANCE_ENCODED_IN_TARGET, bool CORNER_TRUE_CENTER_FALSE, bool CLIP_BBOX>
decode_bbox(Span<T> decoded_bboxes,View<T> locations,View<T> priors,bool transpose_location,bool normalized_bbox,size_type num_loc_classes,index_type background_class_id,float clip_width,float clip_height)28     __global__ void decode_bbox(Span<T> decoded_bboxes, View<T> locations, View<T> priors,
29         bool transpose_location, bool normalized_bbox,
30         size_type num_loc_classes, index_type background_class_id,
31         float clip_width, float clip_height)
32     {
33         // decoded_bboxes: [batch_size, num_priors, num_loc_classes, 4]
34         // locations: [batch_size, num_priors, num_loc_classes, 4]
35         // priors: [1, C, num_priors, 4]
36         // C = 2 if !VARIANCE_ENCODED_IN_TARGET; otherwise, 1
37 
38         /* 4 bbox values + 4 variance values per prior */
39         constexpr int PRIOR_BOX_SIZE = VARIANCE_ENCODED_IN_TARGET ? 4 : 8;
40         const size_type num_priors = priors.size() / PRIOR_BOX_SIZE;
41 
42         using vector_type = get_vector_type_t<T, 4>;
43         auto locations_vPtr = vector_type::get_pointer(locations.data());
44         auto priors_vPtr = vector_type::get_pointer(priors.data());
45         auto decoded_bboxes_vPtr = vector_type::get_pointer(decoded_bboxes.data());
46 
47         const auto boxes_per_batch = num_priors * num_loc_classes;
48         for (auto idx : grid_stride_range(decoded_bboxes.size() / 4))
49         {
50             index_type p;
51             index_type c;
52 
53             if (SHARE_LOCATION)
54             {
55                 // locations are shared across all classes => num_loc_classes = 1
56                 p = idx % boxes_per_batch;
57                 c = 0;
58             }
59             else
60             {
61                 p = (idx % boxes_per_batch) / num_loc_classes;
62                 c = idx % num_loc_classes;
63             }
64 
65             if (!SHARE_LOCATION && c == background_class_id)
66                 continue;
67 
68             BoundingBox bbox;
69             {
70                 vector_type location;
71                 v_load(location, locations_vPtr[idx]);
72 
73                 if (transpose_location)
74                 {
75                     bbox.ymin = location.data[0];
76                     bbox.xmin = location.data[1];
77                     bbox.ymax = location.data[2];
78                     bbox.xmax = location.data[3];
79                 }
80                 else
81                 {
82                     bbox.xmin = location.data[0];
83                     bbox.ymin = location.data[1];
84                     bbox.xmax = location.data[2];
85                     bbox.ymax = location.data[3];
86                 }
87             }
88 
89             if (!VARIANCE_ENCODED_IN_TARGET)
90             {
91                 vector_type prior_variance;
92                 v_load_ldg(prior_variance, priors_vPtr[num_priors + p]);
93 
94                 bbox.xmin *= static_cast<float>(prior_variance.data[0]);
95                 bbox.ymin *= static_cast<float>(prior_variance.data[1]);
96                 bbox.xmax *= static_cast<float>(prior_variance.data[2]);
97                 bbox.ymax *= static_cast<float>(prior_variance.data[3]);
98             }
99 
100             BoundingBox prior;
101             {
102                 vector_type prior_box;
103                 v_load_ldg(prior_box, priors_vPtr[p]);
104 
105                 prior.xmin = prior_box.data[0];
106                 prior.ymin = prior_box.data[1];
107                 prior.xmax = prior_box.data[2];
108                 prior.ymax = prior_box.data[3];
109             }
110 
111             BoundingBox decoded_bbox;
112             if (CORNER_TRUE_CENTER_FALSE)
113             {
114                 decoded_bbox.xmin = prior.xmin + bbox.xmin;
115                 decoded_bbox.ymin = prior.ymin + bbox.ymin;
116                 decoded_bbox.xmax = prior.xmax + bbox.xmax;
117                 decoded_bbox.ymax = prior.ymax + bbox.ymax;
118             }
119             else
120             {
121                 auto prior_width = prior.xmax - prior.xmin;
122                 auto prior_height = prior.ymax - prior.ymin;
123                 if (!normalized_bbox)
124                 {
125                     prior_width += 1;
126                     prior_height += 1;
127                 }
128 
129                 auto prior_center_x = prior.xmin + prior_width * 0.5f;
130                 auto prior_center_y = prior.ymin + prior_height * 0.5f;
131 
132                 auto decode_bbox_center_x = bbox.xmin * prior_width + prior_center_x;
133                 auto decode_bbox_center_y = bbox.ymin * prior_height + prior_center_y;
134 
135                 using device::exp;
136                 float decode_bbox_width = exp(bbox.xmax) * prior_width;
137                 float decode_bbox_height = exp(bbox.ymax) * prior_height;
138 
139                 decoded_bbox.xmin = decode_bbox_center_x - decode_bbox_width * 0.5f;
140                 decoded_bbox.ymin = decode_bbox_center_y - decode_bbox_height * 0.5f;
141                 decoded_bbox.xmax = decode_bbox_center_x + decode_bbox_width * 0.5f;
142                 decoded_bbox.ymax = decode_bbox_center_y + decode_bbox_height * 0.5f;
143             }
144 
145             vector_type decoded_bbox_vec;
146             if (CLIP_BBOX)
147             {
148                 decoded_bbox_vec.data[0] = clamp(decoded_bbox.xmin, 0.0f, clip_width);
149                 decoded_bbox_vec.data[1] = clamp(decoded_bbox.ymin, 0.0f, clip_height);
150                 decoded_bbox_vec.data[2] = clamp(decoded_bbox.xmax, 0.0f, clip_width);
151                 decoded_bbox_vec.data[3] = clamp(decoded_bbox.ymax, 0.0f, clip_height);
152             }
153             else
154             {
155                 decoded_bbox_vec.data[0] = decoded_bbox.xmin;
156                 decoded_bbox_vec.data[1] = decoded_bbox.ymin;
157                 decoded_bbox_vec.data[2] = decoded_bbox.xmax;
158                 decoded_bbox_vec.data[3] = decoded_bbox.ymax;
159             }
160 
161             v_store(decoded_bboxes_vPtr[idx], decoded_bbox_vec);
162         }
163     }
164 
165     template <class T, int BINS, int BLOCK_SIZE>
__launch_bounds__(BLOCK_SIZE)166     __launch_bounds__(BLOCK_SIZE)
167     __global__ void findTopK(Span<int> indices_, Span<int> count_, View<T> scores_, float threshold, size_type classwise_topK, size_type num_classes, size_type num_priors, index_type background_class_id)
168     {
169         /* We need to sort boxes based on their confidence scores. The confidence scores fall in
170          * the range [0.0, 1.0]. We break the range into bins and perform count sort. This is an
171          * approximate algorithm.
172          *
173          * Each block handles a particular class of a particular batch item.
174          */
175         const auto c = blockIdx.x;
176         const auto b = blockIdx.y;
177 
178         if (c == background_class_id)
179             return;
180 
181         // indices: [batch_size, num_classes, classwise_topK]
182         // count: [batch_size, num_classes]
183         // scores: [batch_size, num_classes, num_priors]
184 
185         auto count = count_.data() + b * num_classes + c;
186         auto scores = scores_.data() + (b * num_classes + c) * num_priors;
187         auto indices = indices_.data() + (b * num_classes + c) * classwise_topK;
188 
189         /* We do not require a large number of bins to find the top K confidence scores. We will use
190          * a reasonable number of bins which will fit in the shared memory.
191          *
192          * Note that smaller scores will have a smaller index, i.e. the `bins` are ordered in
193          * ascending order.
194          */
195 
196         __shared__ int bins[BINS];
197 
198         #pragma unroll
199         for (int unroll = 0; unroll < BINS / BLOCK_SIZE; unroll++)
200             bins[unroll * BLOCK_SIZE + threadIdx.x] = 0;
201 
202         __syncthreads();
203 
204         for (auto i : block_stride_range<BLOCK_SIZE>(num_priors))
205         {
206             const float confidence = load_ldg(scores[i]);
207             if (confidence > threshold)
208             {
209                 using device::fast_divide_ftz;
210                 auto conf_scaled = fast_divide_ftz(confidence - threshold, 1 - threshold);
211 
212                 using device::clamp;
213                 int bin_index = conf_scaled * BINS;
214 
215                 /* We store counts of confidence scores in the bins. Our ultimate goal is to store the indices
216                  * of the `classwise_topK` confidence values in the `indices` array.
217                  *
218                  * We use a little trick to parallelize the process of filling up the `indices` array.
219                  * We want every thread in the block to participate in the process. To do so, we want the
220                  * bins array to be shifted by one place to the left. We will be computing the suffix sum
221                  * of the bins array later. Details and reasons for doing so will be explained later.
222                  */
223                 bin_index = clamp<int>(bin_index, 0, BINS - 1) - 1; // shift left by one
224 
225                 if (bin_index >= 0)
226                     atomicAdd(&bins[bin_index], 1);
227             }
228         }
229 
230         __syncthreads();
231 
232         constexpr int WARP_SIZE = 32; /* must be equal to warpSize */
233         // FORWARD_COMPATIBILITY_TAG: WARP_SIZE_DEPENDENT_CODE
234 
235         if (threadIdx.x < WARP_SIZE)
236         {
237             /* We can compute suffix sum of an array in groups of N numbers.
238              * Let N be 4 for this example.
239              *
240              * 1) Last 4 numbers
241              *                      1   2   3   4   |   5   6   7   8   |   9   10  11  12
242              * group suffix sum:                                            42  33  23  12
243              *
244              * 2) Middle 4 numbers
245              *                      1   2   3   4   |   5   6   7   8   |   9   10  11  12
246              * group suffix sum:                    |   26  21  15  8   |
247              *
248              * We add `42` (first element in the previous group) to each element to get:
249              *
250              *                      1   2   3   4   |   5   6   7   8   |   9   10  11  12
251              *                                      |   68  63  57  50  |   42  33  23  12
252              * 3) First 4 numbers
253              *
254              *                      1   2   3   4   |   5   6   7   8   |   9   10  11  12
255              * group suffix sum:    10  9   7   4   |
256              *
257              * We add `68` (first element in the previous group) to each element to get:
258              *
259              *                      1   2   3   4   |   5   6   7   8   |   9   10  11  12
260              * group suffix sum:    78  77  75  72  |   68  63  57  50  |   42  33  23  12
261              *
262              * What we are left with now is the suffix sum of the entire array.
263              *
264              * We use the aforementioned logic in the code below but work in groups of `warpSize`.
265              */
266 
267             /* We calculate suffix sums WARP_SIZE elements at a time starting from the right end.
268              * Hence, we will need BINS / WARP_SIZE number of iterations.
269              *
270              * Each iteration uses shuffle instructions to exchange data between threads. Shuffle
271              * instructions cannot be used in warp-divergent code. If the bins are a multiple of
272              * the warpSize, all the threads in the warp will participate.
273              */
274             static_assert(BINS % WARP_SIZE == 0, "number of bins must be a multiple of warp size");
275 
276             const int thread_id = threadIdx.x;
277             const int inverse_lane_id = WARP_SIZE - thread_id - 1;
278 
279             int previous_group_first_element = 0;
280             for (int iter = BINS / WARP_SIZE - 1; iter >= 0; iter--)
281             {
282                 const index_type idx = iter * WARP_SIZE + thread_id;
283                 auto value = bins[idx];
284 
285                 for (int i = 1; i < WARP_SIZE; i *= 2)
286                 {
287                     auto n = __shfl_down_sync(0xFFFFFFFF, value, i);
288                     if (inverse_lane_id >= i)
289                         value += n;
290                 }
291 
292                 value += previous_group_first_element;
293                 bins[idx] = value;
294 
295                 previous_group_first_element = __shfl_sync(0xFFFFFFFF, value, 0);
296             }
297         }
298 
299         if (threadIdx.x == 0)
300             *count = 0;
301 
302         __syncthreads();
303 
304         for (auto i : block_stride_range<BLOCK_SIZE>(num_priors))
305         {
306             const float confidence = load_ldg(scores[i]);
307             if (confidence > threshold)
308             {
309                 using device::fast_divide_ftz;
310                 auto conf_scaled = fast_divide_ftz(confidence - threshold, 1 - threshold);
311 
312                 int bin_index = conf_scaled * BINS;
313                 bin_index = clamp<int>(bin_index, 0, BINS - 1);
314 
315                 /* This bounding box is eligible to be selected unless it does not fall in
316                  * the `classwise_topK`. If it did, we would have to compute the location where it needs
317                  * to be stored.
318                  *
319                  * Suppose we had just 4 bins and say the following were the counts:
320                  * BIN0 2
321                  * BIN1 1
322                  * BIN2 3
323                  * BIN3 0 (last bin is always zero as we shift left by one while populating the bins)
324                  *
325                  * We will try our best to store the boxes in a sorted order in the `indices` array.
326                  * This requires that the boxes in later bins (higher confidence scores) must be
327                  * stored earlier.
328                  *
329                  * We compute the suffix sum of the array. This gives us:
330                  * BIN0 6
331                  * BIN1 4
332                  * BIN2 3
333                  * BIN3 0
334                  *
335                  * The bins now give us the location in the `indices` array from which the indices of the
336                  * scores corresponding to that bin would be stored. We atomically increment the bin count
337                  * everytime we store a box corresponding to that bin. Therefore, the value in the bins
338                  * gives the index in the `indices` array where the next box corresponding to that bin  must
339                  * be put.
340                  */
341 
342                 const index_type idx = atomicAdd(&bins[bin_index], 1);
343                 if (idx < classwise_topK)
344                 {
345                     indices[idx] = i;
346                     atomicAdd(&count[0], 1);
347                 }
348             }
349         }
350     }
351 
352     template <class T>
box_collect(Span<T> collected_bboxes_,View<T> decoded_bboxes_,View<int> indices_,View<int> count_,bool share_location,size_type num_priors,size_type num_classes,size_type classwise_topK,index_type background_class_id)353     __global__ void box_collect(Span<T> collected_bboxes_, View<T> decoded_bboxes_, View<int> indices_, View<int> count_, bool share_location, size_type num_priors, size_type num_classes, size_type classwise_topK, index_type background_class_id)
354     {
355         const index_type c = blockIdx.x;
356         if (c == background_class_id)
357             return;
358 
359         const index_type b = blockIdx.y;
360 
361         // collected_bboxes: [batch_size, num_classes, classwise_topK, 4]
362         // decoded_bboxes: [batch_size, num_priors, num_loc_classes, 4]
363         // indices: [batch_size, num_classes, classwise_topK]
364         // count: [batch_size, num_classes]
365 
366         const auto num_loc_classes = share_location ? 1 : num_classes;
367 
368         auto collected_bboxes = collected_bboxes_.data() + (b * num_classes + c) * classwise_topK * 4;
369         auto decoded_bboxes = decoded_bboxes_.data() + b * num_priors * num_loc_classes * 4;
370         auto indices = indices_.data() + (b * num_classes + c) * classwise_topK;
371         auto count = count_.data() + b * num_classes + c;
372 
373         const auto boxes = load_ldg(&count[0]);
374         if (boxes == 0)
375             return;
376 
377         using vector_type = get_vector_type_t<T, 4>;
378         auto decoded_bboxes_vPtr = vector_type::get_pointer(decoded_bboxes);
379         auto collected_bboxes_vPtr = vector_type::get_pointer(collected_bboxes);
380 
381         for (auto i : block_stride_range<>(boxes))
382         {
383             const auto prior_id = indices[i];
384             const index_type idx = share_location ? prior_id : (prior_id * num_classes + c);
385 
386             vector_type box;
387             v_load(box, decoded_bboxes_vPtr[idx]);
388             v_store(collected_bboxes_vPtr[i], box);
389         }
390     }
391 
392     template <class T, bool NORMALIZED_BBOX>
blockwise_class_nms(Span<int> indices_,Span<int> count_,View<T> collected_bboxes_,size_type num_classes,size_type classwise_topK,index_type background_class_id,float nms_threshold)393     __global__ void blockwise_class_nms(Span<int> indices_, Span<int> count_, View<T> collected_bboxes_, size_type num_classes, size_type classwise_topK, index_type background_class_id, float nms_threshold)
394     {
395         const index_type b = blockIdx.x / num_classes;
396         const index_type c = blockIdx.x % num_classes;
397         if (c == background_class_id)
398             return;
399 
400         // indices: [batch_size, num_classes, classwise_topK]
401         // count: [batch_size, num_classes]
402         // collected_bboxes: [batch_size, num_classes, classwise_topK, 4]
403 
404         auto indices = indices_.data() + (b * num_classes + c) * classwise_topK;
405         auto count = count_.data() + b * num_classes + c;
406         auto collected_bboxes = collected_bboxes_.data() + (b * num_classes + c) * classwise_topK * 4;
407 
408         const auto boxes = count[0];
409         if (boxes == 0)
410             return;
411 
412         using vector_type = get_vector_type_t<T, 4>;
413         auto collected_bboxes_vPtr = vector_type::get_pointer(collected_bboxes);
414 
415         for (int i = 0; i < boxes; i++)
416         {
417             auto prior_id = indices[i];
418             if (prior_id != -1)
419             {
420                 BoundingBox bbox1;
421                 {
422                     vector_type box;
423                     v_load(box, collected_bboxes_vPtr[i]);
424 
425                     bbox1.xmin = box.data[0];
426                     bbox1.ymin = box.data[1];
427                     bbox1.xmax = box.data[2];
428                     bbox1.ymax = box.data[3];
429                 }
430 
431                 for (auto j : block_stride_range<>(i + 1, boxes))
432                 {
433                     prior_id = indices[j];
434                     if (prior_id == -1)
435                         continue;
436 
437                     BoundingBox bbox2;
438                     {
439                         vector_type box;
440                         v_load_ldg(box, collected_bboxes_vPtr[j]);
441 
442                         bbox2.xmin = box.data[0];
443                         bbox2.ymin = box.data[1];
444                         bbox2.xmax = box.data[2];
445                         bbox2.ymax = box.data[3];
446                     }
447 
448                     using device::min;
449                     using device::max;
450 
451                     BoundingBox intersect_bbox;
452                     intersect_bbox.xmin = max(bbox1.xmin, bbox2.xmin);
453                     intersect_bbox.ymin = max(bbox1.ymin, bbox2.ymin);
454                     intersect_bbox.xmax = min(bbox1.xmax, bbox2.xmax);
455                     intersect_bbox.ymax = min(bbox1.ymax, bbox2.ymax);
456 
457                     float intersect_size = compute_bbox_size<NORMALIZED_BBOX>(intersect_bbox);
458                     float bbox1_size = compute_bbox_size<NORMALIZED_BBOX>(bbox1);
459                     float bbox2_size = compute_bbox_size<NORMALIZED_BBOX>(bbox2);
460 
461                     using device::fast_divide_ftz;
462                     float iou = fast_divide_ftz(intersect_size, bbox1_size + bbox2_size - intersect_size);
463                     if (iou > nms_threshold)
464                         indices[j] = -1;
465                 }
466             }
467 
468             __syncthreads();
469         }
470 
471         if (threadIdx.x == 0)
472             count[0] = 0;
473 
474         __syncthreads();
475 
476         for (auto i : block_stride_range<>(boxes))
477         {
478             auto prior_id = indices[i];
479             if(prior_id != -1)
480             {
481                 const index_type idx = atomicAdd(&count[0], 1);
482                 indices[idx] = prior_id;
483             }
484         }
485     }
486 
487     template <class T, std::size_t BINS, int BLOCK_SIZE>
__launch_bounds__(BLOCK_SIZE)488     __launch_bounds__(BLOCK_SIZE)
489     __global__ void nms_collect(
490         Span<int> kept_indices, Span<int> kept_count, View<int> indices_, View<int> count, View<T> scores_, float threshold,
491         size_type num_classes, size_type num_priors, size_type classwise_topK, size_type keepTopK, index_type background_class_id)
492     {
493         // sorting algorithm is documented in detail in findTopK kernel comments
494         // no explanations are provided here
495 
496         // kept_indices: [batch_size, keepTopK]
497         // kept_count: [batch_size]
498 
499         const auto b = blockIdx.x;
500 
501         __shared__ int bins[BINS];
502 
503         #pragma unroll
504         for (int unroll = 0; unroll < BINS / BLOCK_SIZE; unroll++)
505             bins[unroll * BLOCK_SIZE + threadIdx.x] = 0;
506 
507         __syncthreads();
508 
509         for (int c = 0; c < num_classes; c++)
510         {
511             if (c == background_class_id)
512                 continue;
513 
514             // indices: [batch_size, num_classes, classwise_topK]
515             // count: [batch_size, num_classes]
516             // scores: [batch_size, num_classes, num_priors]
517 
518             const auto indices = indices_.data() + (b * num_classes + c) * classwise_topK;
519             const auto scores = scores_.data() + (b * num_classes + c) * num_priors;
520 
521             auto boxes = count[b * num_classes + c];
522 
523             for (auto i : block_stride_range<BLOCK_SIZE>(boxes))
524             {
525                 auto prior_id = indices[i];
526                 const float confidence = load_ldg(scores[prior_id]);
527                 if (confidence > threshold)
528                 {
529                     using device::fast_divide_ftz;
530                     auto conf_scaled = fast_divide_ftz(confidence - threshold, 1 - threshold);
531 
532                     using device::clamp;
533                     int bin_index = conf_scaled * BINS;
534                     bin_index = clamp<int>(bin_index, 0, BINS - 1) - 1; // shift left by one
535 
536                     if (bin_index >= 0)
537                         atomicAdd(&bins[bin_index], 1);
538                 }
539             }
540         }
541 
542         __syncthreads();
543 
544         constexpr int WARP_SIZE = 32; /* must be equal to warpSize */
545         // FORWARD_COMPATIBILITY_TAG: WARP_SIZE_DEPENDENT_CODE
546 
547         if (threadIdx.x < WARP_SIZE)
548         {
549             static_assert(BINS % WARP_SIZE == 0, "number of bins must be a multiple of warp size");
550 
551             const int thread_id = threadIdx.x;
552             const int inverse_lane_id = WARP_SIZE - thread_id - 1;
553 
554             int previous_group_first_element = 0;
555             for (int iter = BINS / WARP_SIZE - 1; iter >= 0; iter--)
556             {
557                 const index_type idx = iter * WARP_SIZE + thread_id;
558                 auto value = bins[idx];
559 
560                 for (int i = 1; i < WARP_SIZE; i *= 2)
561                 {
562                     auto n = __shfl_down_sync(0xFFFFFFFF, value, i);
563                     if (inverse_lane_id >= i)
564                         value += n;
565                 }
566 
567                 value += previous_group_first_element;
568                 bins[idx] = value;
569 
570                 previous_group_first_element = __shfl_sync(0xFFFFFFFF, value, 0);
571             }
572         }
573 
574         if (threadIdx.x == 0)
575             kept_count[b] = 0;
576 
577         __syncthreads();
578 
579         for (int c = 0; c < num_classes; c++)
580         {
581             if (c == background_class_id)
582                 continue;
583 
584             const auto indices = indices_.data() + (b * num_classes + c) * classwise_topK;
585             const auto scores = scores_.data() + (b * num_classes + c) * num_priors;
586 
587             auto boxes = count[b * num_classes + c];
588 
589             for (auto i : block_stride_range<BLOCK_SIZE>(boxes))
590             {
591                 auto prior_id = indices[i];
592                 const float confidence = load_ldg(scores[prior_id]);
593                 if (confidence > threshold)
594                 {
595                     using device::fast_divide_ftz;
596                     auto conf_scaled = fast_divide_ftz(confidence - threshold, 1 - threshold);
597 
598                     using device::clamp;
599                     int bin_index = conf_scaled * BINS;
600                     bin_index = clamp<int>(bin_index, 0, BINS - 1);
601 
602                     const index_type idx = atomicAdd(&bins[bin_index], 1);
603                     if (idx < keepTopK)
604                     {
605                         kept_indices[b * keepTopK + idx] = c * num_priors + prior_id;
606                         atomicAdd(&kept_count[b], 1);
607                     }
608                 }
609             }
610         }
611     }
612 
613     template <class T>
consolidate_detections(Span<T> output,View<int> kept_indices,View<int> kept_count,View<T> decoded_bboxes,View<T> scores,bool share_location,size_type batch_size,size_type num_classes,size_type num_priors,size_type keepTopK,DevicePtr<int> num_detections)614     __global__ void consolidate_detections(Span<T> output,
615         View<int> kept_indices, View<int> kept_count, View<T> decoded_bboxes, View<T> scores, bool share_location,
616         size_type batch_size, size_type num_classes, size_type num_priors, size_type keepTopK, DevicePtr<int> num_detections)
617     {
618         using vector_type = get_vector_type_t<T, 4>;
619         auto decoded_bboxes_vPtr = vector_type::get_pointer(decoded_bboxes.data());
620 
621         // output: [1, 1, batch_size * keepTopK, 7]
622         // kept_indices: [batch_size, keepTopK]
623         // kept_count: [batch_size]
624         // decoded_bboxes: [batch_size, num_priors, num_loc_classes, 4]
625         // scores: [batch_size, num_classes, num_priors]
626 
627         for (int b = 0; b < batch_size; b++)
628         {
629             for (auto i : grid_stride_range(kept_count[b]))
630             {
631                 auto score_id = kept_indices[b * keepTopK + i];
632                 auto c = score_id / num_priors;
633                 auto prior_id = score_id % num_priors;
634 
635                 const auto confidence = scores[b * num_classes * num_priors + score_id];
636 
637                 index_type bbox_id;
638                 if (share_location)
639                 {
640                     // decoded_bboxes: [batch_size, num_priors, 1, 4]
641                     bbox_id = b * num_priors + prior_id;
642                 }
643                 else
644                 {
645                     // decoded_bboxes: [batch_size, num_priors, num_classes, 4]
646                     bbox_id = (b * num_priors + prior_id) * num_classes + c;
647                 }
648 
649                 vector_type bbox;
650                 v_load(bbox, decoded_bboxes_vPtr[bbox_id]);
651 
652                 auto output_id = atomicAdd(num_detections.get(), 1);
653                 output[output_id * 7 + 0] = b;
654                 output[output_id * 7 + 1] = c;
655                 output[output_id * 7 + 2] = confidence;
656                 output[output_id * 7 + 3] = bbox.data[0];
657                 output[output_id * 7 + 4] = bbox.data[1];
658                 output[output_id * 7 + 5] = bbox.data[2];
659                 output[output_id * 7 + 6] = bbox.data[3];
660             }
661         }
662     }
663 }
664 
665 template <class T, bool SHARE_LOCATION, bool VARIANCE_ENCODED_IN_TARGET, bool CORNER_TRUE_CENTER_FALSE, bool CLIP_BBOX> static
launch_decode_boxes_kernel(const Stream & stream,Span<T> decoded_bboxes,View<T> locations,View<T> priors,bool transpose_location,bool normalized_bbox,size_type num_loc_classes,index_type background_class_id,float clip_width,float clip_height)666 void launch_decode_boxes_kernel(const Stream& stream, Span<T> decoded_bboxes, View<T> locations, View<T> priors,
667     bool transpose_location, bool normalized_bbox,
668     size_type num_loc_classes, index_type background_class_id,
669     float clip_width, float clip_height)
670 {
671     auto kernel = raw::decode_bbox<T, SHARE_LOCATION, VARIANCE_ENCODED_IN_TARGET, CORNER_TRUE_CENTER_FALSE, CLIP_BBOX>;
672     auto policy = make_policy(kernel, decoded_bboxes.size() / 4, 0, stream);
673     launch_kernel(kernel, policy, decoded_bboxes, locations, priors, transpose_location, normalized_bbox, num_loc_classes, background_class_id, clip_width, clip_height);
674 }
675 
676 template <class T, unsigned int current, class ...Args> static
677 typename std::enable_if<current == 0, void>
dispatch_decode_bboxes(int selector,Args &&...args)678 ::type dispatch_decode_bboxes(int selector, Args&& ...args) {
679     if(selector == 0)
680         launch_decode_boxes_kernel<T, 0, 0, 0, 0>(std::forward<Args>(args)...);
681 }
682 
683 template <class T, unsigned int current, class ...Args> static
684 typename std::enable_if<current != 0, void>
dispatch_decode_bboxes(int selector,Args &&...args)685 ::type dispatch_decode_bboxes(int selector, Args&& ...args) {
686     if(selector == current)
687         launch_decode_boxes_kernel<T,
688                 static_cast<bool>(current & 8),
689                 static_cast<bool>(current & 4),
690                 static_cast<bool>(current & 2),
691                 static_cast<bool>(current & 1)>(std::forward<Args>(args)...);
692     else
693         dispatch_decode_bboxes<T, current - 1, Args...>(selector, std::forward<Args>(args)...);
694 }
695 
696 template <class T>
decode_bboxes(const Stream & stream,Span<T> output,View<T> locations,View<T> priors,std::size_t num_loc_classes,bool share_location,std::size_t background_class_id,bool transpose_location,bool variance_encoded_in_target,bool corner_true_or_center_false,bool normalized_bbox,bool clip_box,float clip_width,float clip_height)697 void decode_bboxes(const Stream& stream, Span<T> output, View<T> locations, View<T> priors,
698     std::size_t num_loc_classes,
699     bool share_location, std::size_t background_class_id,
700     bool transpose_location, bool variance_encoded_in_target,
701     bool corner_true_or_center_false, bool normalized_bbox,
702     bool clip_box, float clip_width, float clip_height)
703 {
704     /* `config` combines three kernel template options into one number using which a bit of TMP code can
705      * run through all possible combinations and instantiate the correct template
706      */
707     unsigned int config = (share_location << 3 | variance_encoded_in_target << 2 | corner_true_or_center_false << 1 | clip_box);
708     dispatch_decode_bboxes<T, 15>(config, stream, output, locations, priors, transpose_location, normalized_bbox, num_loc_classes, background_class_id, clip_width, clip_height);
709 }
710 
711 template void decode_bboxes(const Stream&, Span<__half>, View<__half>, View<__half>, std::size_t, bool, std::size_t, bool, bool, bool, bool, bool, float, float);
712 template void decode_bboxes(const Stream&, Span<float>, View<float>, View<float>, std::size_t, bool, std::size_t, bool, bool, bool, bool, bool, float, float);
713 
714 template <class T>
findTopK(const Stream & stream,TensorSpan<int> indices,TensorSpan<int> count,TensorView<T> scores,std::size_t background_class_id,float threshold)715 void findTopK(const Stream& stream, TensorSpan<int> indices, TensorSpan<int> count, TensorView<T> scores, std::size_t background_class_id, float threshold)
716 {
717     // indices: [batch_size, num_classes, classwise_topK]
718     // count: [batch_size, num_classes]
719     // scores: [batch_size, num_classes, num_priors]
720 
721     const auto batch_size = indices.get_axis_size(0);
722     CV_Assert(count.get_axis_size(0) == batch_size);
723     CV_Assert(scores.get_axis_size(0) == batch_size);
724 
725     const auto num_classes = indices.get_axis_size(1);
726     CV_Assert(count.get_axis_size(1) == num_classes);
727     CV_Assert(scores.get_axis_size(1) == num_classes);
728 
729     const auto classwise_topK = indices.get_axis_size(2);
730     const auto num_priors = scores.get_axis_size(2);
731 
732     /* each block processes one class from each batch */
733     constexpr auto BLOCK_SIZE = 256;
734 
735     dim3 grid_size(num_classes, batch_size);
736     dim3 block_size(BLOCK_SIZE);
737     auto policy = execution_policy(grid_size, block_size, stream);
738 
739     auto kernel = raw::findTopK<T, 2048, BLOCK_SIZE>;
740     launch_kernel(kernel, policy, indices, count, scores, threshold, classwise_topK, num_classes, num_priors, background_class_id);
741 }
742 
743 template void findTopK(const Stream&, TensorSpan<int>, TensorSpan<int>, TensorView<__half>, std::size_t, float);
744 template void findTopK(const Stream&, TensorSpan<int>, TensorSpan<int>, TensorView<float>, std::size_t, float);
745 
746 template <class T>
box_collect(const Stream & stream,TensorSpan<T> collected_bboxes,TensorView<T> decoded_bboxes,TensorView<int> indices,TensorView<int> count,bool share_location,std::size_t background_class_id)747 void box_collect(const Stream& stream, TensorSpan<T> collected_bboxes, TensorView<T> decoded_bboxes, TensorView<int> indices, TensorView<int> count, bool share_location, std::size_t background_class_id)
748 {
749     // collected_bboxes: [batch_size, num_classes, classwise_topK, 4]
750     // decoded_bboxes: [batch_size, num_priors, num_loc_classes, 4]
751     // indices: [batch_size, num_classes, classwise_topK]
752     // count: [batch_size, num_classes]
753 
754     const auto batch_size = collected_bboxes.get_axis_size(0);
755     CV_Assert(decoded_bboxes.get_axis_size(0) == batch_size);
756     CV_Assert(indices.get_axis_size(0) == batch_size);
757     CV_Assert(count.get_axis_size(0) == batch_size);
758 
759     const auto num_classes = collected_bboxes.get_axis_size(1);
760     CV_Assert(indices.get_axis_size(1) == num_classes);
761     CV_Assert(count.get_axis_size(1) == num_classes);
762 
763     const auto classwise_topK = collected_bboxes.get_axis_size(2);
764     CV_Assert(indices.get_axis_size(2) == classwise_topK);
765 
766     const auto num_priors = decoded_bboxes.get_axis_size(1);
767 
768     CV_Assert(!share_location || decoded_bboxes.get_axis_size(2) == 1);
769 
770     constexpr int BLOCK_SIZE = 256;
771 
772     /* each block processes one class from each batch */
773     dim3 grid_size(num_classes, batch_size);
774     dim3 block_size(BLOCK_SIZE);
775     auto policy = execution_policy(grid_size, block_size, stream);
776 
777     auto kernel = raw::box_collect<T>;
778     launch_kernel(kernel, policy, collected_bboxes, decoded_bboxes, indices, count, share_location, num_priors, num_classes, classwise_topK, background_class_id);
779 }
780 
781 template void box_collect(const Stream&, TensorSpan<float>, TensorView<float>, TensorView<int>, TensorView<int>, bool, std::size_t);
782 template void box_collect(const Stream&, TensorSpan<__half>, TensorView<__half>, TensorView<int>, TensorView<int>, bool, std::size_t);
783 
784 template <class T>
blockwise_class_nms(const Stream & stream,TensorSpan<int> indices,TensorSpan<int> count,TensorView<T> collected_bboxes,bool normalized_bbox,std::size_t background_class_id,float nms_threshold)785 void blockwise_class_nms(const Stream& stream, TensorSpan<int> indices, TensorSpan<int> count, TensorView<T> collected_bboxes,
786     bool normalized_bbox, std::size_t background_class_id, float nms_threshold)
787 {
788     // indices: [batch_size, num_classes, classwise_topK]
789     // count: [batch_size, num_classes]
790     // collected_bboxes: [batch_size, num_classes, classwise_topK, 4]
791 
792     const auto batch_size = indices.get_axis_size(0);
793     CV_Assert(count.get_axis_size(0) == batch_size);
794     CV_Assert(collected_bboxes.get_axis_size(0) == batch_size);
795 
796     const auto num_classes = indices.get_axis_size(1);
797     CV_Assert(count.get_axis_size(1) == num_classes);
798     CV_Assert(collected_bboxes.get_axis_size(1) == num_classes);
799 
800     const auto classwise_topK = indices.get_axis_size(2);
801     CV_Assert(collected_bboxes.get_axis_size(2) == classwise_topK);
802 
803     /* each block processes one class from each batch */
804     auto num_blocks = batch_size * num_classes;
805     auto num_threads = std::max<std::size_t>(std::min<std::size_t>(1024, classwise_topK), 32);
806 
807     dim3 grid_size(num_blocks);
808     dim3 block_size(num_threads);
809     auto policy = execution_policy(grid_size, block_size, stream);
810 
811     if (normalized_bbox)
812     {
813         auto kernel = raw::blockwise_class_nms<T, true>;
814         launch_kernel(kernel, policy, indices, count, collected_bboxes, num_classes, classwise_topK, background_class_id, nms_threshold);
815     }
816     else
817     {
818         auto kernel = raw::blockwise_class_nms<T, false>;
819         launch_kernel(kernel, policy, indices, count, collected_bboxes, num_classes, classwise_topK, background_class_id, nms_threshold);
820     }
821 }
822 
823 template void blockwise_class_nms(const Stream&, TensorSpan<int>, TensorSpan<int>, TensorView<__half>, bool, std::size_t, float);
824 template void blockwise_class_nms(const Stream&, TensorSpan<int>, TensorSpan<int>, TensorView<float>, bool, std::size_t, float);
825 
826 template <class T>
nms_collect(const Stream & stream,TensorSpan<int> kept_indices,TensorSpan<int> kept_count,TensorView<int> indices,TensorView<int> count,TensorView<T> scores,float threshold,std::size_t background_class_id)827 void nms_collect(const Stream& stream, TensorSpan<int> kept_indices, TensorSpan<int> kept_count,
828     TensorView<int> indices, TensorView<int> count, TensorView<T> scores, float threshold, std::size_t background_class_id)
829 {
830     // kept_indices: [batch_size, keepTopK]
831     // kept_count: [batch_size]
832 
833     // indices: [batch_size, num_classes, classwise_topK]
834     // count: [batch_size, num_classes]
835     // scores: [batch_size, num_classes, num_priors]
836 
837     auto batch_size = kept_indices.get_axis_size(0);
838     CV_Assert(kept_count.get_axis_size(0) == batch_size);
839     CV_Assert(indices.get_axis_size(0) == batch_size);
840     CV_Assert(count.get_axis_size(0) == batch_size);
841     CV_Assert(scores.get_axis_size(0) == batch_size);
842 
843     auto keepTopK = kept_indices.get_axis_size(1);
844 
845     auto num_classes = indices.get_axis_size(1);
846     CV_Assert(count.get_axis_size(1) == num_classes);
847     CV_Assert(scores.get_axis_size(1) == num_classes);
848 
849     auto classwise_topK = indices.get_axis_size(2);
850     auto num_priors = scores.get_axis_size(2);
851 
852     auto num_blocks = batch_size;
853     constexpr int BLOCK_SIZE = 1024;
854 
855     dim3 grid_size(num_blocks);
856     dim3 block_size(BLOCK_SIZE);
857     auto policy = execution_policy(grid_size, block_size, stream);
858 
859     auto kernel = raw::nms_collect<T, 1024, BLOCK_SIZE>;
860     launch_kernel(kernel, policy, kept_indices, kept_count, indices, count, scores, threshold, num_classes, num_priors, classwise_topK, keepTopK, background_class_id);
861 }
862 
863 template void nms_collect(const Stream&, TensorSpan<int>, TensorSpan<int>, TensorView<int>, TensorView<int>, TensorView<__half>, float, std::size_t);
864 template void nms_collect(const Stream&, TensorSpan<int>, TensorSpan<int>, TensorView<int>, TensorView<int>, TensorView<float>, float, std::size_t);
865 
866 template <class T>
consolidate_detections(const Stream & stream,TensorSpan<T> output,TensorView<int> kept_indices,TensorView<int> kept_count,TensorView<T> decoded_bboxes,TensorView<T> scores,bool share_location,DevicePtr<int> num_detections)867 void consolidate_detections(const Stream& stream, TensorSpan<T> output,
868     TensorView<int> kept_indices, TensorView<int> kept_count,
869     TensorView<T> decoded_bboxes, TensorView<T> scores, bool share_location, DevicePtr<int> num_detections)
870 {
871     // output: [1, 1, batch_size * keepTopK, 7]
872     // kept_indices: [batch_size, keepTopK]
873     // kept_count: [batch_size]
874     // decoded_bboxes: [batch_size, num_priors, num_loc_classes, 4]
875     // scores: [batch_size, num_classes, num_priors]
876 
877     auto batch_size = kept_indices.get_axis_size(0);
878     CV_Assert(kept_count.get_axis_size(0) == batch_size);
879     CV_Assert(decoded_bboxes.get_axis_size(0) == batch_size);
880     CV_Assert(scores.get_axis_size(0) == batch_size);
881 
882     auto keepTopK = kept_indices.get_axis_size(1);
883 
884     auto num_classes = scores.get_axis_size(1);
885     auto num_priors = scores.get_axis_size(2);
886 
887     CV_Assert(batch_size * keepTopK * 7 == output.size());
888 
889     auto kernel = raw::consolidate_detections<T>;
890     auto policy = make_policy(kernel, keepTopK, 0, stream);
891     launch_kernel(kernel, policy, output, kept_indices, kept_count, decoded_bboxes, scores, share_location, batch_size, num_classes, num_priors, keepTopK, num_detections);
892 }
893 
894 template void consolidate_detections(const Stream&, TensorSpan<__half>, TensorView<int>, TensorView<int>, TensorView<__half>, TensorView<__half>, bool, DevicePtr<int>);
895 template void consolidate_detections(const Stream&, TensorSpan<float>, TensorView<int>, TensorView<int>, TensorView<float>, TensorView<float>, bool, DevicePtr<int>);
896 
897 }}}} /* namespace cv::dnn::cuda4dnn::kernels */
898