1 // This file is part of OpenCV project.
2 // It is subject to the license terms in the LICENSE file found in the top-level directory
3 // of this distribution and at http://opencv.org/license.html.
4
5 #include <cuda_runtime.h>
6 #include <cuda_fp16.h>
7
8 #include "math.hpp"
9 #include "bbox_utils.hpp"
10 #include "grid_stride_range.hpp"
11 #include "block_stride_range.hpp"
12 #include "execution.hpp"
13 #include "vector_traits.hpp"
14 #include "memory.hpp"
15
16 #include "../cuda4dnn/csl/stream.hpp"
17 #include "../cuda4dnn/csl/span.hpp"
18 #include "../cuda4dnn/csl/tensor.hpp"
19
20 using namespace cv::dnn::cuda4dnn::csl;
21 using namespace cv::dnn::cuda4dnn::csl::device;
22
23 namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
24
25 namespace raw {
26
27 template <class T, bool SHARE_LOCATION, bool VARIANCE_ENCODED_IN_TARGET, bool CORNER_TRUE_CENTER_FALSE, bool CLIP_BBOX>
decode_bbox(Span<T> decoded_bboxes,View<T> locations,View<T> priors,bool transpose_location,bool normalized_bbox,size_type num_loc_classes,index_type background_class_id,float clip_width,float clip_height)28 __global__ void decode_bbox(Span<T> decoded_bboxes, View<T> locations, View<T> priors,
29 bool transpose_location, bool normalized_bbox,
30 size_type num_loc_classes, index_type background_class_id,
31 float clip_width, float clip_height)
32 {
33 // decoded_bboxes: [batch_size, num_priors, num_loc_classes, 4]
34 // locations: [batch_size, num_priors, num_loc_classes, 4]
35 // priors: [1, C, num_priors, 4]
36 // C = 2 if !VARIANCE_ENCODED_IN_TARGET; otherwise, 1
37
38 /* 4 bbox values + 4 variance values per prior */
39 constexpr int PRIOR_BOX_SIZE = VARIANCE_ENCODED_IN_TARGET ? 4 : 8;
40 const size_type num_priors = priors.size() / PRIOR_BOX_SIZE;
41
42 using vector_type = get_vector_type_t<T, 4>;
43 auto locations_vPtr = vector_type::get_pointer(locations.data());
44 auto priors_vPtr = vector_type::get_pointer(priors.data());
45 auto decoded_bboxes_vPtr = vector_type::get_pointer(decoded_bboxes.data());
46
47 const auto boxes_per_batch = num_priors * num_loc_classes;
48 for (auto idx : grid_stride_range(decoded_bboxes.size() / 4))
49 {
50 index_type p;
51 index_type c;
52
53 if (SHARE_LOCATION)
54 {
55 // locations are shared across all classes => num_loc_classes = 1
56 p = idx % boxes_per_batch;
57 c = 0;
58 }
59 else
60 {
61 p = (idx % boxes_per_batch) / num_loc_classes;
62 c = idx % num_loc_classes;
63 }
64
65 if (!SHARE_LOCATION && c == background_class_id)
66 continue;
67
68 BoundingBox bbox;
69 {
70 vector_type location;
71 v_load(location, locations_vPtr[idx]);
72
73 if (transpose_location)
74 {
75 bbox.ymin = location.data[0];
76 bbox.xmin = location.data[1];
77 bbox.ymax = location.data[2];
78 bbox.xmax = location.data[3];
79 }
80 else
81 {
82 bbox.xmin = location.data[0];
83 bbox.ymin = location.data[1];
84 bbox.xmax = location.data[2];
85 bbox.ymax = location.data[3];
86 }
87 }
88
89 if (!VARIANCE_ENCODED_IN_TARGET)
90 {
91 vector_type prior_variance;
92 v_load_ldg(prior_variance, priors_vPtr[num_priors + p]);
93
94 bbox.xmin *= static_cast<float>(prior_variance.data[0]);
95 bbox.ymin *= static_cast<float>(prior_variance.data[1]);
96 bbox.xmax *= static_cast<float>(prior_variance.data[2]);
97 bbox.ymax *= static_cast<float>(prior_variance.data[3]);
98 }
99
100 BoundingBox prior;
101 {
102 vector_type prior_box;
103 v_load_ldg(prior_box, priors_vPtr[p]);
104
105 prior.xmin = prior_box.data[0];
106 prior.ymin = prior_box.data[1];
107 prior.xmax = prior_box.data[2];
108 prior.ymax = prior_box.data[3];
109 }
110
111 BoundingBox decoded_bbox;
112 if (CORNER_TRUE_CENTER_FALSE)
113 {
114 decoded_bbox.xmin = prior.xmin + bbox.xmin;
115 decoded_bbox.ymin = prior.ymin + bbox.ymin;
116 decoded_bbox.xmax = prior.xmax + bbox.xmax;
117 decoded_bbox.ymax = prior.ymax + bbox.ymax;
118 }
119 else
120 {
121 auto prior_width = prior.xmax - prior.xmin;
122 auto prior_height = prior.ymax - prior.ymin;
123 if (!normalized_bbox)
124 {
125 prior_width += 1;
126 prior_height += 1;
127 }
128
129 auto prior_center_x = prior.xmin + prior_width * 0.5f;
130 auto prior_center_y = prior.ymin + prior_height * 0.5f;
131
132 auto decode_bbox_center_x = bbox.xmin * prior_width + prior_center_x;
133 auto decode_bbox_center_y = bbox.ymin * prior_height + prior_center_y;
134
135 using device::exp;
136 float decode_bbox_width = exp(bbox.xmax) * prior_width;
137 float decode_bbox_height = exp(bbox.ymax) * prior_height;
138
139 decoded_bbox.xmin = decode_bbox_center_x - decode_bbox_width * 0.5f;
140 decoded_bbox.ymin = decode_bbox_center_y - decode_bbox_height * 0.5f;
141 decoded_bbox.xmax = decode_bbox_center_x + decode_bbox_width * 0.5f;
142 decoded_bbox.ymax = decode_bbox_center_y + decode_bbox_height * 0.5f;
143 }
144
145 vector_type decoded_bbox_vec;
146 if (CLIP_BBOX)
147 {
148 decoded_bbox_vec.data[0] = clamp(decoded_bbox.xmin, 0.0f, clip_width);
149 decoded_bbox_vec.data[1] = clamp(decoded_bbox.ymin, 0.0f, clip_height);
150 decoded_bbox_vec.data[2] = clamp(decoded_bbox.xmax, 0.0f, clip_width);
151 decoded_bbox_vec.data[3] = clamp(decoded_bbox.ymax, 0.0f, clip_height);
152 }
153 else
154 {
155 decoded_bbox_vec.data[0] = decoded_bbox.xmin;
156 decoded_bbox_vec.data[1] = decoded_bbox.ymin;
157 decoded_bbox_vec.data[2] = decoded_bbox.xmax;
158 decoded_bbox_vec.data[3] = decoded_bbox.ymax;
159 }
160
161 v_store(decoded_bboxes_vPtr[idx], decoded_bbox_vec);
162 }
163 }
164
165 template <class T, int BINS, int BLOCK_SIZE>
__launch_bounds__(BLOCK_SIZE)166 __launch_bounds__(BLOCK_SIZE)
167 __global__ void findTopK(Span<int> indices_, Span<int> count_, View<T> scores_, float threshold, size_type classwise_topK, size_type num_classes, size_type num_priors, index_type background_class_id)
168 {
169 /* We need to sort boxes based on their confidence scores. The confidence scores fall in
170 * the range [0.0, 1.0]. We break the range into bins and perform count sort. This is an
171 * approximate algorithm.
172 *
173 * Each block handles a particular class of a particular batch item.
174 */
175 const auto c = blockIdx.x;
176 const auto b = blockIdx.y;
177
178 if (c == background_class_id)
179 return;
180
181 // indices: [batch_size, num_classes, classwise_topK]
182 // count: [batch_size, num_classes]
183 // scores: [batch_size, num_classes, num_priors]
184
185 auto count = count_.data() + b * num_classes + c;
186 auto scores = scores_.data() + (b * num_classes + c) * num_priors;
187 auto indices = indices_.data() + (b * num_classes + c) * classwise_topK;
188
189 /* We do not require a large number of bins to find the top K confidence scores. We will use
190 * a reasonable number of bins which will fit in the shared memory.
191 *
192 * Note that smaller scores will have a smaller index, i.e. the `bins` are ordered in
193 * ascending order.
194 */
195
196 __shared__ int bins[BINS];
197
198 #pragma unroll
199 for (int unroll = 0; unroll < BINS / BLOCK_SIZE; unroll++)
200 bins[unroll * BLOCK_SIZE + threadIdx.x] = 0;
201
202 __syncthreads();
203
204 for (auto i : block_stride_range<BLOCK_SIZE>(num_priors))
205 {
206 const float confidence = load_ldg(scores[i]);
207 if (confidence > threshold)
208 {
209 using device::fast_divide_ftz;
210 auto conf_scaled = fast_divide_ftz(confidence - threshold, 1 - threshold);
211
212 using device::clamp;
213 int bin_index = conf_scaled * BINS;
214
215 /* We store counts of confidence scores in the bins. Our ultimate goal is to store the indices
216 * of the `classwise_topK` confidence values in the `indices` array.
217 *
218 * We use a little trick to parallelize the process of filling up the `indices` array.
219 * We want every thread in the block to participate in the process. To do so, we want the
220 * bins array to be shifted by one place to the left. We will be computing the suffix sum
221 * of the bins array later. Details and reasons for doing so will be explained later.
222 */
223 bin_index = clamp<int>(bin_index, 0, BINS - 1) - 1; // shift left by one
224
225 if (bin_index >= 0)
226 atomicAdd(&bins[bin_index], 1);
227 }
228 }
229
230 __syncthreads();
231
232 constexpr int WARP_SIZE = 32; /* must be equal to warpSize */
233 // FORWARD_COMPATIBILITY_TAG: WARP_SIZE_DEPENDENT_CODE
234
235 if (threadIdx.x < WARP_SIZE)
236 {
237 /* We can compute suffix sum of an array in groups of N numbers.
238 * Let N be 4 for this example.
239 *
240 * 1) Last 4 numbers
241 * 1 2 3 4 | 5 6 7 8 | 9 10 11 12
242 * group suffix sum: 42 33 23 12
243 *
244 * 2) Middle 4 numbers
245 * 1 2 3 4 | 5 6 7 8 | 9 10 11 12
246 * group suffix sum: | 26 21 15 8 |
247 *
248 * We add `42` (first element in the previous group) to each element to get:
249 *
250 * 1 2 3 4 | 5 6 7 8 | 9 10 11 12
251 * | 68 63 57 50 | 42 33 23 12
252 * 3) First 4 numbers
253 *
254 * 1 2 3 4 | 5 6 7 8 | 9 10 11 12
255 * group suffix sum: 10 9 7 4 |
256 *
257 * We add `68` (first element in the previous group) to each element to get:
258 *
259 * 1 2 3 4 | 5 6 7 8 | 9 10 11 12
260 * group suffix sum: 78 77 75 72 | 68 63 57 50 | 42 33 23 12
261 *
262 * What we are left with now is the suffix sum of the entire array.
263 *
264 * We use the aforementioned logic in the code below but work in groups of `warpSize`.
265 */
266
267 /* We calculate suffix sums WARP_SIZE elements at a time starting from the right end.
268 * Hence, we will need BINS / WARP_SIZE number of iterations.
269 *
270 * Each iteration uses shuffle instructions to exchange data between threads. Shuffle
271 * instructions cannot be used in warp-divergent code. If the bins are a multiple of
272 * the warpSize, all the threads in the warp will participate.
273 */
274 static_assert(BINS % WARP_SIZE == 0, "number of bins must be a multiple of warp size");
275
276 const int thread_id = threadIdx.x;
277 const int inverse_lane_id = WARP_SIZE - thread_id - 1;
278
279 int previous_group_first_element = 0;
280 for (int iter = BINS / WARP_SIZE - 1; iter >= 0; iter--)
281 {
282 const index_type idx = iter * WARP_SIZE + thread_id;
283 auto value = bins[idx];
284
285 for (int i = 1; i < WARP_SIZE; i *= 2)
286 {
287 auto n = __shfl_down_sync(0xFFFFFFFF, value, i);
288 if (inverse_lane_id >= i)
289 value += n;
290 }
291
292 value += previous_group_first_element;
293 bins[idx] = value;
294
295 previous_group_first_element = __shfl_sync(0xFFFFFFFF, value, 0);
296 }
297 }
298
299 if (threadIdx.x == 0)
300 *count = 0;
301
302 __syncthreads();
303
304 for (auto i : block_stride_range<BLOCK_SIZE>(num_priors))
305 {
306 const float confidence = load_ldg(scores[i]);
307 if (confidence > threshold)
308 {
309 using device::fast_divide_ftz;
310 auto conf_scaled = fast_divide_ftz(confidence - threshold, 1 - threshold);
311
312 int bin_index = conf_scaled * BINS;
313 bin_index = clamp<int>(bin_index, 0, BINS - 1);
314
315 /* This bounding box is eligible to be selected unless it does not fall in
316 * the `classwise_topK`. If it did, we would have to compute the location where it needs
317 * to be stored.
318 *
319 * Suppose we had just 4 bins and say the following were the counts:
320 * BIN0 2
321 * BIN1 1
322 * BIN2 3
323 * BIN3 0 (last bin is always zero as we shift left by one while populating the bins)
324 *
325 * We will try our best to store the boxes in a sorted order in the `indices` array.
326 * This requires that the boxes in later bins (higher confidence scores) must be
327 * stored earlier.
328 *
329 * We compute the suffix sum of the array. This gives us:
330 * BIN0 6
331 * BIN1 4
332 * BIN2 3
333 * BIN3 0
334 *
335 * The bins now give us the location in the `indices` array from which the indices of the
336 * scores corresponding to that bin would be stored. We atomically increment the bin count
337 * everytime we store a box corresponding to that bin. Therefore, the value in the bins
338 * gives the index in the `indices` array where the next box corresponding to that bin must
339 * be put.
340 */
341
342 const index_type idx = atomicAdd(&bins[bin_index], 1);
343 if (idx < classwise_topK)
344 {
345 indices[idx] = i;
346 atomicAdd(&count[0], 1);
347 }
348 }
349 }
350 }
351
352 template <class T>
box_collect(Span<T> collected_bboxes_,View<T> decoded_bboxes_,View<int> indices_,View<int> count_,bool share_location,size_type num_priors,size_type num_classes,size_type classwise_topK,index_type background_class_id)353 __global__ void box_collect(Span<T> collected_bboxes_, View<T> decoded_bboxes_, View<int> indices_, View<int> count_, bool share_location, size_type num_priors, size_type num_classes, size_type classwise_topK, index_type background_class_id)
354 {
355 const index_type c = blockIdx.x;
356 if (c == background_class_id)
357 return;
358
359 const index_type b = blockIdx.y;
360
361 // collected_bboxes: [batch_size, num_classes, classwise_topK, 4]
362 // decoded_bboxes: [batch_size, num_priors, num_loc_classes, 4]
363 // indices: [batch_size, num_classes, classwise_topK]
364 // count: [batch_size, num_classes]
365
366 const auto num_loc_classes = share_location ? 1 : num_classes;
367
368 auto collected_bboxes = collected_bboxes_.data() + (b * num_classes + c) * classwise_topK * 4;
369 auto decoded_bboxes = decoded_bboxes_.data() + b * num_priors * num_loc_classes * 4;
370 auto indices = indices_.data() + (b * num_classes + c) * classwise_topK;
371 auto count = count_.data() + b * num_classes + c;
372
373 const auto boxes = load_ldg(&count[0]);
374 if (boxes == 0)
375 return;
376
377 using vector_type = get_vector_type_t<T, 4>;
378 auto decoded_bboxes_vPtr = vector_type::get_pointer(decoded_bboxes);
379 auto collected_bboxes_vPtr = vector_type::get_pointer(collected_bboxes);
380
381 for (auto i : block_stride_range<>(boxes))
382 {
383 const auto prior_id = indices[i];
384 const index_type idx = share_location ? prior_id : (prior_id * num_classes + c);
385
386 vector_type box;
387 v_load(box, decoded_bboxes_vPtr[idx]);
388 v_store(collected_bboxes_vPtr[i], box);
389 }
390 }
391
392 template <class T, bool NORMALIZED_BBOX>
blockwise_class_nms(Span<int> indices_,Span<int> count_,View<T> collected_bboxes_,size_type num_classes,size_type classwise_topK,index_type background_class_id,float nms_threshold)393 __global__ void blockwise_class_nms(Span<int> indices_, Span<int> count_, View<T> collected_bboxes_, size_type num_classes, size_type classwise_topK, index_type background_class_id, float nms_threshold)
394 {
395 const index_type b = blockIdx.x / num_classes;
396 const index_type c = blockIdx.x % num_classes;
397 if (c == background_class_id)
398 return;
399
400 // indices: [batch_size, num_classes, classwise_topK]
401 // count: [batch_size, num_classes]
402 // collected_bboxes: [batch_size, num_classes, classwise_topK, 4]
403
404 auto indices = indices_.data() + (b * num_classes + c) * classwise_topK;
405 auto count = count_.data() + b * num_classes + c;
406 auto collected_bboxes = collected_bboxes_.data() + (b * num_classes + c) * classwise_topK * 4;
407
408 const auto boxes = count[0];
409 if (boxes == 0)
410 return;
411
412 using vector_type = get_vector_type_t<T, 4>;
413 auto collected_bboxes_vPtr = vector_type::get_pointer(collected_bboxes);
414
415 for (int i = 0; i < boxes; i++)
416 {
417 auto prior_id = indices[i];
418 if (prior_id != -1)
419 {
420 BoundingBox bbox1;
421 {
422 vector_type box;
423 v_load(box, collected_bboxes_vPtr[i]);
424
425 bbox1.xmin = box.data[0];
426 bbox1.ymin = box.data[1];
427 bbox1.xmax = box.data[2];
428 bbox1.ymax = box.data[3];
429 }
430
431 for (auto j : block_stride_range<>(i + 1, boxes))
432 {
433 prior_id = indices[j];
434 if (prior_id == -1)
435 continue;
436
437 BoundingBox bbox2;
438 {
439 vector_type box;
440 v_load_ldg(box, collected_bboxes_vPtr[j]);
441
442 bbox2.xmin = box.data[0];
443 bbox2.ymin = box.data[1];
444 bbox2.xmax = box.data[2];
445 bbox2.ymax = box.data[3];
446 }
447
448 using device::min;
449 using device::max;
450
451 BoundingBox intersect_bbox;
452 intersect_bbox.xmin = max(bbox1.xmin, bbox2.xmin);
453 intersect_bbox.ymin = max(bbox1.ymin, bbox2.ymin);
454 intersect_bbox.xmax = min(bbox1.xmax, bbox2.xmax);
455 intersect_bbox.ymax = min(bbox1.ymax, bbox2.ymax);
456
457 float intersect_size = compute_bbox_size<NORMALIZED_BBOX>(intersect_bbox);
458 float bbox1_size = compute_bbox_size<NORMALIZED_BBOX>(bbox1);
459 float bbox2_size = compute_bbox_size<NORMALIZED_BBOX>(bbox2);
460
461 using device::fast_divide_ftz;
462 float iou = fast_divide_ftz(intersect_size, bbox1_size + bbox2_size - intersect_size);
463 if (iou > nms_threshold)
464 indices[j] = -1;
465 }
466 }
467
468 __syncthreads();
469 }
470
471 if (threadIdx.x == 0)
472 count[0] = 0;
473
474 __syncthreads();
475
476 for (auto i : block_stride_range<>(boxes))
477 {
478 auto prior_id = indices[i];
479 if(prior_id != -1)
480 {
481 const index_type idx = atomicAdd(&count[0], 1);
482 indices[idx] = prior_id;
483 }
484 }
485 }
486
487 template <class T, std::size_t BINS, int BLOCK_SIZE>
__launch_bounds__(BLOCK_SIZE)488 __launch_bounds__(BLOCK_SIZE)
489 __global__ void nms_collect(
490 Span<int> kept_indices, Span<int> kept_count, View<int> indices_, View<int> count, View<T> scores_, float threshold,
491 size_type num_classes, size_type num_priors, size_type classwise_topK, size_type keepTopK, index_type background_class_id)
492 {
493 // sorting algorithm is documented in detail in findTopK kernel comments
494 // no explanations are provided here
495
496 // kept_indices: [batch_size, keepTopK]
497 // kept_count: [batch_size]
498
499 const auto b = blockIdx.x;
500
501 __shared__ int bins[BINS];
502
503 #pragma unroll
504 for (int unroll = 0; unroll < BINS / BLOCK_SIZE; unroll++)
505 bins[unroll * BLOCK_SIZE + threadIdx.x] = 0;
506
507 __syncthreads();
508
509 for (int c = 0; c < num_classes; c++)
510 {
511 if (c == background_class_id)
512 continue;
513
514 // indices: [batch_size, num_classes, classwise_topK]
515 // count: [batch_size, num_classes]
516 // scores: [batch_size, num_classes, num_priors]
517
518 const auto indices = indices_.data() + (b * num_classes + c) * classwise_topK;
519 const auto scores = scores_.data() + (b * num_classes + c) * num_priors;
520
521 auto boxes = count[b * num_classes + c];
522
523 for (auto i : block_stride_range<BLOCK_SIZE>(boxes))
524 {
525 auto prior_id = indices[i];
526 const float confidence = load_ldg(scores[prior_id]);
527 if (confidence > threshold)
528 {
529 using device::fast_divide_ftz;
530 auto conf_scaled = fast_divide_ftz(confidence - threshold, 1 - threshold);
531
532 using device::clamp;
533 int bin_index = conf_scaled * BINS;
534 bin_index = clamp<int>(bin_index, 0, BINS - 1) - 1; // shift left by one
535
536 if (bin_index >= 0)
537 atomicAdd(&bins[bin_index], 1);
538 }
539 }
540 }
541
542 __syncthreads();
543
544 constexpr int WARP_SIZE = 32; /* must be equal to warpSize */
545 // FORWARD_COMPATIBILITY_TAG: WARP_SIZE_DEPENDENT_CODE
546
547 if (threadIdx.x < WARP_SIZE)
548 {
549 static_assert(BINS % WARP_SIZE == 0, "number of bins must be a multiple of warp size");
550
551 const int thread_id = threadIdx.x;
552 const int inverse_lane_id = WARP_SIZE - thread_id - 1;
553
554 int previous_group_first_element = 0;
555 for (int iter = BINS / WARP_SIZE - 1; iter >= 0; iter--)
556 {
557 const index_type idx = iter * WARP_SIZE + thread_id;
558 auto value = bins[idx];
559
560 for (int i = 1; i < WARP_SIZE; i *= 2)
561 {
562 auto n = __shfl_down_sync(0xFFFFFFFF, value, i);
563 if (inverse_lane_id >= i)
564 value += n;
565 }
566
567 value += previous_group_first_element;
568 bins[idx] = value;
569
570 previous_group_first_element = __shfl_sync(0xFFFFFFFF, value, 0);
571 }
572 }
573
574 if (threadIdx.x == 0)
575 kept_count[b] = 0;
576
577 __syncthreads();
578
579 for (int c = 0; c < num_classes; c++)
580 {
581 if (c == background_class_id)
582 continue;
583
584 const auto indices = indices_.data() + (b * num_classes + c) * classwise_topK;
585 const auto scores = scores_.data() + (b * num_classes + c) * num_priors;
586
587 auto boxes = count[b * num_classes + c];
588
589 for (auto i : block_stride_range<BLOCK_SIZE>(boxes))
590 {
591 auto prior_id = indices[i];
592 const float confidence = load_ldg(scores[prior_id]);
593 if (confidence > threshold)
594 {
595 using device::fast_divide_ftz;
596 auto conf_scaled = fast_divide_ftz(confidence - threshold, 1 - threshold);
597
598 using device::clamp;
599 int bin_index = conf_scaled * BINS;
600 bin_index = clamp<int>(bin_index, 0, BINS - 1);
601
602 const index_type idx = atomicAdd(&bins[bin_index], 1);
603 if (idx < keepTopK)
604 {
605 kept_indices[b * keepTopK + idx] = c * num_priors + prior_id;
606 atomicAdd(&kept_count[b], 1);
607 }
608 }
609 }
610 }
611 }
612
613 template <class T>
consolidate_detections(Span<T> output,View<int> kept_indices,View<int> kept_count,View<T> decoded_bboxes,View<T> scores,bool share_location,size_type batch_size,size_type num_classes,size_type num_priors,size_type keepTopK,DevicePtr<int> num_detections)614 __global__ void consolidate_detections(Span<T> output,
615 View<int> kept_indices, View<int> kept_count, View<T> decoded_bboxes, View<T> scores, bool share_location,
616 size_type batch_size, size_type num_classes, size_type num_priors, size_type keepTopK, DevicePtr<int> num_detections)
617 {
618 using vector_type = get_vector_type_t<T, 4>;
619 auto decoded_bboxes_vPtr = vector_type::get_pointer(decoded_bboxes.data());
620
621 // output: [1, 1, batch_size * keepTopK, 7]
622 // kept_indices: [batch_size, keepTopK]
623 // kept_count: [batch_size]
624 // decoded_bboxes: [batch_size, num_priors, num_loc_classes, 4]
625 // scores: [batch_size, num_classes, num_priors]
626
627 for (int b = 0; b < batch_size; b++)
628 {
629 for (auto i : grid_stride_range(kept_count[b]))
630 {
631 auto score_id = kept_indices[b * keepTopK + i];
632 auto c = score_id / num_priors;
633 auto prior_id = score_id % num_priors;
634
635 const auto confidence = scores[b * num_classes * num_priors + score_id];
636
637 index_type bbox_id;
638 if (share_location)
639 {
640 // decoded_bboxes: [batch_size, num_priors, 1, 4]
641 bbox_id = b * num_priors + prior_id;
642 }
643 else
644 {
645 // decoded_bboxes: [batch_size, num_priors, num_classes, 4]
646 bbox_id = (b * num_priors + prior_id) * num_classes + c;
647 }
648
649 vector_type bbox;
650 v_load(bbox, decoded_bboxes_vPtr[bbox_id]);
651
652 auto output_id = atomicAdd(num_detections.get(), 1);
653 output[output_id * 7 + 0] = b;
654 output[output_id * 7 + 1] = c;
655 output[output_id * 7 + 2] = confidence;
656 output[output_id * 7 + 3] = bbox.data[0];
657 output[output_id * 7 + 4] = bbox.data[1];
658 output[output_id * 7 + 5] = bbox.data[2];
659 output[output_id * 7 + 6] = bbox.data[3];
660 }
661 }
662 }
663 }
664
665 template <class T, bool SHARE_LOCATION, bool VARIANCE_ENCODED_IN_TARGET, bool CORNER_TRUE_CENTER_FALSE, bool CLIP_BBOX> static
launch_decode_boxes_kernel(const Stream & stream,Span<T> decoded_bboxes,View<T> locations,View<T> priors,bool transpose_location,bool normalized_bbox,size_type num_loc_classes,index_type background_class_id,float clip_width,float clip_height)666 void launch_decode_boxes_kernel(const Stream& stream, Span<T> decoded_bboxes, View<T> locations, View<T> priors,
667 bool transpose_location, bool normalized_bbox,
668 size_type num_loc_classes, index_type background_class_id,
669 float clip_width, float clip_height)
670 {
671 auto kernel = raw::decode_bbox<T, SHARE_LOCATION, VARIANCE_ENCODED_IN_TARGET, CORNER_TRUE_CENTER_FALSE, CLIP_BBOX>;
672 auto policy = make_policy(kernel, decoded_bboxes.size() / 4, 0, stream);
673 launch_kernel(kernel, policy, decoded_bboxes, locations, priors, transpose_location, normalized_bbox, num_loc_classes, background_class_id, clip_width, clip_height);
674 }
675
676 template <class T, unsigned int current, class ...Args> static
677 typename std::enable_if<current == 0, void>
dispatch_decode_bboxes(int selector,Args &&...args)678 ::type dispatch_decode_bboxes(int selector, Args&& ...args) {
679 if(selector == 0)
680 launch_decode_boxes_kernel<T, 0, 0, 0, 0>(std::forward<Args>(args)...);
681 }
682
683 template <class T, unsigned int current, class ...Args> static
684 typename std::enable_if<current != 0, void>
dispatch_decode_bboxes(int selector,Args &&...args)685 ::type dispatch_decode_bboxes(int selector, Args&& ...args) {
686 if(selector == current)
687 launch_decode_boxes_kernel<T,
688 static_cast<bool>(current & 8),
689 static_cast<bool>(current & 4),
690 static_cast<bool>(current & 2),
691 static_cast<bool>(current & 1)>(std::forward<Args>(args)...);
692 else
693 dispatch_decode_bboxes<T, current - 1, Args...>(selector, std::forward<Args>(args)...);
694 }
695
696 template <class T>
decode_bboxes(const Stream & stream,Span<T> output,View<T> locations,View<T> priors,std::size_t num_loc_classes,bool share_location,std::size_t background_class_id,bool transpose_location,bool variance_encoded_in_target,bool corner_true_or_center_false,bool normalized_bbox,bool clip_box,float clip_width,float clip_height)697 void decode_bboxes(const Stream& stream, Span<T> output, View<T> locations, View<T> priors,
698 std::size_t num_loc_classes,
699 bool share_location, std::size_t background_class_id,
700 bool transpose_location, bool variance_encoded_in_target,
701 bool corner_true_or_center_false, bool normalized_bbox,
702 bool clip_box, float clip_width, float clip_height)
703 {
704 /* `config` combines three kernel template options into one number using which a bit of TMP code can
705 * run through all possible combinations and instantiate the correct template
706 */
707 unsigned int config = (share_location << 3 | variance_encoded_in_target << 2 | corner_true_or_center_false << 1 | clip_box);
708 dispatch_decode_bboxes<T, 15>(config, stream, output, locations, priors, transpose_location, normalized_bbox, num_loc_classes, background_class_id, clip_width, clip_height);
709 }
710
711 template void decode_bboxes(const Stream&, Span<__half>, View<__half>, View<__half>, std::size_t, bool, std::size_t, bool, bool, bool, bool, bool, float, float);
712 template void decode_bboxes(const Stream&, Span<float>, View<float>, View<float>, std::size_t, bool, std::size_t, bool, bool, bool, bool, bool, float, float);
713
714 template <class T>
findTopK(const Stream & stream,TensorSpan<int> indices,TensorSpan<int> count,TensorView<T> scores,std::size_t background_class_id,float threshold)715 void findTopK(const Stream& stream, TensorSpan<int> indices, TensorSpan<int> count, TensorView<T> scores, std::size_t background_class_id, float threshold)
716 {
717 // indices: [batch_size, num_classes, classwise_topK]
718 // count: [batch_size, num_classes]
719 // scores: [batch_size, num_classes, num_priors]
720
721 const auto batch_size = indices.get_axis_size(0);
722 CV_Assert(count.get_axis_size(0) == batch_size);
723 CV_Assert(scores.get_axis_size(0) == batch_size);
724
725 const auto num_classes = indices.get_axis_size(1);
726 CV_Assert(count.get_axis_size(1) == num_classes);
727 CV_Assert(scores.get_axis_size(1) == num_classes);
728
729 const auto classwise_topK = indices.get_axis_size(2);
730 const auto num_priors = scores.get_axis_size(2);
731
732 /* each block processes one class from each batch */
733 constexpr auto BLOCK_SIZE = 256;
734
735 dim3 grid_size(num_classes, batch_size);
736 dim3 block_size(BLOCK_SIZE);
737 auto policy = execution_policy(grid_size, block_size, stream);
738
739 auto kernel = raw::findTopK<T, 2048, BLOCK_SIZE>;
740 launch_kernel(kernel, policy, indices, count, scores, threshold, classwise_topK, num_classes, num_priors, background_class_id);
741 }
742
743 template void findTopK(const Stream&, TensorSpan<int>, TensorSpan<int>, TensorView<__half>, std::size_t, float);
744 template void findTopK(const Stream&, TensorSpan<int>, TensorSpan<int>, TensorView<float>, std::size_t, float);
745
746 template <class T>
box_collect(const Stream & stream,TensorSpan<T> collected_bboxes,TensorView<T> decoded_bboxes,TensorView<int> indices,TensorView<int> count,bool share_location,std::size_t background_class_id)747 void box_collect(const Stream& stream, TensorSpan<T> collected_bboxes, TensorView<T> decoded_bboxes, TensorView<int> indices, TensorView<int> count, bool share_location, std::size_t background_class_id)
748 {
749 // collected_bboxes: [batch_size, num_classes, classwise_topK, 4]
750 // decoded_bboxes: [batch_size, num_priors, num_loc_classes, 4]
751 // indices: [batch_size, num_classes, classwise_topK]
752 // count: [batch_size, num_classes]
753
754 const auto batch_size = collected_bboxes.get_axis_size(0);
755 CV_Assert(decoded_bboxes.get_axis_size(0) == batch_size);
756 CV_Assert(indices.get_axis_size(0) == batch_size);
757 CV_Assert(count.get_axis_size(0) == batch_size);
758
759 const auto num_classes = collected_bboxes.get_axis_size(1);
760 CV_Assert(indices.get_axis_size(1) == num_classes);
761 CV_Assert(count.get_axis_size(1) == num_classes);
762
763 const auto classwise_topK = collected_bboxes.get_axis_size(2);
764 CV_Assert(indices.get_axis_size(2) == classwise_topK);
765
766 const auto num_priors = decoded_bboxes.get_axis_size(1);
767
768 CV_Assert(!share_location || decoded_bboxes.get_axis_size(2) == 1);
769
770 constexpr int BLOCK_SIZE = 256;
771
772 /* each block processes one class from each batch */
773 dim3 grid_size(num_classes, batch_size);
774 dim3 block_size(BLOCK_SIZE);
775 auto policy = execution_policy(grid_size, block_size, stream);
776
777 auto kernel = raw::box_collect<T>;
778 launch_kernel(kernel, policy, collected_bboxes, decoded_bboxes, indices, count, share_location, num_priors, num_classes, classwise_topK, background_class_id);
779 }
780
781 template void box_collect(const Stream&, TensorSpan<float>, TensorView<float>, TensorView<int>, TensorView<int>, bool, std::size_t);
782 template void box_collect(const Stream&, TensorSpan<__half>, TensorView<__half>, TensorView<int>, TensorView<int>, bool, std::size_t);
783
784 template <class T>
blockwise_class_nms(const Stream & stream,TensorSpan<int> indices,TensorSpan<int> count,TensorView<T> collected_bboxes,bool normalized_bbox,std::size_t background_class_id,float nms_threshold)785 void blockwise_class_nms(const Stream& stream, TensorSpan<int> indices, TensorSpan<int> count, TensorView<T> collected_bboxes,
786 bool normalized_bbox, std::size_t background_class_id, float nms_threshold)
787 {
788 // indices: [batch_size, num_classes, classwise_topK]
789 // count: [batch_size, num_classes]
790 // collected_bboxes: [batch_size, num_classes, classwise_topK, 4]
791
792 const auto batch_size = indices.get_axis_size(0);
793 CV_Assert(count.get_axis_size(0) == batch_size);
794 CV_Assert(collected_bboxes.get_axis_size(0) == batch_size);
795
796 const auto num_classes = indices.get_axis_size(1);
797 CV_Assert(count.get_axis_size(1) == num_classes);
798 CV_Assert(collected_bboxes.get_axis_size(1) == num_classes);
799
800 const auto classwise_topK = indices.get_axis_size(2);
801 CV_Assert(collected_bboxes.get_axis_size(2) == classwise_topK);
802
803 /* each block processes one class from each batch */
804 auto num_blocks = batch_size * num_classes;
805 auto num_threads = std::max<std::size_t>(std::min<std::size_t>(1024, classwise_topK), 32);
806
807 dim3 grid_size(num_blocks);
808 dim3 block_size(num_threads);
809 auto policy = execution_policy(grid_size, block_size, stream);
810
811 if (normalized_bbox)
812 {
813 auto kernel = raw::blockwise_class_nms<T, true>;
814 launch_kernel(kernel, policy, indices, count, collected_bboxes, num_classes, classwise_topK, background_class_id, nms_threshold);
815 }
816 else
817 {
818 auto kernel = raw::blockwise_class_nms<T, false>;
819 launch_kernel(kernel, policy, indices, count, collected_bboxes, num_classes, classwise_topK, background_class_id, nms_threshold);
820 }
821 }
822
823 template void blockwise_class_nms(const Stream&, TensorSpan<int>, TensorSpan<int>, TensorView<__half>, bool, std::size_t, float);
824 template void blockwise_class_nms(const Stream&, TensorSpan<int>, TensorSpan<int>, TensorView<float>, bool, std::size_t, float);
825
826 template <class T>
nms_collect(const Stream & stream,TensorSpan<int> kept_indices,TensorSpan<int> kept_count,TensorView<int> indices,TensorView<int> count,TensorView<T> scores,float threshold,std::size_t background_class_id)827 void nms_collect(const Stream& stream, TensorSpan<int> kept_indices, TensorSpan<int> kept_count,
828 TensorView<int> indices, TensorView<int> count, TensorView<T> scores, float threshold, std::size_t background_class_id)
829 {
830 // kept_indices: [batch_size, keepTopK]
831 // kept_count: [batch_size]
832
833 // indices: [batch_size, num_classes, classwise_topK]
834 // count: [batch_size, num_classes]
835 // scores: [batch_size, num_classes, num_priors]
836
837 auto batch_size = kept_indices.get_axis_size(0);
838 CV_Assert(kept_count.get_axis_size(0) == batch_size);
839 CV_Assert(indices.get_axis_size(0) == batch_size);
840 CV_Assert(count.get_axis_size(0) == batch_size);
841 CV_Assert(scores.get_axis_size(0) == batch_size);
842
843 auto keepTopK = kept_indices.get_axis_size(1);
844
845 auto num_classes = indices.get_axis_size(1);
846 CV_Assert(count.get_axis_size(1) == num_classes);
847 CV_Assert(scores.get_axis_size(1) == num_classes);
848
849 auto classwise_topK = indices.get_axis_size(2);
850 auto num_priors = scores.get_axis_size(2);
851
852 auto num_blocks = batch_size;
853 constexpr int BLOCK_SIZE = 1024;
854
855 dim3 grid_size(num_blocks);
856 dim3 block_size(BLOCK_SIZE);
857 auto policy = execution_policy(grid_size, block_size, stream);
858
859 auto kernel = raw::nms_collect<T, 1024, BLOCK_SIZE>;
860 launch_kernel(kernel, policy, kept_indices, kept_count, indices, count, scores, threshold, num_classes, num_priors, classwise_topK, keepTopK, background_class_id);
861 }
862
863 template void nms_collect(const Stream&, TensorSpan<int>, TensorSpan<int>, TensorView<int>, TensorView<int>, TensorView<__half>, float, std::size_t);
864 template void nms_collect(const Stream&, TensorSpan<int>, TensorSpan<int>, TensorView<int>, TensorView<int>, TensorView<float>, float, std::size_t);
865
866 template <class T>
consolidate_detections(const Stream & stream,TensorSpan<T> output,TensorView<int> kept_indices,TensorView<int> kept_count,TensorView<T> decoded_bboxes,TensorView<T> scores,bool share_location,DevicePtr<int> num_detections)867 void consolidate_detections(const Stream& stream, TensorSpan<T> output,
868 TensorView<int> kept_indices, TensorView<int> kept_count,
869 TensorView<T> decoded_bboxes, TensorView<T> scores, bool share_location, DevicePtr<int> num_detections)
870 {
871 // output: [1, 1, batch_size * keepTopK, 7]
872 // kept_indices: [batch_size, keepTopK]
873 // kept_count: [batch_size]
874 // decoded_bboxes: [batch_size, num_priors, num_loc_classes, 4]
875 // scores: [batch_size, num_classes, num_priors]
876
877 auto batch_size = kept_indices.get_axis_size(0);
878 CV_Assert(kept_count.get_axis_size(0) == batch_size);
879 CV_Assert(decoded_bboxes.get_axis_size(0) == batch_size);
880 CV_Assert(scores.get_axis_size(0) == batch_size);
881
882 auto keepTopK = kept_indices.get_axis_size(1);
883
884 auto num_classes = scores.get_axis_size(1);
885 auto num_priors = scores.get_axis_size(2);
886
887 CV_Assert(batch_size * keepTopK * 7 == output.size());
888
889 auto kernel = raw::consolidate_detections<T>;
890 auto policy = make_policy(kernel, keepTopK, 0, stream);
891 launch_kernel(kernel, policy, output, kept_indices, kept_count, decoded_bboxes, scores, share_location, batch_size, num_classes, num_priors, keepTopK, num_detections);
892 }
893
894 template void consolidate_detections(const Stream&, TensorSpan<__half>, TensorView<int>, TensorView<int>, TensorView<__half>, TensorView<__half>, bool, DevicePtr<int>);
895 template void consolidate_detections(const Stream&, TensorSpan<float>, TensorView<int>, TensorView<int>, TensorView<float>, TensorView<float>, bool, DevicePtr<int>);
896
897 }}}} /* namespace cv::dnn::cuda4dnn::kernels */
898