1 // This file is part of OpenCV project.
2 // It is subject to the license terms in the LICENSE file found in the top-level directory
3 // of this distribution and at http://opencv.org/license.html.
4 
5 #include <cuda_runtime.h>
6 #include <cuda_fp16.h>
7 
8 #include "math.hpp"
9 #include "bbox_utils.hpp"
10 #include "grid_stride_range.hpp"
11 #include "block_stride_range.hpp"
12 #include "execution.hpp"
13 #include "vector_traits.hpp"
14 #include "memory.hpp"
15 
16 #include "../cuda4dnn/csl/stream.hpp"
17 #include "../cuda4dnn/csl/span.hpp"
18 #include "../cuda4dnn/csl/tensor.hpp"
19 
20 using namespace cv::dnn::cuda4dnn::csl;
21 using namespace cv::dnn::cuda4dnn::csl::device;
22 
23 namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
24 
25 namespace raw {
26 
27     template <class T, bool NORMALIZED_BBOX, int BLOCK_SIZE>
__launch_bounds__(BLOCK_SIZE)28     __launch_bounds__(BLOCK_SIZE)
29     __global__ void grid_nms(Span<unsigned int> mask_, Span<int> count_, View<T> bboxes_, size_type num_classes, index_type background_class_id, size_type topK, size_type topK_gs, float nms_threshold)
30     {
31         // topK_gs is topK rounded upwards to some size
32 
33         // mask: [batch_size, num_classes, topK_gs, topK_gs / 32]
34         // bboxes: [batch_size, num_classes, topK, 4]
35         // count: [batch_size, num_classes]
36 
37         const index_type c = blockIdx.y;
38         const index_type b = blockIdx.z;
39 
40         if (c == background_class_id)
41             return;
42 
43         auto mask = mask_.data() + (b * num_classes + c) * topK_gs * topK_gs / 32;
44         auto bboxes = bboxes_.data() + (b * num_classes + c) * topK * 4;
45         auto count = count_.data() + b * num_classes + c;
46 
47         const auto boxes = *count;
48         if (boxes == 0)
49             return;
50 
51         /* We divide the set of boxes into groups containing BLOCK_SIZE boxes */
52         const auto num_groups = (boxes + BLOCK_SIZE - 1) / BLOCK_SIZE;
53 
54         /* We need to calculate IOUs for every pair of boxes. We can generalize and say that
55          * we need to compute IOUs of every group with every other group including itself.
56          */
57         // Each block processes a pair of groups.
58         const index_type group_i = blockIdx.x % num_groups;
59         const index_type group_j = blockIdx.x / num_groups;
60 
61         /* we use __syncthreads() later but note that the following condition will cause all threads
62          * in the block to exit; hence, no thread will execute a divergent __syncthreads()
63          */
64         if (group_i >= num_groups || group_j >= num_groups)
65             return;
66 
67         /* Note that IOU(A, B) = IOU(B, A). Hence, if we compute IOU(GROUP_A, GROUP_B), we do not need
68          * to compute IOU(GROUP_B, GROUP_A). We still have to compute IOU(GROUP_A, GROUP_A) though since
69          * each group has many boxes and we need IOUs amongst boxes within a group.
70          *
71          * We arbitarily choose a scheme to exit : exit if group_i is greater than group_j. This way we only
72          * compute IOUs between groups once. While nearly half the blocks are wasted, it's ok since they exit
73          * early on and the working blocks are compute heavy.
74          */
75         if (group_i > group_j)
76             return;
77 
78         /* the following variables contain the absolute box number of the first box of their respective groups */
79         const auto group_i_offset = group_i * BLOCK_SIZE;
80         const auto group_j_offset = group_j * BLOCK_SIZE;
81 
82         /* MAIN LOOP LOGIC:
83          * We compare a box `i` from group_i with all boxes in group_j in each iteration. The box `j` is fixed
84          * for each thread. The `j` exactly maps to the thread index. Hence, the `j` is a loop invariant. Each
85          * thread of the block computes the overlap between box `i` and its box `j`.
86          *
87          * for (int i = 0; i < BLOCK_SIZE; i++)
88          * {
89          *    // i = box 1
90          *    // j = threadIdx.x = box 2
91          * }
92          */
93 
94         /* The `j` box is fixed for each thread. All `i` boxes will be required for every thread.
95          * We store the `i` boxes in shared memory to allow global memory coalesing.
96          */
97         using vector_type = get_vector_type_t<T, 4>;
98         __shared__ vector_type group_i_boxes[BLOCK_SIZE];
99 
100         /* We will precompute the sizes of `i` boxes in the code where we load them. The size computation
101          * is distributed across the block. Otherwise, all threads will have to compute the size of the same
102          * box simultaneously in the main loop. The size is computed while the memory subsystem is busy
103          * servicing requests for box coordinates; the compute resources would otherwise be idle in this phase.
104          */
105         /* we store the size as a float since the size can exceed fp16 limits for unnormalized boxes */
106         __shared__ float group_i_size[BLOCK_SIZE];
107 
108         const auto bboxes_vPtr = vector_type::get_pointer(bboxes);
109 
110         // load `i` boxes and precompute their sizes
111         {
112             int i = threadIdx.x;
113             if (group_i_offset + i < boxes)
114             {
115                 vector_type box;
116                 v_load(box, bboxes_vPtr[group_i_offset + i]);
117                 v_store(group_i_boxes[i], box);
118 
119                 BoundingBox bbox;
120                 bbox.xmin = box.data[0];
121                 bbox.ymin = box.data[1];
122                 bbox.xmax = box.data[2];
123                 bbox.ymax = box.data[3];
124 
125                 group_i_size[i] = compute_bbox_size<NORMALIZED_BBOX>(bbox);
126             }
127         }
128 
129         __syncthreads();
130 
131         /* We compute overlap between boxes and check if the IOU exceeds the nms threshold.
132          * We store the result (exceeds or below nms_thresold) in a two-dimensional matrix.
133          * (i, j) is set to one if the overlap between i and j is within the nms threshold.
134          * We pack 32 results into one 32-bit integer. The effective memory layout of the
135          * matrix hence is (BLOCK_SIZE, BLOCK_SIZE / 32).
136          */
137         __shared__ unsigned int mask_shared[BLOCK_SIZE * BLOCK_SIZE / 32];
138 
139         // load box `j` and precompute its size (fixed per thread)
140         BoundingBox bbox_j;
141         float bbox_j_size = 0;
142         if (group_j_offset + threadIdx.x < boxes)
143         {
144             vector_type box;
145             v_load(box, bboxes_vPtr[group_j_offset + threadIdx.x]);
146 
147             bbox_j.xmin = box.data[0];
148             bbox_j.ymin = box.data[1];
149             bbox_j.xmax = box.data[2];
150             bbox_j.ymax = box.data[3];
151 
152             bbox_j_size = compute_bbox_size<NORMALIZED_BBOX>(bbox_j);
153         }
154 
155         /* Each thread computes a predicate which is broadcasted across the warp to obtain a 32-bit mask.
156          * The lane zero thread of each warp saves the mask. We store the offset to the mask array beforehand
157          * to save cycles in the compute-intensive main loop.
158          */
159         auto mask_offset = threadIdx.x / 32;
160 
161         /* The main loop is compute intensive and causes the kernel to be overall compute-bound. Hence,
162          * this loop has been highly tuned. Please profile and verify carefully before making changes.
163          */
164         /* UNROLL_SIZE is the number of boxes that must be processed per iteration. We manually unroll
165          * the loop since the compiler cannot effectively unroll on its own preassumably due to presence
166          * of instructions forcing warp synchronization.
167          */
168         constexpr int UNROLL_SIZE = 4;
169 
170         #pragma unroll 8
171         for (int s = 0; s < BLOCK_SIZE; s += UNROLL_SIZE)
172         {
173             bool do_not_reject_j[UNROLL_SIZE];
174 
175             #pragma unroll
176             for (int k = 0; k < UNROLL_SIZE; k++)
177             {
178                 int i = s + k;
179 
180                 /* The number of boxes need not necessarily be a multiple of BLOCK_SIZE.
181                  * However, the shared memory allocated can hold BLOCK_SIZE boxes from
182                  * each group. Accessing the undefined regions of shared memory is
183                  * a valid memory operation as long as the memory has been allocated.
184                  *
185                  * The condition below is only required when one of the groups does not
186                  * fully filled with valid boxes. This situations are relatively rare. It's
187                  * more common to see both groups completely filled.
188                  *
189                  * We comment this condition to improve the performance of the common case.
190                  * This leads to a net improvement.
191                  */
192                 // if (group_i_offset + i < boxes && group_j_offset + threadIdx.x < boxes)
193                 {
194                     BoundingBox bbox_i;
195                     float bbox_i_size;
196                     {
197                         vector_type box;
198                         v_load(box, group_i_boxes[i]);
199                         bbox_i.xmin = box.data[0];
200                         bbox_i.ymin = box.data[1];
201                         bbox_i.xmax = box.data[2];
202                         bbox_i.ymax = box.data[3];
203 
204                         bbox_i_size = group_i_size[i];
205                     }
206 
207                     using device::min;
208                     using device::max;
209 
210                     BoundingBox intersect_bbox;
211                     intersect_bbox.xmin = max(bbox_i.xmin, bbox_j.xmin);
212                     intersect_bbox.ymin = max(bbox_i.ymin, bbox_j.ymin);
213                     intersect_bbox.xmax = min(bbox_i.xmax, bbox_j.xmax);
214                     intersect_bbox.ymax = min(bbox_i.ymax, bbox_j.ymax);
215 
216                     float intersect_size = compute_bbox_size<NORMALIZED_BBOX>(intersect_bbox);
217 
218                     using device::fast_divide_ftz;
219                     float iou = fast_divide_ftz(intersect_size, bbox_i_size + bbox_j_size - intersect_size);
220                     do_not_reject_j[k] = iou <= nms_threshold;
221                 }
222             }
223 
224             #pragma unroll
225             for (int k = 0; k < UNROLL_SIZE; k++)
226             {
227                 // FORWARD_COMPATIBILITY_TAG: WARP_SIZE_DEPENDENT_CODE
228                 auto predicate = __ballot_sync(0xFFFFFFFF, do_not_reject_j[k]);
229                 if (threadIdx.x % 32 == 0)
230                     mask_shared[mask_offset] = predicate;
231 
232                 /* The following operation should logically be inside the previous if branch. Note that `mask_offset`
233                  * is only used by lane zero threads. Hence, there is no harm in executing it other threads as it is
234                  * unused there.
235                  *
236                  * Keeping it inside prevents the compiler from treating it as a constexpr addition to the address in
237                  * successive unrolled iterations. A register is used and instructions are emitted to multiply the
238                  * addend by four to obtain the byte offset. Pulling it out of the branch makes the compiler do constexpr
239                  * addition on the address in successive unrolled iterations.
240                  */
241                 mask_offset += BLOCK_SIZE / 32;
242             }
243         }
244 
245         __syncthreads();
246 
247         /* The mask data is organized as a two-dimensional bit matrix of size topK_gs * topK_gs.
248          * (i, j) is set to true if the overlap between `i` and `j` is beyond the nms threshold.
249          * We pack 32 results into one 32-bit integer. So the effective memory layout is topK_gs * topK_gs / 32.
250          */
251 
252         /* Each box `i` was compared with BLOCK_SIZE `j` boxes. This amounts to BLOCK_SIZE / 32
253          * 32-bit integers per box `i`.
254          */
255         using mask_vector_type = get_vector_type_t<unsigned int, BLOCK_SIZE / 32>;
256 
257         const int i = threadIdx.x;
258 
259         auto mask_shared_vPtr = mask_vector_type::get_pointer(DevicePtr<unsigned>(mask_shared));
260         mask_vector_type temp;
261         v_load(temp, mask_shared_vPtr[i]);
262         for (int i = 0; i < mask_vector_type::size(); i++)
263             temp.data[i] = __brev(temp.data[i]);
264 
265         auto mask_vPtr = mask_vector_type::get_pointer(mask);
266         v_store(mask_vPtr[((group_i_offset + i) * topK_gs + group_j_offset) / 32 / mask_vector_type::size()], temp);
267     }
268 
269     template <int ITEMS_PER_THREAD, int BLOCK_SIZE>
__launch_bounds__(BLOCK_SIZE)270     __launch_bounds__(BLOCK_SIZE)
271     __global__ void grid_nms_collect(Span<int> indices_, Span<int> count_, View<unsigned int> mask_, size_type num_classes, index_type background_class_id, size_type topK, size_type topK_gs_by32)
272     {
273         const index_type c = blockIdx.x;
274         if (c == background_class_id)
275             return;
276 
277         const index_type b = blockIdx.y;
278 
279         // topK_gs is topK rounded upwards to some size
280 
281         // indices: [batch_size, num_classes, topK]
282         // count: [batch_size, num_classes]
283         // mask: [batch_size, num_classes, topK_gs, topK_gs / 32]
284 
285         auto indices = indices_.data() + (b * num_classes + c) * topK;
286         auto count = count_.data() + (b * num_classes + c);
287         auto mask = mask_.data() + (b * num_classes + c) * topK_gs_by32 * 32 * topK_gs_by32;
288 
289         const auto boxes = *count;
290         if (boxes == 0)
291             return;
292 
293         /* We have a fixed number of threads and an arbitary number of boxes. We use an array of
294          * bits to store which boxes haven't been eliminated and which are still active. We organize
295          * the array of bits into a matrix of bits of the shape (num_rows, BLOCK_SIZE, 32) which
296          * is equivalent to (num_rows, BLOCK_SIZE) where the type is a 32-bit unsigned integer.
297          * `num_rows` is the minimum number of rows required to cover all the boxes.
298          *
299          * Each thread handles a specific column in the matrix. To improve performance, we process
300          * `ITEMS_PER_THREAD` number of elements per thread. This changes the shape to (num_rows,
301          * ROW_WIDTH) where ROW_WIDTH is BLOCK_SIZE * ITEMS_PER_THREAD.
302          */
303          constexpr int ROW_WIDTH = BLOCK_SIZE * ITEMS_PER_THREAD;
304 
305          const index_type num_32b_masks = static_cast<unsigned>(boxes + 31) / 32;
306          const index_type num_rows = static_cast<unsigned>(num_32b_masks + ROW_WIDTH - 1) / ROW_WIDTH;
307 
308         extern __shared__ unsigned int active_boxes[]; // the matrix described earlier
309 
310         #pragma unroll 1
311         for (auto idx : block_stride_range<BLOCK_SIZE>(num_32b_masks))
312             active_boxes[idx] = (idx == num_32b_masks - 1) ? __brev((1u << (boxes % 32)) - 1) : 0xFFFFFFFF;
313 
314         __syncthreads();
315 
316         using vector_type = get_vector_type_t<unsigned int, ITEMS_PER_THREAD>;
317         auto mask_vPtr = vector_type::get_pointer(mask);
318         auto shared_vPtr = vector_type::get_pointer(DevicePtr<unsigned>(active_boxes));
319 
320         int index_temp;
321         int thread0_count = 0;
322         int thread_id = threadIdx.x;
323 
324         for (int step = 0; step < num_32b_masks; step++)
325         {
326             auto current_active = active_boxes[step];
327             while (current_active)
328             {
329                 const index_type bit = __clz(current_active);
330                 const index_type i = step * 32 + bit;
331 
332                 const int mask_offset = static_cast<unsigned>(i * topK_gs_by32) / ITEMS_PER_THREAD;
333 
334                 /* We fetch the index from the memory and store it in a register. We will not use it until
335                  * much later. This helps avoid a long scoreboard stall.
336                  */
337                 if (thread_id == 0)
338                     index_temp = indices[i];
339 
340                 __syncthreads();
341 
342                 if (threadIdx.x == 0)
343                     active_boxes[step] = current_active ^ (0x80000000 >> bit);
344 
345                 __syncthreads();
346 
347                 #pragma unroll 1
348                 for (int r = 0; r < num_rows; r++)
349                 {
350                     const int idx = r * BLOCK_SIZE + thread_id;
351                     if ((step & ~(ITEMS_PER_THREAD - 1)) <= idx * ITEMS_PER_THREAD && idx * ITEMS_PER_THREAD < num_32b_masks)
352                     {
353                         auto active_boxes_vec = shared_vPtr[idx];
354                         auto mask_vec = mask_vPtr[mask_offset + idx];
355                         for (int i = 0; i < vector_type::size(); i++)
356                             active_boxes_vec.data[i] &= mask_vec.data[i];
357                         shared_vPtr[idx] = active_boxes_vec;
358                     }
359                 }
360 
361                 __syncthreads();
362 
363                 if (thread_id == 0)
364                 {
365                     indices[thread0_count] = index_temp;
366                     thread0_count++;
367                 }
368 
369                 current_active = active_boxes[step];
370             }
371         }
372 
373         if (threadIdx.x == 0)
374             *count = thread0_count;
375     }
376 }
377 
378 constexpr int GROUP_SIZE = 128;
379 
getAlignedTopK(std::size_t topK)380 static std::size_t getAlignedTopK(std::size_t topK)
381 {
382     auto remainder = topK % GROUP_SIZE;
383     if (remainder == 0)
384         return topK;
385     return topK + (GROUP_SIZE - remainder);
386 }
387 
getGridNMSWorkspaceSizePerBatchItem(std::size_t num_classes,std::size_t classwise_topK)388 std::size_t getGridNMSWorkspaceSizePerBatchItem(std::size_t num_classes, std::size_t classwise_topK)
389 {
390     auto topK_gs = getAlignedTopK(classwise_topK);
391     return num_classes * topK_gs * topK_gs / 32 * sizeof(unsigned int);
392 }
393 
394 template <class T>
grid_nms(const Stream & stream,Span<unsigned int> workspace,TensorSpan<int> indices,TensorSpan<int> count,TensorView<T> bboxes,int background_class_id,bool normalized_bbox,float nms_threshold)395 void grid_nms(const Stream& stream, Span<unsigned int> workspace, TensorSpan<int> indices, TensorSpan<int> count, TensorView<T> bboxes, int background_class_id, bool normalized_bbox, float nms_threshold)
396 {
397     // workspace: [batch_size, num_classes, topK_gs, topK_gs / 32]
398     // indices: [batch_size, num_classes, topK]
399     // count: [batch_size, num_classes]
400     // bboxes: [batch_size, num_classes, topK, 4] (only first count[b][c] boxes are read)
401 
402     const auto batch_size = indices.get_axis_size(0);
403     CV_Assert(count.get_axis_size(0) == batch_size);
404     CV_Assert(bboxes.get_axis_size(0) == batch_size);
405 
406     const auto num_classes = indices.get_axis_size(1);
407     CV_Assert(count.get_axis_size(1) == num_classes);
408     CV_Assert(bboxes.get_axis_size(1) == num_classes);
409 
410     const auto topK = indices.get_axis_size(2);
411     CV_Assert(bboxes.get_axis_size(2) == topK);
412 
413     CV_Assert(bboxes.get_axis_size(3) == 4);
414 
415     const auto topK_gs = getAlignedTopK(topK);
416     CV_Assert(workspace.size() >= topK_gs * topK_gs / 32);
417 
418     const auto boxes = topK;
419     const auto num_groups = (boxes + GROUP_SIZE - 1) / GROUP_SIZE;
420 
421     {
422         // grid = (num_groups * num_groups, num_classes, batch_size)
423         // if the background class is the last class, we can reduce grid y dim by one
424         auto grid_num_classes = num_classes; //(background_class_id == num_classes - 1) ? num_classes - 1 : num_classes;
425 
426         constexpr int BLOCK_SIZE = GROUP_SIZE;
427 
428         dim3 grid_size(num_groups * num_groups, grid_num_classes, batch_size);
429         dim3 block_size(BLOCK_SIZE);
430         auto policy = execution_policy(grid_size, block_size, stream);
431 
432         if (normalized_bbox)
433         {
434             auto kernel = raw::grid_nms<T, true, BLOCK_SIZE>;
435             launch_kernel(kernel, policy, workspace, count, bboxes, num_classes, background_class_id, topK, topK_gs, nms_threshold);
436         }
437         else
438         {
439             auto kernel = raw::grid_nms<T, false, BLOCK_SIZE>;
440             launch_kernel(kernel, policy, workspace, count, bboxes, num_classes, background_class_id, topK, topK_gs, nms_threshold);
441         }
442     }
443 
444     {
445         // grid = (num_classes, batch_size)
446         // if the background class is the last class, we can reduce grid x dim by one
447         auto grid_num_classes = num_classes; //(background_class_id == num_classes - 1) ? num_classes - 1 : num_classes;
448 
449         constexpr int BLOCK_SIZE = 64;
450 
451         constexpr int ITEMS_PER_THREAD = 4;
452         auto kernel = raw::grid_nms_collect<ITEMS_PER_THREAD, BLOCK_SIZE>;
453 
454         dim3 grid_size(grid_num_classes, batch_size);
455 
456         auto sharedMem = topK_gs / 32 * 4;
457         auto policy = execution_policy(grid_size, BLOCK_SIZE, sharedMem, stream);
458         launch_kernel(kernel, policy, indices, count, workspace, num_classes, background_class_id, topK, topK_gs / 32);
459     }
460 }
461 
462 std::size_t getGridNMSWorkspaceSizePerBatchItem(std::size_t num_classes, std::size_t classwise_topK);
463 
464 template void grid_nms(const Stream& stream, Span<unsigned int> workspace, TensorSpan<int> indices, TensorSpan<int> count, TensorView<__half> bboxes, int, bool normalized_bbox, float nms_threshold);
465 template void grid_nms(const Stream& stream, Span<unsigned int> workspace, TensorSpan<int> indices, TensorSpan<int> count, TensorView<float> bboxes, int, bool normalized_bbox, float nms_threshold);
466 
467 }}}} /* namespace cv::dnn::cuda4dnn::kernels */