1 // This file is part of OpenCV project.
2 // It is subject to the license terms in the LICENSE file found in the top-level directory
3 // of this distribution and at http://opencv.org/license.html.
4
5 #include <cuda_runtime.h>
6 #include <cuda_fp16.h>
7
8 #include "math.hpp"
9 #include "bbox_utils.hpp"
10 #include "grid_stride_range.hpp"
11 #include "block_stride_range.hpp"
12 #include "execution.hpp"
13 #include "vector_traits.hpp"
14 #include "memory.hpp"
15
16 #include "../cuda4dnn/csl/stream.hpp"
17 #include "../cuda4dnn/csl/span.hpp"
18 #include "../cuda4dnn/csl/tensor.hpp"
19
20 using namespace cv::dnn::cuda4dnn::csl;
21 using namespace cv::dnn::cuda4dnn::csl::device;
22
23 namespace cv { namespace dnn { namespace cuda4dnn { namespace kernels {
24
25 namespace raw {
26
27 template <class T, bool NORMALIZED_BBOX, int BLOCK_SIZE>
__launch_bounds__(BLOCK_SIZE)28 __launch_bounds__(BLOCK_SIZE)
29 __global__ void grid_nms(Span<unsigned int> mask_, Span<int> count_, View<T> bboxes_, size_type num_classes, index_type background_class_id, size_type topK, size_type topK_gs, float nms_threshold)
30 {
31 // topK_gs is topK rounded upwards to some size
32
33 // mask: [batch_size, num_classes, topK_gs, topK_gs / 32]
34 // bboxes: [batch_size, num_classes, topK, 4]
35 // count: [batch_size, num_classes]
36
37 const index_type c = blockIdx.y;
38 const index_type b = blockIdx.z;
39
40 if (c == background_class_id)
41 return;
42
43 auto mask = mask_.data() + (b * num_classes + c) * topK_gs * topK_gs / 32;
44 auto bboxes = bboxes_.data() + (b * num_classes + c) * topK * 4;
45 auto count = count_.data() + b * num_classes + c;
46
47 const auto boxes = *count;
48 if (boxes == 0)
49 return;
50
51 /* We divide the set of boxes into groups containing BLOCK_SIZE boxes */
52 const auto num_groups = (boxes + BLOCK_SIZE - 1) / BLOCK_SIZE;
53
54 /* We need to calculate IOUs for every pair of boxes. We can generalize and say that
55 * we need to compute IOUs of every group with every other group including itself.
56 */
57 // Each block processes a pair of groups.
58 const index_type group_i = blockIdx.x % num_groups;
59 const index_type group_j = blockIdx.x / num_groups;
60
61 /* we use __syncthreads() later but note that the following condition will cause all threads
62 * in the block to exit; hence, no thread will execute a divergent __syncthreads()
63 */
64 if (group_i >= num_groups || group_j >= num_groups)
65 return;
66
67 /* Note that IOU(A, B) = IOU(B, A). Hence, if we compute IOU(GROUP_A, GROUP_B), we do not need
68 * to compute IOU(GROUP_B, GROUP_A). We still have to compute IOU(GROUP_A, GROUP_A) though since
69 * each group has many boxes and we need IOUs amongst boxes within a group.
70 *
71 * We arbitarily choose a scheme to exit : exit if group_i is greater than group_j. This way we only
72 * compute IOUs between groups once. While nearly half the blocks are wasted, it's ok since they exit
73 * early on and the working blocks are compute heavy.
74 */
75 if (group_i > group_j)
76 return;
77
78 /* the following variables contain the absolute box number of the first box of their respective groups */
79 const auto group_i_offset = group_i * BLOCK_SIZE;
80 const auto group_j_offset = group_j * BLOCK_SIZE;
81
82 /* MAIN LOOP LOGIC:
83 * We compare a box `i` from group_i with all boxes in group_j in each iteration. The box `j` is fixed
84 * for each thread. The `j` exactly maps to the thread index. Hence, the `j` is a loop invariant. Each
85 * thread of the block computes the overlap between box `i` and its box `j`.
86 *
87 * for (int i = 0; i < BLOCK_SIZE; i++)
88 * {
89 * // i = box 1
90 * // j = threadIdx.x = box 2
91 * }
92 */
93
94 /* The `j` box is fixed for each thread. All `i` boxes will be required for every thread.
95 * We store the `i` boxes in shared memory to allow global memory coalesing.
96 */
97 using vector_type = get_vector_type_t<T, 4>;
98 __shared__ vector_type group_i_boxes[BLOCK_SIZE];
99
100 /* We will precompute the sizes of `i` boxes in the code where we load them. The size computation
101 * is distributed across the block. Otherwise, all threads will have to compute the size of the same
102 * box simultaneously in the main loop. The size is computed while the memory subsystem is busy
103 * servicing requests for box coordinates; the compute resources would otherwise be idle in this phase.
104 */
105 /* we store the size as a float since the size can exceed fp16 limits for unnormalized boxes */
106 __shared__ float group_i_size[BLOCK_SIZE];
107
108 const auto bboxes_vPtr = vector_type::get_pointer(bboxes);
109
110 // load `i` boxes and precompute their sizes
111 {
112 int i = threadIdx.x;
113 if (group_i_offset + i < boxes)
114 {
115 vector_type box;
116 v_load(box, bboxes_vPtr[group_i_offset + i]);
117 v_store(group_i_boxes[i], box);
118
119 BoundingBox bbox;
120 bbox.xmin = box.data[0];
121 bbox.ymin = box.data[1];
122 bbox.xmax = box.data[2];
123 bbox.ymax = box.data[3];
124
125 group_i_size[i] = compute_bbox_size<NORMALIZED_BBOX>(bbox);
126 }
127 }
128
129 __syncthreads();
130
131 /* We compute overlap between boxes and check if the IOU exceeds the nms threshold.
132 * We store the result (exceeds or below nms_thresold) in a two-dimensional matrix.
133 * (i, j) is set to one if the overlap between i and j is within the nms threshold.
134 * We pack 32 results into one 32-bit integer. The effective memory layout of the
135 * matrix hence is (BLOCK_SIZE, BLOCK_SIZE / 32).
136 */
137 __shared__ unsigned int mask_shared[BLOCK_SIZE * BLOCK_SIZE / 32];
138
139 // load box `j` and precompute its size (fixed per thread)
140 BoundingBox bbox_j;
141 float bbox_j_size = 0;
142 if (group_j_offset + threadIdx.x < boxes)
143 {
144 vector_type box;
145 v_load(box, bboxes_vPtr[group_j_offset + threadIdx.x]);
146
147 bbox_j.xmin = box.data[0];
148 bbox_j.ymin = box.data[1];
149 bbox_j.xmax = box.data[2];
150 bbox_j.ymax = box.data[3];
151
152 bbox_j_size = compute_bbox_size<NORMALIZED_BBOX>(bbox_j);
153 }
154
155 /* Each thread computes a predicate which is broadcasted across the warp to obtain a 32-bit mask.
156 * The lane zero thread of each warp saves the mask. We store the offset to the mask array beforehand
157 * to save cycles in the compute-intensive main loop.
158 */
159 auto mask_offset = threadIdx.x / 32;
160
161 /* The main loop is compute intensive and causes the kernel to be overall compute-bound. Hence,
162 * this loop has been highly tuned. Please profile and verify carefully before making changes.
163 */
164 /* UNROLL_SIZE is the number of boxes that must be processed per iteration. We manually unroll
165 * the loop since the compiler cannot effectively unroll on its own preassumably due to presence
166 * of instructions forcing warp synchronization.
167 */
168 constexpr int UNROLL_SIZE = 4;
169
170 #pragma unroll 8
171 for (int s = 0; s < BLOCK_SIZE; s += UNROLL_SIZE)
172 {
173 bool do_not_reject_j[UNROLL_SIZE];
174
175 #pragma unroll
176 for (int k = 0; k < UNROLL_SIZE; k++)
177 {
178 int i = s + k;
179
180 /* The number of boxes need not necessarily be a multiple of BLOCK_SIZE.
181 * However, the shared memory allocated can hold BLOCK_SIZE boxes from
182 * each group. Accessing the undefined regions of shared memory is
183 * a valid memory operation as long as the memory has been allocated.
184 *
185 * The condition below is only required when one of the groups does not
186 * fully filled with valid boxes. This situations are relatively rare. It's
187 * more common to see both groups completely filled.
188 *
189 * We comment this condition to improve the performance of the common case.
190 * This leads to a net improvement.
191 */
192 // if (group_i_offset + i < boxes && group_j_offset + threadIdx.x < boxes)
193 {
194 BoundingBox bbox_i;
195 float bbox_i_size;
196 {
197 vector_type box;
198 v_load(box, group_i_boxes[i]);
199 bbox_i.xmin = box.data[0];
200 bbox_i.ymin = box.data[1];
201 bbox_i.xmax = box.data[2];
202 bbox_i.ymax = box.data[3];
203
204 bbox_i_size = group_i_size[i];
205 }
206
207 using device::min;
208 using device::max;
209
210 BoundingBox intersect_bbox;
211 intersect_bbox.xmin = max(bbox_i.xmin, bbox_j.xmin);
212 intersect_bbox.ymin = max(bbox_i.ymin, bbox_j.ymin);
213 intersect_bbox.xmax = min(bbox_i.xmax, bbox_j.xmax);
214 intersect_bbox.ymax = min(bbox_i.ymax, bbox_j.ymax);
215
216 float intersect_size = compute_bbox_size<NORMALIZED_BBOX>(intersect_bbox);
217
218 using device::fast_divide_ftz;
219 float iou = fast_divide_ftz(intersect_size, bbox_i_size + bbox_j_size - intersect_size);
220 do_not_reject_j[k] = iou <= nms_threshold;
221 }
222 }
223
224 #pragma unroll
225 for (int k = 0; k < UNROLL_SIZE; k++)
226 {
227 // FORWARD_COMPATIBILITY_TAG: WARP_SIZE_DEPENDENT_CODE
228 auto predicate = __ballot_sync(0xFFFFFFFF, do_not_reject_j[k]);
229 if (threadIdx.x % 32 == 0)
230 mask_shared[mask_offset] = predicate;
231
232 /* The following operation should logically be inside the previous if branch. Note that `mask_offset`
233 * is only used by lane zero threads. Hence, there is no harm in executing it other threads as it is
234 * unused there.
235 *
236 * Keeping it inside prevents the compiler from treating it as a constexpr addition to the address in
237 * successive unrolled iterations. A register is used and instructions are emitted to multiply the
238 * addend by four to obtain the byte offset. Pulling it out of the branch makes the compiler do constexpr
239 * addition on the address in successive unrolled iterations.
240 */
241 mask_offset += BLOCK_SIZE / 32;
242 }
243 }
244
245 __syncthreads();
246
247 /* The mask data is organized as a two-dimensional bit matrix of size topK_gs * topK_gs.
248 * (i, j) is set to true if the overlap between `i` and `j` is beyond the nms threshold.
249 * We pack 32 results into one 32-bit integer. So the effective memory layout is topK_gs * topK_gs / 32.
250 */
251
252 /* Each box `i` was compared with BLOCK_SIZE `j` boxes. This amounts to BLOCK_SIZE / 32
253 * 32-bit integers per box `i`.
254 */
255 using mask_vector_type = get_vector_type_t<unsigned int, BLOCK_SIZE / 32>;
256
257 const int i = threadIdx.x;
258
259 auto mask_shared_vPtr = mask_vector_type::get_pointer(DevicePtr<unsigned>(mask_shared));
260 mask_vector_type temp;
261 v_load(temp, mask_shared_vPtr[i]);
262 for (int i = 0; i < mask_vector_type::size(); i++)
263 temp.data[i] = __brev(temp.data[i]);
264
265 auto mask_vPtr = mask_vector_type::get_pointer(mask);
266 v_store(mask_vPtr[((group_i_offset + i) * topK_gs + group_j_offset) / 32 / mask_vector_type::size()], temp);
267 }
268
269 template <int ITEMS_PER_THREAD, int BLOCK_SIZE>
__launch_bounds__(BLOCK_SIZE)270 __launch_bounds__(BLOCK_SIZE)
271 __global__ void grid_nms_collect(Span<int> indices_, Span<int> count_, View<unsigned int> mask_, size_type num_classes, index_type background_class_id, size_type topK, size_type topK_gs_by32)
272 {
273 const index_type c = blockIdx.x;
274 if (c == background_class_id)
275 return;
276
277 const index_type b = blockIdx.y;
278
279 // topK_gs is topK rounded upwards to some size
280
281 // indices: [batch_size, num_classes, topK]
282 // count: [batch_size, num_classes]
283 // mask: [batch_size, num_classes, topK_gs, topK_gs / 32]
284
285 auto indices = indices_.data() + (b * num_classes + c) * topK;
286 auto count = count_.data() + (b * num_classes + c);
287 auto mask = mask_.data() + (b * num_classes + c) * topK_gs_by32 * 32 * topK_gs_by32;
288
289 const auto boxes = *count;
290 if (boxes == 0)
291 return;
292
293 /* We have a fixed number of threads and an arbitary number of boxes. We use an array of
294 * bits to store which boxes haven't been eliminated and which are still active. We organize
295 * the array of bits into a matrix of bits of the shape (num_rows, BLOCK_SIZE, 32) which
296 * is equivalent to (num_rows, BLOCK_SIZE) where the type is a 32-bit unsigned integer.
297 * `num_rows` is the minimum number of rows required to cover all the boxes.
298 *
299 * Each thread handles a specific column in the matrix. To improve performance, we process
300 * `ITEMS_PER_THREAD` number of elements per thread. This changes the shape to (num_rows,
301 * ROW_WIDTH) where ROW_WIDTH is BLOCK_SIZE * ITEMS_PER_THREAD.
302 */
303 constexpr int ROW_WIDTH = BLOCK_SIZE * ITEMS_PER_THREAD;
304
305 const index_type num_32b_masks = static_cast<unsigned>(boxes + 31) / 32;
306 const index_type num_rows = static_cast<unsigned>(num_32b_masks + ROW_WIDTH - 1) / ROW_WIDTH;
307
308 extern __shared__ unsigned int active_boxes[]; // the matrix described earlier
309
310 #pragma unroll 1
311 for (auto idx : block_stride_range<BLOCK_SIZE>(num_32b_masks))
312 active_boxes[idx] = (idx == num_32b_masks - 1) ? __brev((1u << (boxes % 32)) - 1) : 0xFFFFFFFF;
313
314 __syncthreads();
315
316 using vector_type = get_vector_type_t<unsigned int, ITEMS_PER_THREAD>;
317 auto mask_vPtr = vector_type::get_pointer(mask);
318 auto shared_vPtr = vector_type::get_pointer(DevicePtr<unsigned>(active_boxes));
319
320 int index_temp;
321 int thread0_count = 0;
322 int thread_id = threadIdx.x;
323
324 for (int step = 0; step < num_32b_masks; step++)
325 {
326 auto current_active = active_boxes[step];
327 while (current_active)
328 {
329 const index_type bit = __clz(current_active);
330 const index_type i = step * 32 + bit;
331
332 const int mask_offset = static_cast<unsigned>(i * topK_gs_by32) / ITEMS_PER_THREAD;
333
334 /* We fetch the index from the memory and store it in a register. We will not use it until
335 * much later. This helps avoid a long scoreboard stall.
336 */
337 if (thread_id == 0)
338 index_temp = indices[i];
339
340 __syncthreads();
341
342 if (threadIdx.x == 0)
343 active_boxes[step] = current_active ^ (0x80000000 >> bit);
344
345 __syncthreads();
346
347 #pragma unroll 1
348 for (int r = 0; r < num_rows; r++)
349 {
350 const int idx = r * BLOCK_SIZE + thread_id;
351 if ((step & ~(ITEMS_PER_THREAD - 1)) <= idx * ITEMS_PER_THREAD && idx * ITEMS_PER_THREAD < num_32b_masks)
352 {
353 auto active_boxes_vec = shared_vPtr[idx];
354 auto mask_vec = mask_vPtr[mask_offset + idx];
355 for (int i = 0; i < vector_type::size(); i++)
356 active_boxes_vec.data[i] &= mask_vec.data[i];
357 shared_vPtr[idx] = active_boxes_vec;
358 }
359 }
360
361 __syncthreads();
362
363 if (thread_id == 0)
364 {
365 indices[thread0_count] = index_temp;
366 thread0_count++;
367 }
368
369 current_active = active_boxes[step];
370 }
371 }
372
373 if (threadIdx.x == 0)
374 *count = thread0_count;
375 }
376 }
377
378 constexpr int GROUP_SIZE = 128;
379
getAlignedTopK(std::size_t topK)380 static std::size_t getAlignedTopK(std::size_t topK)
381 {
382 auto remainder = topK % GROUP_SIZE;
383 if (remainder == 0)
384 return topK;
385 return topK + (GROUP_SIZE - remainder);
386 }
387
getGridNMSWorkspaceSizePerBatchItem(std::size_t num_classes,std::size_t classwise_topK)388 std::size_t getGridNMSWorkspaceSizePerBatchItem(std::size_t num_classes, std::size_t classwise_topK)
389 {
390 auto topK_gs = getAlignedTopK(classwise_topK);
391 return num_classes * topK_gs * topK_gs / 32 * sizeof(unsigned int);
392 }
393
394 template <class T>
grid_nms(const Stream & stream,Span<unsigned int> workspace,TensorSpan<int> indices,TensorSpan<int> count,TensorView<T> bboxes,int background_class_id,bool normalized_bbox,float nms_threshold)395 void grid_nms(const Stream& stream, Span<unsigned int> workspace, TensorSpan<int> indices, TensorSpan<int> count, TensorView<T> bboxes, int background_class_id, bool normalized_bbox, float nms_threshold)
396 {
397 // workspace: [batch_size, num_classes, topK_gs, topK_gs / 32]
398 // indices: [batch_size, num_classes, topK]
399 // count: [batch_size, num_classes]
400 // bboxes: [batch_size, num_classes, topK, 4] (only first count[b][c] boxes are read)
401
402 const auto batch_size = indices.get_axis_size(0);
403 CV_Assert(count.get_axis_size(0) == batch_size);
404 CV_Assert(bboxes.get_axis_size(0) == batch_size);
405
406 const auto num_classes = indices.get_axis_size(1);
407 CV_Assert(count.get_axis_size(1) == num_classes);
408 CV_Assert(bboxes.get_axis_size(1) == num_classes);
409
410 const auto topK = indices.get_axis_size(2);
411 CV_Assert(bboxes.get_axis_size(2) == topK);
412
413 CV_Assert(bboxes.get_axis_size(3) == 4);
414
415 const auto topK_gs = getAlignedTopK(topK);
416 CV_Assert(workspace.size() >= topK_gs * topK_gs / 32);
417
418 const auto boxes = topK;
419 const auto num_groups = (boxes + GROUP_SIZE - 1) / GROUP_SIZE;
420
421 {
422 // grid = (num_groups * num_groups, num_classes, batch_size)
423 // if the background class is the last class, we can reduce grid y dim by one
424 auto grid_num_classes = num_classes; //(background_class_id == num_classes - 1) ? num_classes - 1 : num_classes;
425
426 constexpr int BLOCK_SIZE = GROUP_SIZE;
427
428 dim3 grid_size(num_groups * num_groups, grid_num_classes, batch_size);
429 dim3 block_size(BLOCK_SIZE);
430 auto policy = execution_policy(grid_size, block_size, stream);
431
432 if (normalized_bbox)
433 {
434 auto kernel = raw::grid_nms<T, true, BLOCK_SIZE>;
435 launch_kernel(kernel, policy, workspace, count, bboxes, num_classes, background_class_id, topK, topK_gs, nms_threshold);
436 }
437 else
438 {
439 auto kernel = raw::grid_nms<T, false, BLOCK_SIZE>;
440 launch_kernel(kernel, policy, workspace, count, bboxes, num_classes, background_class_id, topK, topK_gs, nms_threshold);
441 }
442 }
443
444 {
445 // grid = (num_classes, batch_size)
446 // if the background class is the last class, we can reduce grid x dim by one
447 auto grid_num_classes = num_classes; //(background_class_id == num_classes - 1) ? num_classes - 1 : num_classes;
448
449 constexpr int BLOCK_SIZE = 64;
450
451 constexpr int ITEMS_PER_THREAD = 4;
452 auto kernel = raw::grid_nms_collect<ITEMS_PER_THREAD, BLOCK_SIZE>;
453
454 dim3 grid_size(grid_num_classes, batch_size);
455
456 auto sharedMem = topK_gs / 32 * 4;
457 auto policy = execution_policy(grid_size, BLOCK_SIZE, sharedMem, stream);
458 launch_kernel(kernel, policy, indices, count, workspace, num_classes, background_class_id, topK, topK_gs / 32);
459 }
460 }
461
462 std::size_t getGridNMSWorkspaceSizePerBatchItem(std::size_t num_classes, std::size_t classwise_topK);
463
464 template void grid_nms(const Stream& stream, Span<unsigned int> workspace, TensorSpan<int> indices, TensorSpan<int> count, TensorView<__half> bboxes, int, bool normalized_bbox, float nms_threshold);
465 template void grid_nms(const Stream& stream, Span<unsigned int> workspace, TensorSpan<int> indices, TensorSpan<int> count, TensorView<float> bboxes, int, bool normalized_bbox, float nms_threshold);
466
467 }}}} /* namespace cv::dnn::cuda4dnn::kernels */