1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 //   * Redistribution's of source code must retain the above copyright notice,
21 //     this list of conditions and the following disclaimer.
22 //
23 //   * Redistribution's in binary form must reproduce the above copyright notice,
24 //     this list of conditions and the following disclaimer in the documentation
25 //     and/or other materials provided with the distribution.
26 //
27 //   * The name of the copyright holders may not be used to endorse or promote products
28 //     derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42 
43 #if !defined CUDA_DISABLER
44 
45 #include <opencv2/core/cuda/common.hpp>
46 #include <opencv2/core/cuda/vec_traits.hpp>
47 #include <opencv2/core/cuda/vec_math.hpp>
48 #include <opencv2/core/cuda/emulation.hpp>
49 
50 #include <iostream>
51 #include <stdio.h>
52 
53 namespace cv { namespace cuda { namespace device
54 {
55     namespace ccl
56     {
57         enum
58         {
59             WARP_SIZE  = 32,
60             WARP_LOG   = 5,
61 
62             CTA_SIZE_X = 32,
63             CTA_SIZE_Y = 8,
64 
65             STA_SIZE_MERGE_Y = 4,
66             STA_SIZE_MERGE_X = 32,
67 
68             TPB_X = 1,
69             TPB_Y = 4,
70 
71             TILE_COLS = CTA_SIZE_X * TPB_X,
72             TILE_ROWS = CTA_SIZE_Y * TPB_Y
73         };
74 
75         template<typename T> struct IntervalsTraits
76         {
77             typedef T elem_type;
78         };
79 
80         template<> struct IntervalsTraits<unsigned char>
81         {
82             typedef int dist_type;
83             enum {ch = 1};
84         };
85 
86         template<> struct IntervalsTraits<uchar3>
87         {
88             typedef int3 dist_type;
89             enum {ch = 3};
90         };
91 
92         template<> struct IntervalsTraits<uchar4>
93         {
94             typedef int4 dist_type;
95             enum {ch = 4};
96         };
97 
98         template<> struct IntervalsTraits<unsigned short>
99         {
100             typedef int dist_type;
101             enum {ch = 1};
102         };
103 
104         template<> struct IntervalsTraits<ushort3>
105         {
106             typedef int3 dist_type;
107             enum {ch = 3};
108         };
109 
110         template<> struct IntervalsTraits<ushort4>
111         {
112             typedef int4 dist_type;
113             enum {ch = 4};
114         };
115 
116         template<> struct IntervalsTraits<float>
117         {
118             typedef float dist_type;
119             enum {ch = 1};
120         };
121 
122         template<> struct IntervalsTraits<int>
123         {
124             typedef int dist_type;
125             enum {ch = 1};
126         };
127 
128         typedef unsigned char component;
129         enum Edges { UP = 1, DOWN = 2, LEFT = 4, RIGHT = 8, EMPTY = 0xF0 };
130 
131         template<typename T, int CH> struct InInterval {};
132 
133         template<typename T> struct InInterval<T, 1>
134         {
135             typedef typename VecTraits<T>::elem_type E;
InIntervalcv::cuda::device::ccl::InInterval136             __host__ __device__ __forceinline__ InInterval(const float4& _lo, const float4& _hi) : lo((E)(-_lo.x)), hi((E)_hi.x) { }
137             T lo, hi;
138 
operator ()cv::cuda::device::ccl::InInterval139             template<typename I> __device__ __forceinline__ bool operator() (const I& a, const I& b) const
140             {
141                 I d = a - b;
142                 return lo <= d && d <= hi;
143             }
144         };
145 
146 
147         template<typename T> struct InInterval<T, 3>
148         {
149             typedef typename VecTraits<T>::elem_type E;
InIntervalcv::cuda::device::ccl::InInterval150             __host__ __device__ __forceinline__ InInterval(const float4& _lo, const float4& _hi)
151             : lo (VecTraits<T>::make((E)(-_lo.x), (E)(-_lo.y), (E)(-_lo.z))), hi (VecTraits<T>::make((E)_hi.x, (E)_hi.y, (E)_hi.z)) { }
152             T lo, hi;
153 
operator ()cv::cuda::device::ccl::InInterval154             template<typename I> __device__ __forceinline__ bool operator() (const I& a, const I& b) const
155             {
156                 I d = saturate_cast<I>(a - b);
157                 return lo.x <= d.x && d.x <= hi.x &&
158                        lo.y <= d.y && d.y <= hi.y &&
159                        lo.z <= d.z && d.z <= hi.z;
160             }
161         };
162 
163         template<typename T> struct InInterval<T, 4>
164         {
165             typedef typename VecTraits<T>::elem_type E;
InIntervalcv::cuda::device::ccl::InInterval166             __host__ __device__ __forceinline__ InInterval(const float4& _lo, const float4& _hi)
167             : lo (VecTraits<T>::make((E)(-_lo.x), (E)(-_lo.y), (E)(-_lo.z), (E)(-_lo.w))), hi (VecTraits<T>::make((E)_hi.x, (E)_hi.y, (E)_hi.z, (E)_hi.w)) { }
168             T lo, hi;
169 
operator ()cv::cuda::device::ccl::InInterval170             template<typename I> __device__ __forceinline__ bool operator() (const I& a, const I& b) const
171             {
172                 I d = saturate_cast<I>(a - b);
173                 return lo.x <= d.x && d.x <= hi.x &&
174                        lo.y <= d.y && d.y <= hi.y &&
175                        lo.z <= d.z && d.z <= hi.z &&
176                        lo.w <= d.w && d.w <= hi.w;
177             }
178         };
179 
180 
181         template<typename T, typename F>
computeConnectivity(const PtrStepSz<T> image,PtrStepSzb components,F connected)182         __global__ void computeConnectivity(const PtrStepSz<T> image, PtrStepSzb components, F connected)
183         {
184             int x = threadIdx.x + blockIdx.x * blockDim.x;
185             int y = threadIdx.y + blockIdx.y * blockDim.y;
186 
187             if (x >= image.cols || y >= image.rows) return;
188 
189             T intensity = image(y, x);
190             component c = 0;
191 
192             if ( x > 0 && connected(intensity, image(y, x - 1)))
193                 c |= LEFT;
194 
195             if ( y > 0 && connected(intensity, image(y - 1, x)))
196                 c |= UP;
197 
198             if ( x + 1 < image.cols && connected(intensity, image(y, x + 1)))
199                 c |= RIGHT;
200 
201             if ( y + 1 < image.rows && connected(intensity, image(y + 1, x)))
202                 c |= DOWN;
203 
204             components(y, x) = c;
205         }
206 
207         template< typename T>
computeEdges(const PtrStepSzb & image,PtrStepSzb edges,const float4 & lo,const float4 & hi,cudaStream_t stream)208         void computeEdges(const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream)
209         {
210             dim3 block(CTA_SIZE_X, CTA_SIZE_Y);
211             dim3 grid(divUp(image.cols, block.x), divUp(image.rows, block.y));
212 
213             typedef InInterval<typename IntervalsTraits<T>::dist_type, IntervalsTraits<T>::ch> Int_t;
214 
215             Int_t inInt(lo, hi);
216             computeConnectivity<T, Int_t><<<grid, block, 0, stream>>>(static_cast<const PtrStepSz<T> >(image), edges, inInt);
217 
218             cudaSafeCall( cudaGetLastError() );
219             if (stream == 0)
220                 cudaSafeCall( cudaDeviceSynchronize() );
221         }
222 
223         template void computeEdges<uchar>  (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
224         template void computeEdges<uchar3> (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
225         template void computeEdges<uchar4> (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
226         template void computeEdges<ushort> (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
227         template void computeEdges<ushort3>(const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
228         template void computeEdges<ushort4>(const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
229         template void computeEdges<int>    (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
230         template void computeEdges<float>  (const PtrStepSzb& image, PtrStepSzb edges, const float4& lo, const float4& hi, cudaStream_t stream);
231 
lableTiles(const PtrStepSzb edges,PtrStepSzi comps)232         __global__ void lableTiles(const PtrStepSzb edges, PtrStepSzi comps)
233         {
234             int x = threadIdx.x + blockIdx.x * TILE_COLS;
235             int y = threadIdx.y + blockIdx.y * TILE_ROWS;
236 
237             if (x >= edges.cols || y >= edges.rows) return;
238 
239             //currently x is 1
240             int bounds = ((y + TPB_Y) < edges.rows);
241 
242             __shared__ int labelsTile[TILE_ROWS][TILE_COLS];
243             __shared__ int  edgesTile[TILE_ROWS][TILE_COLS];
244 
245             int new_labels[TPB_Y][TPB_X];
246             int old_labels[TPB_Y][TPB_X];
247 
248             #pragma unroll
249             for (int i = 0; i < TPB_Y; ++i)
250                 #pragma unroll
251                 for (int j = 0; j < TPB_X; ++j)
252                 {
253                     int yloc = threadIdx.y + CTA_SIZE_Y * i;
254                     int xloc = threadIdx.x + CTA_SIZE_X * j;
255                     component c = edges(bounds * (y + CTA_SIZE_Y * i), x + CTA_SIZE_X * j);
256 
257                     if (!xloc) c &= ~LEFT;
258                     if (!yloc) c &= ~UP;
259 
260                     if (xloc == TILE_COLS -1) c &= ~RIGHT;
261                     if (yloc == TILE_ROWS -1) c &= ~DOWN;
262 
263                     new_labels[i][j] = yloc * TILE_COLS + xloc;
264                     edgesTile[yloc][xloc] = c;
265                 }
266 
267             for (int k = 0; ;++k)
268             {
269                 //1. backup
270                 #pragma unroll
271                 for (int i = 0; i < TPB_Y; ++i)
272                     #pragma unroll
273                     for (int j = 0; j < TPB_X; ++j)
274                     {
275                         int yloc = threadIdx.y + CTA_SIZE_Y * i;
276                         int xloc = threadIdx.x + CTA_SIZE_X * j;
277 
278                         old_labels[i][j]       = new_labels[i][j];
279                         labelsTile[yloc][xloc] = new_labels[i][j];
280                     }
281 
282                 __syncthreads();
283 
284                 //2. compare local arrays
285                 #pragma unroll
286                 for (int i = 0; i < TPB_Y; ++i)
287                     #pragma unroll
288                     for (int j = 0; j < TPB_X; ++j)
289                     {
290                         int yloc = threadIdx.y + CTA_SIZE_Y * i;
291                         int xloc = threadIdx.x + CTA_SIZE_X * j;
292 
293                         component c = edgesTile[yloc][xloc];
294                         int label = new_labels[i][j];
295 
296                         if (c & UP)
297                            label = ::min(label, labelsTile[yloc - 1][xloc]);
298 
299                         if (c &  DOWN)
300                            label = ::min(label, labelsTile[yloc + 1][xloc]);
301 
302                         if (c & LEFT)
303                            label = ::min(label, labelsTile[yloc][xloc - 1]);
304 
305                         if (c & RIGHT)
306                            label = ::min(label, labelsTile[yloc][xloc + 1]);
307 
308                        new_labels[i][j] = label;
309                     }
310 
311                 __syncthreads();
312 
313                 //3. determine: Is any value changed?
314                 int changed = 0;
315                 #pragma unroll
316                 for (int i = 0; i < TPB_Y; ++i)
317                     #pragma unroll
318                     for (int j = 0; j < TPB_X; ++j)
319                     {
320                         if (new_labels[i][j] < old_labels[i][j])
321                         {
322                             changed = 1;
323                             Emulation::smem::atomicMin(&labelsTile[0][0] + old_labels[i][j], new_labels[i][j]);
324                         }
325                     }
326 
327                 changed = Emulation::syncthreadsOr(changed);
328 
329                 if (!changed)
330                     break;
331 
332                 //4. Compact paths
333                 const int *labels = &labelsTile[0][0];
334                 #pragma unroll
335                 for (int i = 0; i < TPB_Y; ++i)
336                     #pragma unroll
337                     for (int j = 0; j < TPB_X; ++j)
338                     {
339                         int label = new_labels[i][j];
340 
341                         while( labels[label] < label ) label = labels[label];
342 
343                         new_labels[i][j] = label;
344                     }
345                 __syncthreads();
346             }
347 
348             #pragma unroll
349             for (int i = 0; i < TPB_Y; ++i)
350             #pragma unroll
351                 for (int j = 0; j < TPB_X; ++j)
352                 {
353                     int label = new_labels[i][j];
354                     int yloc = label / TILE_COLS;
355                     int xloc = label - yloc * TILE_COLS;
356 
357                     xloc += blockIdx.x * TILE_COLS;
358                     yloc += blockIdx.y * TILE_ROWS;
359 
360                     label = yloc * edges.cols + xloc;
361                     // do it for x too.
362                     if (y + CTA_SIZE_Y * i < comps.rows) comps(y + CTA_SIZE_Y * i, x + CTA_SIZE_X * j) = label;
363                 }
364         }
365 
root(const PtrStepSzi & comps,int label)366         __device__ __forceinline__ int root(const PtrStepSzi& comps, int label)
367         {
368             while(1)
369             {
370                 int y = label / comps.cols;
371                 int x = label - y * comps.cols;
372 
373                 int parent = comps(y, x);
374 
375                 if (label == parent) break;
376 
377                 label = parent;
378             }
379             return label;
380         }
381 
isConnected(PtrStepSzi & comps,int l1,int l2,bool & changed)382         __device__ __forceinline__ void isConnected(PtrStepSzi& comps, int l1, int l2, bool& changed)
383         {
384             int r1 = root(comps, l1);
385             int r2 = root(comps, l2);
386 
387             if (r1 == r2) return;
388 
389             int mi = ::min(r1, r2);
390             int ma = ::max(r1, r2);
391 
392             int y = ma / comps.cols;
393             int x = ma - y * comps.cols;
394 
395             atomicMin(&comps.ptr(y)[x], mi);
396             changed = true;
397         }
398 
crossMerge(const int tilesNumY,const int tilesNumX,int tileSizeY,int tileSizeX,const PtrStepSzb edges,PtrStepSzi comps,const int yIncomplete,int xIncomplete)399         __global__ void crossMerge(const int tilesNumY, const int tilesNumX, int tileSizeY, int tileSizeX,
400             const PtrStepSzb edges, PtrStepSzi comps, const int yIncomplete, int xIncomplete)
401         {
402             int tid = threadIdx.y * blockDim.x + threadIdx.x;
403             int stride = blockDim.y * blockDim.x;
404 
405             int ybegin = blockIdx.y * (tilesNumY * tileSizeY);
406             int yend   = ybegin + tilesNumY * tileSizeY;
407 
408             if (blockIdx.y == gridDim.y - 1)
409             {
410                 yend -= yIncomplete * tileSizeY;
411                 yend -= tileSizeY;
412                 tileSizeY = (edges.rows % tileSizeY);
413 
414                 yend += tileSizeY;
415             }
416 
417             int xbegin = blockIdx.x * tilesNumX * tileSizeX;
418             int xend   = xbegin + tilesNumX * tileSizeX;
419 
420             if (blockIdx.x == gridDim.x - 1)
421             {
422                 if (xIncomplete) yend = ybegin;
423                 xend -= xIncomplete * tileSizeX;
424                 xend -= tileSizeX;
425                 tileSizeX = (edges.cols % tileSizeX);
426 
427                 xend += tileSizeX;
428             }
429 
430             if (blockIdx.y == (gridDim.y - 1) && yIncomplete)
431             {
432                 xend = xbegin;
433             }
434 
435             int tasksV = (tilesNumX - 1) * (yend - ybegin);
436             int tasksH = (tilesNumY - 1) * (xend - xbegin);
437 
438             int total = tasksH + tasksV;
439 
440             bool changed;
441             do
442             {
443                 changed = false;
444                 for (int taskIdx = tid; taskIdx < total; taskIdx += stride)
445                 {
446                     if (taskIdx < tasksH)
447                     {
448                         int indexH = taskIdx;
449 
450                         int row = indexH / (xend - xbegin);
451                         int col = indexH - row * (xend - xbegin);
452 
453                         int y = ybegin + (row + 1) * tileSizeY;
454                         int x = xbegin + col;
455 
456                         component e = edges( x, y);
457                         if (e & UP)
458                         {
459                             int lc = comps(y,x);
460                             int lu = comps(y - 1, x);
461 
462                             isConnected(comps, lc, lu, changed);
463                         }
464                     }
465                     else
466                     {
467                         int indexV = taskIdx - tasksH;
468 
469                         int col = indexV / (yend - ybegin);
470                         int row = indexV - col * (yend - ybegin);
471 
472                         int x = xbegin + (col + 1) * tileSizeX;
473                         int y = ybegin + row;
474 
475                         component e = edges(x, y);
476                         if (e & LEFT)
477                         {
478                             int lc = comps(y, x);
479                             int ll = comps(y, x - 1);
480 
481                             isConnected(comps, lc, ll, changed);
482                         }
483                     }
484                 }
485             } while (Emulation::syncthreadsOr(changed));
486         }
487 
flatten(const PtrStepSzb edges,PtrStepSzi comps)488         __global__ void flatten(const PtrStepSzb edges, PtrStepSzi comps)
489         {
490             int x = threadIdx.x + blockIdx.x * blockDim.x;
491             int y = threadIdx.y + blockIdx.y * blockDim.y;
492 
493             if( x < comps.cols && y < comps.rows)
494                 comps(y, x) = root(comps, comps(y, x));
495         }
496 
497         enum {CC_NO_COMPACT = 0, CC_COMPACT_LABELS = 1};
498 
labelComponents(const PtrStepSzb & edges,PtrStepSzi comps,int flags,cudaStream_t stream)499         void labelComponents(const PtrStepSzb& edges, PtrStepSzi comps, int flags, cudaStream_t stream)
500         {
501             CV_UNUSED(flags);
502             dim3 block(CTA_SIZE_X, CTA_SIZE_Y);
503             dim3 grid(divUp(edges.cols, TILE_COLS), divUp(edges.rows, TILE_ROWS));
504 
505             lableTiles<<<grid, block, 0, stream>>>(edges, comps);
506             cudaSafeCall( cudaGetLastError() );
507 
508             int tileSizeX = TILE_COLS, tileSizeY = TILE_ROWS;
509             while (grid.x > 1 || grid.y > 1)
510             {
511                 dim3 mergeGrid((int)ceilf(grid.x / 2.f), (int)ceilf(grid.y / 2.f));
512                 dim3 mergeBlock(STA_SIZE_MERGE_X, STA_SIZE_MERGE_Y);
513                 // debug log
514                 // std::cout << "merging: " << grid.y  << " x " << grid.x << " ---> " << mergeGrid.y <<  " x " << mergeGrid.x << " for tiles: " << tileSizeY << " x " << tileSizeX << std::endl;
515                 crossMerge<<<mergeGrid, mergeBlock, 0, stream>>>(2, 2, tileSizeY, tileSizeX, edges, comps, (int)ceilf(grid.y / 2.f) - grid.y / 2, (int)ceilf(grid.x / 2.f) - grid.x / 2);
516                 tileSizeX <<= 1;
517                 tileSizeY <<= 1;
518                 grid = mergeGrid;
519 
520                 cudaSafeCall( cudaGetLastError() );
521             }
522 
523             grid.x = divUp(edges.cols, block.x);
524             grid.y = divUp(edges.rows, block.y);
525             flatten<<<grid, block, 0, stream>>>(edges, comps);
526             cudaSafeCall( cudaGetLastError() );
527 
528             if (stream == 0)
529                 cudaSafeCall( cudaDeviceSynchronize() );
530         }
531     }
532 } } }
533 
534 #endif /* CUDA_DISABLER */
535