1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 //   * Redistribution's of source code must retain the above copyright notice,
21 //     this list of conditions and the following disclaimer.
22 //
23 //   * Redistribution's in binary form must reproduce the above copyright notice,
24 //     this list of conditions and the following disclaimer in the documentation
25 //     and/or other materials provided with the distribution.
26 //
27 //   * The name of the copyright holders may not be used to endorse or promote products
28 //     derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42 
43 #if !defined CUDA_DISABLER
44 
45 #include "opencv2/core/cuda/common.hpp"
46 #include "opencv2/core/cuda/emulation.hpp"
47 #include "opencv2/core/cuda/transform.hpp"
48 #include "opencv2/core/cuda/functional.hpp"
49 #include "opencv2/core/cuda/utility.hpp"
50 #include "opencv2/core/cuda.hpp"
51 
52 using namespace cv::cuda;
53 using namespace cv::cuda::device;
54 
55 namespace canny
56 {
57     struct L1 : binary_function<int, int, float>
58     {
operator ()canny::L159         __device__ __forceinline__ float operator ()(int x, int y) const
60         {
61             return ::abs(x) + ::abs(y);
62         }
63 
L1canny::L164         __host__ __device__ __forceinline__ L1() {}
L1canny::L165         __host__ __device__ __forceinline__ L1(const L1&) {}
66     };
67     struct L2 : binary_function<int, int, float>
68     {
operator ()canny::L269         __device__ __forceinline__ float operator ()(int x, int y) const
70         {
71             return ::sqrtf(x * x + y * y);
72         }
73 
L2canny::L274         __host__ __device__ __forceinline__ L2() {}
L2canny::L275         __host__ __device__ __forceinline__ L2(const L2&) {}
76     };
77 }
78 
79 namespace cv { namespace cuda { namespace device
80 {
81     template <> struct TransformFunctorTraits<canny::L1> : DefaultTransformFunctorTraits<canny::L1>
82     {
83         enum { smart_shift = 4 };
84     };
85     template <> struct TransformFunctorTraits<canny::L2> : DefaultTransformFunctorTraits<canny::L2>
86     {
87         enum { smart_shift = 4 };
88     };
89 }}}
90 
91 namespace canny
92 {
93     struct SrcTex
94     {
~SrcTexcanny::SrcTex95         virtual ~SrcTex() {}
96 
SrcTexcanny::SrcTex97         __host__ SrcTex(int _xoff, int _yoff) : xoff(_xoff), yoff(_yoff) {}
98 
99         __device__ __forceinline__ virtual int operator ()(int y, int x) const = 0;
100 
101         int xoff;
102         int yoff;
103     };
104 
105     texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_src(false, cudaFilterModePoint, cudaAddressModeClamp);
106     struct SrcTexRef : SrcTex
107     {
SrcTexRefcanny::SrcTexRef108         __host__ SrcTexRef(int _xoff, int _yoff) : SrcTex(_xoff, _yoff) {}
109 
operator ()canny::SrcTexRef110         __device__ __forceinline__ int operator ()(int y, int x) const override
111         {
112             return tex2D(tex_src, x + xoff, y + yoff);
113         }
114     };
115 
116     struct SrcTexObj : SrcTex
117     {
SrcTexObjcanny::SrcTexObj118         __host__ SrcTexObj(int _xoff, int _yoff, cudaTextureObject_t _tex_src_object) : SrcTex(_xoff, _yoff), tex_src_object(_tex_src_object) { }
119 
operator ()canny::SrcTexObj120         __device__ __forceinline__ int operator ()(int y, int x) const override
121         {
122             return tex2D<uchar>(tex_src_object, x + xoff, y + yoff);
123         }
124 
125         cudaTextureObject_t tex_src_object;
126     };
127 
128     template <
129         class T,
130         class Norm,
131         typename = typename std::enable_if<std::is_base_of<SrcTex, T>::value>::type
132     >
calcMagnitudeKernel(const T src,PtrStepi dx,PtrStepi dy,PtrStepSzf mag,const Norm norm)133     __global__ void calcMagnitudeKernel(const T src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm)
134     {
135         const int x = blockIdx.x * blockDim.x + threadIdx.x;
136         const int y = blockIdx.y * blockDim.y + threadIdx.y;
137 
138         if (y >= mag.rows || x >= mag.cols)
139             return;
140 
141         int dxVal = (src(y - 1, x + 1) + 2 * src(y, x + 1) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y, x - 1) + src(y + 1, x - 1));
142         int dyVal = (src(y + 1, x - 1) + 2 * src(y + 1, x) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y - 1, x) + src(y - 1, x + 1));
143 
144         dx(y, x) = dxVal;
145         dy(y, x) = dyVal;
146 
147         mag(y, x) = norm(dxVal, dyVal);
148     }
149 
calcMagnitude(PtrStepSzb srcWhole,int xoff,int yoff,PtrStepSzi dx,PtrStepSzi dy,PtrStepSzf mag,bool L2Grad,cudaStream_t stream)150     void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad, cudaStream_t stream)
151     {
152         const dim3 block(16, 16);
153         const dim3 grid(divUp(mag.cols, block.x), divUp(mag.rows, block.y));
154 
155         bool cc30 = deviceSupports(FEATURE_SET_COMPUTE_30);
156 
157         if (cc30)
158         {
159             cudaTextureDesc texDesc;
160             memset(&texDesc, 0, sizeof(texDesc));
161             texDesc.addressMode[0] = cudaAddressModeClamp;
162             texDesc.addressMode[1] = cudaAddressModeClamp;
163             texDesc.addressMode[2] = cudaAddressModeClamp;
164 
165             cudaTextureObject_t tex = 0;
166             createTextureObjectPitch2D(&tex, srcWhole, texDesc);
167 
168             SrcTexObj src(xoff, yoff, tex);
169 
170             if (L2Grad)
171             {
172                 L2 norm;
173                 calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm);
174             }
175             else
176             {
177                 L1 norm;
178                 calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm);
179             }
180 
181             cudaSafeCall( cudaGetLastError() );
182 
183             if (stream == NULL)
184                 cudaSafeCall( cudaDeviceSynchronize() );
185             else
186                 cudaSafeCall( cudaStreamSynchronize(stream) );
187 
188             cudaSafeCall( cudaDestroyTextureObject(tex) );
189         }
190         else
191         {
192             bindTexture(&tex_src, srcWhole);
193             SrcTexRef src(xoff, yoff);
194 
195             if (L2Grad)
196             {
197                 L2 norm;
198                 calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm);
199             }
200             else
201             {
202                 L1 norm;
203                 calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm);
204             }
205 
206             cudaSafeCall( cudaGetLastError() );
207 
208             if (stream == NULL)
209                 cudaSafeCall( cudaDeviceSynchronize() );
210         }
211     }
212 
calcMagnitude(PtrStepSzi dx,PtrStepSzi dy,PtrStepSzf mag,bool L2Grad,cudaStream_t stream)213     void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad, cudaStream_t stream)
214     {
215         if (L2Grad)
216         {
217             L2 norm;
218             transform(dx, dy, mag, norm, WithOutMask(), stream);
219         }
220         else
221         {
222             L1 norm;
223             transform(dx, dy, mag, norm, WithOutMask(), stream);
224         }
225     }
226 }
227 
228 //////////////////////////////////////////////////////////////////////////////////////////
229 
230 namespace canny
231 {
232     texture<float, cudaTextureType2D, cudaReadModeElementType> tex_mag(false, cudaFilterModePoint, cudaAddressModeClamp);
calcMapKernel(const PtrStepSzi dx,const PtrStepi dy,PtrStepi map,const float low_thresh,const float high_thresh)233     __global__ void calcMapKernel(const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh)
234     {
235         const int CANNY_SHIFT = 15;
236         const int TG22 = (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5);
237 
238         const int x = blockIdx.x * blockDim.x + threadIdx.x;
239         const int y = blockIdx.y * blockDim.y + threadIdx.y;
240 
241         if (x == 0 || x >= dx.cols - 1 || y == 0 || y >= dx.rows - 1)
242             return;
243 
244         int dxVal = dx(y, x);
245         int dyVal = dy(y, x);
246 
247         const int s = (dxVal ^ dyVal) < 0 ? -1 : 1;
248         const float m = tex2D(tex_mag, x, y);
249 
250         dxVal = ::abs(dxVal);
251         dyVal = ::abs(dyVal);
252 
253         // 0 - the pixel can not belong to an edge
254         // 1 - the pixel might belong to an edge
255         // 2 - the pixel does belong to an edge
256         int edge_type = 0;
257 
258         if (m > low_thresh)
259         {
260             const int tg22x = dxVal * TG22;
261             const int tg67x = tg22x + ((dxVal + dxVal) << CANNY_SHIFT);
262 
263             dyVal <<= CANNY_SHIFT;
264 
265             if (dyVal < tg22x)
266             {
267                 if (m > tex2D(tex_mag, x - 1, y) && m >= tex2D(tex_mag, x + 1, y))
268                     edge_type = 1 + (int)(m > high_thresh);
269             }
270             else if(dyVal > tg67x)
271             {
272                 if (m > tex2D(tex_mag, x, y - 1) && m >= tex2D(tex_mag, x, y + 1))
273                     edge_type = 1 + (int)(m > high_thresh);
274             }
275             else
276             {
277                 if (m > tex2D(tex_mag, x - s, y - 1) && m >= tex2D(tex_mag, x + s, y + 1))
278                     edge_type = 1 + (int)(m > high_thresh);
279             }
280         }
281 
282         map(y, x) = edge_type;
283     }
284 
calcMapKernel(const PtrStepSzi dx,const PtrStepi dy,PtrStepi map,const float low_thresh,const float high_thresh,cudaTextureObject_t tex_mag)285     __global__ void calcMapKernel(const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh, cudaTextureObject_t tex_mag)
286     {
287         const int CANNY_SHIFT = 15;
288         const int TG22 = (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5);
289 
290         const int x = blockIdx.x * blockDim.x + threadIdx.x;
291         const int y = blockIdx.y * blockDim.y + threadIdx.y;
292 
293         if (x == 0 || x >= dx.cols - 1 || y == 0 || y >= dx.rows - 1)
294             return;
295 
296         int dxVal = dx(y, x);
297         int dyVal = dy(y, x);
298 
299         const int s = (dxVal ^ dyVal) < 0 ? -1 : 1;
300         const float m = tex2D<float>(tex_mag, x, y);
301 
302         dxVal = ::abs(dxVal);
303         dyVal = ::abs(dyVal);
304 
305         // 0 - the pixel can not belong to an edge
306         // 1 - the pixel might belong to an edge
307         // 2 - the pixel does belong to an edge
308         int edge_type = 0;
309 
310         if (m > low_thresh)
311         {
312             const int tg22x = dxVal * TG22;
313             const int tg67x = tg22x + ((dxVal + dxVal) << CANNY_SHIFT);
314 
315             dyVal <<= CANNY_SHIFT;
316 
317             if (dyVal < tg22x)
318             {
319                 if (m > tex2D<float>(tex_mag, x - 1, y) && m >= tex2D<float>(tex_mag, x + 1, y))
320                     edge_type = 1 + (int)(m > high_thresh);
321             }
322             else if(dyVal > tg67x)
323             {
324                 if (m > tex2D<float>(tex_mag, x, y - 1) && m >= tex2D<float>(tex_mag, x, y + 1))
325                     edge_type = 1 + (int)(m > high_thresh);
326             }
327             else
328             {
329                 if (m > tex2D<float>(tex_mag, x - s, y - 1) && m >= tex2D<float>(tex_mag, x + s, y + 1))
330                     edge_type = 1 + (int)(m > high_thresh);
331             }
332         }
333 
334         map(y, x) = edge_type;
335     }
336 
calcMap(PtrStepSzi dx,PtrStepSzi dy,PtrStepSzf mag,PtrStepSzi map,float low_thresh,float high_thresh,cudaStream_t stream)337     void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh, cudaStream_t stream)
338     {
339         const dim3 block(16, 16);
340         const dim3 grid(divUp(dx.cols, block.x), divUp(dx.rows, block.y));
341 
342         if (deviceSupports(FEATURE_SET_COMPUTE_30))
343         {
344             // Use the texture object
345             cudaResourceDesc resDesc;
346             memset(&resDesc, 0, sizeof(resDesc));
347             resDesc.resType = cudaResourceTypePitch2D;
348             resDesc.res.pitch2D.devPtr = mag.ptr();
349             resDesc.res.pitch2D.height = mag.rows;
350             resDesc.res.pitch2D.width = mag.cols;
351             resDesc.res.pitch2D.pitchInBytes = mag.step;
352             resDesc.res.pitch2D.desc = cudaCreateChannelDesc<float>();
353 
354             cudaTextureDesc texDesc;
355             memset(&texDesc, 0, sizeof(texDesc));
356             texDesc.addressMode[0] = cudaAddressModeClamp;
357             texDesc.addressMode[1] = cudaAddressModeClamp;
358             texDesc.addressMode[2] = cudaAddressModeClamp;
359 
360             cudaTextureObject_t tex=0;
361             cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL);
362             calcMapKernel<<<grid, block, 0, stream>>>(dx, dy, map, low_thresh, high_thresh, tex);
363             cudaSafeCall( cudaGetLastError() );
364 
365             if (stream == NULL)
366                 cudaSafeCall( cudaDeviceSynchronize() );
367             else
368                 cudaSafeCall( cudaStreamSynchronize(stream) );
369 
370             cudaSafeCall( cudaDestroyTextureObject(tex) );
371         }
372         else
373         {
374             // Use the texture reference
375             bindTexture(&tex_mag, mag);
376             calcMapKernel<<<grid, block, 0, stream>>>(dx, dy, map, low_thresh, high_thresh);
377             cudaSafeCall( cudaGetLastError() );
378 
379             if (stream == NULL)
380                 cudaSafeCall( cudaDeviceSynchronize() );
381         }
382     }
383 }
384 
385 //////////////////////////////////////////////////////////////////////////////////////////
386 
387 namespace canny
388 {
checkIdx(int y,int x,int rows,int cols)389     __device__ __forceinline__ bool checkIdx(int y, int x, int rows, int cols)
390     {
391         return (y >= 0) && (y < rows) && (x >= 0) && (x < cols);
392     }
393 
edgesHysteresisLocalKernel(PtrStepSzi map,short2 * st,int * d_counter)394     __global__ void edgesHysteresisLocalKernel(PtrStepSzi map, short2* st, int* d_counter)
395     {
396         __shared__ volatile int smem[18][18];
397 
398         const int x = blockIdx.x * blockDim.x + threadIdx.x;
399         const int y = blockIdx.y * blockDim.y + threadIdx.y;
400 
401         smem[threadIdx.y + 1][threadIdx.x + 1] = checkIdx(y, x, map.rows, map.cols) ? map(y, x) : 0;
402         if (threadIdx.y == 0)
403             smem[0][threadIdx.x + 1] = checkIdx(y - 1, x, map.rows, map.cols) ? map(y - 1, x) : 0;
404         if (threadIdx.y == blockDim.y - 1)
405             smem[blockDim.y + 1][threadIdx.x + 1] = checkIdx(y + 1, x, map.rows, map.cols) ? map(y + 1, x) : 0;
406         if (threadIdx.x == 0)
407             smem[threadIdx.y + 1][0] = checkIdx(y, x - 1, map.rows, map.cols) ? map(y, x - 1) : 0;
408         if (threadIdx.x == blockDim.x - 1)
409             smem[threadIdx.y + 1][blockDim.x + 1] = checkIdx(y, x + 1, map.rows, map.cols) ? map(y, x + 1) : 0;
410         if (threadIdx.x == 0 && threadIdx.y == 0)
411             smem[0][0] = checkIdx(y - 1, x - 1, map.rows, map.cols) ? map(y - 1, x - 1) : 0;
412         if (threadIdx.x == blockDim.x - 1 && threadIdx.y == 0)
413             smem[0][blockDim.x + 1] = checkIdx(y - 1, x + 1, map.rows, map.cols) ? map(y - 1, x + 1) : 0;
414         if (threadIdx.x == 0 && threadIdx.y == blockDim.y - 1)
415             smem[blockDim.y + 1][0] = checkIdx(y + 1, x - 1, map.rows, map.cols) ? map(y + 1, x - 1) : 0;
416         if (threadIdx.x == blockDim.x - 1 && threadIdx.y == blockDim.y - 1)
417             smem[blockDim.y + 1][blockDim.x + 1] = checkIdx(y + 1, x + 1, map.rows, map.cols) ? map(y + 1, x + 1) : 0;
418 
419         __syncthreads();
420 
421         if (x >= map.cols || y >= map.rows)
422             return;
423 
424         int n;
425 
426         #pragma unroll
427         for (int k = 0; k < 16; ++k)
428         {
429             n = 0;
430 
431             if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1)
432             {
433                 n += smem[threadIdx.y    ][threadIdx.x    ] == 2;
434                 n += smem[threadIdx.y    ][threadIdx.x + 1] == 2;
435                 n += smem[threadIdx.y    ][threadIdx.x + 2] == 2;
436 
437                 n += smem[threadIdx.y + 1][threadIdx.x    ] == 2;
438                 n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2;
439 
440                 n += smem[threadIdx.y + 2][threadIdx.x    ] == 2;
441                 n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2;
442                 n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2;
443             }
444 
445             __syncthreads();
446 
447             if (n > 0)
448                 smem[threadIdx.y + 1][threadIdx.x + 1] = 2;
449 
450             __syncthreads();
451         }
452 
453         const int e = smem[threadIdx.y + 1][threadIdx.x + 1];
454 
455         map(y, x) = e;
456 
457         n = 0;
458 
459         if (e == 2)
460         {
461             n += smem[threadIdx.y    ][threadIdx.x    ] == 1;
462             n += smem[threadIdx.y    ][threadIdx.x + 1] == 1;
463             n += smem[threadIdx.y    ][threadIdx.x + 2] == 1;
464 
465             n += smem[threadIdx.y + 1][threadIdx.x    ] == 1;
466             n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1;
467 
468             n += smem[threadIdx.y + 2][threadIdx.x    ] == 1;
469             n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1;
470             n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1;
471         }
472 
473         if (n > 0)
474         {
475             const int ind =  ::atomicAdd(d_counter, 1);
476             st[ind] = make_short2(x, y);
477         }
478     }
479 
edgesHysteresisLocal(PtrStepSzi map,short2 * st1,int * d_counter,cudaStream_t stream)480     void edgesHysteresisLocal(PtrStepSzi map, short2* st1, int* d_counter, cudaStream_t stream)
481     {
482         cudaSafeCall( cudaMemsetAsync(d_counter, 0, sizeof(int), stream) );
483 
484         const dim3 block(16, 16);
485         const dim3 grid(divUp(map.cols, block.x), divUp(map.rows, block.y));
486 
487         edgesHysteresisLocalKernel<<<grid, block, 0, stream>>>(map, st1, d_counter);
488         cudaSafeCall( cudaGetLastError() );
489 
490         if (stream == NULL)
491             cudaSafeCall( cudaDeviceSynchronize() );
492     }
493 }
494 
495 //////////////////////////////////////////////////////////////////////////////////////////
496 
497 namespace canny
498 {
499     __constant__ int c_dx[8] = {-1,  0,  1, -1, 1, -1, 0, 1};
500     __constant__ int c_dy[8] = {-1, -1, -1,  0, 0,  1, 1, 1};
501 
edgesHysteresisGlobalKernel(PtrStepSzi map,short2 * st1,short2 * st2,int * d_counter,const int count)502     __global__ void edgesHysteresisGlobalKernel(PtrStepSzi map, short2* st1, short2* st2, int* d_counter, const int count)
503     {
504         const int stack_size = 512;
505 
506         __shared__ int s_counter;
507         __shared__ int s_ind;
508         __shared__ short2 s_st[stack_size];
509 
510         if (threadIdx.x == 0)
511             s_counter = 0;
512 
513         __syncthreads();
514 
515         int ind = blockIdx.y * gridDim.x + blockIdx.x;
516 
517         if (ind >= count)
518             return;
519 
520         short2 pos = st1[ind];
521 
522         if (threadIdx.x < 8)
523         {
524             pos.x += c_dx[threadIdx.x];
525             pos.y += c_dy[threadIdx.x];
526 
527             if (pos.x > 0 && pos.x < map.cols - 1 && pos.y > 0 && pos.y < map.rows - 1 && map(pos.y, pos.x) == 1)
528             {
529                 map(pos.y, pos.x) = 2;
530 
531                 ind = Emulation::smem::atomicAdd(&s_counter, 1);
532 
533                 s_st[ind] = pos;
534             }
535         }
536 
537         __syncthreads();
538 
539         while (s_counter > 0 && s_counter <= stack_size - blockDim.x)
540         {
541             const int subTaskIdx = threadIdx.x >> 3;
542             const int portion = ::min(s_counter, blockDim.x >> 3);
543 
544             if (subTaskIdx < portion)
545                 pos = s_st[s_counter - 1 - subTaskIdx];
546 
547             __syncthreads();
548 
549             if (threadIdx.x == 0)
550                 s_counter -= portion;
551 
552             __syncthreads();
553 
554             if (subTaskIdx < portion)
555             {
556                 pos.x += c_dx[threadIdx.x & 7];
557                 pos.y += c_dy[threadIdx.x & 7];
558 
559                 if (pos.x > 0 && pos.x < map.cols - 1 && pos.y > 0 && pos.y < map.rows - 1 && map(pos.y, pos.x) == 1)
560                 {
561                     map(pos.y, pos.x) = 2;
562 
563                     ind = Emulation::smem::atomicAdd(&s_counter, 1);
564 
565                     s_st[ind] = pos;
566                 }
567             }
568 
569             __syncthreads();
570         }
571 
572         if (s_counter > 0)
573         {
574             if (threadIdx.x == 0)
575             {
576                 s_ind = ::atomicAdd(d_counter, s_counter);
577 
578                 if (s_ind + s_counter > map.cols * map.rows)
579                     s_counter = 0;
580             }
581 
582             __syncthreads();
583 
584             ind = s_ind;
585 
586             for (int i = threadIdx.x; i < s_counter; i += blockDim.x)
587                 st2[ind + i] = s_st[i];
588         }
589     }
590 
edgesHysteresisGlobal(PtrStepSzi map,short2 * st1,short2 * st2,int * d_counter,cudaStream_t stream)591     void edgesHysteresisGlobal(PtrStepSzi map, short2* st1, short2* st2, int* d_counter, cudaStream_t stream)
592     {
593         int count;
594         cudaSafeCall( cudaMemcpyAsync(&count, d_counter, sizeof(int), cudaMemcpyDeviceToHost, stream) );
595         cudaSafeCall( cudaStreamSynchronize(stream) );
596 
597         while (count > 0)
598         {
599             cudaSafeCall( cudaMemsetAsync(d_counter, 0, sizeof(int), stream) );
600 
601             const dim3 block(128);
602             const dim3 grid(::min(count, 65535u), divUp(count, 65535), 1);
603 
604             edgesHysteresisGlobalKernel<<<grid, block, 0, stream>>>(map, st1, st2, d_counter, count);
605             cudaSafeCall( cudaGetLastError() );
606 
607             if (stream == NULL)
608                 cudaSafeCall( cudaDeviceSynchronize() );
609 
610             cudaSafeCall( cudaMemcpyAsync(&count, d_counter, sizeof(int), cudaMemcpyDeviceToHost, stream) );
611             cudaSafeCall( cudaStreamSynchronize(stream) );
612 
613             count = min(count, map.cols * map.rows);
614 
615             //std::swap(st1, st2);
616             short2* tmp = st1;
617             st1 = st2;
618             st2 = tmp;
619         }
620     }
621 }
622 
623 //////////////////////////////////////////////////////////////////////////////////////////
624 
625 namespace canny
626 {
627     struct GetEdges : unary_function<int, uchar>
628     {
operator ()canny::GetEdges629         __device__ __forceinline__ uchar operator ()(int e) const
630         {
631             return (uchar)(-(e >> 1));
632         }
633 
GetEdgescanny::GetEdges634         __host__ __device__ __forceinline__ GetEdges() {}
GetEdgescanny::GetEdges635         __host__ __device__ __forceinline__ GetEdges(const GetEdges&) {}
636     };
637 }
638 
639 namespace cv { namespace cuda { namespace device
640 {
641     template <> struct TransformFunctorTraits<canny::GetEdges> : DefaultTransformFunctorTraits<canny::GetEdges>
642     {
643         enum { smart_shift = 4 };
644     };
645 }}}
646 
647 namespace canny
648 {
getEdges(PtrStepSzi map,PtrStepSzb dst,cudaStream_t stream)649     void getEdges(PtrStepSzi map, PtrStepSzb dst, cudaStream_t stream)
650     {
651         transform(map, dst, GetEdges(), WithOutMask(), stream);
652     }
653 }
654 
655 #endif /* CUDA_DISABLER */
656