1 /*M/////////////////////////////////////////////////////////////////////////////////////// 2 // 3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 4 // 5 // By downloading, copying, installing or using the software you agree to this license. 6 // If you do not agree to this license, do not download, install, 7 // copy or use the software. 8 // 9 // 10 // License Agreement 11 // For Open Source Computer Vision Library 12 // 13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. 14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved. 15 // Third party copyrights are property of their respective owners. 16 // 17 // Redistribution and use in source and binary forms, with or without modification, 18 // are permitted provided that the following conditions are met: 19 // 20 // * Redistribution's of source code must retain the above copyright notice, 21 // this list of conditions and the following disclaimer. 22 // 23 // * Redistribution's in binary form must reproduce the above copyright notice, 24 // this list of conditions and the following disclaimer in the documentation 25 // and/or other materials provided with the distribution. 26 // 27 // * The name of the copyright holders may not be used to endorse or promote products 28 // derived from this software without specific prior written permission. 29 // 30 // This software is provided by the copyright holders and contributors "as is" and 31 // any express or implied warranties, including, but not limited to, the implied 32 // warranties of merchantability and fitness for a particular purpose are disclaimed. 33 // In no event shall the Intel Corporation or contributors be liable for any direct, 34 // indirect, incidental, special, exemplary, or consequential damages 35 // (including, but not limited to, procurement of substitute goods or services; 36 // loss of use, data, or profits; or business interruption) however caused 37 // and on any theory of liability, whether in contract, strict liability, 38 // or tort (including negligence or otherwise) arising in any way out of 39 // the use of this software, even if advised of the possibility of such damage. 40 // 41 //M*/ 42 43 #if !defined CUDA_DISABLER 44 45 #include "opencv2/core/cuda/common.hpp" 46 #include "opencv2/core/cuda/emulation.hpp" 47 #include "opencv2/core/cuda/transform.hpp" 48 #include "opencv2/core/cuda/functional.hpp" 49 #include "opencv2/core/cuda/utility.hpp" 50 #include "opencv2/core/cuda.hpp" 51 52 using namespace cv::cuda; 53 using namespace cv::cuda::device; 54 55 namespace canny 56 { 57 struct L1 : binary_function<int, int, float> 58 { operator ()canny::L159 __device__ __forceinline__ float operator ()(int x, int y) const 60 { 61 return ::abs(x) + ::abs(y); 62 } 63 L1canny::L164 __host__ __device__ __forceinline__ L1() {} L1canny::L165 __host__ __device__ __forceinline__ L1(const L1&) {} 66 }; 67 struct L2 : binary_function<int, int, float> 68 { operator ()canny::L269 __device__ __forceinline__ float operator ()(int x, int y) const 70 { 71 return ::sqrtf(x * x + y * y); 72 } 73 L2canny::L274 __host__ __device__ __forceinline__ L2() {} L2canny::L275 __host__ __device__ __forceinline__ L2(const L2&) {} 76 }; 77 } 78 79 namespace cv { namespace cuda { namespace device 80 { 81 template <> struct TransformFunctorTraits<canny::L1> : DefaultTransformFunctorTraits<canny::L1> 82 { 83 enum { smart_shift = 4 }; 84 }; 85 template <> struct TransformFunctorTraits<canny::L2> : DefaultTransformFunctorTraits<canny::L2> 86 { 87 enum { smart_shift = 4 }; 88 }; 89 }}} 90 91 namespace canny 92 { 93 struct SrcTex 94 { ~SrcTexcanny::SrcTex95 virtual ~SrcTex() {} 96 SrcTexcanny::SrcTex97 __host__ SrcTex(int _xoff, int _yoff) : xoff(_xoff), yoff(_yoff) {} 98 99 __device__ __forceinline__ virtual int operator ()(int y, int x) const = 0; 100 101 int xoff; 102 int yoff; 103 }; 104 105 texture<uchar, cudaTextureType2D, cudaReadModeElementType> tex_src(false, cudaFilterModePoint, cudaAddressModeClamp); 106 struct SrcTexRef : SrcTex 107 { SrcTexRefcanny::SrcTexRef108 __host__ SrcTexRef(int _xoff, int _yoff) : SrcTex(_xoff, _yoff) {} 109 operator ()canny::SrcTexRef110 __device__ __forceinline__ int operator ()(int y, int x) const override 111 { 112 return tex2D(tex_src, x + xoff, y + yoff); 113 } 114 }; 115 116 struct SrcTexObj : SrcTex 117 { SrcTexObjcanny::SrcTexObj118 __host__ SrcTexObj(int _xoff, int _yoff, cudaTextureObject_t _tex_src_object) : SrcTex(_xoff, _yoff), tex_src_object(_tex_src_object) { } 119 operator ()canny::SrcTexObj120 __device__ __forceinline__ int operator ()(int y, int x) const override 121 { 122 return tex2D<uchar>(tex_src_object, x + xoff, y + yoff); 123 } 124 125 cudaTextureObject_t tex_src_object; 126 }; 127 128 template < 129 class T, 130 class Norm, 131 typename = typename std::enable_if<std::is_base_of<SrcTex, T>::value>::type 132 > calcMagnitudeKernel(const T src,PtrStepi dx,PtrStepi dy,PtrStepSzf mag,const Norm norm)133 __global__ void calcMagnitudeKernel(const T src, PtrStepi dx, PtrStepi dy, PtrStepSzf mag, const Norm norm) 134 { 135 const int x = blockIdx.x * blockDim.x + threadIdx.x; 136 const int y = blockIdx.y * blockDim.y + threadIdx.y; 137 138 if (y >= mag.rows || x >= mag.cols) 139 return; 140 141 int dxVal = (src(y - 1, x + 1) + 2 * src(y, x + 1) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y, x - 1) + src(y + 1, x - 1)); 142 int dyVal = (src(y + 1, x - 1) + 2 * src(y + 1, x) + src(y + 1, x + 1)) - (src(y - 1, x - 1) + 2 * src(y - 1, x) + src(y - 1, x + 1)); 143 144 dx(y, x) = dxVal; 145 dy(y, x) = dyVal; 146 147 mag(y, x) = norm(dxVal, dyVal); 148 } 149 calcMagnitude(PtrStepSzb srcWhole,int xoff,int yoff,PtrStepSzi dx,PtrStepSzi dy,PtrStepSzf mag,bool L2Grad,cudaStream_t stream)150 void calcMagnitude(PtrStepSzb srcWhole, int xoff, int yoff, PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad, cudaStream_t stream) 151 { 152 const dim3 block(16, 16); 153 const dim3 grid(divUp(mag.cols, block.x), divUp(mag.rows, block.y)); 154 155 bool cc30 = deviceSupports(FEATURE_SET_COMPUTE_30); 156 157 if (cc30) 158 { 159 cudaTextureDesc texDesc; 160 memset(&texDesc, 0, sizeof(texDesc)); 161 texDesc.addressMode[0] = cudaAddressModeClamp; 162 texDesc.addressMode[1] = cudaAddressModeClamp; 163 texDesc.addressMode[2] = cudaAddressModeClamp; 164 165 cudaTextureObject_t tex = 0; 166 createTextureObjectPitch2D(&tex, srcWhole, texDesc); 167 168 SrcTexObj src(xoff, yoff, tex); 169 170 if (L2Grad) 171 { 172 L2 norm; 173 calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm); 174 } 175 else 176 { 177 L1 norm; 178 calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm); 179 } 180 181 cudaSafeCall( cudaGetLastError() ); 182 183 if (stream == NULL) 184 cudaSafeCall( cudaDeviceSynchronize() ); 185 else 186 cudaSafeCall( cudaStreamSynchronize(stream) ); 187 188 cudaSafeCall( cudaDestroyTextureObject(tex) ); 189 } 190 else 191 { 192 bindTexture(&tex_src, srcWhole); 193 SrcTexRef src(xoff, yoff); 194 195 if (L2Grad) 196 { 197 L2 norm; 198 calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm); 199 } 200 else 201 { 202 L1 norm; 203 calcMagnitudeKernel<<<grid, block, 0, stream>>>(src, dx, dy, mag, norm); 204 } 205 206 cudaSafeCall( cudaGetLastError() ); 207 208 if (stream == NULL) 209 cudaSafeCall( cudaDeviceSynchronize() ); 210 } 211 } 212 calcMagnitude(PtrStepSzi dx,PtrStepSzi dy,PtrStepSzf mag,bool L2Grad,cudaStream_t stream)213 void calcMagnitude(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, bool L2Grad, cudaStream_t stream) 214 { 215 if (L2Grad) 216 { 217 L2 norm; 218 transform(dx, dy, mag, norm, WithOutMask(), stream); 219 } 220 else 221 { 222 L1 norm; 223 transform(dx, dy, mag, norm, WithOutMask(), stream); 224 } 225 } 226 } 227 228 ////////////////////////////////////////////////////////////////////////////////////////// 229 230 namespace canny 231 { 232 texture<float, cudaTextureType2D, cudaReadModeElementType> tex_mag(false, cudaFilterModePoint, cudaAddressModeClamp); calcMapKernel(const PtrStepSzi dx,const PtrStepi dy,PtrStepi map,const float low_thresh,const float high_thresh)233 __global__ void calcMapKernel(const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh) 234 { 235 const int CANNY_SHIFT = 15; 236 const int TG22 = (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5); 237 238 const int x = blockIdx.x * blockDim.x + threadIdx.x; 239 const int y = blockIdx.y * blockDim.y + threadIdx.y; 240 241 if (x == 0 || x >= dx.cols - 1 || y == 0 || y >= dx.rows - 1) 242 return; 243 244 int dxVal = dx(y, x); 245 int dyVal = dy(y, x); 246 247 const int s = (dxVal ^ dyVal) < 0 ? -1 : 1; 248 const float m = tex2D(tex_mag, x, y); 249 250 dxVal = ::abs(dxVal); 251 dyVal = ::abs(dyVal); 252 253 // 0 - the pixel can not belong to an edge 254 // 1 - the pixel might belong to an edge 255 // 2 - the pixel does belong to an edge 256 int edge_type = 0; 257 258 if (m > low_thresh) 259 { 260 const int tg22x = dxVal * TG22; 261 const int tg67x = tg22x + ((dxVal + dxVal) << CANNY_SHIFT); 262 263 dyVal <<= CANNY_SHIFT; 264 265 if (dyVal < tg22x) 266 { 267 if (m > tex2D(tex_mag, x - 1, y) && m >= tex2D(tex_mag, x + 1, y)) 268 edge_type = 1 + (int)(m > high_thresh); 269 } 270 else if(dyVal > tg67x) 271 { 272 if (m > tex2D(tex_mag, x, y - 1) && m >= tex2D(tex_mag, x, y + 1)) 273 edge_type = 1 + (int)(m > high_thresh); 274 } 275 else 276 { 277 if (m > tex2D(tex_mag, x - s, y - 1) && m >= tex2D(tex_mag, x + s, y + 1)) 278 edge_type = 1 + (int)(m > high_thresh); 279 } 280 } 281 282 map(y, x) = edge_type; 283 } 284 calcMapKernel(const PtrStepSzi dx,const PtrStepi dy,PtrStepi map,const float low_thresh,const float high_thresh,cudaTextureObject_t tex_mag)285 __global__ void calcMapKernel(const PtrStepSzi dx, const PtrStepi dy, PtrStepi map, const float low_thresh, const float high_thresh, cudaTextureObject_t tex_mag) 286 { 287 const int CANNY_SHIFT = 15; 288 const int TG22 = (int)(0.4142135623730950488016887242097*(1<<CANNY_SHIFT) + 0.5); 289 290 const int x = blockIdx.x * blockDim.x + threadIdx.x; 291 const int y = blockIdx.y * blockDim.y + threadIdx.y; 292 293 if (x == 0 || x >= dx.cols - 1 || y == 0 || y >= dx.rows - 1) 294 return; 295 296 int dxVal = dx(y, x); 297 int dyVal = dy(y, x); 298 299 const int s = (dxVal ^ dyVal) < 0 ? -1 : 1; 300 const float m = tex2D<float>(tex_mag, x, y); 301 302 dxVal = ::abs(dxVal); 303 dyVal = ::abs(dyVal); 304 305 // 0 - the pixel can not belong to an edge 306 // 1 - the pixel might belong to an edge 307 // 2 - the pixel does belong to an edge 308 int edge_type = 0; 309 310 if (m > low_thresh) 311 { 312 const int tg22x = dxVal * TG22; 313 const int tg67x = tg22x + ((dxVal + dxVal) << CANNY_SHIFT); 314 315 dyVal <<= CANNY_SHIFT; 316 317 if (dyVal < tg22x) 318 { 319 if (m > tex2D<float>(tex_mag, x - 1, y) && m >= tex2D<float>(tex_mag, x + 1, y)) 320 edge_type = 1 + (int)(m > high_thresh); 321 } 322 else if(dyVal > tg67x) 323 { 324 if (m > tex2D<float>(tex_mag, x, y - 1) && m >= tex2D<float>(tex_mag, x, y + 1)) 325 edge_type = 1 + (int)(m > high_thresh); 326 } 327 else 328 { 329 if (m > tex2D<float>(tex_mag, x - s, y - 1) && m >= tex2D<float>(tex_mag, x + s, y + 1)) 330 edge_type = 1 + (int)(m > high_thresh); 331 } 332 } 333 334 map(y, x) = edge_type; 335 } 336 calcMap(PtrStepSzi dx,PtrStepSzi dy,PtrStepSzf mag,PtrStepSzi map,float low_thresh,float high_thresh,cudaStream_t stream)337 void calcMap(PtrStepSzi dx, PtrStepSzi dy, PtrStepSzf mag, PtrStepSzi map, float low_thresh, float high_thresh, cudaStream_t stream) 338 { 339 const dim3 block(16, 16); 340 const dim3 grid(divUp(dx.cols, block.x), divUp(dx.rows, block.y)); 341 342 if (deviceSupports(FEATURE_SET_COMPUTE_30)) 343 { 344 // Use the texture object 345 cudaResourceDesc resDesc; 346 memset(&resDesc, 0, sizeof(resDesc)); 347 resDesc.resType = cudaResourceTypePitch2D; 348 resDesc.res.pitch2D.devPtr = mag.ptr(); 349 resDesc.res.pitch2D.height = mag.rows; 350 resDesc.res.pitch2D.width = mag.cols; 351 resDesc.res.pitch2D.pitchInBytes = mag.step; 352 resDesc.res.pitch2D.desc = cudaCreateChannelDesc<float>(); 353 354 cudaTextureDesc texDesc; 355 memset(&texDesc, 0, sizeof(texDesc)); 356 texDesc.addressMode[0] = cudaAddressModeClamp; 357 texDesc.addressMode[1] = cudaAddressModeClamp; 358 texDesc.addressMode[2] = cudaAddressModeClamp; 359 360 cudaTextureObject_t tex=0; 361 cudaCreateTextureObject(&tex, &resDesc, &texDesc, NULL); 362 calcMapKernel<<<grid, block, 0, stream>>>(dx, dy, map, low_thresh, high_thresh, tex); 363 cudaSafeCall( cudaGetLastError() ); 364 365 if (stream == NULL) 366 cudaSafeCall( cudaDeviceSynchronize() ); 367 else 368 cudaSafeCall( cudaStreamSynchronize(stream) ); 369 370 cudaSafeCall( cudaDestroyTextureObject(tex) ); 371 } 372 else 373 { 374 // Use the texture reference 375 bindTexture(&tex_mag, mag); 376 calcMapKernel<<<grid, block, 0, stream>>>(dx, dy, map, low_thresh, high_thresh); 377 cudaSafeCall( cudaGetLastError() ); 378 379 if (stream == NULL) 380 cudaSafeCall( cudaDeviceSynchronize() ); 381 } 382 } 383 } 384 385 ////////////////////////////////////////////////////////////////////////////////////////// 386 387 namespace canny 388 { checkIdx(int y,int x,int rows,int cols)389 __device__ __forceinline__ bool checkIdx(int y, int x, int rows, int cols) 390 { 391 return (y >= 0) && (y < rows) && (x >= 0) && (x < cols); 392 } 393 edgesHysteresisLocalKernel(PtrStepSzi map,short2 * st,int * d_counter)394 __global__ void edgesHysteresisLocalKernel(PtrStepSzi map, short2* st, int* d_counter) 395 { 396 __shared__ volatile int smem[18][18]; 397 398 const int x = blockIdx.x * blockDim.x + threadIdx.x; 399 const int y = blockIdx.y * blockDim.y + threadIdx.y; 400 401 smem[threadIdx.y + 1][threadIdx.x + 1] = checkIdx(y, x, map.rows, map.cols) ? map(y, x) : 0; 402 if (threadIdx.y == 0) 403 smem[0][threadIdx.x + 1] = checkIdx(y - 1, x, map.rows, map.cols) ? map(y - 1, x) : 0; 404 if (threadIdx.y == blockDim.y - 1) 405 smem[blockDim.y + 1][threadIdx.x + 1] = checkIdx(y + 1, x, map.rows, map.cols) ? map(y + 1, x) : 0; 406 if (threadIdx.x == 0) 407 smem[threadIdx.y + 1][0] = checkIdx(y, x - 1, map.rows, map.cols) ? map(y, x - 1) : 0; 408 if (threadIdx.x == blockDim.x - 1) 409 smem[threadIdx.y + 1][blockDim.x + 1] = checkIdx(y, x + 1, map.rows, map.cols) ? map(y, x + 1) : 0; 410 if (threadIdx.x == 0 && threadIdx.y == 0) 411 smem[0][0] = checkIdx(y - 1, x - 1, map.rows, map.cols) ? map(y - 1, x - 1) : 0; 412 if (threadIdx.x == blockDim.x - 1 && threadIdx.y == 0) 413 smem[0][blockDim.x + 1] = checkIdx(y - 1, x + 1, map.rows, map.cols) ? map(y - 1, x + 1) : 0; 414 if (threadIdx.x == 0 && threadIdx.y == blockDim.y - 1) 415 smem[blockDim.y + 1][0] = checkIdx(y + 1, x - 1, map.rows, map.cols) ? map(y + 1, x - 1) : 0; 416 if (threadIdx.x == blockDim.x - 1 && threadIdx.y == blockDim.y - 1) 417 smem[blockDim.y + 1][blockDim.x + 1] = checkIdx(y + 1, x + 1, map.rows, map.cols) ? map(y + 1, x + 1) : 0; 418 419 __syncthreads(); 420 421 if (x >= map.cols || y >= map.rows) 422 return; 423 424 int n; 425 426 #pragma unroll 427 for (int k = 0; k < 16; ++k) 428 { 429 n = 0; 430 431 if (smem[threadIdx.y + 1][threadIdx.x + 1] == 1) 432 { 433 n += smem[threadIdx.y ][threadIdx.x ] == 2; 434 n += smem[threadIdx.y ][threadIdx.x + 1] == 2; 435 n += smem[threadIdx.y ][threadIdx.x + 2] == 2; 436 437 n += smem[threadIdx.y + 1][threadIdx.x ] == 2; 438 n += smem[threadIdx.y + 1][threadIdx.x + 2] == 2; 439 440 n += smem[threadIdx.y + 2][threadIdx.x ] == 2; 441 n += smem[threadIdx.y + 2][threadIdx.x + 1] == 2; 442 n += smem[threadIdx.y + 2][threadIdx.x + 2] == 2; 443 } 444 445 __syncthreads(); 446 447 if (n > 0) 448 smem[threadIdx.y + 1][threadIdx.x + 1] = 2; 449 450 __syncthreads(); 451 } 452 453 const int e = smem[threadIdx.y + 1][threadIdx.x + 1]; 454 455 map(y, x) = e; 456 457 n = 0; 458 459 if (e == 2) 460 { 461 n += smem[threadIdx.y ][threadIdx.x ] == 1; 462 n += smem[threadIdx.y ][threadIdx.x + 1] == 1; 463 n += smem[threadIdx.y ][threadIdx.x + 2] == 1; 464 465 n += smem[threadIdx.y + 1][threadIdx.x ] == 1; 466 n += smem[threadIdx.y + 1][threadIdx.x + 2] == 1; 467 468 n += smem[threadIdx.y + 2][threadIdx.x ] == 1; 469 n += smem[threadIdx.y + 2][threadIdx.x + 1] == 1; 470 n += smem[threadIdx.y + 2][threadIdx.x + 2] == 1; 471 } 472 473 if (n > 0) 474 { 475 const int ind = ::atomicAdd(d_counter, 1); 476 st[ind] = make_short2(x, y); 477 } 478 } 479 edgesHysteresisLocal(PtrStepSzi map,short2 * st1,int * d_counter,cudaStream_t stream)480 void edgesHysteresisLocal(PtrStepSzi map, short2* st1, int* d_counter, cudaStream_t stream) 481 { 482 cudaSafeCall( cudaMemsetAsync(d_counter, 0, sizeof(int), stream) ); 483 484 const dim3 block(16, 16); 485 const dim3 grid(divUp(map.cols, block.x), divUp(map.rows, block.y)); 486 487 edgesHysteresisLocalKernel<<<grid, block, 0, stream>>>(map, st1, d_counter); 488 cudaSafeCall( cudaGetLastError() ); 489 490 if (stream == NULL) 491 cudaSafeCall( cudaDeviceSynchronize() ); 492 } 493 } 494 495 ////////////////////////////////////////////////////////////////////////////////////////// 496 497 namespace canny 498 { 499 __constant__ int c_dx[8] = {-1, 0, 1, -1, 1, -1, 0, 1}; 500 __constant__ int c_dy[8] = {-1, -1, -1, 0, 0, 1, 1, 1}; 501 edgesHysteresisGlobalKernel(PtrStepSzi map,short2 * st1,short2 * st2,int * d_counter,const int count)502 __global__ void edgesHysteresisGlobalKernel(PtrStepSzi map, short2* st1, short2* st2, int* d_counter, const int count) 503 { 504 const int stack_size = 512; 505 506 __shared__ int s_counter; 507 __shared__ int s_ind; 508 __shared__ short2 s_st[stack_size]; 509 510 if (threadIdx.x == 0) 511 s_counter = 0; 512 513 __syncthreads(); 514 515 int ind = blockIdx.y * gridDim.x + blockIdx.x; 516 517 if (ind >= count) 518 return; 519 520 short2 pos = st1[ind]; 521 522 if (threadIdx.x < 8) 523 { 524 pos.x += c_dx[threadIdx.x]; 525 pos.y += c_dy[threadIdx.x]; 526 527 if (pos.x > 0 && pos.x < map.cols - 1 && pos.y > 0 && pos.y < map.rows - 1 && map(pos.y, pos.x) == 1) 528 { 529 map(pos.y, pos.x) = 2; 530 531 ind = Emulation::smem::atomicAdd(&s_counter, 1); 532 533 s_st[ind] = pos; 534 } 535 } 536 537 __syncthreads(); 538 539 while (s_counter > 0 && s_counter <= stack_size - blockDim.x) 540 { 541 const int subTaskIdx = threadIdx.x >> 3; 542 const int portion = ::min(s_counter, blockDim.x >> 3); 543 544 if (subTaskIdx < portion) 545 pos = s_st[s_counter - 1 - subTaskIdx]; 546 547 __syncthreads(); 548 549 if (threadIdx.x == 0) 550 s_counter -= portion; 551 552 __syncthreads(); 553 554 if (subTaskIdx < portion) 555 { 556 pos.x += c_dx[threadIdx.x & 7]; 557 pos.y += c_dy[threadIdx.x & 7]; 558 559 if (pos.x > 0 && pos.x < map.cols - 1 && pos.y > 0 && pos.y < map.rows - 1 && map(pos.y, pos.x) == 1) 560 { 561 map(pos.y, pos.x) = 2; 562 563 ind = Emulation::smem::atomicAdd(&s_counter, 1); 564 565 s_st[ind] = pos; 566 } 567 } 568 569 __syncthreads(); 570 } 571 572 if (s_counter > 0) 573 { 574 if (threadIdx.x == 0) 575 { 576 s_ind = ::atomicAdd(d_counter, s_counter); 577 578 if (s_ind + s_counter > map.cols * map.rows) 579 s_counter = 0; 580 } 581 582 __syncthreads(); 583 584 ind = s_ind; 585 586 for (int i = threadIdx.x; i < s_counter; i += blockDim.x) 587 st2[ind + i] = s_st[i]; 588 } 589 } 590 edgesHysteresisGlobal(PtrStepSzi map,short2 * st1,short2 * st2,int * d_counter,cudaStream_t stream)591 void edgesHysteresisGlobal(PtrStepSzi map, short2* st1, short2* st2, int* d_counter, cudaStream_t stream) 592 { 593 int count; 594 cudaSafeCall( cudaMemcpyAsync(&count, d_counter, sizeof(int), cudaMemcpyDeviceToHost, stream) ); 595 cudaSafeCall( cudaStreamSynchronize(stream) ); 596 597 while (count > 0) 598 { 599 cudaSafeCall( cudaMemsetAsync(d_counter, 0, sizeof(int), stream) ); 600 601 const dim3 block(128); 602 const dim3 grid(::min(count, 65535u), divUp(count, 65535), 1); 603 604 edgesHysteresisGlobalKernel<<<grid, block, 0, stream>>>(map, st1, st2, d_counter, count); 605 cudaSafeCall( cudaGetLastError() ); 606 607 if (stream == NULL) 608 cudaSafeCall( cudaDeviceSynchronize() ); 609 610 cudaSafeCall( cudaMemcpyAsync(&count, d_counter, sizeof(int), cudaMemcpyDeviceToHost, stream) ); 611 cudaSafeCall( cudaStreamSynchronize(stream) ); 612 613 count = min(count, map.cols * map.rows); 614 615 //std::swap(st1, st2); 616 short2* tmp = st1; 617 st1 = st2; 618 st2 = tmp; 619 } 620 } 621 } 622 623 ////////////////////////////////////////////////////////////////////////////////////////// 624 625 namespace canny 626 { 627 struct GetEdges : unary_function<int, uchar> 628 { operator ()canny::GetEdges629 __device__ __forceinline__ uchar operator ()(int e) const 630 { 631 return (uchar)(-(e >> 1)); 632 } 633 GetEdgescanny::GetEdges634 __host__ __device__ __forceinline__ GetEdges() {} GetEdgescanny::GetEdges635 __host__ __device__ __forceinline__ GetEdges(const GetEdges&) {} 636 }; 637 } 638 639 namespace cv { namespace cuda { namespace device 640 { 641 template <> struct TransformFunctorTraits<canny::GetEdges> : DefaultTransformFunctorTraits<canny::GetEdges> 642 { 643 enum { smart_shift = 4 }; 644 }; 645 }}} 646 647 namespace canny 648 { getEdges(PtrStepSzi map,PtrStepSzb dst,cudaStream_t stream)649 void getEdges(PtrStepSzi map, PtrStepSzb dst, cudaStream_t stream) 650 { 651 transform(map, dst, GetEdges(), WithOutMask(), stream); 652 } 653 } 654 655 #endif /* CUDA_DISABLER */ 656