1 /*M/////////////////////////////////////////////////////////////////////////////////////// 2 // 3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 4 // 5 // By downloading, copying, installing or using the software you agree to this license. 6 // If you do not agree to this license, do not download, install, 7 // copy or use the software. 8 // 9 // 10 // License Agreement 11 // For Open Source Computer Vision Library 12 // 13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. 14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved. 15 // Third party copyrights are property of their respective owners. 16 // 17 // Redistribution and use in source and binary forms, with or without modification, 18 // are permitted provided that the following conditions are met: 19 // 20 // * Redistribution's of source code must retain the above copyright notice, 21 // this list of conditions and the following disclaimer. 22 // 23 // * Redistribution's in binary form must reproduce the above copyright notice, 24 // this list of conditions and the following disclaimer in the documentation 25 // and/or other materials provided with the distribution. 26 // 27 // * The name of the copyright holders may not be used to endorse or promote products 28 // derived from this software without specific prior written permission. 29 // 30 // This software is provided by the copyright holders and contributors "as is" and 31 // any express or implied warranties, including, but not limited to, the implied 32 // warranties of merchantability and fitness for a particular purpose are disclaimed. 33 // In no event shall the Intel Corporation or contributors be liable for any direct, 34 // indirect, incidental, special, exemplary, or consequential damages 35 // (including, but not limited to, procurement of substitute goods or services; 36 // loss of use, data, or profits; or business interruption) however caused 37 // and on any theory of liability, whether in contract, strict liability, 38 // or tort (including negligence or otherwise) arising in any way out of 39 // the use of this software, even if advised of the possibility of such damage. 40 // 41 //M*/ 42 43 #if !defined CUDA_DISABLER 44 45 #include <thrust/device_ptr.h> 46 #include <thrust/sort.h> 47 #include <thrust/system/cuda/execution_policy.h> 48 #include <thrust/version.h> 49 50 51 #include "opencv2/core/cuda/common.hpp" 52 #include "opencv2/core/cuda/reduce.hpp" 53 #include "opencv2/core/cuda/functional.hpp" 54 #include "opencv2/core/cuda/utility.hpp" 55 namespace cv { namespace cuda { namespace device 56 { 57 namespace orb 58 { 59 //////////////////////////////////////////////////////////////////////////////////////////////////////// 60 // cull 61 cull_gpu(int * loc,float * response,int size,int n_points,cudaStream_t stream)62 int cull_gpu(int* loc, float* response, int size, int n_points, cudaStream_t stream) 63 { 64 thrust::device_ptr<int> loc_ptr(loc); 65 thrust::device_ptr<float> response_ptr(response); 66 #if THRUST_VERSION >= 100800 67 #if THRUST_VERSION >= 100802 68 if (stream) 69 { 70 thrust::sort_by_key(thrust::cuda::par(ThrustAllocator::getAllocator()).on(stream), response_ptr, response_ptr + size, loc_ptr, thrust::greater<float>()); 71 } 72 else 73 { 74 thrust::sort_by_key(thrust::cuda::par(ThrustAllocator::getAllocator()), response_ptr, response_ptr + size, loc_ptr, thrust::greater<float>()); 75 } 76 #else 77 if(stream) 78 { 79 thrust::sort_by_key(thrust::cuda::par.on(stream), response_ptr, response_ptr + size, loc_ptr, thrust::greater<float>()); 80 }else 81 { 82 thrust::sort_by_key(response_ptr, response_ptr + size, loc_ptr, thrust::greater<float>()); 83 } 84 #endif 85 #else 86 thrust::sort_by_key(response_ptr, response_ptr + size, loc_ptr, thrust::greater<float>()); 87 #endif 88 return n_points; 89 } 90 91 //////////////////////////////////////////////////////////////////////////////////////////////////////// 92 // HarrisResponses 93 HarrisResponses(const PtrStepb img,const short2 * loc_,float * response,const int npoints,const int blockSize,const float harris_k)94 __global__ void HarrisResponses(const PtrStepb img, const short2* loc_, float* response, const int npoints, const int blockSize, const float harris_k) 95 { 96 __shared__ int smem0[8 * 32]; 97 __shared__ int smem1[8 * 32]; 98 __shared__ int smem2[8 * 32]; 99 100 const int ptidx = blockIdx.x * blockDim.y + threadIdx.y; 101 102 if (ptidx < npoints) 103 { 104 const short2 loc = loc_[ptidx]; 105 106 const int r = blockSize / 2; 107 const int x0 = loc.x - r; 108 const int y0 = loc.y - r; 109 110 int a = 0, b = 0, c = 0; 111 112 for (int ind = threadIdx.x; ind < blockSize * blockSize; ind += blockDim.x) 113 { 114 const int i = ind / blockSize; 115 const int j = ind % blockSize; 116 117 int Ix = (img(y0 + i, x0 + j + 1) - img(y0 + i, x0 + j - 1)) * 2 + 118 (img(y0 + i - 1, x0 + j + 1) - img(y0 + i - 1, x0 + j - 1)) + 119 (img(y0 + i + 1, x0 + j + 1) - img(y0 + i + 1, x0 + j - 1)); 120 121 int Iy = (img(y0 + i + 1, x0 + j) - img(y0 + i - 1, x0 + j)) * 2 + 122 (img(y0 + i + 1, x0 + j - 1) - img(y0 + i - 1, x0 + j - 1)) + 123 (img(y0 + i + 1, x0 + j + 1) - img(y0 + i - 1, x0 + j + 1)); 124 125 a += Ix * Ix; 126 b += Iy * Iy; 127 c += Ix * Iy; 128 } 129 130 int* srow0 = smem0 + threadIdx.y * blockDim.x; 131 int* srow1 = smem1 + threadIdx.y * blockDim.x; 132 int* srow2 = smem2 + threadIdx.y * blockDim.x; 133 134 plus<int> op; 135 reduce<32>(smem_tuple(srow0, srow1, srow2), thrust::tie(a, b, c), threadIdx.x, thrust::make_tuple(op, op, op)); 136 137 if (threadIdx.x == 0) 138 { 139 float scale = (1 << 2) * blockSize * 255.0f; 140 scale = 1.0f / scale; 141 const float scale_sq_sq = scale * scale * scale * scale; 142 143 response[ptidx] = ((float)a * b - (float)c * c - harris_k * ((float)a + b) * ((float)a + b)) * scale_sq_sq; 144 } 145 } 146 } 147 HarrisResponses_gpu(PtrStepSzb img,const short2 * loc,float * response,const int npoints,int blockSize,float harris_k,cudaStream_t stream)148 void HarrisResponses_gpu(PtrStepSzb img, const short2* loc, float* response, const int npoints, int blockSize, float harris_k, cudaStream_t stream) 149 { 150 dim3 block(32, 8); 151 152 dim3 grid; 153 grid.x = divUp(npoints, block.y); 154 155 HarrisResponses<<<grid, block, 0, stream>>>(img, loc, response, npoints, blockSize, harris_k); 156 157 cudaSafeCall( cudaGetLastError() ); 158 159 if (stream == 0) 160 cudaSafeCall( cudaDeviceSynchronize() ); 161 } 162 163 //////////////////////////////////////////////////////////////////////////////////////////////////////// 164 // IC_Angle 165 166 __constant__ int c_u_max[32]; 167 loadUMax(const int * u_max,int count)168 void loadUMax(const int* u_max, int count) 169 { 170 cudaSafeCall( cudaMemcpyToSymbol(c_u_max, u_max, count * sizeof(int)) ); 171 } 172 IC_Angle(const PtrStepb image,const short2 * loc_,float * angle,const int npoints,const int half_k)173 __global__ void IC_Angle(const PtrStepb image, const short2* loc_, float* angle, const int npoints, const int half_k) 174 { 175 __shared__ int smem0[8 * 32]; 176 __shared__ int smem1[8 * 32]; 177 178 int* srow0 = smem0 + threadIdx.y * blockDim.x; 179 int* srow1 = smem1 + threadIdx.y * blockDim.x; 180 181 plus<int> op; 182 183 const int ptidx = blockIdx.x * blockDim.y + threadIdx.y; 184 185 if (ptidx < npoints) 186 { 187 int m_01 = 0, m_10 = 0; 188 189 const short2 loc = loc_[ptidx]; 190 191 // Treat the center line differently, v=0 192 for (int u = threadIdx.x - half_k; u <= half_k; u += blockDim.x) 193 m_10 += u * image(loc.y, loc.x + u); 194 195 reduce<32>(srow0, m_10, threadIdx.x, op); 196 197 for (int v = 1; v <= half_k; ++v) 198 { 199 // Proceed over the two lines 200 int v_sum = 0; 201 int m_sum = 0; 202 const int d = c_u_max[v]; 203 204 for (int u = threadIdx.x - d; u <= d; u += blockDim.x) 205 { 206 int val_plus = image(loc.y + v, loc.x + u); 207 int val_minus = image(loc.y - v, loc.x + u); 208 209 v_sum += (val_plus - val_minus); 210 m_sum += u * (val_plus + val_minus); 211 } 212 213 reduce<32>(smem_tuple(srow0, srow1), thrust::tie(v_sum, m_sum), threadIdx.x, thrust::make_tuple(op, op)); 214 215 m_10 += m_sum; 216 m_01 += v * v_sum; 217 } 218 219 if (threadIdx.x == 0) 220 { 221 float kp_dir = ::atan2f((float)m_01, (float)m_10); 222 kp_dir += (kp_dir < 0) * (2.0f * CV_PI_F); 223 kp_dir *= 180.0f / CV_PI_F; 224 225 angle[ptidx] = kp_dir; 226 } 227 } 228 } 229 IC_Angle_gpu(PtrStepSzb image,const short2 * loc,float * angle,int npoints,int half_k,cudaStream_t stream)230 void IC_Angle_gpu(PtrStepSzb image, const short2* loc, float* angle, int npoints, int half_k, cudaStream_t stream) 231 { 232 dim3 block(32, 8); 233 234 dim3 grid; 235 grid.x = divUp(npoints, block.y); 236 237 IC_Angle<<<grid, block, 0, stream>>>(image, loc, angle, npoints, half_k); 238 239 cudaSafeCall( cudaGetLastError() ); 240 241 if (stream == 0) 242 cudaSafeCall( cudaDeviceSynchronize() ); 243 } 244 245 //////////////////////////////////////////////////////////////////////////////////////////////////////// 246 // computeOrbDescriptor 247 248 template <int WTA_K> struct OrbDescriptor; 249 250 #define GET_VALUE(idx) \ 251 img(loc.y + __float2int_rn(pattern_x[idx] * sina + pattern_y[idx] * cosa), \ 252 loc.x + __float2int_rn(pattern_x[idx] * cosa - pattern_y[idx] * sina)) 253 254 template <> struct OrbDescriptor<2> 255 { calccv::cuda::device::orb::OrbDescriptor256 __device__ static int calc(const PtrStepb& img, short2 loc, const int* pattern_x, const int* pattern_y, float sina, float cosa, int i) 257 { 258 pattern_x += 16 * i; 259 pattern_y += 16 * i; 260 261 int t0, t1, val; 262 263 t0 = GET_VALUE(0); t1 = GET_VALUE(1); 264 val = t0 < t1; 265 266 t0 = GET_VALUE(2); t1 = GET_VALUE(3); 267 val |= (t0 < t1) << 1; 268 269 t0 = GET_VALUE(4); t1 = GET_VALUE(5); 270 val |= (t0 < t1) << 2; 271 272 t0 = GET_VALUE(6); t1 = GET_VALUE(7); 273 val |= (t0 < t1) << 3; 274 275 t0 = GET_VALUE(8); t1 = GET_VALUE(9); 276 val |= (t0 < t1) << 4; 277 278 t0 = GET_VALUE(10); t1 = GET_VALUE(11); 279 val |= (t0 < t1) << 5; 280 281 t0 = GET_VALUE(12); t1 = GET_VALUE(13); 282 val |= (t0 < t1) << 6; 283 284 t0 = GET_VALUE(14); t1 = GET_VALUE(15); 285 val |= (t0 < t1) << 7; 286 287 return val; 288 } 289 }; 290 291 template <> struct OrbDescriptor<3> 292 { calccv::cuda::device::orb::OrbDescriptor293 __device__ static int calc(const PtrStepb& img, short2 loc, const int* pattern_x, const int* pattern_y, float sina, float cosa, int i) 294 { 295 pattern_x += 12 * i; 296 pattern_y += 12 * i; 297 298 int t0, t1, t2, val; 299 300 t0 = GET_VALUE(0); t1 = GET_VALUE(1); t2 = GET_VALUE(2); 301 val = t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0); 302 303 t0 = GET_VALUE(3); t1 = GET_VALUE(4); t2 = GET_VALUE(5); 304 val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 2; 305 306 t0 = GET_VALUE(6); t1 = GET_VALUE(7); t2 = GET_VALUE(8); 307 val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 4; 308 309 t0 = GET_VALUE(9); t1 = GET_VALUE(10); t2 = GET_VALUE(11); 310 val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 6; 311 312 return val; 313 } 314 }; 315 316 template <> struct OrbDescriptor<4> 317 { calccv::cuda::device::orb::OrbDescriptor318 __device__ static int calc(const PtrStepb& img, short2 loc, const int* pattern_x, const int* pattern_y, float sina, float cosa, int i) 319 { 320 pattern_x += 16 * i; 321 pattern_y += 16 * i; 322 323 int t0, t1, t2, t3, k, val; 324 int a, b; 325 326 t0 = GET_VALUE(0); t1 = GET_VALUE(1); 327 t2 = GET_VALUE(2); t3 = GET_VALUE(3); 328 a = 0, b = 2; 329 if( t1 > t0 ) t0 = t1, a = 1; 330 if( t3 > t2 ) t2 = t3, b = 3; 331 k = t0 > t2 ? a : b; 332 val = k; 333 334 t0 = GET_VALUE(4); t1 = GET_VALUE(5); 335 t2 = GET_VALUE(6); t3 = GET_VALUE(7); 336 a = 0, b = 2; 337 if( t1 > t0 ) t0 = t1, a = 1; 338 if( t3 > t2 ) t2 = t3, b = 3; 339 k = t0 > t2 ? a : b; 340 val |= k << 2; 341 342 t0 = GET_VALUE(8); t1 = GET_VALUE(9); 343 t2 = GET_VALUE(10); t3 = GET_VALUE(11); 344 a = 0, b = 2; 345 if( t1 > t0 ) t0 = t1, a = 1; 346 if( t3 > t2 ) t2 = t3, b = 3; 347 k = t0 > t2 ? a : b; 348 val |= k << 4; 349 350 t0 = GET_VALUE(12); t1 = GET_VALUE(13); 351 t2 = GET_VALUE(14); t3 = GET_VALUE(15); 352 a = 0, b = 2; 353 if( t1 > t0 ) t0 = t1, a = 1; 354 if( t3 > t2 ) t2 = t3, b = 3; 355 k = t0 > t2 ? a : b; 356 val |= k << 6; 357 358 return val; 359 } 360 }; 361 362 #undef GET_VALUE 363 364 template <int WTA_K> computeOrbDescriptor(const PtrStepb img,const short2 * loc,const float * angle_,const int npoints,const int * pattern_x,const int * pattern_y,PtrStepb desc,int dsize)365 __global__ void computeOrbDescriptor(const PtrStepb img, const short2* loc, const float* angle_, const int npoints, 366 const int* pattern_x, const int* pattern_y, PtrStepb desc, int dsize) 367 { 368 const int descidx = blockIdx.x * blockDim.x + threadIdx.x; 369 const int ptidx = blockIdx.y * blockDim.y + threadIdx.y; 370 371 if (ptidx < npoints && descidx < dsize) 372 { 373 float angle = angle_[ptidx]; 374 angle *= (float)(CV_PI_F / 180.f); 375 376 float sina, cosa; 377 ::sincosf(angle, &sina, &cosa); 378 379 desc.ptr(ptidx)[descidx] = OrbDescriptor<WTA_K>::calc(img, loc[ptidx], pattern_x, pattern_y, sina, cosa, descidx); 380 } 381 } 382 computeOrbDescriptor_gpu(PtrStepb img,const short2 * loc,const float * angle,const int npoints,const int * pattern_x,const int * pattern_y,PtrStepb desc,int dsize,int WTA_K,cudaStream_t stream)383 void computeOrbDescriptor_gpu(PtrStepb img, const short2* loc, const float* angle, const int npoints, 384 const int* pattern_x, const int* pattern_y, PtrStepb desc, int dsize, int WTA_K, cudaStream_t stream) 385 { 386 dim3 block(32, 8); 387 388 dim3 grid; 389 grid.x = divUp(dsize, block.x); 390 grid.y = divUp(npoints, block.y); 391 392 switch (WTA_K) 393 { 394 case 2: 395 computeOrbDescriptor<2><<<grid, block, 0, stream>>>(img, loc, angle, npoints, pattern_x, pattern_y, desc, dsize); 396 break; 397 398 case 3: 399 computeOrbDescriptor<3><<<grid, block, 0, stream>>>(img, loc, angle, npoints, pattern_x, pattern_y, desc, dsize); 400 break; 401 402 case 4: 403 computeOrbDescriptor<4><<<grid, block, 0, stream>>>(img, loc, angle, npoints, pattern_x, pattern_y, desc, dsize); 404 break; 405 } 406 407 cudaSafeCall( cudaGetLastError() ); 408 409 if (stream == 0) 410 cudaSafeCall( cudaDeviceSynchronize() ); 411 } 412 413 //////////////////////////////////////////////////////////////////////////////////////////////////////// 414 // mergeLocation 415 mergeLocation(const short2 * loc_,float * x,float * y,const int npoints,float scale)416 __global__ void mergeLocation(const short2* loc_, float* x, float* y, const int npoints, float scale) 417 { 418 const int ptidx = blockIdx.x * blockDim.x + threadIdx.x; 419 420 if (ptidx < npoints) 421 { 422 short2 loc = loc_[ptidx]; 423 424 x[ptidx] = loc.x * scale; 425 y[ptidx] = loc.y * scale; 426 } 427 } 428 mergeLocation_gpu(const short2 * loc,float * x,float * y,int npoints,float scale,cudaStream_t stream)429 void mergeLocation_gpu(const short2* loc, float* x, float* y, int npoints, float scale, cudaStream_t stream) 430 { 431 dim3 block(256); 432 433 dim3 grid; 434 grid.x = divUp(npoints, block.x); 435 436 mergeLocation<<<grid, block, 0, stream>>>(loc, x, y, npoints, scale); 437 438 cudaSafeCall( cudaGetLastError() ); 439 440 if (stream == 0) 441 cudaSafeCall( cudaDeviceSynchronize() ); 442 } 443 } 444 }}} 445 446 #endif /* CUDA_DISABLER */ 447