1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 //   * Redistribution's of source code must retain the above copyright notice,
21 //     this list of conditions and the following disclaimer.
22 //
23 //   * Redistribution's in binary form must reproduce the above copyright notice,
24 //     this list of conditions and the following disclaimer in the documentation
25 //     and/or other materials provided with the distribution.
26 //
27 //   * The name of the copyright holders may not be used to endorse or promote products
28 //     derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42 
43 #if !defined CUDA_DISABLER
44 
45 #include <thrust/device_ptr.h>
46 #include <thrust/sort.h>
47 #include <thrust/system/cuda/execution_policy.h>
48 #include <thrust/version.h>
49 
50 
51 #include "opencv2/core/cuda/common.hpp"
52 #include "opencv2/core/cuda/reduce.hpp"
53 #include "opencv2/core/cuda/functional.hpp"
54 #include "opencv2/core/cuda/utility.hpp"
55 namespace cv { namespace cuda { namespace device
56 {
57     namespace orb
58     {
59         ////////////////////////////////////////////////////////////////////////////////////////////////////////
60         // cull
61 
cull_gpu(int * loc,float * response,int size,int n_points,cudaStream_t stream)62         int cull_gpu(int* loc, float* response, int size, int n_points, cudaStream_t stream)
63         {
64             thrust::device_ptr<int> loc_ptr(loc);
65             thrust::device_ptr<float> response_ptr(response);
66 #if THRUST_VERSION >= 100800
67 #if THRUST_VERSION >= 100802
68             if (stream)
69             {
70                 thrust::sort_by_key(thrust::cuda::par(ThrustAllocator::getAllocator()).on(stream), response_ptr, response_ptr + size, loc_ptr, thrust::greater<float>());
71             }
72             else
73             {
74                 thrust::sort_by_key(thrust::cuda::par(ThrustAllocator::getAllocator()), response_ptr, response_ptr + size, loc_ptr, thrust::greater<float>());
75             }
76 #else
77             if(stream)
78             {
79                 thrust::sort_by_key(thrust::cuda::par.on(stream), response_ptr, response_ptr + size, loc_ptr, thrust::greater<float>());
80             }else
81             {
82                 thrust::sort_by_key(response_ptr, response_ptr + size, loc_ptr, thrust::greater<float>());
83             }
84 #endif
85 #else
86             thrust::sort_by_key(response_ptr, response_ptr + size, loc_ptr, thrust::greater<float>());
87 #endif
88             return n_points;
89         }
90 
91         ////////////////////////////////////////////////////////////////////////////////////////////////////////
92         // HarrisResponses
93 
HarrisResponses(const PtrStepb img,const short2 * loc_,float * response,const int npoints,const int blockSize,const float harris_k)94         __global__ void HarrisResponses(const PtrStepb img, const short2* loc_, float* response, const int npoints, const int blockSize, const float harris_k)
95         {
96             __shared__ int smem0[8 * 32];
97             __shared__ int smem1[8 * 32];
98             __shared__ int smem2[8 * 32];
99 
100             const int ptidx = blockIdx.x * blockDim.y + threadIdx.y;
101 
102             if (ptidx < npoints)
103             {
104                 const short2 loc = loc_[ptidx];
105 
106                 const int r = blockSize / 2;
107                 const int x0 = loc.x - r;
108                 const int y0 = loc.y - r;
109 
110                 int a = 0, b = 0, c = 0;
111 
112                 for (int ind = threadIdx.x; ind < blockSize * blockSize; ind += blockDim.x)
113                 {
114                     const int i = ind / blockSize;
115                     const int j = ind % blockSize;
116 
117                     int Ix = (img(y0 + i, x0 + j + 1) - img(y0 + i, x0 + j - 1)) * 2 +
118                         (img(y0 + i - 1, x0 + j + 1) - img(y0 + i - 1, x0 + j - 1)) +
119                         (img(y0 + i + 1, x0 + j + 1) - img(y0 + i + 1, x0 + j - 1));
120 
121                     int Iy = (img(y0 + i + 1, x0 + j) - img(y0 + i - 1, x0 + j)) * 2 +
122                         (img(y0 + i + 1, x0 + j - 1) - img(y0 + i - 1, x0 + j - 1)) +
123                         (img(y0 + i + 1, x0 + j + 1) - img(y0 + i - 1, x0 + j + 1));
124 
125                     a += Ix * Ix;
126                     b += Iy * Iy;
127                     c += Ix * Iy;
128                 }
129 
130                 int* srow0 = smem0 + threadIdx.y * blockDim.x;
131                 int* srow1 = smem1 + threadIdx.y * blockDim.x;
132                 int* srow2 = smem2 + threadIdx.y * blockDim.x;
133 
134                 plus<int> op;
135                 reduce<32>(smem_tuple(srow0, srow1, srow2), thrust::tie(a, b, c), threadIdx.x, thrust::make_tuple(op, op, op));
136 
137                 if (threadIdx.x == 0)
138                 {
139                     float scale = (1 << 2) * blockSize * 255.0f;
140                     scale = 1.0f / scale;
141                     const float scale_sq_sq = scale * scale * scale * scale;
142 
143                     response[ptidx] = ((float)a * b - (float)c * c - harris_k * ((float)a + b) * ((float)a + b)) * scale_sq_sq;
144                 }
145             }
146         }
147 
HarrisResponses_gpu(PtrStepSzb img,const short2 * loc,float * response,const int npoints,int blockSize,float harris_k,cudaStream_t stream)148         void HarrisResponses_gpu(PtrStepSzb img, const short2* loc, float* response, const int npoints, int blockSize, float harris_k, cudaStream_t stream)
149         {
150             dim3 block(32, 8);
151 
152             dim3 grid;
153             grid.x = divUp(npoints, block.y);
154 
155             HarrisResponses<<<grid, block, 0, stream>>>(img, loc, response, npoints, blockSize, harris_k);
156 
157             cudaSafeCall( cudaGetLastError() );
158 
159             if (stream == 0)
160                 cudaSafeCall( cudaDeviceSynchronize() );
161         }
162 
163         ////////////////////////////////////////////////////////////////////////////////////////////////////////
164         // IC_Angle
165 
166         __constant__ int c_u_max[32];
167 
loadUMax(const int * u_max,int count)168         void loadUMax(const int* u_max, int count)
169         {
170             cudaSafeCall( cudaMemcpyToSymbol(c_u_max, u_max, count * sizeof(int)) );
171         }
172 
IC_Angle(const PtrStepb image,const short2 * loc_,float * angle,const int npoints,const int half_k)173         __global__ void IC_Angle(const PtrStepb image, const short2* loc_, float* angle, const int npoints, const int half_k)
174         {
175             __shared__ int smem0[8 * 32];
176             __shared__ int smem1[8 * 32];
177 
178             int* srow0 = smem0 + threadIdx.y * blockDim.x;
179             int* srow1 = smem1 + threadIdx.y * blockDim.x;
180 
181             plus<int> op;
182 
183             const int ptidx = blockIdx.x * blockDim.y + threadIdx.y;
184 
185             if (ptidx < npoints)
186             {
187                 int m_01 = 0, m_10 = 0;
188 
189                 const short2 loc = loc_[ptidx];
190 
191                 // Treat the center line differently, v=0
192                 for (int u = threadIdx.x - half_k; u <= half_k; u += blockDim.x)
193                     m_10 += u * image(loc.y, loc.x + u);
194 
195                 reduce<32>(srow0, m_10, threadIdx.x, op);
196 
197                 for (int v = 1; v <= half_k; ++v)
198                 {
199                     // Proceed over the two lines
200                     int v_sum = 0;
201                     int m_sum = 0;
202                     const int d = c_u_max[v];
203 
204                     for (int u = threadIdx.x - d; u <= d; u += blockDim.x)
205                     {
206                         int val_plus = image(loc.y + v, loc.x + u);
207                         int val_minus = image(loc.y - v, loc.x + u);
208 
209                         v_sum += (val_plus - val_minus);
210                         m_sum += u * (val_plus + val_minus);
211                     }
212 
213                     reduce<32>(smem_tuple(srow0, srow1), thrust::tie(v_sum, m_sum), threadIdx.x, thrust::make_tuple(op, op));
214 
215                     m_10 += m_sum;
216                     m_01 += v * v_sum;
217                 }
218 
219                 if (threadIdx.x == 0)
220                 {
221                     float kp_dir = ::atan2f((float)m_01, (float)m_10);
222                     kp_dir += (kp_dir < 0) * (2.0f * CV_PI_F);
223                     kp_dir *= 180.0f / CV_PI_F;
224 
225                     angle[ptidx] = kp_dir;
226                 }
227             }
228         }
229 
IC_Angle_gpu(PtrStepSzb image,const short2 * loc,float * angle,int npoints,int half_k,cudaStream_t stream)230         void IC_Angle_gpu(PtrStepSzb image, const short2* loc, float* angle, int npoints, int half_k, cudaStream_t stream)
231         {
232             dim3 block(32, 8);
233 
234             dim3 grid;
235             grid.x = divUp(npoints, block.y);
236 
237             IC_Angle<<<grid, block, 0, stream>>>(image, loc, angle, npoints, half_k);
238 
239             cudaSafeCall( cudaGetLastError() );
240 
241             if (stream == 0)
242                 cudaSafeCall( cudaDeviceSynchronize() );
243         }
244 
245         ////////////////////////////////////////////////////////////////////////////////////////////////////////
246         // computeOrbDescriptor
247 
248         template <int WTA_K> struct OrbDescriptor;
249 
250         #define GET_VALUE(idx) \
251             img(loc.y + __float2int_rn(pattern_x[idx] * sina + pattern_y[idx] * cosa), \
252                 loc.x + __float2int_rn(pattern_x[idx] * cosa - pattern_y[idx] * sina))
253 
254         template <> struct OrbDescriptor<2>
255         {
calccv::cuda::device::orb::OrbDescriptor256             __device__ static int calc(const PtrStepb& img, short2 loc, const int* pattern_x, const int* pattern_y, float sina, float cosa, int i)
257             {
258                 pattern_x += 16 * i;
259                 pattern_y += 16 * i;
260 
261                 int t0, t1, val;
262 
263                 t0 = GET_VALUE(0); t1 = GET_VALUE(1);
264                 val = t0 < t1;
265 
266                 t0 = GET_VALUE(2); t1 = GET_VALUE(3);
267                 val |= (t0 < t1) << 1;
268 
269                 t0 = GET_VALUE(4); t1 = GET_VALUE(5);
270                 val |= (t0 < t1) << 2;
271 
272                 t0 = GET_VALUE(6); t1 = GET_VALUE(7);
273                 val |= (t0 < t1) << 3;
274 
275                 t0 = GET_VALUE(8); t1 = GET_VALUE(9);
276                 val |= (t0 < t1) << 4;
277 
278                 t0 = GET_VALUE(10); t1 = GET_VALUE(11);
279                 val |= (t0 < t1) << 5;
280 
281                 t0 = GET_VALUE(12); t1 = GET_VALUE(13);
282                 val |= (t0 < t1) << 6;
283 
284                 t0 = GET_VALUE(14); t1 = GET_VALUE(15);
285                 val |= (t0 < t1) << 7;
286 
287                 return val;
288             }
289         };
290 
291         template <> struct OrbDescriptor<3>
292         {
calccv::cuda::device::orb::OrbDescriptor293             __device__ static int calc(const PtrStepb& img, short2 loc, const int* pattern_x, const int* pattern_y, float sina, float cosa, int i)
294             {
295                 pattern_x += 12 * i;
296                 pattern_y += 12 * i;
297 
298                 int t0, t1, t2, val;
299 
300                 t0 = GET_VALUE(0); t1 = GET_VALUE(1); t2 = GET_VALUE(2);
301                 val = t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0);
302 
303                 t0 = GET_VALUE(3); t1 = GET_VALUE(4); t2 = GET_VALUE(5);
304                 val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 2;
305 
306                 t0 = GET_VALUE(6); t1 = GET_VALUE(7); t2 = GET_VALUE(8);
307                 val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 4;
308 
309                 t0 = GET_VALUE(9); t1 = GET_VALUE(10); t2 = GET_VALUE(11);
310                 val |= (t2 > t1 ? (t2 > t0 ? 2 : 0) : (t1 > t0)) << 6;
311 
312                 return val;
313             }
314         };
315 
316         template <> struct OrbDescriptor<4>
317         {
calccv::cuda::device::orb::OrbDescriptor318             __device__ static int calc(const PtrStepb& img, short2 loc, const int* pattern_x, const int* pattern_y, float sina, float cosa, int i)
319             {
320                 pattern_x += 16 * i;
321                 pattern_y += 16 * i;
322 
323                 int t0, t1, t2, t3, k, val;
324                 int a, b;
325 
326                 t0 = GET_VALUE(0); t1 = GET_VALUE(1);
327                 t2 = GET_VALUE(2); t3 = GET_VALUE(3);
328                 a = 0, b = 2;
329                 if( t1 > t0 ) t0 = t1, a = 1;
330                 if( t3 > t2 ) t2 = t3, b = 3;
331                 k = t0 > t2 ? a : b;
332                 val = k;
333 
334                 t0 = GET_VALUE(4); t1 = GET_VALUE(5);
335                 t2 = GET_VALUE(6); t3 = GET_VALUE(7);
336                 a = 0, b = 2;
337                 if( t1 > t0 ) t0 = t1, a = 1;
338                 if( t3 > t2 ) t2 = t3, b = 3;
339                 k = t0 > t2 ? a : b;
340                 val |= k << 2;
341 
342                 t0 = GET_VALUE(8); t1 = GET_VALUE(9);
343                 t2 = GET_VALUE(10); t3 = GET_VALUE(11);
344                 a = 0, b = 2;
345                 if( t1 > t0 ) t0 = t1, a = 1;
346                 if( t3 > t2 ) t2 = t3, b = 3;
347                 k = t0 > t2 ? a : b;
348                 val |= k << 4;
349 
350                 t0 = GET_VALUE(12); t1 = GET_VALUE(13);
351                 t2 = GET_VALUE(14); t3 = GET_VALUE(15);
352                 a = 0, b = 2;
353                 if( t1 > t0 ) t0 = t1, a = 1;
354                 if( t3 > t2 ) t2 = t3, b = 3;
355                 k = t0 > t2 ? a : b;
356                 val |= k << 6;
357 
358                 return val;
359             }
360         };
361 
362         #undef GET_VALUE
363 
364         template <int WTA_K>
computeOrbDescriptor(const PtrStepb img,const short2 * loc,const float * angle_,const int npoints,const int * pattern_x,const int * pattern_y,PtrStepb desc,int dsize)365         __global__ void computeOrbDescriptor(const PtrStepb img, const short2* loc, const float* angle_, const int npoints,
366             const int* pattern_x, const int* pattern_y, PtrStepb desc, int dsize)
367         {
368             const int descidx = blockIdx.x * blockDim.x + threadIdx.x;
369             const int ptidx = blockIdx.y * blockDim.y + threadIdx.y;
370 
371             if (ptidx < npoints && descidx < dsize)
372             {
373                 float angle = angle_[ptidx];
374                 angle *= (float)(CV_PI_F / 180.f);
375 
376                 float sina, cosa;
377                 ::sincosf(angle, &sina, &cosa);
378 
379                 desc.ptr(ptidx)[descidx] = OrbDescriptor<WTA_K>::calc(img, loc[ptidx], pattern_x, pattern_y, sina, cosa, descidx);
380             }
381         }
382 
computeOrbDescriptor_gpu(PtrStepb img,const short2 * loc,const float * angle,const int npoints,const int * pattern_x,const int * pattern_y,PtrStepb desc,int dsize,int WTA_K,cudaStream_t stream)383         void computeOrbDescriptor_gpu(PtrStepb img, const short2* loc, const float* angle, const int npoints,
384             const int* pattern_x, const int* pattern_y, PtrStepb desc, int dsize, int WTA_K, cudaStream_t stream)
385         {
386             dim3 block(32, 8);
387 
388             dim3 grid;
389             grid.x = divUp(dsize, block.x);
390             grid.y = divUp(npoints, block.y);
391 
392             switch (WTA_K)
393             {
394             case 2:
395                 computeOrbDescriptor<2><<<grid, block, 0, stream>>>(img, loc, angle, npoints, pattern_x, pattern_y, desc, dsize);
396                 break;
397 
398             case 3:
399                 computeOrbDescriptor<3><<<grid, block, 0, stream>>>(img, loc, angle, npoints, pattern_x, pattern_y, desc, dsize);
400                 break;
401 
402             case 4:
403                 computeOrbDescriptor<4><<<grid, block, 0, stream>>>(img, loc, angle, npoints, pattern_x, pattern_y, desc, dsize);
404                 break;
405             }
406 
407             cudaSafeCall( cudaGetLastError() );
408 
409             if (stream == 0)
410                 cudaSafeCall( cudaDeviceSynchronize() );
411         }
412 
413         ////////////////////////////////////////////////////////////////////////////////////////////////////////
414         // mergeLocation
415 
mergeLocation(const short2 * loc_,float * x,float * y,const int npoints,float scale)416         __global__ void mergeLocation(const short2* loc_, float* x, float* y, const int npoints, float scale)
417         {
418             const int ptidx = blockIdx.x * blockDim.x + threadIdx.x;
419 
420             if (ptidx < npoints)
421             {
422                 short2 loc = loc_[ptidx];
423 
424                 x[ptidx] = loc.x * scale;
425                 y[ptidx] = loc.y * scale;
426             }
427         }
428 
mergeLocation_gpu(const short2 * loc,float * x,float * y,int npoints,float scale,cudaStream_t stream)429         void mergeLocation_gpu(const short2* loc, float* x, float* y, int npoints, float scale, cudaStream_t stream)
430         {
431             dim3 block(256);
432 
433             dim3 grid;
434             grid.x = divUp(npoints, block.x);
435 
436             mergeLocation<<<grid, block, 0, stream>>>(loc, x, y, npoints, scale);
437 
438             cudaSafeCall( cudaGetLastError() );
439 
440             if (stream == 0)
441                 cudaSafeCall( cudaDeviceSynchronize() );
442         }
443     }
444 }}}
445 
446 #endif /* CUDA_DISABLER */
447