1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 //   * Redistribution's of source code must retain the above copyright notice,
21 //     this list of conditions and the following disclaimer.
22 //
23 //   * Redistribution's in binary form must reproduce the above copyright notice,
24 //     this list of conditions and the following disclaimer in the documentation
25 //     and/or other materials provided with the distribution.
26 //
27 //   * The name of the copyright holders may not be used to endorse or promote products
28 //     derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42 
43 #if !defined CUDA_DISABLER
44 
45 #include "opencv2/core/cuda/common.hpp"
46 #include "opencv2/core/cuda/utility.hpp"
47 #include "opencv2/core/cuda/reduce.hpp"
48 #include "opencv2/core/cuda/limits.hpp"
49 #include "opencv2/core/cuda/vec_distance.hpp"
50 #include "opencv2/core/cuda/datamov_utils.hpp"
51 
52 namespace cv { namespace cuda { namespace device
53 {
54     namespace bf_match
55     {
56         ///////////////////////////////////////////////////////////////////////////////
57         // Reduction
58 
59         template <int BLOCK_SIZE>
findBestMatch(float & bestDistance,int & bestTrainIdx,float * s_distance,int * s_trainIdx)60         __device__ void findBestMatch(float& bestDistance, int& bestTrainIdx, float* s_distance, int* s_trainIdx)
61         {
62             s_distance += threadIdx.y * BLOCK_SIZE;
63             s_trainIdx += threadIdx.y * BLOCK_SIZE;
64 
65             reduceKeyVal<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, threadIdx.x, less<float>());
66         }
67 
68         template <int BLOCK_SIZE>
findBestMatch(float & bestDistance,int & bestTrainIdx,int & bestImgIdx,float * s_distance,int * s_trainIdx,int * s_imgIdx)69         __device__ void findBestMatch(float& bestDistance, int& bestTrainIdx, int& bestImgIdx, float* s_distance, int* s_trainIdx, int* s_imgIdx)
70         {
71             s_distance += threadIdx.y * BLOCK_SIZE;
72             s_trainIdx += threadIdx.y * BLOCK_SIZE;
73             s_imgIdx   += threadIdx.y * BLOCK_SIZE;
74 
75             reduceKeyVal<BLOCK_SIZE>(s_distance, bestDistance, smem_tuple(s_trainIdx, s_imgIdx), thrust::tie(bestTrainIdx, bestImgIdx), threadIdx.x, less<float>());
76         }
77 
78         ///////////////////////////////////////////////////////////////////////////////
79         // Match Unrolled Cached
80 
81         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename T, typename U>
loadQueryToSmem(int queryIdx,const PtrStepSz<T> & query,U * s_query)82         __device__ void loadQueryToSmem(int queryIdx, const PtrStepSz<T>& query, U* s_query)
83         {
84             #pragma unroll
85             for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
86             {
87                 const int loadX = threadIdx.x + i * BLOCK_SIZE;
88                 s_query[threadIdx.y * MAX_DESC_LEN + loadX] = loadX < query.cols ? query.ptr(::min(queryIdx, query.rows - 1))[loadX] : 0;
89             }
90         }
91 
92         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
loopUnrolledCached(int queryIdx,const PtrStepSz<T> & query,volatile int imgIdx,const PtrStepSz<T> & train,const Mask & mask,typename Dist::value_type * s_query,typename Dist::value_type * s_train,float & bestDistance,int & bestTrainIdx,int & bestImgIdx)93         __device__ void loopUnrolledCached(int queryIdx, const PtrStepSz<T>& query,volatile int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
94                                            typename Dist::value_type* s_query, typename Dist::value_type* s_train,
95                                            float& bestDistance, int& bestTrainIdx, int& bestImgIdx)
96         {
97             for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
98             {
99                 Dist dist;
100 
101                 #pragma unroll
102                 for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
103                 {
104                     const int loadX = threadIdx.x + i * BLOCK_SIZE;
105 
106                     s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
107 
108                     if (loadX < train.cols)
109                     {
110                         T val;
111 
112                         ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
113                         s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
114                     }
115 
116                     __syncthreads();
117 
118                     #pragma unroll
119                     for (int j = 0; j < BLOCK_SIZE; ++j)
120                         dist.reduceIter(s_query[threadIdx.y * MAX_DESC_LEN + i * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
121 
122                     __syncthreads();
123                 }
124 
125                 typename Dist::result_type distVal = dist;
126 
127                 const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
128 
129                 if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx))
130                 {
131                     bestImgIdx = imgIdx;
132                     bestDistance = distVal;
133                     bestTrainIdx = trainIdx;
134                 }
135             }
136         }
137 
138         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
matchUnrolledCached(const PtrStepSz<T> query,const PtrStepSz<T> train,const Mask mask,int * bestTrainIdx,float * bestDistance)139         __global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
140         {
141             extern __shared__ int smem[];
142 
143             const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
144 
145             typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
146             typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);
147 
148             loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);
149 
150             float myBestDistance = numeric_limits<float>::max();
151             int myBestTrainIdx = -1;
152 
153             loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx);
154 
155             __syncthreads();
156 
157             float* s_distance = (float*)(smem);
158             int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
159 
160             findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx);
161 
162             if (queryIdx < query.rows && threadIdx.x == 0)
163             {
164                 bestTrainIdx[queryIdx] = myBestTrainIdx;
165                 bestDistance[queryIdx] = myBestDistance;
166             }
167         }
168 
169         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
matchUnrolledCached(const PtrStepSz<T> & query,const PtrStepSz<T> & train,const Mask & mask,const PtrStepSzi & trainIdx,const PtrStepSzf & distance,cudaStream_t stream)170         void matchUnrolledCached(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
171                                  const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
172                                  cudaStream_t stream)
173         {
174             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
175             const dim3 grid(divUp(query.rows, BLOCK_SIZE));
176 
177             const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
178 
179             matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
180             cudaSafeCall( cudaGetLastError() );
181 
182             if (stream == 0)
183                 cudaSafeCall( cudaDeviceSynchronize() );
184         }
185 
186         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
matchUnrolledCached(const PtrStepSz<T> query,const PtrStepSz<T> * trains,int n,const Mask mask,int * bestTrainIdx,int * bestImgIdx,float * bestDistance)187         __global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask,
188                                             int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
189         {
190             extern __shared__ int smem[];
191 
192             const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
193 
194             typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
195             typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN);
196 
197             loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query);
198 
199             float myBestDistance = numeric_limits<float>::max();
200             int myBestTrainIdx = -1;
201             int myBestImgIdx = -1;
202 
203             Mask m = mask;
204 
205             for (int imgIdx = 0; imgIdx < n; ++imgIdx)
206             {
207                 const PtrStepSz<T> train = trains[imgIdx];
208                 m.next();
209                 loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx);
210             }
211 
212             __syncthreads();
213 
214             float* s_distance = (float*)(smem);
215             int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
216             int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
217 
218             findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdx);
219 
220             if (queryIdx < query.rows && threadIdx.x == 0)
221             {
222                 bestTrainIdx[queryIdx] = myBestTrainIdx;
223                 bestImgIdx[queryIdx] = myBestImgIdx;
224                 bestDistance[queryIdx] = myBestDistance;
225             }
226         }
227 
228         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
matchUnrolledCached(const PtrStepSz<T> & query,const PtrStepSz<T> * trains,int n,const Mask & mask,const PtrStepSzi & trainIdx,const PtrStepSzi & imgIdx,const PtrStepSzf & distance,cudaStream_t stream)229         void matchUnrolledCached(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
230                                  const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
231                                  cudaStream_t stream)
232         {
233             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
234             const dim3 grid(divUp(query.rows, BLOCK_SIZE));
235 
236             const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= 2 * BLOCK_SIZE ? MAX_DESC_LEN : 2 * BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
237 
238             matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
239             cudaSafeCall( cudaGetLastError() );
240 
241             if (stream == 0)
242                 cudaSafeCall( cudaDeviceSynchronize() );
243         }
244 
245         ///////////////////////////////////////////////////////////////////////////////
246         // Match Unrolled
247 
248         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
loopUnrolled(int queryIdx,const PtrStepSz<T> & query,volatile int imgIdx,const PtrStepSz<T> & train,const Mask & mask,typename Dist::value_type * s_query,typename Dist::value_type * s_train,float & bestDistance,int & bestTrainIdx,int & bestImgIdx)249         __device__ void loopUnrolled(int queryIdx, const PtrStepSz<T>& query,volatile int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
250                                      typename Dist::value_type* s_query, typename Dist::value_type* s_train,
251                                      float& bestDistance, int& bestTrainIdx, int& bestImgIdx)
252         {
253             for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
254             {
255                 Dist dist;
256 
257                 #pragma unroll
258                 for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i)
259                 {
260                     const int loadX = threadIdx.x + i * BLOCK_SIZE;
261 
262                     s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
263                     s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
264 
265                     if (loadX < query.cols)
266                     {
267                         T val;
268 
269                         ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
270                         s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
271 
272                         ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
273                         s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
274                     }
275 
276                     __syncthreads();
277 
278                     #pragma unroll
279                     for (int j = 0; j < BLOCK_SIZE; ++j)
280                         dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
281 
282                     __syncthreads();
283                 }
284 
285                 typename Dist::result_type distVal = dist;
286 
287                 const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
288 
289                 if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx))
290                 {
291                     bestImgIdx = imgIdx;
292                     bestDistance = distVal;
293                     bestTrainIdx = trainIdx;
294                 }
295             }
296         }
297 
298         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
matchUnrolled(const PtrStepSz<T> query,const PtrStepSz<T> train,const Mask mask,int * bestTrainIdx,float * bestDistance)299         __global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
300         {
301             extern __shared__ int smem[];
302 
303             const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
304 
305             float myBestDistance = numeric_limits<float>::max();
306             int myBestTrainIdx = -1;
307 
308             typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
309             typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
310 
311             loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx);
312 
313             __syncthreads();
314 
315             float* s_distance = (float*)(smem);
316             int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
317 
318             findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx);
319 
320             if (queryIdx < query.rows && threadIdx.x == 0)
321             {
322                 bestTrainIdx[queryIdx] = myBestTrainIdx;
323                 bestDistance[queryIdx] = myBestDistance;
324             }
325         }
326 
327         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
matchUnrolled(const PtrStepSz<T> & query,const PtrStepSz<T> & train,const Mask & mask,const PtrStepSzi & trainIdx,const PtrStepSzf & distance,cudaStream_t stream)328         void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
329                            const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
330                            cudaStream_t stream)
331         {
332             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
333             const dim3 grid(divUp(query.rows, BLOCK_SIZE));
334 
335             const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
336 
337             matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
338             cudaSafeCall( cudaGetLastError() );
339 
340             if (stream == 0)
341                 cudaSafeCall( cudaDeviceSynchronize() );
342         }
343 
344         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
matchUnrolled(const PtrStepSz<T> query,const PtrStepSz<T> * trains,int n,const Mask mask,int * bestTrainIdx,int * bestImgIdx,float * bestDistance)345         __global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask,
346                                       int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
347         {
348             extern __shared__ int smem[];
349 
350             const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
351 
352             float myBestDistance = numeric_limits<float>::max();
353             int myBestTrainIdx = -1;
354             int myBestImgIdx = -1;
355 
356             typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
357             typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
358 
359             Mask m = mask;
360 
361             for (int imgIdx = 0; imgIdx < n; ++imgIdx)
362             {
363                 const PtrStepSz<T> train = trains[imgIdx];
364                 m.next();
365                 loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx);
366             }
367 
368             __syncthreads();
369 
370             float* s_distance = (float*)(smem);
371             int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
372             int* s_imgIdxIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
373 
374             findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdxIdx);
375 
376             if (queryIdx < query.rows && threadIdx.x == 0)
377             {
378                 bestTrainIdx[queryIdx] = myBestTrainIdx;
379                 bestImgIdx[queryIdx] = myBestImgIdx;
380                 bestDistance[queryIdx] = myBestDistance;
381             }
382         }
383 
384         template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask>
matchUnrolled(const PtrStepSz<T> & query,const PtrStepSz<T> * trains,int n,const Mask & mask,const PtrStepSzi & trainIdx,const PtrStepSzi & imgIdx,const PtrStepSzf & distance,cudaStream_t stream)385         void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
386                            const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
387                            cudaStream_t stream)
388         {
389             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
390             const dim3 grid(divUp(query.rows, BLOCK_SIZE));
391 
392             const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
393 
394             matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
395             cudaSafeCall( cudaGetLastError() );
396 
397             if (stream == 0)
398                 cudaSafeCall( cudaDeviceSynchronize() );
399         }
400 
401         ///////////////////////////////////////////////////////////////////////////////
402         // Match
403 
404         template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
loop(int queryIdx,const PtrStepSz<T> & query,volatile int imgIdx,const PtrStepSz<T> & train,const Mask & mask,typename Dist::value_type * s_query,typename Dist::value_type * s_train,float & bestDistance,int & bestTrainIdx,int & bestImgIdx)405         __device__ void loop(int queryIdx, const PtrStepSz<T>& query, volatile int imgIdx, const PtrStepSz<T>& train, const Mask& mask,
406                              typename Dist::value_type* s_query, typename Dist::value_type* s_train,
407                              float& bestDistance, int& bestTrainIdx, int& bestImgIdx)
408         {
409             for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t)
410             {
411                 Dist dist;
412 
413                 for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i)
414                 {
415                     const int loadX = threadIdx.x + i * BLOCK_SIZE;
416 
417                     s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0;
418                     s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0;
419 
420                     if (loadX < query.cols)
421                     {
422                         T val;
423 
424                         ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val);
425                         s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val;
426 
427                         ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val);
428                         s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val;
429                     }
430 
431                     __syncthreads();
432 
433                     #pragma unroll
434                     for (int j = 0; j < BLOCK_SIZE; ++j)
435                         dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]);
436 
437                     __syncthreads();
438                 }
439 
440                 typename Dist::result_type distVal = dist;
441 
442                 const int trainIdx = t * BLOCK_SIZE + threadIdx.x;
443 
444                 if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx))
445                 {
446                     bestImgIdx = imgIdx;
447                     bestDistance = distVal;
448                     bestTrainIdx = trainIdx;
449                 }
450             }
451         }
452 
453         template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
match(const PtrStepSz<T> query,const PtrStepSz<T> train,const Mask mask,int * bestTrainIdx,float * bestDistance)454         __global__ void match(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance)
455         {
456             extern __shared__ int smem[];
457 
458             const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
459 
460             float myBestDistance = numeric_limits<float>::max();
461             int myBestTrainIdx = -1;
462 
463             typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
464             typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
465 
466             loop<BLOCK_SIZE, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx);
467 
468             __syncthreads();
469 
470             float* s_distance = (float*)(smem);
471             int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
472 
473             findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx);
474 
475             if (queryIdx < query.rows && threadIdx.x == 0)
476             {
477                 bestTrainIdx[queryIdx] = myBestTrainIdx;
478                 bestDistance[queryIdx] = myBestDistance;
479             }
480         }
481 
482         template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
match(const PtrStepSz<T> & query,const PtrStepSz<T> & train,const Mask & mask,const PtrStepSzi & trainIdx,const PtrStepSzf & distance,cudaStream_t stream)483         void match(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
484                    const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
485                    cudaStream_t stream)
486         {
487             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
488             const dim3 grid(divUp(query.rows, BLOCK_SIZE));
489 
490             const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
491 
492             match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data);
493             cudaSafeCall( cudaGetLastError() );
494 
495             if (stream == 0)
496                 cudaSafeCall( cudaDeviceSynchronize() );
497         }
498 
499         template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
match(const PtrStepSz<T> query,const PtrStepSz<T> * trains,int n,const Mask mask,int * bestTrainIdx,int * bestImgIdx,float * bestDistance)500         __global__ void match(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask,
501                               int* bestTrainIdx, int* bestImgIdx, float* bestDistance)
502         {
503             extern __shared__ int smem[];
504 
505             const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y;
506 
507             float myBestDistance = numeric_limits<float>::max();
508             int myBestTrainIdx = -1;
509             int myBestImgIdx = -1;
510 
511             typename Dist::value_type* s_query = (typename Dist::value_type*)(smem);
512             typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE);
513 
514             Mask m = mask;
515             for (int imgIdx = 0; imgIdx < n; ++imgIdx)
516             {
517                 const PtrStepSz<T> train = trains[imgIdx];
518                 m.next();
519                 loop<BLOCK_SIZE, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx);
520             }
521 
522             __syncthreads();
523 
524             float* s_distance = (float*)(smem);
525             int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE);
526             int* s_imgIdxIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE);
527 
528             findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdxIdx);
529 
530             if (queryIdx < query.rows && threadIdx.x == 0)
531             {
532                 bestTrainIdx[queryIdx] = myBestTrainIdx;
533                 bestImgIdx[queryIdx] = myBestImgIdx;
534                 bestDistance[queryIdx] = myBestDistance;
535             }
536         }
537 
538         template <int BLOCK_SIZE, typename Dist, typename T, typename Mask>
match(const PtrStepSz<T> & query,const PtrStepSz<T> * trains,int n,const Mask & mask,const PtrStepSzi & trainIdx,const PtrStepSzi & imgIdx,const PtrStepSzf & distance,cudaStream_t stream)539         void match(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
540                    const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
541                    cudaStream_t stream)
542         {
543             const dim3 block(BLOCK_SIZE, BLOCK_SIZE);
544             const dim3 grid(divUp(query.rows, BLOCK_SIZE));
545 
546             const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int);
547 
548             match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data);
549             cudaSafeCall( cudaGetLastError() );
550 
551             if (stream == 0)
552                 cudaSafeCall( cudaDeviceSynchronize() );
553         }
554 
555         ///////////////////////////////////////////////////////////////////////////////
556         // Match dispatcher
557 
558         template <typename Dist, typename T, typename Mask>
matchDispatcher(const PtrStepSz<T> & query,const PtrStepSz<T> & train,const Mask & mask,const PtrStepSzi & trainIdx,const PtrStepSzf & distance,cudaStream_t stream)559         void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask,
560                              const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
561                              cudaStream_t stream)
562         {
563             if (query.cols <= 64)
564             {
565                 matchUnrolledCached<16, 64, Dist>(query, train, mask, trainIdx, distance, stream);
566             }
567             else if (query.cols <= 128)
568             {
569                 matchUnrolledCached<16, 128, Dist>(query, train, mask, trainIdx, distance, stream);
570             }
571             /*else if (query.cols <= 256)
572             {
573                 matchUnrolled<16, 256, Dist>(query, train, mask, trainIdx, distance, stream);
574             }
575             else if (query.cols <= 512)
576             {
577                 matchUnrolled<16, 512, Dist>(query, train, mask, trainIdx, distance, stream);
578             }
579             else if (query.cols <= 1024)
580             {
581                 matchUnrolled<16, 1024, Dist>(query, train, mask, trainIdx, distance, stream);
582             }*/
583             else
584             {
585                 match<16, Dist>(query, train, mask, trainIdx, distance, stream);
586             }
587         }
588 
589         template <typename Dist, typename T, typename Mask>
matchDispatcher(const PtrStepSz<T> & query,const PtrStepSz<T> * trains,int n,const Mask & mask,const PtrStepSzi & trainIdx,const PtrStepSzi & imgIdx,const PtrStepSzf & distance,cudaStream_t stream)590         void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask,
591                              const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
592                              cudaStream_t stream)
593         {
594             if (query.cols <= 64)
595             {
596                 matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
597             }
598             else if (query.cols <= 128)
599             {
600                 matchUnrolledCached<16, 128, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
601             }
602             /*else if (query.cols <= 256)
603             {
604                 matchUnrolled<16, 256, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
605             }
606             else if (query.cols <= 512)
607             {
608                 matchUnrolled<16, 512, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
609             }
610             else if (query.cols <= 1024)
611             {
612                 matchUnrolled<16, 1024, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
613             }*/
614             else
615             {
616                 match<16, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream);
617             }
618         }
619 
620         ///////////////////////////////////////////////////////////////////////////////
621         // Match caller
622 
matchL1_gpu(const PtrStepSzb & query,const PtrStepSzb & train,const PtrStepSzb & mask,const PtrStepSzi & trainIdx,const PtrStepSzf & distance,cudaStream_t stream)623         template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
624                                                const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
625                                                cudaStream_t stream)
626         {
627             if (mask.data)
628             {
629                 matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), SingleMask(mask),
630                     trainIdx, distance,
631                     stream);
632             }
633             else
634             {
635                 matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), WithOutMask(),
636                     trainIdx, distance,
637                     stream);
638             }
639         }
640 
641         template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
642         //template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
643         template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
644         template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
645         template void matchL1_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
646         template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
647 
matchL2_gpu(const PtrStepSzb & query,const PtrStepSzb & train,const PtrStepSzb & mask,const PtrStepSzi & trainIdx,const PtrStepSzf & distance,cudaStream_t stream)648         template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
649                                                const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
650                                                cudaStream_t stream)
651         {
652             if (mask.data)
653             {
654                 matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), SingleMask(mask),
655                     trainIdx, distance,
656                     stream);
657             }
658             else
659             {
660                 matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), WithOutMask(),
661                     trainIdx, distance,
662                     stream);
663             }
664         }
665 
666         //template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
667         //template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
668         //template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
669         //template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
670         //template void matchL2_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
671         template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
672 
matchHamming_gpu(const PtrStepSzb & query,const PtrStepSzb & train,const PtrStepSzb & mask,const PtrStepSzi & trainIdx,const PtrStepSzf & distance,cudaStream_t stream)673         template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask,
674                                                     const PtrStepSzi& trainIdx, const PtrStepSzf& distance,
675                                                     cudaStream_t stream)
676         {
677             if (mask.data)
678             {
679                 matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), SingleMask(mask),
680                     trainIdx, distance,
681                     stream);
682             }
683             else
684             {
685                 matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), WithOutMask(),
686                     trainIdx, distance,
687                     stream);
688             }
689         }
690 
691         template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
692         //template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
693         template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
694         //template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
695         template void matchHamming_gpu<int   >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream);
696 
matchL1_gpu(const PtrStepSzb & query,const PtrStepSzb & trains,const PtrStepSz<PtrStepb> & masks,const PtrStepSzi & trainIdx,const PtrStepSzi & imgIdx,const PtrStepSzf & distance,cudaStream_t stream)697         template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
698                                                const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
699                                                 cudaStream_t stream)
700         {
701             if (masks.data)
702             {
703                 matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
704                     trainIdx, imgIdx, distance,
705                     stream);
706             }
707             else
708             {
709                 matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(),
710                     trainIdx, imgIdx, distance,
711                     stream);
712             }
713         }
714 
715         template void matchL1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
716         //template void matchL1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
717         template void matchL1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
718         template void matchL1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
719         template void matchL1_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
720         template void matchL1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
721 
matchL2_gpu(const PtrStepSzb & query,const PtrStepSzb & trains,const PtrStepSz<PtrStepb> & masks,const PtrStepSzi & trainIdx,const PtrStepSzi & imgIdx,const PtrStepSzf & distance,cudaStream_t stream)722         template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
723                                                const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
724                                                cudaStream_t stream)
725         {
726             if (masks.data)
727             {
728                 matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
729                     trainIdx, imgIdx, distance,
730                     stream);
731             }
732             else
733             {
734                 matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(),
735                     trainIdx, imgIdx, distance,
736                     stream);
737             }
738         }
739 
740         //template void matchL2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
741         //template void matchL2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
742         //template void matchL2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
743         //template void matchL2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
744         //template void matchL2_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
745         template void matchL2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& maskCollection, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
746 
matchHamming_gpu(const PtrStepSzb & query,const PtrStepSzb & trains,const PtrStepSz<PtrStepb> & masks,const PtrStepSzi & trainIdx,const PtrStepSzi & imgIdx,const PtrStepSzf & distance,cudaStream_t stream)747         template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks,
748                                                     const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance,
749                                                     cudaStream_t stream)
750         {
751             if (masks.data)
752             {
753                 matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data),
754                     trainIdx, imgIdx, distance,
755                     stream);
756             }
757             else
758             {
759                 matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(),
760                     trainIdx, imgIdx, distance,
761                     stream);
762             }
763         }
764 
765         template void matchHamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
766         //template void matchHamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
767         template void matchHamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
768         //template void matchHamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
769         template void matchHamming_gpu<int   >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream);
770     } // namespace bf_match
771 }}} // namespace cv { namespace cuda { namespace cudev {
772 
773 
774 #endif /* CUDA_DISABLER */
775