1 /*M/////////////////////////////////////////////////////////////////////////////////////// 2 // 3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 4 // 5 // By downloading, copying, installing or using the software you agree to this license. 6 // If you do not agree to this license, do not download, install, 7 // copy or use the software. 8 // 9 // 10 // License Agreement 11 // For Open Source Computer Vision Library 12 // 13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. 14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved. 15 // Third party copyrights are property of their respective owners. 16 // 17 // Redistribution and use in source and binary forms, with or without modification, 18 // are permitted provided that the following conditions are met: 19 // 20 // * Redistribution's of source code must retain the above copyright notice, 21 // this list of conditions and the following disclaimer. 22 // 23 // * Redistribution's in binary form must reproduce the above copyright notice, 24 // this list of conditions and the following disclaimer in the documentation 25 // and/or other materials provided with the distribution. 26 // 27 // * The name of the copyright holders may not be used to endorse or promote products 28 // derived from this software without specific prior written permission. 29 // 30 // This software is provided by the copyright holders and contributors "as is" and 31 // any express or implied warranties, including, but not limited to, the implied 32 // warranties of merchantability and fitness for a particular purpose are disclaimed. 33 // In no event shall the Intel Corporation or contributors be liable for any direct, 34 // indirect, incidental, special, exemplary, or consequential damages 35 // (including, but not limited to, procurement of substitute goods or services; 36 // loss of use, data, or profits; or business interruption) however caused 37 // and on any theory of liability, whether in contract, strict liability, 38 // or tort (including negligence or otherwise) arising in any way out of 39 // the use of this software, even if advised of the possibility of such damage. 40 // 41 //M*/ 42 43 #if !defined CUDA_DISABLER 44 45 #include "opencv2/core/cuda/common.hpp" 46 #include "opencv2/core/cuda/utility.hpp" 47 #include "opencv2/core/cuda/reduce.hpp" 48 #include "opencv2/core/cuda/limits.hpp" 49 #include "opencv2/core/cuda/vec_distance.hpp" 50 #include "opencv2/core/cuda/datamov_utils.hpp" 51 52 namespace cv { namespace cuda { namespace device 53 { 54 namespace bf_match 55 { 56 /////////////////////////////////////////////////////////////////////////////// 57 // Reduction 58 59 template <int BLOCK_SIZE> findBestMatch(float & bestDistance,int & bestTrainIdx,float * s_distance,int * s_trainIdx)60 __device__ void findBestMatch(float& bestDistance, int& bestTrainIdx, float* s_distance, int* s_trainIdx) 61 { 62 s_distance += threadIdx.y * BLOCK_SIZE; 63 s_trainIdx += threadIdx.y * BLOCK_SIZE; 64 65 reduceKeyVal<BLOCK_SIZE>(s_distance, bestDistance, s_trainIdx, bestTrainIdx, threadIdx.x, less<float>()); 66 } 67 68 template <int BLOCK_SIZE> findBestMatch(float & bestDistance,int & bestTrainIdx,int & bestImgIdx,float * s_distance,int * s_trainIdx,int * s_imgIdx)69 __device__ void findBestMatch(float& bestDistance, int& bestTrainIdx, int& bestImgIdx, float* s_distance, int* s_trainIdx, int* s_imgIdx) 70 { 71 s_distance += threadIdx.y * BLOCK_SIZE; 72 s_trainIdx += threadIdx.y * BLOCK_SIZE; 73 s_imgIdx += threadIdx.y * BLOCK_SIZE; 74 75 reduceKeyVal<BLOCK_SIZE>(s_distance, bestDistance, smem_tuple(s_trainIdx, s_imgIdx), thrust::tie(bestTrainIdx, bestImgIdx), threadIdx.x, less<float>()); 76 } 77 78 /////////////////////////////////////////////////////////////////////////////// 79 // Match Unrolled Cached 80 81 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename T, typename U> loadQueryToSmem(int queryIdx,const PtrStepSz<T> & query,U * s_query)82 __device__ void loadQueryToSmem(int queryIdx, const PtrStepSz<T>& query, U* s_query) 83 { 84 #pragma unroll 85 for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i) 86 { 87 const int loadX = threadIdx.x + i * BLOCK_SIZE; 88 s_query[threadIdx.y * MAX_DESC_LEN + loadX] = loadX < query.cols ? query.ptr(::min(queryIdx, query.rows - 1))[loadX] : 0; 89 } 90 } 91 92 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> loopUnrolledCached(int queryIdx,const PtrStepSz<T> & query,volatile int imgIdx,const PtrStepSz<T> & train,const Mask & mask,typename Dist::value_type * s_query,typename Dist::value_type * s_train,float & bestDistance,int & bestTrainIdx,int & bestImgIdx)93 __device__ void loopUnrolledCached(int queryIdx, const PtrStepSz<T>& query,volatile int imgIdx, const PtrStepSz<T>& train, const Mask& mask, 94 typename Dist::value_type* s_query, typename Dist::value_type* s_train, 95 float& bestDistance, int& bestTrainIdx, int& bestImgIdx) 96 { 97 for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t) 98 { 99 Dist dist; 100 101 #pragma unroll 102 for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i) 103 { 104 const int loadX = threadIdx.x + i * BLOCK_SIZE; 105 106 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0; 107 108 if (loadX < train.cols) 109 { 110 T val; 111 112 ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val); 113 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val; 114 } 115 116 __syncthreads(); 117 118 #pragma unroll 119 for (int j = 0; j < BLOCK_SIZE; ++j) 120 dist.reduceIter(s_query[threadIdx.y * MAX_DESC_LEN + i * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]); 121 122 __syncthreads(); 123 } 124 125 typename Dist::result_type distVal = dist; 126 127 const int trainIdx = t * BLOCK_SIZE + threadIdx.x; 128 129 if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx)) 130 { 131 bestImgIdx = imgIdx; 132 bestDistance = distVal; 133 bestTrainIdx = trainIdx; 134 } 135 } 136 } 137 138 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> matchUnrolledCached(const PtrStepSz<T> query,const PtrStepSz<T> train,const Mask mask,int * bestTrainIdx,float * bestDistance)139 __global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance) 140 { 141 extern __shared__ int smem[]; 142 143 const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; 144 145 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); 146 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN); 147 148 loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query); 149 150 float myBestDistance = numeric_limits<float>::max(); 151 int myBestTrainIdx = -1; 152 153 loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx); 154 155 __syncthreads(); 156 157 float* s_distance = (float*)(smem); 158 int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); 159 160 findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx); 161 162 if (queryIdx < query.rows && threadIdx.x == 0) 163 { 164 bestTrainIdx[queryIdx] = myBestTrainIdx; 165 bestDistance[queryIdx] = myBestDistance; 166 } 167 } 168 169 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> matchUnrolledCached(const PtrStepSz<T> & query,const PtrStepSz<T> & train,const Mask & mask,const PtrStepSzi & trainIdx,const PtrStepSzf & distance,cudaStream_t stream)170 void matchUnrolledCached(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask, 171 const PtrStepSzi& trainIdx, const PtrStepSzf& distance, 172 cudaStream_t stream) 173 { 174 const dim3 block(BLOCK_SIZE, BLOCK_SIZE); 175 const dim3 grid(divUp(query.rows, BLOCK_SIZE)); 176 177 const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= BLOCK_SIZE ? MAX_DESC_LEN : BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); 178 179 matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data); 180 cudaSafeCall( cudaGetLastError() ); 181 182 if (stream == 0) 183 cudaSafeCall( cudaDeviceSynchronize() ); 184 } 185 186 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> matchUnrolledCached(const PtrStepSz<T> query,const PtrStepSz<T> * trains,int n,const Mask mask,int * bestTrainIdx,int * bestImgIdx,float * bestDistance)187 __global__ void matchUnrolledCached(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, 188 int* bestTrainIdx, int* bestImgIdx, float* bestDistance) 189 { 190 extern __shared__ int smem[]; 191 192 const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; 193 194 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); 195 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * MAX_DESC_LEN); 196 197 loadQueryToSmem<BLOCK_SIZE, MAX_DESC_LEN>(queryIdx, query, s_query); 198 199 float myBestDistance = numeric_limits<float>::max(); 200 int myBestTrainIdx = -1; 201 int myBestImgIdx = -1; 202 203 Mask m = mask; 204 205 for (int imgIdx = 0; imgIdx < n; ++imgIdx) 206 { 207 const PtrStepSz<T> train = trains[imgIdx]; 208 m.next(); 209 loopUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx); 210 } 211 212 __syncthreads(); 213 214 float* s_distance = (float*)(smem); 215 int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); 216 int* s_imgIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE); 217 218 findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdx); 219 220 if (queryIdx < query.rows && threadIdx.x == 0) 221 { 222 bestTrainIdx[queryIdx] = myBestTrainIdx; 223 bestImgIdx[queryIdx] = myBestImgIdx; 224 bestDistance[queryIdx] = myBestDistance; 225 } 226 } 227 228 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> matchUnrolledCached(const PtrStepSz<T> & query,const PtrStepSz<T> * trains,int n,const Mask & mask,const PtrStepSzi & trainIdx,const PtrStepSzi & imgIdx,const PtrStepSzf & distance,cudaStream_t stream)229 void matchUnrolledCached(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask, 230 const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, 231 cudaStream_t stream) 232 { 233 const dim3 block(BLOCK_SIZE, BLOCK_SIZE); 234 const dim3 grid(divUp(query.rows, BLOCK_SIZE)); 235 236 const size_t smemSize = (BLOCK_SIZE * (MAX_DESC_LEN >= 2 * BLOCK_SIZE ? MAX_DESC_LEN : 2 * BLOCK_SIZE) + BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); 237 238 matchUnrolledCached<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data); 239 cudaSafeCall( cudaGetLastError() ); 240 241 if (stream == 0) 242 cudaSafeCall( cudaDeviceSynchronize() ); 243 } 244 245 /////////////////////////////////////////////////////////////////////////////// 246 // Match Unrolled 247 248 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> loopUnrolled(int queryIdx,const PtrStepSz<T> & query,volatile int imgIdx,const PtrStepSz<T> & train,const Mask & mask,typename Dist::value_type * s_query,typename Dist::value_type * s_train,float & bestDistance,int & bestTrainIdx,int & bestImgIdx)249 __device__ void loopUnrolled(int queryIdx, const PtrStepSz<T>& query,volatile int imgIdx, const PtrStepSz<T>& train, const Mask& mask, 250 typename Dist::value_type* s_query, typename Dist::value_type* s_train, 251 float& bestDistance, int& bestTrainIdx, int& bestImgIdx) 252 { 253 for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t) 254 { 255 Dist dist; 256 257 #pragma unroll 258 for (int i = 0; i < MAX_DESC_LEN / BLOCK_SIZE; ++i) 259 { 260 const int loadX = threadIdx.x + i * BLOCK_SIZE; 261 262 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0; 263 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0; 264 265 if (loadX < query.cols) 266 { 267 T val; 268 269 ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val); 270 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val; 271 272 ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val); 273 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val; 274 } 275 276 __syncthreads(); 277 278 #pragma unroll 279 for (int j = 0; j < BLOCK_SIZE; ++j) 280 dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]); 281 282 __syncthreads(); 283 } 284 285 typename Dist::result_type distVal = dist; 286 287 const int trainIdx = t * BLOCK_SIZE + threadIdx.x; 288 289 if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx)) 290 { 291 bestImgIdx = imgIdx; 292 bestDistance = distVal; 293 bestTrainIdx = trainIdx; 294 } 295 } 296 } 297 298 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> matchUnrolled(const PtrStepSz<T> query,const PtrStepSz<T> train,const Mask mask,int * bestTrainIdx,float * bestDistance)299 __global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance) 300 { 301 extern __shared__ int smem[]; 302 303 const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; 304 305 float myBestDistance = numeric_limits<float>::max(); 306 int myBestTrainIdx = -1; 307 308 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); 309 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); 310 311 loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx); 312 313 __syncthreads(); 314 315 float* s_distance = (float*)(smem); 316 int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); 317 318 findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx); 319 320 if (queryIdx < query.rows && threadIdx.x == 0) 321 { 322 bestTrainIdx[queryIdx] = myBestTrainIdx; 323 bestDistance[queryIdx] = myBestDistance; 324 } 325 } 326 327 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> matchUnrolled(const PtrStepSz<T> & query,const PtrStepSz<T> & train,const Mask & mask,const PtrStepSzi & trainIdx,const PtrStepSzf & distance,cudaStream_t stream)328 void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask, 329 const PtrStepSzi& trainIdx, const PtrStepSzf& distance, 330 cudaStream_t stream) 331 { 332 const dim3 block(BLOCK_SIZE, BLOCK_SIZE); 333 const dim3 grid(divUp(query.rows, BLOCK_SIZE)); 334 335 const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); 336 337 matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data); 338 cudaSafeCall( cudaGetLastError() ); 339 340 if (stream == 0) 341 cudaSafeCall( cudaDeviceSynchronize() ); 342 } 343 344 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> matchUnrolled(const PtrStepSz<T> query,const PtrStepSz<T> * trains,int n,const Mask mask,int * bestTrainIdx,int * bestImgIdx,float * bestDistance)345 __global__ void matchUnrolled(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, 346 int* bestTrainIdx, int* bestImgIdx, float* bestDistance) 347 { 348 extern __shared__ int smem[]; 349 350 const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; 351 352 float myBestDistance = numeric_limits<float>::max(); 353 int myBestTrainIdx = -1; 354 int myBestImgIdx = -1; 355 356 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); 357 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); 358 359 Mask m = mask; 360 361 for (int imgIdx = 0; imgIdx < n; ++imgIdx) 362 { 363 const PtrStepSz<T> train = trains[imgIdx]; 364 m.next(); 365 loopUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx); 366 } 367 368 __syncthreads(); 369 370 float* s_distance = (float*)(smem); 371 int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); 372 int* s_imgIdxIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE); 373 374 findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdxIdx); 375 376 if (queryIdx < query.rows && threadIdx.x == 0) 377 { 378 bestTrainIdx[queryIdx] = myBestTrainIdx; 379 bestImgIdx[queryIdx] = myBestImgIdx; 380 bestDistance[queryIdx] = myBestDistance; 381 } 382 } 383 384 template <int BLOCK_SIZE, int MAX_DESC_LEN, typename Dist, typename T, typename Mask> matchUnrolled(const PtrStepSz<T> & query,const PtrStepSz<T> * trains,int n,const Mask & mask,const PtrStepSzi & trainIdx,const PtrStepSzi & imgIdx,const PtrStepSzf & distance,cudaStream_t stream)385 void matchUnrolled(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask, 386 const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, 387 cudaStream_t stream) 388 { 389 const dim3 block(BLOCK_SIZE, BLOCK_SIZE); 390 const dim3 grid(divUp(query.rows, BLOCK_SIZE)); 391 392 const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); 393 394 matchUnrolled<BLOCK_SIZE, MAX_DESC_LEN, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data); 395 cudaSafeCall( cudaGetLastError() ); 396 397 if (stream == 0) 398 cudaSafeCall( cudaDeviceSynchronize() ); 399 } 400 401 /////////////////////////////////////////////////////////////////////////////// 402 // Match 403 404 template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> loop(int queryIdx,const PtrStepSz<T> & query,volatile int imgIdx,const PtrStepSz<T> & train,const Mask & mask,typename Dist::value_type * s_query,typename Dist::value_type * s_train,float & bestDistance,int & bestTrainIdx,int & bestImgIdx)405 __device__ void loop(int queryIdx, const PtrStepSz<T>& query, volatile int imgIdx, const PtrStepSz<T>& train, const Mask& mask, 406 typename Dist::value_type* s_query, typename Dist::value_type* s_train, 407 float& bestDistance, int& bestTrainIdx, int& bestImgIdx) 408 { 409 for (int t = 0, endt = (train.rows + BLOCK_SIZE - 1) / BLOCK_SIZE; t < endt; ++t) 410 { 411 Dist dist; 412 413 for (int i = 0, endi = (query.cols + BLOCK_SIZE - 1) / BLOCK_SIZE; i < endi; ++i) 414 { 415 const int loadX = threadIdx.x + i * BLOCK_SIZE; 416 417 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = 0; 418 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = 0; 419 420 if (loadX < query.cols) 421 { 422 T val; 423 424 ForceGlob<T>::Load(query.ptr(::min(queryIdx, query.rows - 1)), loadX, val); 425 s_query[threadIdx.y * BLOCK_SIZE + threadIdx.x] = val; 426 427 ForceGlob<T>::Load(train.ptr(::min(t * BLOCK_SIZE + threadIdx.y, train.rows - 1)), loadX, val); 428 s_train[threadIdx.x * BLOCK_SIZE + threadIdx.y] = val; 429 } 430 431 __syncthreads(); 432 433 #pragma unroll 434 for (int j = 0; j < BLOCK_SIZE; ++j) 435 dist.reduceIter(s_query[threadIdx.y * BLOCK_SIZE + j], s_train[j * BLOCK_SIZE + threadIdx.x]); 436 437 __syncthreads(); 438 } 439 440 typename Dist::result_type distVal = dist; 441 442 const int trainIdx = t * BLOCK_SIZE + threadIdx.x; 443 444 if (queryIdx < query.rows && trainIdx < train.rows && distVal < bestDistance && mask(queryIdx, trainIdx)) 445 { 446 bestImgIdx = imgIdx; 447 bestDistance = distVal; 448 bestTrainIdx = trainIdx; 449 } 450 } 451 } 452 453 template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> match(const PtrStepSz<T> query,const PtrStepSz<T> train,const Mask mask,int * bestTrainIdx,float * bestDistance)454 __global__ void match(const PtrStepSz<T> query, const PtrStepSz<T> train, const Mask mask, int* bestTrainIdx, float* bestDistance) 455 { 456 extern __shared__ int smem[]; 457 458 const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; 459 460 float myBestDistance = numeric_limits<float>::max(); 461 int myBestTrainIdx = -1; 462 463 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); 464 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); 465 466 loop<BLOCK_SIZE, Dist>(queryIdx, query, 0, train, mask, s_query, s_train, myBestDistance, myBestTrainIdx, myBestTrainIdx); 467 468 __syncthreads(); 469 470 float* s_distance = (float*)(smem); 471 int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); 472 473 findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, s_distance, s_trainIdx); 474 475 if (queryIdx < query.rows && threadIdx.x == 0) 476 { 477 bestTrainIdx[queryIdx] = myBestTrainIdx; 478 bestDistance[queryIdx] = myBestDistance; 479 } 480 } 481 482 template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> match(const PtrStepSz<T> & query,const PtrStepSz<T> & train,const Mask & mask,const PtrStepSzi & trainIdx,const PtrStepSzf & distance,cudaStream_t stream)483 void match(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask, 484 const PtrStepSzi& trainIdx, const PtrStepSzf& distance, 485 cudaStream_t stream) 486 { 487 const dim3 block(BLOCK_SIZE, BLOCK_SIZE); 488 const dim3 grid(divUp(query.rows, BLOCK_SIZE)); 489 490 const size_t smemSize = (2 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); 491 492 match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, train, mask, trainIdx.data, distance.data); 493 cudaSafeCall( cudaGetLastError() ); 494 495 if (stream == 0) 496 cudaSafeCall( cudaDeviceSynchronize() ); 497 } 498 499 template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> match(const PtrStepSz<T> query,const PtrStepSz<T> * trains,int n,const Mask mask,int * bestTrainIdx,int * bestImgIdx,float * bestDistance)500 __global__ void match(const PtrStepSz<T> query, const PtrStepSz<T>* trains, int n, const Mask mask, 501 int* bestTrainIdx, int* bestImgIdx, float* bestDistance) 502 { 503 extern __shared__ int smem[]; 504 505 const int queryIdx = blockIdx.x * BLOCK_SIZE + threadIdx.y; 506 507 float myBestDistance = numeric_limits<float>::max(); 508 int myBestTrainIdx = -1; 509 int myBestImgIdx = -1; 510 511 typename Dist::value_type* s_query = (typename Dist::value_type*)(smem); 512 typename Dist::value_type* s_train = (typename Dist::value_type*)(smem + BLOCK_SIZE * BLOCK_SIZE); 513 514 Mask m = mask; 515 for (int imgIdx = 0; imgIdx < n; ++imgIdx) 516 { 517 const PtrStepSz<T> train = trains[imgIdx]; 518 m.next(); 519 loop<BLOCK_SIZE, Dist>(queryIdx, query, imgIdx, train, m, s_query, s_train, myBestDistance, myBestTrainIdx, myBestImgIdx); 520 } 521 522 __syncthreads(); 523 524 float* s_distance = (float*)(smem); 525 int* s_trainIdx = (int*)(smem + BLOCK_SIZE * BLOCK_SIZE); 526 int* s_imgIdxIdx = (int*)(smem + 2 * BLOCK_SIZE * BLOCK_SIZE); 527 528 findBestMatch<BLOCK_SIZE>(myBestDistance, myBestTrainIdx, myBestImgIdx, s_distance, s_trainIdx, s_imgIdxIdx); 529 530 if (queryIdx < query.rows && threadIdx.x == 0) 531 { 532 bestTrainIdx[queryIdx] = myBestTrainIdx; 533 bestImgIdx[queryIdx] = myBestImgIdx; 534 bestDistance[queryIdx] = myBestDistance; 535 } 536 } 537 538 template <int BLOCK_SIZE, typename Dist, typename T, typename Mask> match(const PtrStepSz<T> & query,const PtrStepSz<T> * trains,int n,const Mask & mask,const PtrStepSzi & trainIdx,const PtrStepSzi & imgIdx,const PtrStepSzf & distance,cudaStream_t stream)539 void match(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask, 540 const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, 541 cudaStream_t stream) 542 { 543 const dim3 block(BLOCK_SIZE, BLOCK_SIZE); 544 const dim3 grid(divUp(query.rows, BLOCK_SIZE)); 545 546 const size_t smemSize = (3 * BLOCK_SIZE * BLOCK_SIZE) * sizeof(int); 547 548 match<BLOCK_SIZE, Dist><<<grid, block, smemSize, stream>>>(query, trains, n, mask, trainIdx.data, imgIdx.data, distance.data); 549 cudaSafeCall( cudaGetLastError() ); 550 551 if (stream == 0) 552 cudaSafeCall( cudaDeviceSynchronize() ); 553 } 554 555 /////////////////////////////////////////////////////////////////////////////// 556 // Match dispatcher 557 558 template <typename Dist, typename T, typename Mask> matchDispatcher(const PtrStepSz<T> & query,const PtrStepSz<T> & train,const Mask & mask,const PtrStepSzi & trainIdx,const PtrStepSzf & distance,cudaStream_t stream)559 void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>& train, const Mask& mask, 560 const PtrStepSzi& trainIdx, const PtrStepSzf& distance, 561 cudaStream_t stream) 562 { 563 if (query.cols <= 64) 564 { 565 matchUnrolledCached<16, 64, Dist>(query, train, mask, trainIdx, distance, stream); 566 } 567 else if (query.cols <= 128) 568 { 569 matchUnrolledCached<16, 128, Dist>(query, train, mask, trainIdx, distance, stream); 570 } 571 /*else if (query.cols <= 256) 572 { 573 matchUnrolled<16, 256, Dist>(query, train, mask, trainIdx, distance, stream); 574 } 575 else if (query.cols <= 512) 576 { 577 matchUnrolled<16, 512, Dist>(query, train, mask, trainIdx, distance, stream); 578 } 579 else if (query.cols <= 1024) 580 { 581 matchUnrolled<16, 1024, Dist>(query, train, mask, trainIdx, distance, stream); 582 }*/ 583 else 584 { 585 match<16, Dist>(query, train, mask, trainIdx, distance, stream); 586 } 587 } 588 589 template <typename Dist, typename T, typename Mask> matchDispatcher(const PtrStepSz<T> & query,const PtrStepSz<T> * trains,int n,const Mask & mask,const PtrStepSzi & trainIdx,const PtrStepSzi & imgIdx,const PtrStepSzf & distance,cudaStream_t stream)590 void matchDispatcher(const PtrStepSz<T>& query, const PtrStepSz<T>* trains, int n, const Mask& mask, 591 const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, 592 cudaStream_t stream) 593 { 594 if (query.cols <= 64) 595 { 596 matchUnrolledCached<16, 64, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream); 597 } 598 else if (query.cols <= 128) 599 { 600 matchUnrolledCached<16, 128, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream); 601 } 602 /*else if (query.cols <= 256) 603 { 604 matchUnrolled<16, 256, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream); 605 } 606 else if (query.cols <= 512) 607 { 608 matchUnrolled<16, 512, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream); 609 } 610 else if (query.cols <= 1024) 611 { 612 matchUnrolled<16, 1024, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream); 613 }*/ 614 else 615 { 616 match<16, Dist>(query, trains, n, mask, trainIdx, imgIdx, distance, stream); 617 } 618 } 619 620 /////////////////////////////////////////////////////////////////////////////// 621 // Match caller 622 matchL1_gpu(const PtrStepSzb & query,const PtrStepSzb & train,const PtrStepSzb & mask,const PtrStepSzi & trainIdx,const PtrStepSzf & distance,cudaStream_t stream)623 template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask, 624 const PtrStepSzi& trainIdx, const PtrStepSzf& distance, 625 cudaStream_t stream) 626 { 627 if (mask.data) 628 { 629 matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), SingleMask(mask), 630 trainIdx, distance, 631 stream); 632 } 633 else 634 { 635 matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), WithOutMask(), 636 trainIdx, distance, 637 stream); 638 } 639 } 640 641 template void matchL1_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream); 642 //template void matchL1_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream); 643 template void matchL1_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream); 644 template void matchL1_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream); 645 template void matchL1_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream); 646 template void matchL1_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream); 647 matchL2_gpu(const PtrStepSzb & query,const PtrStepSzb & train,const PtrStepSzb & mask,const PtrStepSzi & trainIdx,const PtrStepSzf & distance,cudaStream_t stream)648 template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask, 649 const PtrStepSzi& trainIdx, const PtrStepSzf& distance, 650 cudaStream_t stream) 651 { 652 if (mask.data) 653 { 654 matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), SingleMask(mask), 655 trainIdx, distance, 656 stream); 657 } 658 else 659 { 660 matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), WithOutMask(), 661 trainIdx, distance, 662 stream); 663 } 664 } 665 666 //template void matchL2_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream); 667 //template void matchL2_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream); 668 //template void matchL2_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream); 669 //template void matchL2_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream); 670 //template void matchL2_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream); 671 template void matchL2_gpu<float >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream); 672 matchHamming_gpu(const PtrStepSzb & query,const PtrStepSzb & train,const PtrStepSzb & mask,const PtrStepSzi & trainIdx,const PtrStepSzf & distance,cudaStream_t stream)673 template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& train, const PtrStepSzb& mask, 674 const PtrStepSzi& trainIdx, const PtrStepSzf& distance, 675 cudaStream_t stream) 676 { 677 if (mask.data) 678 { 679 matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), SingleMask(mask), 680 trainIdx, distance, 681 stream); 682 } 683 else 684 { 685 matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), static_cast< PtrStepSz<T> >(train), WithOutMask(), 686 trainIdx, distance, 687 stream); 688 } 689 } 690 691 template void matchHamming_gpu<uchar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream); 692 //template void matchHamming_gpu<schar >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream); 693 template void matchHamming_gpu<ushort>(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream); 694 //template void matchHamming_gpu<short >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream); 695 template void matchHamming_gpu<int >(const PtrStepSzb& queryDescs, const PtrStepSzb& trainDescs, const PtrStepSzb& mask, const PtrStepSzi& trainIdx, const PtrStepSzf& distance, cudaStream_t stream); 696 matchL1_gpu(const PtrStepSzb & query,const PtrStepSzb & trains,const PtrStepSz<PtrStepb> & masks,const PtrStepSzi & trainIdx,const PtrStepSzi & imgIdx,const PtrStepSzf & distance,cudaStream_t stream)697 template <typename T> void matchL1_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, 698 const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, 699 cudaStream_t stream) 700 { 701 if (masks.data) 702 { 703 matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), 704 trainIdx, imgIdx, distance, 705 stream); 706 } 707 else 708 { 709 matchDispatcher< L1Dist<T> >(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), 710 trainIdx, imgIdx, distance, 711 stream); 712 } 713 } 714 715 template void matchL1_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream); 716 //template void matchL1_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream); 717 template void matchL1_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream); 718 template void matchL1_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream); 719 template void matchL1_gpu<int >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream); 720 template void matchL1_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream); 721 matchL2_gpu(const PtrStepSzb & query,const PtrStepSzb & trains,const PtrStepSz<PtrStepb> & masks,const PtrStepSzi & trainIdx,const PtrStepSzi & imgIdx,const PtrStepSzf & distance,cudaStream_t stream)722 template <typename T> void matchL2_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, 723 const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, 724 cudaStream_t stream) 725 { 726 if (masks.data) 727 { 728 matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), 729 trainIdx, imgIdx, distance, 730 stream); 731 } 732 else 733 { 734 matchDispatcher<L2Dist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), 735 trainIdx, imgIdx, distance, 736 stream); 737 } 738 } 739 740 //template void matchL2_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream); 741 //template void matchL2_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream); 742 //template void matchL2_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream); 743 //template void matchL2_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream); 744 //template void matchL2_gpu<int >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream); 745 template void matchL2_gpu<float >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& maskCollection, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream); 746 matchHamming_gpu(const PtrStepSzb & query,const PtrStepSzb & trains,const PtrStepSz<PtrStepb> & masks,const PtrStepSzi & trainIdx,const PtrStepSzi & imgIdx,const PtrStepSzf & distance,cudaStream_t stream)747 template <typename T> void matchHamming_gpu(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, 748 const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, 749 cudaStream_t stream) 750 { 751 if (masks.data) 752 { 753 matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, MaskCollection(masks.data), 754 trainIdx, imgIdx, distance, 755 stream); 756 } 757 else 758 { 759 matchDispatcher<HammingDist>(static_cast< PtrStepSz<T> >(query), (const PtrStepSz<T>*)trains.ptr(), trains.cols, WithOutMask(), 760 trainIdx, imgIdx, distance, 761 stream); 762 } 763 } 764 765 template void matchHamming_gpu<uchar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream); 766 //template void matchHamming_gpu<schar >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream); 767 template void matchHamming_gpu<ushort>(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream); 768 //template void matchHamming_gpu<short >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream); 769 template void matchHamming_gpu<int >(const PtrStepSzb& query, const PtrStepSzb& trains, const PtrStepSz<PtrStepb>& masks, const PtrStepSzi& trainIdx, const PtrStepSzi& imgIdx, const PtrStepSzf& distance, cudaStream_t stream); 770 } // namespace bf_match 771 }}} // namespace cv { namespace cuda { namespace cudev { 772 773 774 #endif /* CUDA_DISABLER */ 775