1 /*M/////////////////////////////////////////////////////////////////////////////////////// 2 // 3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING. 4 // 5 // By downloading, copying, installing or using the software you agree to this license. 6 // If you do not agree to this license, do not download, install, 7 // copy or use the software. 8 // 9 // 10 // License Agreement 11 // For Open Source Computer Vision Library 12 // 13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved. 14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved. 15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved. 16 // Third party copyrights are property of their respective owners. 17 // 18 // Redistribution and use in source and binary forms, with or without modification, 19 // are permitted provided that the following conditions are met: 20 // 21 // * Redistribution's of source code must retain the above copyright notice, 22 // this list of conditions and the following disclaimer. 23 // 24 // * Redistribution's in binary form must reproduce the above copyright notice, 25 // this list of conditions and the following disclaimer in the documentation 26 // and/or other materials provided with the distribution. 27 // 28 // * The name of the copyright holders may not be used to endorse or promote products 29 // derived from this software without specific prior written permission. 30 // 31 // This software is provided by the copyright holders and contributors "as is" and 32 // any express or implied warranties, including, but not limited to, the implied 33 // warranties of merchantability and fitness for a particular purpose are disclaimed. 34 // In no event shall the Intel Corporation or contributors be liable for any direct, 35 // indirect, incidental, special, exemplary, or consequential damages 36 // (including, but not limited to, procurement of substitute goods or services; 37 // loss of use, data, or profits; or business interruption) however caused 38 // and on any theory of liability, whether in contract, strict liability, 39 // or tort (including negligence or otherwise) arising in any way out of 40 // the use of this software, even if advised of the possibility of such damage. 41 // 42 //M*/ 43 44 #pragma once 45 46 #ifndef OPENCV_CUDEV_GRID_INTEGRAL_DETAIL_HPP 47 #define OPENCV_CUDEV_GRID_INTEGRAL_DETAIL_HPP 48 49 #include "../../common.hpp" 50 #include "../../warp/shuffle.hpp" 51 #include "../../block/scan.hpp" 52 #include "../../ptr2d/glob.hpp" 53 54 namespace cv { namespace cudev { 55 56 namespace integral_detail 57 { 58 // horizontal_pass 59 60 template <int NUM_SCAN_THREADS, class SrcPtr, typename D> horizontal_pass(const SrcPtr src,GlobPtr<D> dst,const int cols)61 __global__ void horizontal_pass(const SrcPtr src, GlobPtr<D> dst, const int cols) 62 { 63 __shared__ D smem[NUM_SCAN_THREADS * 2]; 64 __shared__ D carryElem; 65 66 if (threadIdx.x == 0) 67 carryElem = 0; 68 69 __syncthreads(); 70 71 D* dst_row = dst.row(blockIdx.x); 72 73 int numBuckets = divUp(cols, NUM_SCAN_THREADS); 74 int offsetX = 0; 75 76 while (numBuckets--) 77 { 78 const int curElemOffs = offsetX + threadIdx.x; 79 80 D curElem = 0.0f; 81 82 if (curElemOffs < cols) 83 curElem = src(blockIdx.x, curElemOffs); 84 85 const D curScanElem = blockScanInclusive<NUM_SCAN_THREADS>(curElem, smem, threadIdx.x); 86 87 if (curElemOffs < cols) 88 dst_row[curElemOffs] = carryElem + curScanElem; 89 90 offsetX += NUM_SCAN_THREADS; 91 92 __syncthreads(); 93 94 if (threadIdx.x == NUM_SCAN_THREADS - 1) 95 { 96 carryElem += curScanElem; 97 } 98 99 __syncthreads(); 100 } 101 } 102 103 template <int NUM_SCAN_THREADS, typename T, typename D> horizontal_pass(const GlobPtr<T> src,GlobPtr<D> dst,const int cols)104 __global__ void horizontal_pass(const GlobPtr<T> src, GlobPtr<D> dst, const int cols) 105 { 106 __shared__ D smem[NUM_SCAN_THREADS * 2]; 107 __shared__ D carryElem; 108 109 if (threadIdx.x == 0) 110 carryElem = 0; 111 112 __syncthreads(); 113 114 const T* src_row = src.row(blockIdx.x); 115 D* dst_row = dst.row(blockIdx.x); 116 117 int numBuckets = divUp(cols, NUM_SCAN_THREADS); 118 int offsetX = 0; 119 120 while (numBuckets--) 121 { 122 const int curElemOffs = offsetX + threadIdx.x; 123 124 D curElem = 0.0f; 125 126 if (curElemOffs < cols) 127 curElem = src_row[curElemOffs]; 128 129 const D curScanElem = blockScanInclusive<NUM_SCAN_THREADS>(curElem, smem, threadIdx.x); 130 131 if (curElemOffs < cols) 132 dst_row[curElemOffs] = carryElem + curScanElem; 133 134 offsetX += NUM_SCAN_THREADS; 135 136 __syncthreads(); 137 138 if (threadIdx.x == NUM_SCAN_THREADS - 1) 139 { 140 carryElem += curScanElem; 141 } 142 143 __syncthreads(); 144 } 145 } 146 147 template <class SrcPtr, typename D> horizontal_pass(const SrcPtr & src,const GlobPtr<D> & dst,int rows,int cols,cudaStream_t stream)148 __host__ void horizontal_pass(const SrcPtr& src, const GlobPtr<D>& dst, int rows, int cols, cudaStream_t stream) 149 { 150 const int NUM_SCAN_THREADS = 256; 151 152 const dim3 block(NUM_SCAN_THREADS); 153 const dim3 grid(rows); 154 155 horizontal_pass<NUM_SCAN_THREADS><<<grid, block, 0, stream>>>(src, dst, cols); 156 CV_CUDEV_SAFE_CALL( cudaGetLastError() ); 157 } 158 159 // horisontal_pass_8u_shfl 160 int_to_uchar4(unsigned int in)161 __device__ static uchar4 int_to_uchar4(unsigned int in) 162 { 163 uchar4 bytes; 164 bytes.x = (in & 0x000000ff) >> 0; 165 bytes.y = (in & 0x0000ff00) >> 8; 166 bytes.z = (in & 0x00ff0000) >> 16; 167 bytes.w = (in & 0xff000000) >> 24; 168 return bytes; 169 } 170 horisontal_pass_8u_shfl_kernel(const GlobPtr<uint4> img,GlobPtr<uint4> integral)171 __global__ static void horisontal_pass_8u_shfl_kernel(const GlobPtr<uint4> img, GlobPtr<uint4> integral) 172 { 173 #if CV_CUDEV_ARCH >= 300 174 __shared__ int sums[128]; 175 176 const int id = threadIdx.x; 177 const int lane_id = id % warpSize; 178 const int warp_id = id / warpSize; 179 180 const uint4 data = img(blockIdx.x, id); 181 182 const uchar4 a = int_to_uchar4(data.x); 183 const uchar4 b = int_to_uchar4(data.y); 184 const uchar4 c = int_to_uchar4(data.z); 185 const uchar4 d = int_to_uchar4(data.w); 186 187 int result[16]; 188 189 result[0] = a.x; 190 result[1] = result[0] + a.y; 191 result[2] = result[1] + a.z; 192 result[3] = result[2] + a.w; 193 194 result[4] = result[3] + b.x; 195 result[5] = result[4] + b.y; 196 result[6] = result[5] + b.z; 197 result[7] = result[6] + b.w; 198 199 result[8] = result[7] + c.x; 200 result[9] = result[8] + c.y; 201 result[10] = result[9] + c.z; 202 result[11] = result[10] + c.w; 203 204 result[12] = result[11] + d.x; 205 result[13] = result[12] + d.y; 206 result[14] = result[13] + d.z; 207 result[15] = result[14] + d.w; 208 209 int sum = result[15]; 210 211 // the prefix sum for each thread's 16 value is computed, 212 // now the final sums (result[15]) need to be shared 213 // with the other threads and add. To do this, 214 // the shfl_up() instruction is used and a shuffle scan 215 // operation is performed to distribute the sums to the correct 216 // threads 217 #pragma unroll 218 for (int i = 1; i < 32; i *= 2) 219 { 220 const int n = compatible_shfl_up(sum, i, 32); 221 222 if (lane_id >= i) 223 { 224 #pragma unroll 225 for (int k = 0; k < 16; ++k) 226 result[k] += n; 227 228 sum += n; 229 } 230 } 231 232 // Now the final sum for the warp must be shared 233 // between warps. This is done by each warp 234 // having a thread store to shared memory, then 235 // having some other warp load the values and 236 // compute a prefix sum, again by using shfl_up. 237 // The results are uniformly added back to the warps. 238 // last thread in the warp holding sum of the warp 239 // places that in shared 240 if (threadIdx.x % warpSize == warpSize - 1) 241 sums[warp_id] = result[15]; 242 243 __syncthreads(); 244 245 if (warp_id == 0) 246 { 247 int warp_sum = sums[lane_id]; 248 249 #pragma unroll 250 for (int i = 1; i < 32; i *= 2) 251 { 252 const int n = compatible_shfl_up(warp_sum, i, 32); 253 254 if (lane_id >= i) 255 warp_sum += n; 256 } 257 258 sums[lane_id] = warp_sum; 259 } 260 261 __syncthreads(); 262 263 int blockSum = 0; 264 265 // fold in unused warp 266 if (warp_id > 0) 267 { 268 blockSum = sums[warp_id - 1]; 269 270 #pragma unroll 271 for (int k = 0; k < 16; ++k) 272 result[k] += blockSum; 273 } 274 275 // assemble result 276 // Each thread has 16 values to write, which are 277 // now integer data (to avoid overflow). Instead of 278 // each thread writing consecutive uint4s, the 279 // approach shown here experiments using 280 // the shuffle command to reformat the data 281 // inside the registers so that each thread holds 282 // consecutive data to be written so larger contiguous 283 // segments can be assembled for writing. 284 285 /* 286 For example data that needs to be written as 287 288 GMEM[16] <- x0 x1 x2 x3 y0 y1 y2 y3 z0 z1 z2 z3 w0 w1 w2 w3 289 but is stored in registers (r0..r3), in four threads (0..3) as: 290 291 threadId 0 1 2 3 292 r0 x0 y0 z0 w0 293 r1 x1 y1 z1 w1 294 r2 x2 y2 z2 w2 295 r3 x3 y3 z3 w3 296 297 after apply shfl_xor operations to move data between registers r1..r3: 298 299 threadId 00 01 10 11 300 x0 y0 z0 w0 301 xor(01)->y1 x1 w1 z1 302 xor(10)->z2 w2 x2 y2 303 xor(11)->w3 z3 y3 x3 304 305 and now x0..x3, and z0..z3 can be written out in order by all threads. 306 307 In the current code, each register above is actually representing 308 four integers to be written as uint4's to GMEM. 309 */ 310 311 result[4] = shfl_xor(result[4] , 1, 32); 312 result[5] = shfl_xor(result[5] , 1, 32); 313 result[6] = shfl_xor(result[6] , 1, 32); 314 result[7] = shfl_xor(result[7] , 1, 32); 315 316 result[8] = shfl_xor(result[8] , 2, 32); 317 result[9] = shfl_xor(result[9] , 2, 32); 318 result[10] = shfl_xor(result[10], 2, 32); 319 result[11] = shfl_xor(result[11], 2, 32); 320 321 result[12] = shfl_xor(result[12], 3, 32); 322 result[13] = shfl_xor(result[13], 3, 32); 323 result[14] = shfl_xor(result[14], 3, 32); 324 result[15] = shfl_xor(result[15], 3, 32); 325 326 uint4* integral_row = integral.row(blockIdx.x); 327 uint4 output; 328 329 /////// 330 331 if (threadIdx.x % 4 == 0) 332 output = make_uint4(result[0], result[1], result[2], result[3]); 333 334 if (threadIdx.x % 4 == 1) 335 output = make_uint4(result[4], result[5], result[6], result[7]); 336 337 if (threadIdx.x % 4 == 2) 338 output = make_uint4(result[8], result[9], result[10], result[11]); 339 340 if (threadIdx.x % 4 == 3) 341 output = make_uint4(result[12], result[13], result[14], result[15]); 342 343 integral_row[threadIdx.x % 4 + (threadIdx.x / 4) * 16] = output; 344 345 /////// 346 347 if (threadIdx.x % 4 == 2) 348 output = make_uint4(result[0], result[1], result[2], result[3]); 349 350 if (threadIdx.x % 4 == 3) 351 output = make_uint4(result[4], result[5], result[6], result[7]); 352 353 if (threadIdx.x % 4 == 0) 354 output = make_uint4(result[8], result[9], result[10], result[11]); 355 356 if (threadIdx.x % 4 == 1) 357 output = make_uint4(result[12], result[13], result[14], result[15]); 358 359 integral_row[(threadIdx.x + 2) % 4 + (threadIdx.x / 4) * 16 + 8] = output; 360 361 // continuning from the above example, 362 // this use of shfl_xor() places the y0..y3 and w0..w3 data 363 // in order. 364 365 #pragma unroll 366 for (int i = 0; i < 16; ++i) 367 result[i] = shfl_xor(result[i], 1, 32); 368 369 if (threadIdx.x % 4 == 0) 370 output = make_uint4(result[0], result[1], result[2], result[3]); 371 372 if (threadIdx.x % 4 == 1) 373 output = make_uint4(result[4], result[5], result[6], result[7]); 374 375 if (threadIdx.x % 4 == 2) 376 output = make_uint4(result[8], result[9], result[10], result[11]); 377 378 if (threadIdx.x % 4 == 3) 379 output = make_uint4(result[12], result[13], result[14], result[15]); 380 381 integral_row[threadIdx.x % 4 + (threadIdx.x / 4) * 16 + 4] = output; 382 383 /////// 384 385 if (threadIdx.x % 4 == 2) 386 output = make_uint4(result[0], result[1], result[2], result[3]); 387 388 if (threadIdx.x % 4 == 3) 389 output = make_uint4(result[4], result[5], result[6], result[7]); 390 391 if (threadIdx.x % 4 == 0) 392 output = make_uint4(result[8], result[9], result[10], result[11]); 393 394 if (threadIdx.x % 4 == 1) 395 output = make_uint4(result[12], result[13], result[14], result[15]); 396 397 integral_row[(threadIdx.x + 2) % 4 + (threadIdx.x / 4) * 16 + 12] = output; 398 #endif 399 } 400 horisontal_pass_8u_shfl(const GlobPtr<uchar> src,GlobPtr<uint> integral,int rows,int cols,cudaStream_t stream)401 __host__ static void horisontal_pass_8u_shfl(const GlobPtr<uchar> src, GlobPtr<uint> integral, int rows, int cols, cudaStream_t stream) 402 { 403 // each thread handles 16 values, use 1 block/row 404 // save, because step is actually can't be less 512 bytes 405 const int block = cols / 16; 406 407 // launch 1 block / row 408 const int grid = rows; 409 410 CV_CUDEV_SAFE_CALL( cudaFuncSetCacheConfig(horisontal_pass_8u_shfl_kernel, cudaFuncCachePreferL1) ); 411 412 GlobPtr<uint4> src4 = globPtr((uint4*) src.data, src.step); 413 GlobPtr<uint4> integral4 = globPtr((uint4*) integral.data, integral.step); 414 415 horisontal_pass_8u_shfl_kernel<<<grid, block, 0, stream>>>(src4, integral4); 416 CV_CUDEV_SAFE_CALL( cudaGetLastError() ); 417 } 418 419 // vertical 420 421 template <typename T> vertical_pass(GlobPtr<T> integral,const int rows,const int cols)422 __global__ void vertical_pass(GlobPtr<T> integral, const int rows, const int cols) 423 { 424 #if CV_CUDEV_ARCH >= 300 425 __shared__ T sums[32][9]; 426 427 const int tidx = blockIdx.x * blockDim.x + threadIdx.x; 428 const int lane_id = tidx % 8; 429 430 sums[threadIdx.x][threadIdx.y] = 0; 431 __syncthreads(); 432 433 T stepSum = 0; 434 435 int numBuckets = divUp(rows, blockDim.y); 436 int y = threadIdx.y; 437 438 while (numBuckets--) 439 { 440 T* p = integral.row(y) + tidx; 441 442 T sum = (tidx < cols) && (y < rows) ? *p : 0; 443 444 sums[threadIdx.x][threadIdx.y] = sum; 445 __syncthreads(); 446 447 // place into SMEM 448 // shfl scan reduce the SMEM, reformating so the column 449 // sums are computed in a warp 450 // then read out properly 451 const int j = threadIdx.x % 8; 452 const int k = threadIdx.x / 8 + threadIdx.y * 4; 453 454 T partial_sum = sums[k][j]; 455 456 for (int i = 1; i <= 8; i *= 2) 457 { 458 T n = compatible_shfl_up(partial_sum, i, 32); 459 460 if (lane_id >= i) 461 partial_sum += n; 462 } 463 464 sums[k][j] = partial_sum; 465 __syncthreads(); 466 467 if (threadIdx.y > 0) 468 sum += sums[threadIdx.x][threadIdx.y - 1]; 469 470 sum += stepSum; 471 stepSum += sums[threadIdx.x][blockDim.y - 1]; 472 473 __syncthreads(); 474 475 if ((tidx < cols) && (y < rows)) 476 { 477 *p = sum; 478 } 479 480 y += blockDim.y; 481 } 482 #else 483 __shared__ T smem[32][32]; 484 __shared__ T prevVals[32]; 485 486 volatile T* smem_row = &smem[0][0] + 64 * threadIdx.y; 487 488 if (threadIdx.y == 0) 489 prevVals[threadIdx.x] = 0; 490 491 __syncthreads(); 492 493 const int x = blockIdx.x * blockDim.x + threadIdx.x; 494 495 int numBuckets = divUp(rows, 8 * 4); 496 int offsetY = 0; 497 498 while (numBuckets--) 499 { 500 const int curRowOffs = offsetY + threadIdx.y; 501 502 T curElems[4]; 503 T temp[4]; 504 505 // load patch 506 507 smem[threadIdx.y + 0][threadIdx.x] = 0.0f; 508 smem[threadIdx.y + 8][threadIdx.x] = 0.0f; 509 smem[threadIdx.y + 16][threadIdx.x] = 0.0f; 510 smem[threadIdx.y + 24][threadIdx.x] = 0.0f; 511 512 if (x < cols) 513 { 514 for (int i = 0; i < 4; ++i) 515 { 516 if (curRowOffs + i * 8 < rows) 517 smem[threadIdx.y + i * 8][threadIdx.x] = integral(curRowOffs + i * 8, x); 518 } 519 } 520 521 __syncthreads(); 522 523 // reduce 524 525 curElems[0] = smem[threadIdx.x][threadIdx.y ]; 526 curElems[1] = smem[threadIdx.x][threadIdx.y + 8]; 527 curElems[2] = smem[threadIdx.x][threadIdx.y + 16]; 528 curElems[3] = smem[threadIdx.x][threadIdx.y + 24]; 529 530 __syncthreads(); 531 532 temp[0] = curElems[0] = warpScanInclusive(curElems[0], smem_row, threadIdx.x); 533 temp[1] = curElems[1] = warpScanInclusive(curElems[1], smem_row, threadIdx.x); 534 temp[2] = curElems[2] = warpScanInclusive(curElems[2], smem_row, threadIdx.x); 535 temp[3] = curElems[3] = warpScanInclusive(curElems[3], smem_row, threadIdx.x); 536 537 curElems[0] += prevVals[threadIdx.y ]; 538 curElems[1] += prevVals[threadIdx.y + 8]; 539 curElems[2] += prevVals[threadIdx.y + 16]; 540 curElems[3] += prevVals[threadIdx.y + 24]; 541 542 __syncthreads(); 543 544 if (threadIdx.x == 31) 545 { 546 prevVals[threadIdx.y ] += temp[0]; 547 prevVals[threadIdx.y + 8] += temp[1]; 548 prevVals[threadIdx.y + 16] += temp[2]; 549 prevVals[threadIdx.y + 24] += temp[3]; 550 } 551 552 smem[threadIdx.y ][threadIdx.x] = curElems[0]; 553 smem[threadIdx.y + 8][threadIdx.x] = curElems[1]; 554 smem[threadIdx.y + 16][threadIdx.x] = curElems[2]; 555 smem[threadIdx.y + 24][threadIdx.x] = curElems[3]; 556 557 __syncthreads(); 558 559 // store patch 560 561 if (x < cols) 562 { 563 // read 4 value from source 564 for (int i = 0; i < 4; ++i) 565 { 566 if (curRowOffs + i * 8 < rows) 567 integral(curRowOffs + i * 8, x) = smem[threadIdx.x][threadIdx.y + i * 8]; 568 } 569 } 570 571 __syncthreads(); 572 573 offsetY += 8 * 4; 574 } 575 #endif 576 } 577 578 template <typename T> vertical_pass(const GlobPtr<T> & integral,int rows,int cols,cudaStream_t stream)579 __host__ void vertical_pass(const GlobPtr<T>& integral, int rows, int cols, cudaStream_t stream) 580 { 581 const dim3 block(32, 8); 582 const dim3 grid(divUp(cols, block.x)); 583 584 vertical_pass<<<grid, block, 0, stream>>>(integral, rows, cols); 585 CV_CUDEV_SAFE_CALL( cudaGetLastError() ); 586 } 587 588 // integral 589 590 template <class SrcPtr, typename D> integral(const SrcPtr & src,const GlobPtr<D> & dst,int rows,int cols,cudaStream_t stream)591 __host__ void integral(const SrcPtr& src, const GlobPtr<D>& dst, int rows, int cols, cudaStream_t stream) 592 { 593 horizontal_pass(src, dst, rows, cols, stream); 594 vertical_pass(dst, rows, cols, stream); 595 596 if (stream == 0) 597 CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() ); 598 } 599 integral(const GlobPtr<uchar> & src,const GlobPtr<uint> & dst,int rows,int cols,cudaStream_t stream)600 __host__ static void integral(const GlobPtr<uchar>& src, const GlobPtr<uint>& dst, int rows, int cols, cudaStream_t stream) 601 { 602 if (deviceSupports(FEATURE_SET_COMPUTE_30) 603 && (cols % 64 == 0) 604 && reinterpret_cast<intptr_t>(src.data) % 32 == 0 605 && reinterpret_cast<intptr_t>(dst.data) % 32 == 0) 606 { 607 horisontal_pass_8u_shfl(src, dst, rows, cols, stream); 608 } 609 else 610 { 611 horizontal_pass(src, dst, rows, cols, stream); 612 } 613 614 vertical_pass(dst, rows, cols, stream); 615 616 if (stream == 0) 617 CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() ); 618 } 619 integral(const GlobPtr<uchar> & src,const GlobPtr<int> & dst,int rows,int cols,cudaStream_t stream)620 __host__ __forceinline__ void integral(const GlobPtr<uchar>& src, const GlobPtr<int>& dst, int rows, int cols, cudaStream_t stream) 621 { 622 GlobPtr<uint> dstui = globPtr((uint*) dst.data, dst.step); 623 integral(src, dstui, rows, cols, stream); 624 } 625 } 626 627 }} 628 629 #endif 630