1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                          License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
16 // Third party copyrights are property of their respective owners.
17 //
18 // Redistribution and use in source and binary forms, with or without modification,
19 // are permitted provided that the following conditions are met:
20 //
21 //   * Redistribution's of source code must retain the above copyright notice,
22 //     this list of conditions and the following disclaimer.
23 //
24 //   * Redistribution's in binary form must reproduce the above copyright notice,
25 //     this list of conditions and the following disclaimer in the documentation
26 //     and/or other materials provided with the distribution.
27 //
28 //   * The name of the copyright holders may not be used to endorse or promote products
29 //     derived from this software without specific prior written permission.
30 //
31 // This software is provided by the copyright holders and contributors "as is" and
32 // any express or implied warranties, including, but not limited to, the implied
33 // warranties of merchantability and fitness for a particular purpose are disclaimed.
34 // In no event shall the Intel Corporation or contributors be liable for any direct,
35 // indirect, incidental, special, exemplary, or consequential damages
36 // (including, but not limited to, procurement of substitute goods or services;
37 // loss of use, data, or profits; or business interruption) however caused
38 // and on any theory of liability, whether in contract, strict liability,
39 // or tort (including negligence or otherwise) arising in any way out of
40 // the use of this software, even if advised of the possibility of such damage.
41 //
42 //M*/
43 
44 #pragma once
45 
46 #ifndef OPENCV_CUDEV_GRID_INTEGRAL_DETAIL_HPP
47 #define OPENCV_CUDEV_GRID_INTEGRAL_DETAIL_HPP
48 
49 #include "../../common.hpp"
50 #include "../../warp/shuffle.hpp"
51 #include "../../block/scan.hpp"
52 #include "../../ptr2d/glob.hpp"
53 
54 namespace cv { namespace cudev {
55 
56 namespace integral_detail
57 {
58     // horizontal_pass
59 
60     template <int NUM_SCAN_THREADS, class SrcPtr, typename D>
horizontal_pass(const SrcPtr src,GlobPtr<D> dst,const int cols)61     __global__ void horizontal_pass(const SrcPtr src, GlobPtr<D> dst, const int cols)
62     {
63         __shared__ D smem[NUM_SCAN_THREADS * 2];
64         __shared__ D carryElem;
65 
66         if (threadIdx.x == 0)
67             carryElem = 0;
68 
69         __syncthreads();
70 
71         D* dst_row = dst.row(blockIdx.x);
72 
73         int numBuckets = divUp(cols, NUM_SCAN_THREADS);
74         int offsetX = 0;
75 
76         while (numBuckets--)
77         {
78             const int curElemOffs = offsetX + threadIdx.x;
79 
80             D curElem = 0.0f;
81 
82             if (curElemOffs < cols)
83                 curElem = src(blockIdx.x, curElemOffs);
84 
85             const D curScanElem = blockScanInclusive<NUM_SCAN_THREADS>(curElem, smem, threadIdx.x);
86 
87             if (curElemOffs < cols)
88                 dst_row[curElemOffs] = carryElem + curScanElem;
89 
90             offsetX += NUM_SCAN_THREADS;
91 
92             __syncthreads();
93 
94             if (threadIdx.x == NUM_SCAN_THREADS - 1)
95             {
96                 carryElem += curScanElem;
97             }
98 
99             __syncthreads();
100         }
101     }
102 
103     template <int NUM_SCAN_THREADS, typename T, typename D>
horizontal_pass(const GlobPtr<T> src,GlobPtr<D> dst,const int cols)104     __global__ void horizontal_pass(const GlobPtr<T> src, GlobPtr<D> dst, const int cols)
105     {
106         __shared__ D smem[NUM_SCAN_THREADS * 2];
107         __shared__ D carryElem;
108 
109         if (threadIdx.x == 0)
110             carryElem = 0;
111 
112         __syncthreads();
113 
114         const T* src_row = src.row(blockIdx.x);
115         D* dst_row = dst.row(blockIdx.x);
116 
117         int numBuckets = divUp(cols, NUM_SCAN_THREADS);
118         int offsetX = 0;
119 
120         while (numBuckets--)
121         {
122             const int curElemOffs = offsetX + threadIdx.x;
123 
124             D curElem = 0.0f;
125 
126             if (curElemOffs < cols)
127                 curElem = src_row[curElemOffs];
128 
129             const D curScanElem = blockScanInclusive<NUM_SCAN_THREADS>(curElem, smem, threadIdx.x);
130 
131             if (curElemOffs < cols)
132                 dst_row[curElemOffs] = carryElem + curScanElem;
133 
134             offsetX += NUM_SCAN_THREADS;
135 
136             __syncthreads();
137 
138             if (threadIdx.x == NUM_SCAN_THREADS - 1)
139             {
140                 carryElem += curScanElem;
141             }
142 
143             __syncthreads();
144         }
145     }
146 
147     template <class SrcPtr, typename D>
horizontal_pass(const SrcPtr & src,const GlobPtr<D> & dst,int rows,int cols,cudaStream_t stream)148     __host__ void horizontal_pass(const SrcPtr& src, const GlobPtr<D>& dst, int rows, int cols, cudaStream_t stream)
149     {
150         const int NUM_SCAN_THREADS = 256;
151 
152         const dim3 block(NUM_SCAN_THREADS);
153         const dim3 grid(rows);
154 
155         horizontal_pass<NUM_SCAN_THREADS><<<grid, block, 0, stream>>>(src, dst, cols);
156         CV_CUDEV_SAFE_CALL( cudaGetLastError() );
157     }
158 
159     // horisontal_pass_8u_shfl
160 
int_to_uchar4(unsigned int in)161     __device__ static uchar4 int_to_uchar4(unsigned int in)
162     {
163         uchar4 bytes;
164         bytes.x = (in & 0x000000ff) >>  0;
165         bytes.y = (in & 0x0000ff00) >>  8;
166         bytes.z = (in & 0x00ff0000) >> 16;
167         bytes.w = (in & 0xff000000) >> 24;
168         return bytes;
169     }
170 
horisontal_pass_8u_shfl_kernel(const GlobPtr<uint4> img,GlobPtr<uint4> integral)171     __global__ static void horisontal_pass_8u_shfl_kernel(const GlobPtr<uint4> img, GlobPtr<uint4> integral)
172     {
173     #if CV_CUDEV_ARCH >= 300
174         __shared__ int sums[128];
175 
176         const int id = threadIdx.x;
177         const int lane_id = id % warpSize;
178         const int warp_id = id / warpSize;
179 
180         const uint4 data = img(blockIdx.x, id);
181 
182         const uchar4 a = int_to_uchar4(data.x);
183         const uchar4 b = int_to_uchar4(data.y);
184         const uchar4 c = int_to_uchar4(data.z);
185         const uchar4 d = int_to_uchar4(data.w);
186 
187         int result[16];
188 
189         result[0]  =              a.x;
190         result[1]  = result[0]  + a.y;
191         result[2]  = result[1]  + a.z;
192         result[3]  = result[2]  + a.w;
193 
194         result[4]  = result[3]  + b.x;
195         result[5]  = result[4]  + b.y;
196         result[6]  = result[5]  + b.z;
197         result[7]  = result[6]  + b.w;
198 
199         result[8]  = result[7]  + c.x;
200         result[9]  = result[8]  + c.y;
201         result[10] = result[9]  + c.z;
202         result[11] = result[10] + c.w;
203 
204         result[12] = result[11] + d.x;
205         result[13] = result[12] + d.y;
206         result[14] = result[13] + d.z;
207         result[15] = result[14] + d.w;
208 
209         int sum = result[15];
210 
211         // the prefix sum for each thread's 16 value is computed,
212         // now the final sums (result[15]) need to be shared
213         // with the other threads and add.  To do this,
214         // the shfl_up() instruction is used and a shuffle scan
215         // operation is performed to distribute the sums to the correct
216         // threads
217         #pragma unroll
218         for (int i = 1; i < 32; i *= 2)
219         {
220             const int n = compatible_shfl_up(sum, i, 32);
221 
222             if (lane_id >= i)
223             {
224                 #pragma unroll
225                 for (int k = 0; k < 16; ++k)
226                     result[k] += n;
227 
228                 sum += n;
229             }
230         }
231 
232         // Now the final sum for the warp must be shared
233         // between warps.  This is done by each warp
234         // having a thread store to shared memory, then
235         // having some other warp load the values and
236         // compute a prefix sum, again by using shfl_up.
237         // The results are uniformly added back to the warps.
238         // last thread in the warp holding sum of the warp
239         // places that in shared
240         if (threadIdx.x % warpSize == warpSize - 1)
241             sums[warp_id] = result[15];
242 
243         __syncthreads();
244 
245         if (warp_id == 0)
246         {
247             int warp_sum = sums[lane_id];
248 
249             #pragma unroll
250             for (int i = 1; i < 32; i *= 2)
251             {
252                 const int n = compatible_shfl_up(warp_sum, i, 32);
253 
254                 if (lane_id >= i)
255                     warp_sum += n;
256             }
257 
258             sums[lane_id] = warp_sum;
259         }
260 
261         __syncthreads();
262 
263         int blockSum = 0;
264 
265         // fold in unused warp
266         if (warp_id > 0)
267         {
268             blockSum = sums[warp_id - 1];
269 
270             #pragma unroll
271             for (int k = 0; k < 16; ++k)
272                 result[k] += blockSum;
273         }
274 
275         // assemble result
276         // Each thread has 16 values to write, which are
277         // now integer data (to avoid overflow).  Instead of
278         // each thread writing consecutive uint4s, the
279         // approach shown here experiments using
280         // the shuffle command to reformat the data
281         // inside the registers so that each thread holds
282         // consecutive data to be written so larger contiguous
283         // segments can be assembled for writing.
284 
285         /*
286             For example data that needs to be written as
287 
288             GMEM[16] <- x0 x1 x2 x3 y0 y1 y2 y3 z0 z1 z2 z3 w0 w1 w2 w3
289             but is stored in registers (r0..r3), in four threads (0..3) as:
290 
291             threadId   0  1  2  3
292               r0      x0 y0 z0 w0
293               r1      x1 y1 z1 w1
294               r2      x2 y2 z2 w2
295               r3      x3 y3 z3 w3
296 
297               after apply shfl_xor operations to move data between registers r1..r3:
298 
299             threadId  00 01 10 11
300                       x0 y0 z0 w0
301              xor(01)->y1 x1 w1 z1
302              xor(10)->z2 w2 x2 y2
303              xor(11)->w3 z3 y3 x3
304 
305              and now x0..x3, and z0..z3 can be written out in order by all threads.
306 
307              In the current code, each register above is actually representing
308              four integers to be written as uint4's to GMEM.
309         */
310 
311         result[4]  = shfl_xor(result[4] , 1, 32);
312         result[5]  = shfl_xor(result[5] , 1, 32);
313         result[6]  = shfl_xor(result[6] , 1, 32);
314         result[7]  = shfl_xor(result[7] , 1, 32);
315 
316         result[8]  = shfl_xor(result[8] , 2, 32);
317         result[9]  = shfl_xor(result[9] , 2, 32);
318         result[10] = shfl_xor(result[10], 2, 32);
319         result[11] = shfl_xor(result[11], 2, 32);
320 
321         result[12] = shfl_xor(result[12], 3, 32);
322         result[13] = shfl_xor(result[13], 3, 32);
323         result[14] = shfl_xor(result[14], 3, 32);
324         result[15] = shfl_xor(result[15], 3, 32);
325 
326         uint4* integral_row = integral.row(blockIdx.x);
327         uint4 output;
328 
329         ///////
330 
331         if (threadIdx.x % 4 == 0)
332             output = make_uint4(result[0], result[1], result[2], result[3]);
333 
334         if (threadIdx.x % 4 == 1)
335             output = make_uint4(result[4], result[5], result[6], result[7]);
336 
337         if (threadIdx.x % 4 == 2)
338             output = make_uint4(result[8], result[9], result[10], result[11]);
339 
340         if (threadIdx.x % 4 == 3)
341             output = make_uint4(result[12], result[13], result[14], result[15]);
342 
343         integral_row[threadIdx.x % 4 + (threadIdx.x / 4) * 16] = output;
344 
345         ///////
346 
347         if (threadIdx.x % 4 == 2)
348             output = make_uint4(result[0], result[1], result[2], result[3]);
349 
350         if (threadIdx.x % 4 == 3)
351             output = make_uint4(result[4], result[5], result[6], result[7]);
352 
353         if (threadIdx.x % 4 == 0)
354             output = make_uint4(result[8], result[9], result[10], result[11]);
355 
356         if (threadIdx.x % 4 == 1)
357             output = make_uint4(result[12], result[13], result[14], result[15]);
358 
359         integral_row[(threadIdx.x + 2) % 4 + (threadIdx.x / 4) * 16 + 8] = output;
360 
361         // continuning from the above example,
362         // this use of shfl_xor() places the y0..y3 and w0..w3 data
363         // in order.
364 
365         #pragma unroll
366         for (int i = 0; i < 16; ++i)
367             result[i] = shfl_xor(result[i], 1, 32);
368 
369         if (threadIdx.x % 4 == 0)
370             output = make_uint4(result[0], result[1], result[2], result[3]);
371 
372         if (threadIdx.x % 4 == 1)
373             output = make_uint4(result[4], result[5], result[6], result[7]);
374 
375         if (threadIdx.x % 4 == 2)
376             output = make_uint4(result[8], result[9], result[10], result[11]);
377 
378         if (threadIdx.x % 4 == 3)
379             output = make_uint4(result[12], result[13], result[14], result[15]);
380 
381         integral_row[threadIdx.x % 4 + (threadIdx.x / 4) * 16 + 4] = output;
382 
383         ///////
384 
385         if (threadIdx.x % 4 == 2)
386             output = make_uint4(result[0], result[1], result[2], result[3]);
387 
388         if (threadIdx.x % 4 == 3)
389             output = make_uint4(result[4], result[5], result[6], result[7]);
390 
391         if (threadIdx.x % 4 == 0)
392             output = make_uint4(result[8], result[9], result[10], result[11]);
393 
394         if (threadIdx.x % 4 == 1)
395             output = make_uint4(result[12], result[13], result[14], result[15]);
396 
397         integral_row[(threadIdx.x + 2) % 4 + (threadIdx.x / 4) * 16 + 12] = output;
398     #endif
399     }
400 
horisontal_pass_8u_shfl(const GlobPtr<uchar> src,GlobPtr<uint> integral,int rows,int cols,cudaStream_t stream)401     __host__ static void horisontal_pass_8u_shfl(const GlobPtr<uchar> src, GlobPtr<uint> integral, int rows, int cols, cudaStream_t stream)
402     {
403         // each thread handles 16 values, use 1 block/row
404         // save, because step is actually can't be less 512 bytes
405         const int block = cols / 16;
406 
407         // launch 1 block / row
408         const int grid = rows;
409 
410         CV_CUDEV_SAFE_CALL( cudaFuncSetCacheConfig(horisontal_pass_8u_shfl_kernel, cudaFuncCachePreferL1) );
411 
412         GlobPtr<uint4> src4 = globPtr((uint4*) src.data, src.step);
413         GlobPtr<uint4> integral4 = globPtr((uint4*) integral.data, integral.step);
414 
415         horisontal_pass_8u_shfl_kernel<<<grid, block, 0, stream>>>(src4, integral4);
416         CV_CUDEV_SAFE_CALL( cudaGetLastError() );
417     }
418 
419     // vertical
420 
421     template <typename T>
vertical_pass(GlobPtr<T> integral,const int rows,const int cols)422     __global__ void vertical_pass(GlobPtr<T> integral, const int rows, const int cols)
423     {
424     #if CV_CUDEV_ARCH >= 300
425         __shared__ T sums[32][9];
426 
427         const int tidx = blockIdx.x * blockDim.x + threadIdx.x;
428         const int lane_id = tidx % 8;
429 
430         sums[threadIdx.x][threadIdx.y] = 0;
431         __syncthreads();
432 
433         T stepSum = 0;
434 
435         int numBuckets = divUp(rows, blockDim.y);
436         int y = threadIdx.y;
437 
438         while (numBuckets--)
439         {
440             T* p = integral.row(y) + tidx;
441 
442             T sum = (tidx < cols) && (y < rows) ? *p : 0;
443 
444             sums[threadIdx.x][threadIdx.y] = sum;
445             __syncthreads();
446 
447             // place into SMEM
448             // shfl scan reduce the SMEM, reformating so the column
449             // sums are computed in a warp
450             // then read out properly
451             const int j = threadIdx.x % 8;
452             const int k = threadIdx.x / 8 + threadIdx.y * 4;
453 
454             T partial_sum = sums[k][j];
455 
456             for (int i = 1; i <= 8; i *= 2)
457             {
458                 T n = compatible_shfl_up(partial_sum, i, 32);
459 
460                 if (lane_id >= i)
461                     partial_sum += n;
462             }
463 
464             sums[k][j] = partial_sum;
465             __syncthreads();
466 
467             if (threadIdx.y > 0)
468                 sum += sums[threadIdx.x][threadIdx.y - 1];
469 
470             sum += stepSum;
471             stepSum += sums[threadIdx.x][blockDim.y - 1];
472 
473             __syncthreads();
474 
475             if ((tidx < cols) && (y < rows))
476             {
477                 *p = sum;
478             }
479 
480             y += blockDim.y;
481         }
482     #else
483         __shared__ T smem[32][32];
484         __shared__ T prevVals[32];
485 
486         volatile T* smem_row = &smem[0][0] + 64 * threadIdx.y;
487 
488         if (threadIdx.y == 0)
489             prevVals[threadIdx.x] = 0;
490 
491         __syncthreads();
492 
493         const int x = blockIdx.x * blockDim.x + threadIdx.x;
494 
495         int numBuckets = divUp(rows, 8 * 4);
496         int offsetY = 0;
497 
498         while (numBuckets--)
499         {
500             const int curRowOffs = offsetY + threadIdx.y;
501 
502             T curElems[4];
503             T temp[4];
504 
505             // load patch
506 
507             smem[threadIdx.y +  0][threadIdx.x] = 0.0f;
508             smem[threadIdx.y +  8][threadIdx.x] = 0.0f;
509             smem[threadIdx.y + 16][threadIdx.x] = 0.0f;
510             smem[threadIdx.y + 24][threadIdx.x] = 0.0f;
511 
512             if (x < cols)
513             {
514                 for (int i = 0; i < 4; ++i)
515                 {
516                     if (curRowOffs + i * 8 < rows)
517                         smem[threadIdx.y + i * 8][threadIdx.x] = integral(curRowOffs + i * 8, x);
518                 }
519             }
520 
521             __syncthreads();
522 
523             // reduce
524 
525             curElems[0] = smem[threadIdx.x][threadIdx.y     ];
526             curElems[1] = smem[threadIdx.x][threadIdx.y +  8];
527             curElems[2] = smem[threadIdx.x][threadIdx.y + 16];
528             curElems[3] = smem[threadIdx.x][threadIdx.y + 24];
529 
530             __syncthreads();
531 
532             temp[0] = curElems[0] = warpScanInclusive(curElems[0], smem_row, threadIdx.x);
533             temp[1] = curElems[1] = warpScanInclusive(curElems[1], smem_row, threadIdx.x);
534             temp[2] = curElems[2] = warpScanInclusive(curElems[2], smem_row, threadIdx.x);
535             temp[3] = curElems[3] = warpScanInclusive(curElems[3], smem_row, threadIdx.x);
536 
537             curElems[0] += prevVals[threadIdx.y     ];
538             curElems[1] += prevVals[threadIdx.y +  8];
539             curElems[2] += prevVals[threadIdx.y + 16];
540             curElems[3] += prevVals[threadIdx.y + 24];
541 
542             __syncthreads();
543 
544             if (threadIdx.x == 31)
545             {
546                 prevVals[threadIdx.y     ] += temp[0];
547                 prevVals[threadIdx.y +  8] += temp[1];
548                 prevVals[threadIdx.y + 16] += temp[2];
549                 prevVals[threadIdx.y + 24] += temp[3];
550             }
551 
552             smem[threadIdx.y     ][threadIdx.x] = curElems[0];
553             smem[threadIdx.y +  8][threadIdx.x] = curElems[1];
554             smem[threadIdx.y + 16][threadIdx.x] = curElems[2];
555             smem[threadIdx.y + 24][threadIdx.x] = curElems[3];
556 
557             __syncthreads();
558 
559             // store patch
560 
561             if (x < cols)
562             {
563                 // read 4 value from source
564                 for (int i = 0; i < 4; ++i)
565                 {
566                     if (curRowOffs + i * 8 < rows)
567                         integral(curRowOffs + i * 8, x) = smem[threadIdx.x][threadIdx.y + i * 8];
568                 }
569             }
570 
571             __syncthreads();
572 
573             offsetY += 8 * 4;
574         }
575     #endif
576     }
577 
578     template <typename T>
vertical_pass(const GlobPtr<T> & integral,int rows,int cols,cudaStream_t stream)579     __host__ void vertical_pass(const GlobPtr<T>& integral, int rows, int cols, cudaStream_t stream)
580     {
581         const dim3 block(32, 8);
582         const dim3 grid(divUp(cols, block.x));
583 
584         vertical_pass<<<grid, block, 0, stream>>>(integral, rows, cols);
585         CV_CUDEV_SAFE_CALL( cudaGetLastError() );
586     }
587 
588     // integral
589 
590     template <class SrcPtr, typename D>
integral(const SrcPtr & src,const GlobPtr<D> & dst,int rows,int cols,cudaStream_t stream)591     __host__ void integral(const SrcPtr& src, const GlobPtr<D>& dst, int rows, int cols, cudaStream_t stream)
592     {
593         horizontal_pass(src, dst, rows, cols, stream);
594         vertical_pass(dst, rows, cols, stream);
595 
596         if (stream == 0)
597             CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
598     }
599 
integral(const GlobPtr<uchar> & src,const GlobPtr<uint> & dst,int rows,int cols,cudaStream_t stream)600     __host__ static void integral(const GlobPtr<uchar>& src, const GlobPtr<uint>& dst, int rows, int cols, cudaStream_t stream)
601     {
602         if (deviceSupports(FEATURE_SET_COMPUTE_30)
603             && (cols % 64 == 0)
604             && reinterpret_cast<intptr_t>(src.data) % 32 == 0
605             && reinterpret_cast<intptr_t>(dst.data) % 32 == 0)
606         {
607             horisontal_pass_8u_shfl(src, dst, rows, cols, stream);
608         }
609         else
610         {
611             horizontal_pass(src, dst, rows, cols, stream);
612         }
613 
614         vertical_pass(dst, rows, cols, stream);
615 
616         if (stream == 0)
617             CV_CUDEV_SAFE_CALL( cudaDeviceSynchronize() );
618     }
619 
integral(const GlobPtr<uchar> & src,const GlobPtr<int> & dst,int rows,int cols,cudaStream_t stream)620     __host__ __forceinline__ void integral(const GlobPtr<uchar>& src, const GlobPtr<int>& dst, int rows, int cols, cudaStream_t stream)
621     {
622         GlobPtr<uint> dstui = globPtr((uint*) dst.data, dst.step);
623         integral(src, dstui, rows, cols, stream);
624     }
625 }
626 
627 }}
628 
629 #endif
630