1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 //   * Redistribution's of source code must retain the above copyright notice,
21 //     this list of conditions and the following disclaimer.
22 //
23 //   * Redistribution's in binary form must reproduce the above copyright notice,
24 //     this list of conditions and the following disclaimer in the documentation
25 //     and/or other materials provided with the distribution.
26 //
27 //   * The name of the copyright holders may not be used to endorse or promote products
28 //     derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42 
43 #if !defined CUDA_DISABLER
44 
45 #include "opencv2/core/cuda/common.hpp"
46 #include "opencv2/core/cuda/functional.hpp"
47 #include "opencv2/core/cuda/emulation.hpp"
48 #include "opencv2/core/cuda/transform.hpp"
49 
50 using namespace cv::cuda;
51 using namespace cv::cuda::device;
52 
53 namespace hist
54 {
histogram256Kernel(const uchar * src,int cols,int rows,size_t step,int * hist)55     __global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t step, int* hist)
56     {
57         __shared__ int shist[256];
58 
59         const int y = blockIdx.x * blockDim.y + threadIdx.y;
60         const int tid = threadIdx.y * blockDim.x + threadIdx.x;
61 
62         shist[tid] = 0;
63         __syncthreads();
64 
65         if (y < rows)
66         {
67             const unsigned int* rowPtr = (const unsigned int*) (src + y * step);
68 
69             const int cols_4 = cols / 4;
70             for (int x = threadIdx.x; x < cols_4; x += blockDim.x)
71             {
72                 unsigned int data = rowPtr[x];
73 
74                 Emulation::smem::atomicAdd(&shist[(data >>  0) & 0xFFU], 1);
75                 Emulation::smem::atomicAdd(&shist[(data >>  8) & 0xFFU], 1);
76                 Emulation::smem::atomicAdd(&shist[(data >> 16) & 0xFFU], 1);
77                 Emulation::smem::atomicAdd(&shist[(data >> 24) & 0xFFU], 1);
78             }
79 
80             if (cols % 4 != 0 && threadIdx.x == 0)
81             {
82                 for (int x = cols_4 * 4; x < cols; ++x)
83                 {
84                     unsigned int data = ((const uchar*)rowPtr)[x];
85                     Emulation::smem::atomicAdd(&shist[data], 1);
86                 }
87             }
88         }
89 
90         __syncthreads();
91 
92         const int histVal = shist[tid];
93         if (histVal > 0)
94             ::atomicAdd(hist + tid, histVal);
95     }
96 
histogram256(PtrStepSzb src,int * hist,cudaStream_t stream)97     void histogram256(PtrStepSzb src, int* hist, cudaStream_t stream)
98     {
99         const dim3 block(32, 8);
100         const dim3 grid(divUp(src.rows, block.y));
101 
102         histogram256Kernel<<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, hist);
103         cudaSafeCall( cudaGetLastError() );
104 
105         if (stream == 0)
106             cudaSafeCall( cudaDeviceSynchronize() );
107     }
108 
histogram256Kernel(const uchar * src,int cols,int rows,size_t srcStep,const uchar * mask,size_t maskStep,int * hist)109     __global__ void histogram256Kernel(const uchar* src, int cols, int rows, size_t srcStep, const uchar* mask, size_t maskStep, int* hist)
110     {
111         __shared__ int shist[256];
112 
113         const int y = blockIdx.x * blockDim.y + threadIdx.y;
114         const int tid = threadIdx.y * blockDim.x + threadIdx.x;
115 
116         shist[tid] = 0;
117         __syncthreads();
118 
119         if (y < rows)
120         {
121             const unsigned int* rowPtr = (const unsigned int*) (src + y * srcStep);
122             const unsigned int* maskRowPtr = (const unsigned int*) (mask + y * maskStep);
123 
124             const int cols_4 = cols / 4;
125             for (int x = threadIdx.x; x < cols_4; x += blockDim.x)
126             {
127                 unsigned int data = rowPtr[x];
128                 unsigned int m = maskRowPtr[x];
129 
130                 if ((m >>  0) & 0xFFU)
131                     Emulation::smem::atomicAdd(&shist[(data >>  0) & 0xFFU], 1);
132 
133                 if ((m >>  8) & 0xFFU)
134                     Emulation::smem::atomicAdd(&shist[(data >>  8) & 0xFFU], 1);
135 
136                 if ((m >>  16) & 0xFFU)
137                     Emulation::smem::atomicAdd(&shist[(data >> 16) & 0xFFU], 1);
138 
139                 if ((m >>  24) & 0xFFU)
140                     Emulation::smem::atomicAdd(&shist[(data >> 24) & 0xFFU], 1);
141             }
142 
143             if (cols % 4 != 0 && threadIdx.x == 0)
144             {
145                 for (int x = cols_4 * 4; x < cols; ++x)
146                 {
147                     unsigned int data = ((const uchar*)rowPtr)[x];
148                     unsigned int m = ((const uchar*)maskRowPtr)[x];
149 
150                     if (m)
151                         Emulation::smem::atomicAdd(&shist[data], 1);
152                 }
153             }
154         }
155 
156         __syncthreads();
157 
158         const int histVal = shist[tid];
159         if (histVal > 0)
160             ::atomicAdd(hist + tid, histVal);
161     }
162 
histogram256(PtrStepSzb src,PtrStepSzb mask,int * hist,cudaStream_t stream)163     void histogram256(PtrStepSzb src, PtrStepSzb mask, int* hist, cudaStream_t stream)
164     {
165         const dim3 block(32, 8);
166         const dim3 grid(divUp(src.rows, block.y));
167 
168         histogram256Kernel<<<grid, block, 0, stream>>>(src.data, src.cols, src.rows, src.step, mask.data, mask.step, hist);
169         cudaSafeCall( cudaGetLastError() );
170 
171         if (stream == 0)
172             cudaSafeCall( cudaDeviceSynchronize() );
173     }
174 }
175 
176 /////////////////////////////////////////////////////////////////////////
177 
178 namespace hist
179 {
histEvenInc(int * shist,uint data,int binSize,int lowerLevel,int upperLevel)180     __device__ __forceinline__ void histEvenInc(int* shist, uint data, int binSize, int lowerLevel, int upperLevel)
181     {
182         if (data >= lowerLevel && data <= upperLevel)
183         {
184             const uint ind = (data - lowerLevel) / binSize;
185             Emulation::smem::atomicAdd(shist + ind, 1);
186         }
187     }
188 
histEven8u(const uchar * src,const size_t step,const int rows,const int cols,int * hist,const int binCount,const int binSize,const int lowerLevel,const int upperLevel)189     __global__ void histEven8u(const uchar* src, const size_t step, const int rows, const int cols,
190                                int* hist, const int binCount, const int binSize, const int lowerLevel, const int upperLevel)
191     {
192         extern __shared__ int shist[];
193 
194         const int y = blockIdx.x * blockDim.y + threadIdx.y;
195         const int tid = threadIdx.y * blockDim.x + threadIdx.x;
196 
197         if (tid < binCount)
198             shist[tid] = 0;
199 
200         __syncthreads();
201 
202         if (y < rows)
203         {
204             const uchar* rowPtr = src + y * step;
205             const uint* rowPtr4 = (uint*) rowPtr;
206 
207             const int cols_4 = cols / 4;
208             for (int x = threadIdx.x; x < cols_4; x += blockDim.x)
209             {
210                 const uint data = rowPtr4[x];
211 
212                 histEvenInc(shist, (data >>  0) & 0xFFU, binSize, lowerLevel, upperLevel);
213                 histEvenInc(shist, (data >>  8) & 0xFFU, binSize, lowerLevel, upperLevel);
214                 histEvenInc(shist, (data >> 16) & 0xFFU, binSize, lowerLevel, upperLevel);
215                 histEvenInc(shist, (data >> 24) & 0xFFU, binSize, lowerLevel, upperLevel);
216             }
217 
218             if (cols % 4 != 0 && threadIdx.x == 0)
219             {
220                 for (int x = cols_4 * 4; x < cols; ++x)
221                 {
222                     const uchar data = rowPtr[x];
223                     histEvenInc(shist, data, binSize, lowerLevel, upperLevel);
224                 }
225             }
226         }
227 
228         __syncthreads();
229 
230         if (tid < binCount)
231         {
232             const int histVal = shist[tid];
233 
234             if (histVal > 0)
235                 ::atomicAdd(hist + tid, histVal);
236         }
237     }
238 
histEven8u(PtrStepSzb src,int * hist,int binCount,int lowerLevel,int upperLevel,cudaStream_t stream)239     void histEven8u(PtrStepSzb src, int* hist, int binCount, int lowerLevel, int upperLevel, cudaStream_t stream)
240     {
241         const dim3 block(32, 8);
242         const dim3 grid(divUp(src.rows, block.y));
243 
244         const int binSize = divUp(upperLevel - lowerLevel, binCount);
245 
246         const size_t smem_size = binCount * sizeof(int);
247 
248         histEven8u<<<grid, block, smem_size, stream>>>(src.data, src.step, src.rows, src.cols, hist, binCount, binSize, lowerLevel, upperLevel);
249         cudaSafeCall( cudaGetLastError() );
250 
251         if (stream == 0)
252             cudaSafeCall( cudaDeviceSynchronize() );
253     }
254 }
255 
256 /////////////////////////////////////////////////////////////////////////
257 
258 namespace hist
259 {
260     struct EqualizeHist : unary_function<uchar, uchar>
261     {
262         const uchar* lut;
263 
EqualizeHisthist::EqualizeHist264         __host__ EqualizeHist(const uchar* _lut) : lut(_lut) {}
265 
operator ()hist::EqualizeHist266         __device__ __forceinline__ uchar operator ()(uchar val) const
267         {
268             return lut[val];
269         }
270     };
271 }
272 
273 namespace cv { namespace cuda { namespace device
274 {
275     template <> struct TransformFunctorTraits<hist::EqualizeHist> : DefaultTransformFunctorTraits<hist::EqualizeHist>
276     {
277         enum { smart_shift = 4 };
278     };
279 }}}
280 
281 namespace hist
282 {
equalizeHist(PtrStepSzb src,PtrStepSzb dst,const uchar * lut,cudaStream_t stream)283     void equalizeHist(PtrStepSzb src, PtrStepSzb dst, const uchar* lut, cudaStream_t stream)
284     {
285         device::transform(src, dst, EqualizeHist(lut), WithOutMask(), stream);
286     }
287 
buildLutKernel(int * hist,unsigned char * lut,int size)288     __global__ void buildLutKernel(int* hist, unsigned char* lut, int size)
289     {
290         __shared__ int warp_smem[8];
291         __shared__ int hist_smem[8][33];
292 
293 #define HIST_SMEM_NO_BANK_CONFLICT(idx) hist_smem[(idx) >> 5][(idx) & 31]
294 
295         const int tId = threadIdx.x;
296         const int warpId = threadIdx.x / 32;
297         const int laneId = threadIdx.x % 32;
298 
299         // Step1 - Find minimum non-zero value in hist and make it zero
300         HIST_SMEM_NO_BANK_CONFLICT(tId) = hist[tId];
301         int nonZeroIdx = HIST_SMEM_NO_BANK_CONFLICT(tId) > 0 ? tId : 256;
302 
303         __syncthreads();
304 
305         for (int delta = 16; delta > 0; delta /= 2)
306         {
307 #if __CUDACC_VER_MAJOR__ >= 9
308             int shflVal = __shfl_down_sync(0xFFFFFFFF, nonZeroIdx, delta);
309 #else
310             int shflVal = __shfl_down(nonZeroIdx, delta);
311 #endif
312             if (laneId < delta)
313                 nonZeroIdx = min(nonZeroIdx, shflVal);
314         }
315 
316         if (laneId == 0)
317             warp_smem[warpId] = nonZeroIdx;
318 
319         __syncthreads();
320 
321         if (tId < 8)
322         {
323             int warpVal = warp_smem[tId];
324             for (int delta = 4; delta > 0; delta /= 2)
325             {
326 #if __CUDACC_VER_MAJOR__ >= 9
327                 int shflVal = __shfl_down_sync(0x000000FF, warpVal, delta);
328 #else
329                 int shflVal = __shfl_down(warpVal, delta);
330 #endif
331                 if (tId < delta)
332                     warpVal = min(warpVal, shflVal);
333             }
334             if (tId == 0)
335             {
336                 warp_smem[0] = warpVal; // warpVal - minimum index
337             }
338         }
339 
340         __syncthreads();
341 
342         const int minNonZeroIdx = warp_smem[0];
343         const int minNonZeroVal = HIST_SMEM_NO_BANK_CONFLICT(minNonZeroIdx);
344         if (minNonZeroVal == size)
345         {
346             // This is a special case: the whole image has the same color
347 
348             lut[tId] = 0;
349             if (tId == minNonZeroIdx)
350                 lut[tId] = minNonZeroIdx;
351             return;
352         }
353 
354         if (tId == 0)
355             HIST_SMEM_NO_BANK_CONFLICT(minNonZeroIdx) = 0;
356 
357         __syncthreads();
358 
359         // Step2 - Inclusive sum
360         // Algorithm from GPU Gems 3 (A Work-Efficient Parallel Scan)
361         // https://developer.nvidia.com/gpugems/gpugems3/part-vi-gpu-computing/chapter-39-parallel-prefix-sum-scan-cuda
362 
363         // Step2 Phase1 - The Up-Sweep Phase
364         for (int delta = 1; delta < 256; delta *= 2)
365         {
366             if (tId < 128 / delta)
367             {
368                 int idx = 255 - 2 * tId * delta;
369                 HIST_SMEM_NO_BANK_CONFLICT(idx) += HIST_SMEM_NO_BANK_CONFLICT(idx - delta);
370             }
371             __syncthreads();
372         }
373 
374         // Step2 Phase2 - The Down-Sweep Phase
375         if (tId == 0)
376             HIST_SMEM_NO_BANK_CONFLICT(255) = 0;
377 
378         for (int delta = 128; delta >= 1; delta /= 2)
379         {
380             if (tId < 128 / delta)
381             {
382                 int rootIdx = 255 - tId * delta * 2;
383                 int leftIdx = rootIdx - delta;
384                 int tmp = HIST_SMEM_NO_BANK_CONFLICT(leftIdx);
385                 HIST_SMEM_NO_BANK_CONFLICT(leftIdx) = HIST_SMEM_NO_BANK_CONFLICT(rootIdx);
386                 HIST_SMEM_NO_BANK_CONFLICT(rootIdx) += tmp;
387             }
388             __syncthreads();
389         }
390 
391         // Step2 Phase3 - Convert exclusive sum to inclusive sum
392         int tmp = HIST_SMEM_NO_BANK_CONFLICT(tId);
393         __syncthreads();
394         if (tId >= 1)
395             HIST_SMEM_NO_BANK_CONFLICT(tId - 1) = tmp;
396         if (tId == 255)
397             HIST_SMEM_NO_BANK_CONFLICT(tId) = tmp + hist[tId];
398         __syncthreads();
399 
400         // Step3 - Scale values to build lut
401 
402         lut[tId] = saturate_cast<unsigned char>(HIST_SMEM_NO_BANK_CONFLICT(tId) * (255.0f / (size - minNonZeroVal)));
403 
404 #undef HIST_SMEM_NO_BANK_CONFLICT
405     }
406 
buildLut(PtrStepSzi hist,PtrStepSzb lut,int size,cudaStream_t stream)407     void buildLut(PtrStepSzi hist, PtrStepSzb lut, int size, cudaStream_t stream)
408     {
409         buildLutKernel<<<1, 256, 0, stream>>>(hist.data, lut.data, size);
410         cudaSafeCall( cudaGetLastError() );
411 
412         if (stream == 0)
413             cudaSafeCall( cudaDeviceSynchronize() );
414     }
415 }
416 
417 #endif /* CUDA_DISABLER */
418