1 //M*//////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                          License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000, Intel Corporation, all rights reserved.
14 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 //   * Redistribution's of source code must retain the above copyright notice,
21 //     this list of conditions and the following disclaimer.
22 //
23 //   * Redistribution's in binary form must reproduce the above copyright notice,
24 //     this list of conditions and the following disclaimer in the documentation
25 //     and/or other materials provided with the distribution.
26 //
27 //   * The name of the copyright holders may not be used to endorse or promote products
28 //     derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42 
43 /****************************************************************************************\
44 *    Very fast SAD-based (Sum-of-Absolute-Diffrences) stereo correspondence algorithm.   *
45 *    Contributed by Kurt Konolige                                                        *
46 \****************************************************************************************/
47 
48 #include "precomp.hpp"
49 #include <stdio.h>
50 #include <limits>
51 #include <vector>
52 #include "opencl_kernels_calib3d.hpp"
53 #include "opencv2/core/hal/intrin.hpp"
54 #include "opencv2/core/utils/buffer_area.private.hpp"
55 
56 namespace cv
57 {
58 
59 struct StereoBMParams
60 {
StereoBMParamscv::StereoBMParams61     StereoBMParams(int _numDisparities=64, int _SADWindowSize=21)
62     {
63         preFilterType = StereoBM::PREFILTER_XSOBEL;
64         preFilterSize = 9;
65         preFilterCap = 31;
66         SADWindowSize = _SADWindowSize;
67         minDisparity = 0;
68         numDisparities = _numDisparities > 0 ? _numDisparities : 64;
69         textureThreshold = 10;
70         uniquenessRatio = 15;
71         speckleRange = speckleWindowSize = 0;
72         roi1 = roi2 = Rect(0,0,0,0);
73         disp12MaxDiff = -1;
74         dispType = CV_16S;
75     }
76 
77     int preFilterType;
78     int preFilterSize;
79     int preFilterCap;
80     int SADWindowSize;
81     int minDisparity;
82     int numDisparities;
83     int textureThreshold;
84     int uniquenessRatio;
85     int speckleRange;
86     int speckleWindowSize;
87     Rect roi1, roi2;
88     int disp12MaxDiff;
89     int dispType;
90 
useShortscv::StereoBMParams91     inline bool useShorts() const
92     {
93         return preFilterCap <= 31 && SADWindowSize <= 21;
94     }
useFilterSpecklescv::StereoBMParams95     inline bool useFilterSpeckles() const
96     {
97         return speckleRange >= 0 && speckleWindowSize > 0;
98     }
useNormPrefiltercv::StereoBMParams99     inline bool useNormPrefilter() const
100     {
101         return preFilterType == StereoBM::PREFILTER_NORMALIZED_RESPONSE;
102     }
103 };
104 
105 #ifdef HAVE_OPENCL
ocl_prefilter_norm(InputArray _input,OutputArray _output,int winsize,int prefilterCap)106 static bool ocl_prefilter_norm(InputArray _input, OutputArray _output, int winsize, int prefilterCap)
107 {
108     ocl::Kernel k("prefilter_norm", ocl::calib3d::stereobm_oclsrc, cv::format("-D WSZ=%d", winsize));
109     if(k.empty())
110         return false;
111 
112     int scale_g = winsize*winsize/8, scale_s = (1024 + scale_g)/(scale_g*2);
113     scale_g *= scale_s;
114 
115     UMat input = _input.getUMat(), output;
116     _output.create(input.size(), input.type());
117     output = _output.getUMat();
118 
119     size_t globalThreads[3] = { (size_t)input.cols, (size_t)input.rows, 1 };
120 
121     k.args(ocl::KernelArg::PtrReadOnly(input), ocl::KernelArg::PtrWriteOnly(output), input.rows, input.cols,
122         prefilterCap, scale_g, scale_s);
123 
124     return k.run(2, globalThreads, NULL, false);
125 }
126 #endif
127 
prefilterNorm(const Mat & src,Mat & dst,int winsize,int ftzero,int * buf)128 static void prefilterNorm( const Mat& src, Mat& dst, int winsize, int ftzero, int *buf )
129 {
130     int x, y, wsz2 = winsize/2;
131     int* vsum = buf + (wsz2 + 1);
132     int scale_g = winsize*winsize/8, scale_s = (1024 + scale_g)/(scale_g*2);
133     const int OFS = 256*5, TABSZ = OFS*2 + 256;
134     uchar tab[TABSZ];
135     const uchar* sptr = src.ptr();
136     int srcstep = (int)src.step;
137     Size size = src.size();
138 
139     scale_g *= scale_s;
140 
141     for( x = 0; x < TABSZ; x++ )
142         tab[x] = (uchar)(x - OFS < -ftzero ? 0 : x - OFS > ftzero ? ftzero*2 : x - OFS + ftzero);
143 
144     for( x = 0; x < size.width; x++ )
145         vsum[x] = (ushort)(sptr[x]*(wsz2 + 2));
146 
147     for( y = 1; y < wsz2; y++ )
148     {
149         for( x = 0; x < size.width; x++ )
150             vsum[x] = (ushort)(vsum[x] + sptr[srcstep*y + x]);
151     }
152 
153     for( y = 0; y < size.height; y++ )
154     {
155         const uchar* top = sptr + srcstep*MAX(y-wsz2-1,0);
156         const uchar* bottom = sptr + srcstep*MIN(y+wsz2,size.height-1);
157         const uchar* prev = sptr + srcstep*MAX(y-1,0);
158         const uchar* curr = sptr + srcstep*y;
159         const uchar* next = sptr + srcstep*MIN(y+1,size.height-1);
160         uchar* dptr = dst.ptr<uchar>(y);
161 
162         for( x = 0; x < size.width; x++ )
163             vsum[x] = (ushort)(vsum[x] + bottom[x] - top[x]);
164 
165         for( x = 0; x <= wsz2; x++ )
166         {
167             vsum[-x-1] = vsum[0];
168             vsum[size.width+x] = vsum[size.width-1];
169         }
170 
171         int sum = vsum[0]*(wsz2 + 1);
172         for( x = 1; x <= wsz2; x++ )
173             sum += vsum[x];
174 
175         int val = ((curr[0]*5 + curr[1] + prev[0] + next[0])*scale_g - sum*scale_s) >> 10;
176         dptr[0] = tab[val + OFS];
177 
178         for( x = 1; x < size.width-1; x++ )
179         {
180             sum += vsum[x+wsz2] - vsum[x-wsz2-1];
181             val = ((curr[x]*4 + curr[x-1] + curr[x+1] + prev[x] + next[x])*scale_g - sum*scale_s) >> 10;
182             dptr[x] = tab[val + OFS];
183         }
184 
185         sum += vsum[x+wsz2] - vsum[x-wsz2-1];
186         val = ((curr[x]*5 + curr[x-1] + prev[x] + next[x])*scale_g - sum*scale_s) >> 10;
187         dptr[x] = tab[val + OFS];
188     }
189 }
190 
191 #ifdef HAVE_OPENCL
ocl_prefilter_xsobel(InputArray _input,OutputArray _output,int prefilterCap)192 static bool ocl_prefilter_xsobel(InputArray _input, OutputArray _output, int prefilterCap)
193 {
194     ocl::Kernel k("prefilter_xsobel", ocl::calib3d::stereobm_oclsrc);
195     if(k.empty())
196         return false;
197 
198     UMat input = _input.getUMat(), output;
199     _output.create(input.size(), input.type());
200     output = _output.getUMat();
201 
202     size_t globalThreads[3] = { (size_t)input.cols, (size_t)input.rows, 1 };
203 
204     k.args(ocl::KernelArg::PtrReadOnly(input), ocl::KernelArg::PtrWriteOnly(output), input.rows, input.cols, prefilterCap);
205 
206     return k.run(2, globalThreads, NULL, false);
207 }
208 #endif
209 
210 static void
prefilterXSobel(const Mat & src,Mat & dst,int ftzero)211 prefilterXSobel( const Mat& src, Mat& dst, int ftzero )
212 {
213     int x, y;
214     const int OFS = 256*4, TABSZ = OFS*2 + 256;
215     uchar tab[TABSZ] = { 0 };
216     Size size = src.size();
217 
218     for( x = 0; x < TABSZ; x++ )
219         tab[x] = (uchar)(x - OFS < -ftzero ? 0 : x - OFS > ftzero ? ftzero*2 : x - OFS + ftzero);
220     uchar val0 = tab[0 + OFS];
221 
222     for( y = 0; y < size.height-1; y += 2 )
223     {
224         const uchar* srow1 = src.ptr<uchar>(y);
225         const uchar* srow0 = y > 0 ? srow1 - src.step : size.height > 1 ? srow1 + src.step : srow1;
226         const uchar* srow2 = y < size.height-1 ? srow1 + src.step : size.height > 1 ? srow1 - src.step : srow1;
227         const uchar* srow3 = y < size.height-2 ? srow1 + src.step*2 : srow1;
228         uchar* dptr0 = dst.ptr<uchar>(y);
229         uchar* dptr1 = dptr0 + dst.step;
230 
231         dptr0[0] = dptr0[size.width-1] = dptr1[0] = dptr1[size.width-1] = val0;
232         x = 1;
233 
234 #if CV_SIMD
235         {
236             v_int16 ftz = vx_setall_s16((short) ftzero);
237             v_int16 ftz2 = vx_setall_s16((short)(ftzero*2));
238             v_int16 z = vx_setzero_s16();
239 
240             for(; x <= (size.width - 1) - v_int16::nlanes; x += v_int16::nlanes)
241             {
242                 v_int16 s00 = v_reinterpret_as_s16(vx_load_expand(srow0 + x + 1));
243                 v_int16 s01 = v_reinterpret_as_s16(vx_load_expand(srow0 + x - 1));
244                 v_int16 s10 = v_reinterpret_as_s16(vx_load_expand(srow1 + x + 1));
245                 v_int16 s11 = v_reinterpret_as_s16(vx_load_expand(srow1 + x - 1));
246                 v_int16 s20 = v_reinterpret_as_s16(vx_load_expand(srow2 + x + 1));
247                 v_int16 s21 = v_reinterpret_as_s16(vx_load_expand(srow2 + x - 1));
248                 v_int16 s30 = v_reinterpret_as_s16(vx_load_expand(srow3 + x + 1));
249                 v_int16 s31 = v_reinterpret_as_s16(vx_load_expand(srow3 + x - 1));
250 
251                 v_int16 d0 = s00 - s01;
252                 v_int16 d1 = s10 - s11;
253                 v_int16 d2 = s20 - s21;
254                 v_int16 d3 = s30 - s31;
255 
256                 v_uint16 v0 = v_reinterpret_as_u16(v_max(v_min(d0 + d1 + d1 + d2 + ftz, ftz2), z));
257                 v_uint16 v1 = v_reinterpret_as_u16(v_max(v_min(d1 + d2 + d2 + d3 + ftz, ftz2), z));
258 
259                 v_pack_store(dptr0 + x, v0);
260                 v_pack_store(dptr1 + x, v1);
261             }
262         }
263 #endif
264 
265         for( ; x < size.width-1; x++ )
266         {
267             int d0 = srow0[x+1] - srow0[x-1], d1 = srow1[x+1] - srow1[x-1],
268             d2 = srow2[x+1] - srow2[x-1], d3 = srow3[x+1] - srow3[x-1];
269             int v0 = tab[d0 + d1*2 + d2 + OFS];
270             int v1 = tab[d1 + d2*2 + d3 + OFS];
271             dptr0[x] = (uchar)v0;
272             dptr1[x] = (uchar)v1;
273         }
274     }
275 
276     for( ; y < size.height; y++ )
277     {
278         uchar* dptr = dst.ptr<uchar>(y);
279         x = 0;
280 #if CV_SIMD
281         {
282             v_uint8 val0_16 = vx_setall_u8(val0);
283             for(; x <= size.width-v_uint8::nlanes; x+=v_uint8::nlanes)
284                 v_store(dptr + x, val0_16);
285         }
286 #endif
287         for(; x < size.width; x++ )
288             dptr[x] = val0;
289     }
290 }
291 
292 
293 static const int DISPARITY_SHIFT_16S = 4;
294 static const int DISPARITY_SHIFT_32S = 8;
295 
296 template <typename T>
297 struct dispShiftTemplate
298 { };
299 
300 template<>
301 struct dispShiftTemplate<short>
302 {
303     enum { value = DISPARITY_SHIFT_16S };
304 };
305 
306 template<>
307 struct dispShiftTemplate<int>
308 {
309     enum { value = DISPARITY_SHIFT_32S };
310 };
311 
312 template <typename T>
313 inline T dispDescale(int /*v1*/, int /*v2*/, int /*d*/);
314 
315 template<>
dispDescale(int v1,int v2,int d)316 inline short dispDescale(int v1, int v2, int d)
317 {
318     return (short)((v1*256 + (d != 0 ? v2*256/d : 0) + 15) >> 4);
319 }
320 
321 template <>
dispDescale(int v1,int v2,int d)322 inline int dispDescale(int v1, int v2, int d)
323 {
324     return (int)(v1*256 + (d != 0 ? v2*256/d : 0)); // no need to add 127, this will be converted to float
325 }
326 
327 
328 class BufferBM
329 {
330     static const int TABSZ = 256;
331 public:
332     std::vector<int*> sad;
333     std::vector<int*> hsad;
334     std::vector<int*> htext;
335     std::vector<uchar*> cbuf0;
336     std::vector<ushort*> sad_short;
337     std::vector<ushort*> hsad_short;
338     int *prefilter[2];
339     uchar tab[TABSZ];
340 private:
341     utils::BufferArea area;
342 
343 public:
BufferBM(size_t nstripes,size_t width,size_t height,const StereoBMParams & params)344     BufferBM(size_t nstripes, size_t width, size_t height, const StereoBMParams& params)
345         : sad(nstripes, NULL),
346         hsad(nstripes, NULL),
347         htext(nstripes, NULL),
348         cbuf0(nstripes, NULL),
349         sad_short(nstripes, NULL),
350         hsad_short(nstripes, NULL),
351         prefilter()
352     {
353         const int wsz = params.SADWindowSize;
354         const int ndisp = params.numDisparities;
355         const int ftzero = params.preFilterCap;
356         for (size_t i = 0; i < nstripes; ++i)
357         {
358             // 1D: [1][  ndisp  ][1]
359 #if CV_SIMD
360             if (params.useShorts())
361                 area.allocate(sad_short[i], ndisp + 2);
362             else
363 #endif
364                 area.allocate(sad[i], ndisp + 2);
365 
366             // 2D: [ wsz/2 + 1 ][   height   ][ wsz/2 + 1 ] * [ ndisp ]
367 #if CV_SIMD
368             if (params.useShorts())
369                 area.allocate(hsad_short[i], (height + wsz + 2) * ndisp);
370             else
371 #endif
372                 area.allocate(hsad[i], (height + wsz + 2) * ndisp);
373 
374             // 1D: [ wsz/2 + 1 ][   height   ][ wsz/2 + 1 ]
375             area.allocate(htext[i], (height + wsz + 2));
376 
377             // 3D: [ wsz/2 + 1 ][   height   ][ wsz/2 + 1 ] * [ ndisp ] * [ wsz/2 + 1 ][ wsz/2 + 1 ]
378             area.allocate(cbuf0[i], ((height + wsz + 2) * ndisp * (wsz + 2) + 256));
379         }
380         if (params.useNormPrefilter())
381         {
382             for (size_t i = 0; i < 2; ++i)
383                 area.allocate(prefilter[i], width + params.preFilterSize + 2);
384         }
385         area.commit();
386 
387         // static table
388         for (int x = 0; x < TABSZ; x++)
389             tab[x] = (uchar)std::abs(x - ftzero);
390     }
391 };
392 
393 #if CV_SIMD
394 template <typename dType>
findStereoCorrespondenceBM_SIMD(const Mat & left,const Mat & right,Mat & disp,Mat & cost,const StereoBMParams & state,int _dy0,int _dy1,const BufferBM & bufX,size_t bufNum)395 static void findStereoCorrespondenceBM_SIMD( const Mat& left, const Mat& right,
396                                             Mat& disp, Mat& cost, const StereoBMParams& state,
397                                             int _dy0, int _dy1, const BufferBM & bufX, size_t bufNum )
398 {
399     int x, y, d;
400     int wsz = state.SADWindowSize, wsz2 = wsz/2;
401     int dy0 = MIN(_dy0, wsz2+1), dy1 = MIN(_dy1, wsz2+1);
402     int ndisp = state.numDisparities;
403     int mindisp = state.minDisparity;
404     int lofs = MAX(ndisp - 1 + mindisp, 0);
405     int rofs = -MIN(ndisp - 1 + mindisp, 0);
406     int width = left.cols, height = left.rows;
407     int width1 = width - rofs - ndisp + 1;
408     int textureThreshold = state.textureThreshold;
409     int uniquenessRatio = state.uniquenessRatio;
410     const int disp_shift = dispShiftTemplate<dType>::value;
411     dType FILTERED = (dType)((mindisp - 1) << disp_shift);
412 
413     ushort *hsad, *hsad_sub;
414     uchar *cbuf;
415     const uchar* lptr0 = left.ptr() + lofs;
416     const uchar* rptr0 = right.ptr() + rofs;
417     const uchar *lptr, *lptr_sub, *rptr;
418     dType* dptr = disp.ptr<dType>();
419     int sstep = (int)left.step;
420     int dstep = (int)(disp.step/sizeof(dptr[0]));
421     int cstep = (height + dy0 + dy1)*ndisp;
422     short costbuf = 0;
423     int coststep = cost.data ? (int)(cost.step/sizeof(costbuf)) : 0;
424     const uchar * tab = bufX.tab;
425     short v_seq[v_int16::nlanes];
426     for (short i = 0; i < v_int16::nlanes; ++i)
427         v_seq[i] = i;
428 
429     ushort *sad = bufX.sad_short[bufNum] + 1;
430     ushort *hsad0 = bufX.hsad_short[bufNum] + (wsz2 + 1) * ndisp;
431     int *htext = bufX.htext[bufNum] + (wsz2 + 1);
432     uchar *cbuf0 = bufX.cbuf0[bufNum] + (wsz2 + 1) * ndisp;
433 
434     // initialize buffers
435     memset(sad - 1, 0, (ndisp + 2) * sizeof(sad[0]));
436     memset(hsad0 - dy0 * ndisp, 0, (height + wsz + 2) * ndisp * sizeof(hsad[0]));
437     memset(htext - dy0, 0, (height + wsz + 2) * sizeof(htext[0]));
438 
439     for( x = -wsz2-1; x < wsz2; x++ )
440     {
441         hsad = hsad0 - dy0*ndisp; cbuf = cbuf0 + (x + wsz2 + 1)*cstep - dy0*ndisp;
442         lptr = lptr0 + MIN(MAX(x, -lofs), width-lofs-1) - dy0*sstep;
443         rptr = rptr0 + MIN(MAX(x, -rofs), width-rofs-ndisp) - dy0*sstep;
444 
445         for( y = -dy0; y < height + dy1; y++, hsad += ndisp, cbuf += ndisp, lptr += sstep, rptr += sstep )
446         {
447             int lval = lptr[0];
448             v_uint8 lv = vx_setall_u8((uchar)lval);
449             for( d = 0; d <= ndisp - v_uint8::nlanes; d += v_uint8::nlanes )
450             {
451                 v_uint8 diff = v_absdiff(lv, vx_load(rptr + d));
452                 v_store(cbuf + d, diff);
453                 v_store(hsad + d, vx_load(hsad + d) + v_expand_low(diff));
454                 v_store(hsad + d + v_uint16::nlanes, vx_load(hsad + d + v_uint16::nlanes) + v_expand_high(diff));
455             }
456             if( d <= ndisp - v_uint16::nlanes )
457             {
458                 v_uint8 diff = v_absdiff(lv, vx_load_low(rptr + d));
459                 v_store_low(cbuf + d, diff);
460                 v_store(hsad + d, vx_load(hsad + d) + v_expand_low(diff));
461                 d += v_uint16::nlanes;
462             }
463             for( ; d < ndisp; d++ )
464             {
465                 int diff = abs(lval - rptr[d]);
466                 cbuf[d] = (uchar)diff;
467                 hsad[d] += (ushort)diff;
468             }
469             htext[y] += tab[lval];
470         }
471     }
472 
473     // initialize the left and right borders of the disparity map
474     for( y = 0; y < height; y++ )
475     {
476         for( x = 0; x < lofs; x++ )
477             dptr[y*dstep + x] = FILTERED;
478         for( x = lofs + width1; x < width; x++ )
479             dptr[y*dstep + x] = FILTERED;
480     }
481     dptr += lofs;
482 
483     for( x = 0; x < width1; x++, dptr++ )
484     {
485         short* costptr = cost.data ? cost.ptr<short>() + lofs + x : &costbuf;
486         int x0 = x - wsz2 - 1, x1 = x + wsz2;
487         const uchar* cbuf_sub = cbuf0 + ((x0 + wsz2 + 1) % (wsz + 1))*cstep - dy0*ndisp;
488         cbuf = cbuf0 + ((x1 + wsz2 + 1) % (wsz + 1))*cstep - dy0*ndisp;
489         hsad = hsad0 - dy0*ndisp;
490         lptr_sub = lptr0 + MIN(MAX(x0, -lofs), width-1-lofs) - dy0*sstep;
491         lptr = lptr0 + MIN(MAX(x1, -lofs), width-1-lofs) - dy0*sstep;
492         rptr = rptr0 + MIN(MAX(x1, -rofs), width-ndisp-rofs) - dy0*sstep;
493 
494         for( y = -dy0; y < height + dy1; y++, cbuf += ndisp, cbuf_sub += ndisp,
495             hsad += ndisp, lptr += sstep, lptr_sub += sstep, rptr += sstep )
496         {
497             int lval = lptr[0];
498             v_uint8 lv = vx_setall_u8((uchar)lval);
499             for( d = 0; d <= ndisp - v_uint8::nlanes; d += v_uint8::nlanes )
500             {
501                 v_uint8 diff = v_absdiff(lv, vx_load(rptr + d));
502                 v_int8 cbs = v_reinterpret_as_s8(vx_load(cbuf_sub + d));
503                 v_store(cbuf + d, diff);
504                 v_store(hsad + d, v_reinterpret_as_u16(v_reinterpret_as_s16(vx_load(hsad + d) + v_expand_low(diff)) - v_expand_low(cbs)));
505                 v_store(hsad + d + v_uint16::nlanes, v_reinterpret_as_u16(v_reinterpret_as_s16(vx_load(hsad + d + v_uint16::nlanes) + v_expand_high(diff)) - v_expand_high(cbs)));
506             }
507             if( d <= ndisp - v_uint16::nlanes)
508             {
509                 v_uint8 diff = v_absdiff(lv, vx_load_low(rptr + d));
510                 v_store_low(cbuf + d, diff);
511                 v_store(hsad + d, v_reinterpret_as_u16(v_reinterpret_as_s16(vx_load(hsad + d) + v_expand_low(diff)) - vx_load_expand((schar*)cbuf_sub + d)));
512                 d += v_uint16::nlanes;
513             }
514             for( ; d < ndisp; d++ )
515             {
516                 int diff = abs(lval - rptr[d]);
517                 cbuf[d] = (uchar)diff;
518                 hsad[d] = hsad[d] + (ushort)diff - cbuf_sub[d];
519             }
520             htext[y] += tab[lval] - tab[lptr_sub[0]];
521         }
522 
523         // fill borders
524         for( y = dy1; y <= wsz2; y++ )
525             htext[height+y] = htext[height+dy1-1];
526         for( y = -wsz2-1; y < -dy0; y++ )
527             htext[y] = htext[-dy0];
528 
529         // initialize sums
530         for( d = 0; d < ndisp; d++ )
531             sad[d] = (ushort)(hsad0[d-ndisp*dy0]*(wsz2 + 2 - dy0));
532 
533         hsad = hsad0 + (1 - dy0)*ndisp;
534         for( y = 1 - dy0; y < wsz2; y++, hsad += ndisp )
535         {
536             for( d = 0; d <= ndisp-2*v_uint16::nlanes; d += 2*v_uint16::nlanes )
537             {
538                 v_store(sad + d, vx_load(sad + d) + vx_load(hsad + d));
539                 v_store(sad + d + v_uint16::nlanes, vx_load(sad + d + v_uint16::nlanes) + vx_load(hsad + d + v_uint16::nlanes));
540             }
541             if( d <= ndisp-v_uint16::nlanes )
542             {
543                 v_store(sad + d, vx_load(sad + d) + vx_load(hsad + d));
544                 d += v_uint16::nlanes;
545             }
546             if( d <= ndisp-v_uint16::nlanes/2 )
547             {
548                 v_store_low(sad + d, vx_load_low(sad + d) + vx_load_low(hsad + d));
549                 d += v_uint16::nlanes/2;
550             }
551             for( ; d < ndisp; d++ )
552                 sad[d] = sad[d] + hsad[d];
553         }
554         int tsum = 0;
555         for( y = -wsz2-1; y < wsz2; y++ )
556             tsum += htext[y];
557 
558         // finally, start the real processing
559         for( y = 0; y < height; y++ )
560         {
561             int minsad = INT_MAX, mind = -1;
562             hsad = hsad0 + MIN(y + wsz2, height+dy1-1)*ndisp;
563             hsad_sub = hsad0 + MAX(y - wsz2 - 1, -dy0)*ndisp;
564             v_int16 minsad8 = vx_setall_s16(SHRT_MAX);
565             v_int16 mind8 = vx_setall_s16(0);
566 
567             for( d = 0; d <= ndisp - 2*v_int16::nlanes; d += 2*v_int16::nlanes )
568             {
569                 v_int16 sad8 = v_reinterpret_as_s16(vx_load(hsad + d)) - v_reinterpret_as_s16(vx_load(hsad_sub + d)) + v_reinterpret_as_s16(vx_load(sad + d));
570                 v_store(sad + d, v_reinterpret_as_u16(sad8));
571                 mind8 = v_max(mind8, (minsad8 > sad8) & vx_setall_s16((short)d));
572                 minsad8 = v_min(minsad8, sad8);
573 
574                 sad8 = v_reinterpret_as_s16(vx_load(hsad + d + v_int16::nlanes)) - v_reinterpret_as_s16(vx_load(hsad_sub + d + v_int16::nlanes)) + v_reinterpret_as_s16(vx_load(sad + d + v_int16::nlanes));
575                 v_store(sad + d + v_int16::nlanes, v_reinterpret_as_u16(sad8));
576                 mind8 = v_max(mind8, (minsad8 > sad8) & vx_setall_s16((short)d+v_int16::nlanes));
577                 minsad8 = v_min(minsad8, sad8);
578             }
579             if( d <= ndisp - v_int16::nlanes )
580             {
581                 v_int16 sad8 = v_reinterpret_as_s16(vx_load(hsad + d)) - v_reinterpret_as_s16(vx_load(hsad_sub + d)) + v_reinterpret_as_s16(vx_load(sad + d));
582                 v_store(sad + d, v_reinterpret_as_u16(sad8));
583                 mind8 = v_max(mind8, (minsad8 > sad8) & vx_setall_s16((short)d));
584                 minsad8 = v_min(minsad8, sad8);
585                 d += v_int16::nlanes;
586             }
587             minsad = v_reduce_min(minsad8);
588             v_int16 v_mask = (vx_setall_s16((short)minsad) == minsad8);
589             mind = v_reduce_min(((mind8+vx_load(v_seq)) & v_mask) | (vx_setall_s16(SHRT_MAX) & ~v_mask));
590             for( ; d < ndisp; d++ )
591             {
592                 int sad8 = (int)(hsad[d]) - hsad_sub[d] + sad[d];
593                 sad[d] = (ushort)sad8;
594                 if(minsad > sad8)
595                 {
596                     mind = d;
597                     minsad = sad8;
598                 }
599             }
600 
601             tsum += htext[y + wsz2] - htext[y - wsz2 - 1];
602             if( tsum < textureThreshold )
603             {
604                 dptr[y*dstep] = FILTERED;
605                 continue;
606             }
607 
608             if( uniquenessRatio > 0 )
609             {
610                 int thresh = minsad + (minsad * uniquenessRatio/100);
611                 v_int32 thresh4 = vx_setall_s32(thresh + 1);
612                 v_int32 d1 = vx_setall_s32(mind-1), d2 = vx_setall_s32(mind+1);
613                 v_int32 dd_4 = vx_setall_s32(v_int32::nlanes);
614                 v_int32 d4 = vx_load_expand(v_seq);
615 
616                 for( d = 0; d <= ndisp - v_int16::nlanes; d += v_int16::nlanes )
617                 {
618                     v_int32 sad4_l, sad4_h;
619                     v_expand(v_reinterpret_as_s16(vx_load(sad + d)), sad4_l, sad4_h);
620                     if( v_check_any((thresh4 > sad4_l) & ((d1 > d4) | (d4 > d2))) )
621                         break;
622                     d4 += dd_4;
623                     if( v_check_any((thresh4 > sad4_h) & ((d1 > d4) | (d4 > d2))) )
624                         break;
625                     d4 += dd_4;
626                 }
627                 if( d <= ndisp - v_int16::nlanes )
628                 {
629                     dptr[y*dstep] = FILTERED;
630                     continue;
631                 }
632                 if( d <= ndisp - v_int32::nlanes )
633                 {
634                     v_int32 sad4_l = vx_load_expand((short*)sad + d);
635                     if (v_check_any((thresh4 > sad4_l) & ((d1 > d4) | (d4 > d2))))
636                     {
637                         dptr[y*dstep] = FILTERED;
638                         continue;
639                     }
640                     d += v_int16::nlanes;
641                 }
642                 for( ; d < ndisp; d++ )
643                 {
644                     if( (thresh + 1) > sad[d] && ((mind - 1) > d || d > (mind + 1)) )
645                         break;
646                 }
647                 if( d < ndisp )
648                 {
649                     dptr[y*dstep] = FILTERED;
650                     continue;
651                 }
652             }
653 
654             if( 0 < mind && mind < ndisp - 1 )
655             {
656                 int p = sad[mind+1], n = sad[mind-1];
657                 d = p + n - 2*sad[mind] + std::abs(p - n);
658                 dptr[y*dstep] = dispDescale<dType>(ndisp - mind - 1 + mindisp, p-n, d);
659             }
660             else
661                 dptr[y*dstep] = dispDescale<dType>(ndisp - mind - 1 + mindisp, 0, 0);
662             costptr[y*coststep] = sad[mind];
663         }
664     }
665 }
666 #endif
667 
668 template <typename mType>
669 static void
findStereoCorrespondenceBM(const Mat & left,const Mat & right,Mat & disp,Mat & cost,const StereoBMParams & state,int _dy0,int _dy1,const BufferBM & bufX,size_t bufNum)670 findStereoCorrespondenceBM( const Mat& left, const Mat& right,
671                             Mat& disp, Mat& cost, const StereoBMParams& state,
672                             int _dy0, int _dy1, const BufferBM & bufX, size_t bufNum )
673 {
674 
675     int x, y, d;
676     int wsz = state.SADWindowSize, wsz2 = wsz/2;
677     int dy0 = MIN(_dy0, wsz2+1), dy1 = MIN(_dy1, wsz2+1);
678     int ndisp = state.numDisparities;
679     int mindisp = state.minDisparity;
680     int lofs = MAX(ndisp - 1 + mindisp, 0);
681     int rofs = -MIN(ndisp - 1 + mindisp, 0);
682     int width = left.cols, height = left.rows;
683     int width1 = width - rofs - ndisp + 1;
684     int textureThreshold = state.textureThreshold;
685     int uniquenessRatio = state.uniquenessRatio;
686     const int disp_shift = dispShiftTemplate<mType>::value;
687     mType FILTERED = (mType)((mindisp - 1) << disp_shift);
688 
689     int *hsad, *hsad_sub;
690     uchar *cbuf;
691     const uchar* lptr0 = left.ptr() + lofs;
692     const uchar* rptr0 = right.ptr() + rofs;
693     const uchar *lptr, *lptr_sub, *rptr;
694     mType* dptr = disp.ptr<mType>();
695     int sstep = (int)left.step;
696     int dstep = (int)(disp.step/sizeof(dptr[0]));
697     int cstep = (height+dy0+dy1)*ndisp;
698     int costbuf = 0;
699     int coststep = cost.data ? (int)(cost.step/sizeof(costbuf)) : 0;
700     const uchar * tab = bufX.tab;
701 
702 #if CV_SIMD
703     int v_seq[v_int32::nlanes];
704     for (int i = 0; i < v_int32::nlanes; ++i)
705         v_seq[i] = i;
706     v_int32 d0_4 = vx_load(v_seq), dd_4 = vx_setall_s32(v_int32::nlanes);
707 #endif
708 
709     int *sad = bufX.sad[bufNum] + 1;
710     int *hsad0 = bufX.hsad[bufNum] + (wsz2 + 1) * ndisp;
711     int *htext = bufX.htext[bufNum] + (wsz2 + 1);
712     uchar *cbuf0 = bufX.cbuf0[bufNum] + (wsz2 + 1) * ndisp;
713 
714     // initialize buffers
715     memset(sad - 1, 0, (ndisp + 2) * sizeof(sad[0]));
716     memset(hsad0 - dy0 * ndisp, 0, (height + wsz + 2) * ndisp * sizeof(hsad[0]));
717     memset(htext - dy0, 0, (height + wsz + 2) * sizeof(htext[0]));
718 
719     for( x = -wsz2-1; x < wsz2; x++ )
720     {
721         hsad = hsad0 - dy0*ndisp; cbuf = cbuf0 + (x + wsz2 + 1)*cstep - dy0*ndisp;
722         lptr = lptr0 + std::min(std::max(x, -lofs), width-lofs-1) - dy0*sstep;
723         rptr = rptr0 + std::min(std::max(x, -rofs), width-rofs-ndisp) - dy0*sstep;
724         for( y = -dy0; y < height + dy1; y++, hsad += ndisp, cbuf += ndisp, lptr += sstep, rptr += sstep )
725         {
726             int lval = lptr[0];
727             d = 0;
728 #if CV_SIMD
729             {
730                 v_uint8 lv = vx_setall_u8((uchar)lval);
731 
732                 for( ; d <= ndisp - v_uint8::nlanes; d += v_uint8::nlanes )
733                 {
734                     v_uint8 rv = vx_load(rptr + d);
735                     v_int32 hsad_0 = vx_load(hsad + d);
736                     v_int32 hsad_1 = vx_load(hsad + d + v_int32::nlanes);
737                     v_int32 hsad_2 = vx_load(hsad + d + 2*v_int32::nlanes);
738                     v_int32 hsad_3 = vx_load(hsad + d + 3*v_int32::nlanes);
739                     v_uint8 diff = v_absdiff(lv, rv);
740                     v_store(cbuf + d, diff);
741 
742                     v_uint16 diff0, diff1;
743                     v_uint32 diff00, diff01, diff10, diff11;
744                     v_expand(diff, diff0, diff1);
745                     v_expand(diff0, diff00, diff01);
746                     v_expand(diff1, diff10, diff11);
747 
748                     hsad_0 += v_reinterpret_as_s32(diff00);
749                     hsad_1 += v_reinterpret_as_s32(diff01);
750                     hsad_2 += v_reinterpret_as_s32(diff10);
751                     hsad_3 += v_reinterpret_as_s32(diff11);
752 
753                     v_store(hsad + d, hsad_0);
754                     v_store(hsad + d + v_int32::nlanes, hsad_1);
755                     v_store(hsad + d + 2*v_int32::nlanes, hsad_2);
756                     v_store(hsad + d + 3*v_int32::nlanes, hsad_3);
757                 }
758             }
759 #endif
760             for( ; d < ndisp; d++ )
761             {
762                 int diff = std::abs(lval - rptr[d]);
763                 cbuf[d] = (uchar)diff;
764                 hsad[d] = (int)(hsad[d] + diff);
765             }
766             htext[y] += tab[lval];
767         }
768     }
769 
770     // initialize the left and right borders of the disparity map
771     for( y = 0; y < height; y++ )
772     {
773         for( x = 0; x < lofs; x++ )
774             dptr[y*dstep + x] = FILTERED;
775         for( x = lofs + width1; x < width; x++ )
776             dptr[y*dstep + x] = FILTERED;
777     }
778     dptr += lofs;
779 
780     for( x = 0; x < width1; x++, dptr++ )
781     {
782         int* costptr = cost.data ? cost.ptr<int>() + lofs + x : &costbuf;
783         int x0 = x - wsz2 - 1, x1 = x + wsz2;
784         const uchar* cbuf_sub = cbuf0 + ((x0 + wsz2 + 1) % (wsz + 1))*cstep - dy0*ndisp;
785         cbuf = cbuf0 + ((x1 + wsz2 + 1) % (wsz + 1))*cstep - dy0*ndisp;
786         hsad = hsad0 - dy0*ndisp;
787         lptr_sub = lptr0 + MIN(MAX(x0, -lofs), width-1-lofs) - dy0*sstep;
788         lptr = lptr0 + MIN(MAX(x1, -lofs), width-1-lofs) - dy0*sstep;
789         rptr = rptr0 + MIN(MAX(x1, -rofs), width-ndisp-rofs) - dy0*sstep;
790 
791         for( y = -dy0; y < height + dy1; y++, cbuf += ndisp, cbuf_sub += ndisp,
792             hsad += ndisp, lptr += sstep, lptr_sub += sstep, rptr += sstep )
793         {
794             int lval = lptr[0];
795             d = 0;
796 #if CV_SIMD
797             {
798                 v_uint8 lv = vx_setall_u8((uchar)lval);
799                 for( ; d <= ndisp - v_uint8::nlanes; d += v_uint8::nlanes )
800                 {
801                     v_uint8 rv = vx_load(rptr + d);
802                     v_int32 hsad_0 = vx_load(hsad + d);
803                     v_int32 hsad_1 = vx_load(hsad + d + v_int32::nlanes);
804                     v_int32 hsad_2 = vx_load(hsad + d + 2*v_int32::nlanes);
805                     v_int32 hsad_3 = vx_load(hsad + d + 3*v_int32::nlanes);
806                     v_uint8 cbs = vx_load(cbuf_sub + d);
807                     v_uint8 diff = v_absdiff(lv, rv);
808                     v_store(cbuf + d, diff);
809 
810                     v_uint16 diff0, diff1, cbs0, cbs1;
811                     v_int32 diff00, diff01, diff10, diff11, cbs00, cbs01, cbs10, cbs11;
812                     v_expand(diff, diff0, diff1);
813                     v_expand(cbs, cbs0, cbs1);
814                     v_expand(v_reinterpret_as_s16(diff0), diff00, diff01);
815                     v_expand(v_reinterpret_as_s16(diff1), diff10, diff11);
816                     v_expand(v_reinterpret_as_s16(cbs0), cbs00, cbs01);
817                     v_expand(v_reinterpret_as_s16(cbs1), cbs10, cbs11);
818 
819                     v_int32 diff_0 = diff00 - cbs00;
820                     v_int32 diff_1 = diff01 - cbs01;
821                     v_int32 diff_2 = diff10 - cbs10;
822                     v_int32 diff_3 = diff11 - cbs11;
823                     hsad_0 += diff_0;
824                     hsad_1 += diff_1;
825                     hsad_2 += diff_2;
826                     hsad_3 += diff_3;
827 
828                     v_store(hsad + d, hsad_0);
829                     v_store(hsad + d + v_int32::nlanes, hsad_1);
830                     v_store(hsad + d + 2*v_int32::nlanes, hsad_2);
831                     v_store(hsad + d + 3*v_int32::nlanes, hsad_3);
832                 }
833             }
834 #endif
835             for( ; d < ndisp; d++ )
836             {
837                 int diff = std::abs(lval - rptr[d]);
838                 cbuf[d] = (uchar)diff;
839                 hsad[d] = hsad[d] + diff - cbuf_sub[d];
840             }
841             htext[y] += tab[lval] - tab[lptr_sub[0]];
842         }
843 
844         // fill borders
845         for( y = dy1; y <= wsz2; y++ )
846             htext[height+y] = htext[height+dy1-1];
847         for( y = -wsz2-1; y < -dy0; y++ )
848             htext[y] = htext[-dy0];
849 
850         // initialize sums
851         for( d = 0; d < ndisp; d++ )
852             sad[d] = (int)(hsad0[d-ndisp*dy0]*(wsz2 + 2 - dy0));
853 
854         hsad = hsad0 + (1 - dy0)*ndisp;
855         for( y = 1 - dy0; y < wsz2; y++, hsad += ndisp )
856         {
857             d = 0;
858 #if CV_SIMD
859             {
860                 for( d = 0; d <= ndisp-2*v_int32::nlanes; d += 2*v_int32::nlanes )
861                 {
862                     v_int32 s0 = vx_load(sad + d);
863                     v_int32 s1 = vx_load(sad + d + v_int32::nlanes);
864                     v_int32 t0 = vx_load(hsad + d);
865                     v_int32 t1 = vx_load(hsad + d + v_int32::nlanes);
866                     s0 += t0;
867                     s1 += t1;
868                     v_store(sad + d, s0);
869                     v_store(sad + d + v_int32::nlanes, s1);
870                 }
871             }
872 #endif
873             for( ; d < ndisp; d++ )
874                 sad[d] = (int)(sad[d] + hsad[d]);
875         }
876         int tsum = 0;
877         for( y = -wsz2-1; y < wsz2; y++ )
878             tsum += htext[y];
879 
880         // finally, start the real processing
881         for( y = 0; y < height; y++ )
882         {
883             int minsad = INT_MAX, mind = -1;
884             hsad = hsad0 + MIN(y + wsz2, height+dy1-1)*ndisp;
885             hsad_sub = hsad0 + MAX(y - wsz2 - 1, -dy0)*ndisp;
886             d = 0;
887 #if CV_SIMD
888             {
889                 v_int32 minsad4 = vx_setall_s32(INT_MAX);
890                 v_int32 mind4 = vx_setall_s32(0), d4 = d0_4;
891 
892                 for( ; d <= ndisp - 2*v_int32::nlanes; d += 2*v_int32::nlanes )
893                 {
894                     v_int32 sad4 = vx_load(sad + d) + vx_load(hsad + d) - vx_load(hsad_sub + d);
895                     v_store(sad + d, sad4);
896                     mind4 = v_select(minsad4 > sad4, d4, mind4);
897                     minsad4 = v_min(minsad4, sad4);
898                     d4 += dd_4;
899 
900                     sad4 = vx_load(sad + d + v_int32::nlanes) + vx_load(hsad + d + v_int32::nlanes) - vx_load(hsad_sub + d + v_int32::nlanes);
901                     v_store(sad + d + v_int32::nlanes, sad4);
902                     mind4 = v_select(minsad4 > sad4, d4, mind4);
903                     minsad4 = v_min(minsad4, sad4);
904                     d4 += dd_4;
905                 }
906 
907                 int CV_DECL_ALIGNED(CV_SIMD_WIDTH) minsad_buf[v_int32::nlanes], mind_buf[v_int32::nlanes];
908                 v_store(minsad_buf, minsad4);
909                 v_store(mind_buf, mind4);
910                 for (int i = 0; i < v_int32::nlanes; ++i)
911                     if(minsad_buf[i] < minsad || (minsad == minsad_buf[i] && mind_buf[i] < mind)) { minsad = minsad_buf[i]; mind = mind_buf[i]; }
912             }
913 #endif
914             for( ; d < ndisp; d++ )
915             {
916                 int currsad = sad[d] + hsad[d] - hsad_sub[d];
917                 sad[d] = currsad;
918                 if( currsad < minsad )
919                 {
920                     minsad = currsad;
921                     mind = d;
922                 }
923             }
924 
925             tsum += htext[y + wsz2] - htext[y - wsz2 - 1];
926             if( tsum < textureThreshold )
927             {
928                 dptr[y*dstep] = FILTERED;
929                 continue;
930             }
931 
932             if( uniquenessRatio > 0 )
933             {
934                 int thresh = minsad + (minsad * uniquenessRatio/100);
935                 for( d = 0; d < ndisp; d++ )
936                 {
937                     if( (d < mind-1 || d > mind+1) && sad[d] <= thresh)
938                         break;
939                 }
940                 if( d < ndisp )
941                 {
942                     dptr[y*dstep] = FILTERED;
943                     continue;
944                 }
945             }
946 
947             {
948                 sad[-1] = sad[1];
949                 sad[ndisp] = sad[ndisp-2];
950                 int p = sad[mind+1], n = sad[mind-1];
951                 d = p + n - 2*sad[mind] + std::abs(p - n);
952                 dptr[y*dstep] = dispDescale<mType>(ndisp - mind - 1 + mindisp, p-n, d);
953 
954                 costptr[y*coststep] = sad[mind];
955             }
956         }
957     }
958 }
959 
960 #ifdef HAVE_OPENCL
ocl_prefiltering(InputArray left0,InputArray right0,OutputArray left,OutputArray right,StereoBMParams * state)961 static bool ocl_prefiltering(InputArray left0, InputArray right0, OutputArray left, OutputArray right, StereoBMParams* state)
962 {
963     if (state->useNormPrefilter())
964     {
965         if(!ocl_prefilter_norm( left0, left, state->preFilterSize, state->preFilterCap))
966             return false;
967         if(!ocl_prefilter_norm( right0, right, state->preFilterSize, state->preFilterCap))
968             return false;
969     }
970     else
971     {
972         if(!ocl_prefilter_xsobel( left0, left, state->preFilterCap ))
973             return false;
974         if(!ocl_prefilter_xsobel( right0, right, state->preFilterCap))
975             return false;
976     }
977     return true;
978 }
979 #endif
980 
981 struct PrefilterInvoker : public ParallelLoopBody
982 {
PrefilterInvokercv::PrefilterInvoker983     PrefilterInvoker(const Mat& left0, const Mat& right0, Mat& left, Mat& right,
984                      const BufferBM &bufX_, const StereoBMParams &state_)
985         : bufX(bufX_), state(state_)
986     {
987         imgs0[0] = &left0; imgs0[1] = &right0;
988         imgs[0] = &left; imgs[1] = &right;
989     }
990 
operator ()cv::PrefilterInvoker991     void operator()(const Range& range) const CV_OVERRIDE
992     {
993         for( int i = range.start; i < range.end; i++ )
994         {
995             if (state.useNormPrefilter())
996                 prefilterNorm( *imgs0[i], *imgs[i], state.preFilterSize, state.preFilterCap, bufX.prefilter[i] );
997             else
998                 prefilterXSobel( *imgs0[i], *imgs[i], state.preFilterCap );
999         }
1000     }
1001 
1002     const Mat* imgs0[2];
1003     Mat* imgs[2];
1004     const BufferBM &bufX;
1005     const StereoBMParams &state;
1006 };
1007 
1008 #ifdef HAVE_OPENCL
ocl_stereobm(InputArray _left,InputArray _right,OutputArray _disp,StereoBMParams * state)1009 static bool ocl_stereobm( InputArray _left, InputArray _right,
1010                        OutputArray _disp, StereoBMParams* state)
1011 {
1012     int ndisp = state->numDisparities;
1013     int mindisp = state->minDisparity;
1014     int wsz = state->SADWindowSize;
1015     int wsz2 = wsz/2;
1016 
1017     ocl::Device devDef = ocl::Device::getDefault();
1018     int sizeX = devDef.isIntel() ? 32 : std::max(11, 27 - devDef.maxComputeUnits()),
1019         sizeY = sizeX - 1,
1020         N = ndisp * 2;
1021 
1022     cv::String opt = cv::format("-D DEFINE_KERNEL_STEREOBM -D MIN_DISP=%d -D NUM_DISP=%d"
1023                                 " -D BLOCK_SIZE_X=%d -D BLOCK_SIZE_Y=%d -D WSZ=%d",
1024                                 mindisp, ndisp,
1025                                 sizeX, sizeY, wsz);
1026     ocl::Kernel k("stereoBM", ocl::calib3d::stereobm_oclsrc, opt);
1027     if(k.empty())
1028         return false;
1029 
1030     UMat left = _left.getUMat(), right = _right.getUMat();
1031     int cols = left.cols, rows = left.rows;
1032 
1033     _disp.create(_left.size(), CV_16S);
1034     _disp.setTo((mindisp - 1) << 4);
1035     Rect roi = Rect(Point(wsz2 + mindisp + ndisp - 1, wsz2), Point(cols-wsz2-mindisp, rows-wsz2) );
1036     UMat disp = (_disp.getUMat())(roi);
1037 
1038     int globalX = (disp.cols + sizeX - 1) / sizeX,
1039         globalY = (disp.rows + sizeY - 1) / sizeY;
1040     size_t globalThreads[3] = {(size_t)N, (size_t)globalX, (size_t)globalY};
1041     size_t localThreads[3]  = {(size_t)N, 1, 1};
1042 
1043     int idx = 0;
1044     idx = k.set(idx, ocl::KernelArg::PtrReadOnly(left));
1045     idx = k.set(idx, ocl::KernelArg::PtrReadOnly(right));
1046     idx = k.set(idx, ocl::KernelArg::WriteOnlyNoSize(disp));
1047     idx = k.set(idx, rows);
1048     idx = k.set(idx, cols);
1049     idx = k.set(idx, state->textureThreshold);
1050     idx = k.set(idx, state->uniquenessRatio);
1051     return k.run(3, globalThreads, localThreads, false);
1052 }
1053 #endif
1054 
1055 struct FindStereoCorrespInvoker : public ParallelLoopBody
1056 {
FindStereoCorrespInvokercv::FindStereoCorrespInvoker1057     FindStereoCorrespInvoker( const Mat& _left, const Mat& _right,
1058                              Mat& _disp, const StereoBMParams &_state,
1059                              int _nstripes,
1060                              Rect _validDisparityRect,
1061                              Mat& _cost, const BufferBM & buf_ )
1062         : state(_state), buf(buf_)
1063     {
1064         CV_Assert( _disp.type() == CV_16S || _disp.type() == CV_32S );
1065         left = &_left; right = &_right;
1066         disp = &_disp;
1067         nstripes = _nstripes;
1068         validDisparityRect = _validDisparityRect;
1069         cost = &_cost;
1070     }
1071 
operator ()cv::FindStereoCorrespInvoker1072     void operator()(const Range& range) const CV_OVERRIDE
1073     {
1074         int cols = left->cols, rows = left->rows;
1075         int _row0 = std::min(cvRound(range.start * rows / nstripes), rows);
1076         int _row1 = std::min(cvRound(range.end * rows / nstripes), rows);
1077 
1078         int dispShift = disp->type() == CV_16S ? DISPARITY_SHIFT_16S :
1079                                                  DISPARITY_SHIFT_32S;
1080         int FILTERED = (state.minDisparity - 1) << dispShift;
1081 
1082         Rect roi = validDisparityRect & Rect(0, _row0, cols, _row1 - _row0);
1083         if( roi.height == 0 )
1084             return;
1085         int row0 = roi.y;
1086         int row1 = roi.y + roi.height;
1087 
1088         Mat part;
1089         if( row0 > _row0 )
1090         {
1091             part = disp->rowRange(_row0, row0);
1092             part = Scalar::all(FILTERED);
1093         }
1094         if( _row1 > row1 )
1095         {
1096             part = disp->rowRange(row1, _row1);
1097             part = Scalar::all(FILTERED);
1098         }
1099 
1100         Mat left_i = left->rowRange(row0, row1);
1101         Mat right_i = right->rowRange(row0, row1);
1102         Mat disp_i = disp->rowRange(row0, row1);
1103         Mat cost_i = state.disp12MaxDiff >= 0 ? cost->rowRange(row0, row1) : Mat();
1104 
1105 #if CV_SIMD
1106         if (state.useShorts())
1107         {
1108             if( disp_i.type() == CV_16S)
1109                 findStereoCorrespondenceBM_SIMD<short>( left_i, right_i, disp_i, cost_i, state, row0, rows - row1, buf, range.start );
1110             else
1111                 findStereoCorrespondenceBM_SIMD<int>( left_i, right_i, disp_i, cost_i, state, row0, rows - row1, buf, range.start);
1112         }
1113         else
1114 #endif
1115         {
1116             if( disp_i.type() == CV_16S )
1117                 findStereoCorrespondenceBM<short>( left_i, right_i, disp_i, cost_i, state, row0, rows - row1, buf, range.start );
1118             else
1119                 findStereoCorrespondenceBM<int>( left_i, right_i, disp_i, cost_i, state, row0, rows - row1, buf, range.start );
1120         }
1121 
1122         if( state.disp12MaxDiff >= 0 )
1123             validateDisparity( disp_i, cost_i, state.minDisparity, state.numDisparities, state.disp12MaxDiff );
1124 
1125         if( roi.x > 0 )
1126         {
1127             part = disp_i.colRange(0, roi.x);
1128             part = Scalar::all(FILTERED);
1129         }
1130         if( roi.x + roi.width < cols )
1131         {
1132             part = disp_i.colRange(roi.x + roi.width, cols);
1133             part = Scalar::all(FILTERED);
1134         }
1135     }
1136 
1137 protected:
1138     const Mat *left, *right;
1139     Mat* disp, *cost;
1140     const StereoBMParams &state;
1141 
1142     int nstripes;
1143     Rect validDisparityRect;
1144     const BufferBM & buf;
1145 };
1146 
1147 class StereoBMImpl CV_FINAL : public StereoBM
1148 {
1149 public:
StereoBMImpl()1150     StereoBMImpl()
1151         : params()
1152     {
1153         // nothing
1154     }
1155 
StereoBMImpl(int _numDisparities,int _SADWindowSize)1156     StereoBMImpl( int _numDisparities, int _SADWindowSize )
1157         : params(_numDisparities, _SADWindowSize)
1158     {
1159         // nothing
1160     }
1161 
compute(InputArray leftarr,InputArray rightarr,OutputArray disparr)1162     void compute( InputArray leftarr, InputArray rightarr, OutputArray disparr ) CV_OVERRIDE
1163     {
1164         CV_INSTRUMENT_REGION();
1165 
1166         int dtype = disparr.fixedType() ? disparr.type() : params.dispType;
1167         Size leftsize = leftarr.size();
1168 
1169         if (leftarr.size() != rightarr.size())
1170             CV_Error( Error::StsUnmatchedSizes, "All the images must have the same size" );
1171 
1172         if (leftarr.type() != CV_8UC1 || rightarr.type() != CV_8UC1)
1173             CV_Error( Error::StsUnsupportedFormat, "Both input images must have CV_8UC1" );
1174 
1175         if (dtype != CV_16SC1 && dtype != CV_32FC1)
1176             CV_Error( Error::StsUnsupportedFormat, "Disparity image must have CV_16SC1 or CV_32FC1 format" );
1177 
1178         if( params.preFilterType != PREFILTER_NORMALIZED_RESPONSE &&
1179             params.preFilterType != PREFILTER_XSOBEL )
1180             CV_Error( Error::StsOutOfRange, "preFilterType must be = CV_STEREO_BM_NORMALIZED_RESPONSE" );
1181 
1182         if( params.preFilterSize < 5 || params.preFilterSize > 255 || params.preFilterSize % 2 == 0 )
1183             CV_Error( Error::StsOutOfRange, "preFilterSize must be odd and be within 5..255" );
1184 
1185         if( params.preFilterCap < 1 || params.preFilterCap > 63 )
1186             CV_Error( Error::StsOutOfRange, "preFilterCap must be within 1..63" );
1187 
1188         if( params.SADWindowSize < 5 || params.SADWindowSize > 255 || params.SADWindowSize % 2 == 0 ||
1189             params.SADWindowSize >= std::min(leftsize.width, leftsize.height) )
1190             CV_Error( Error::StsOutOfRange, "SADWindowSize must be odd, be within 5..255 and be not larger than image width or height" );
1191 
1192         if( params.numDisparities <= 0 || params.numDisparities % 16 != 0 )
1193             CV_Error( Error::StsOutOfRange, "numDisparities must be positive and divisible by 16" );
1194 
1195         if( params.textureThreshold < 0 )
1196             CV_Error( Error::StsOutOfRange, "texture threshold must be non-negative" );
1197 
1198         if( params.uniquenessRatio < 0 )
1199             CV_Error( Error::StsOutOfRange, "uniqueness ratio must be non-negative" );
1200 
1201         int disp_shift;
1202         if (dtype == CV_16SC1)
1203             disp_shift = DISPARITY_SHIFT_16S;
1204         else
1205             disp_shift = DISPARITY_SHIFT_32S;
1206 
1207         int FILTERED = (params.minDisparity - 1) << disp_shift;
1208 
1209 #ifdef HAVE_OPENCL
1210         if(ocl::isOpenCLActivated() && disparr.isUMat() && params.textureThreshold == 0)
1211         {
1212             UMat left, right;
1213             if(ocl_prefiltering(leftarr, rightarr, left, right, &params))
1214             {
1215                 if(ocl_stereobm(left, right, disparr, &params))
1216                 {
1217                     disp_shift = DISPARITY_SHIFT_16S;
1218                     FILTERED = (params.minDisparity - 1) << disp_shift;
1219 
1220                     if (params.useFilterSpeckles())
1221                         filterSpeckles(disparr.getMat(), FILTERED, params.speckleWindowSize, params.speckleRange, slidingSumBuf);
1222                     if (dtype == CV_32F)
1223                         disparr.getUMat().convertTo(disparr, CV_32FC1, 1./(1 << disp_shift), 0);
1224                     CV_IMPL_ADD(CV_IMPL_OCL);
1225                     return;
1226                 }
1227             }
1228         }
1229 #endif
1230 
1231         Mat left0 = leftarr.getMat(), right0 = rightarr.getMat();
1232         disparr.create(left0.size(), dtype);
1233         Mat disp0 = disparr.getMat();
1234 
1235         preFilteredImg0.create( left0.size(), CV_8U );
1236         preFilteredImg1.create( left0.size(), CV_8U );
1237         cost.create( left0.size(), CV_16S );
1238 
1239         Mat left = preFilteredImg0, right = preFilteredImg1;
1240 
1241         int mindisp = params.minDisparity;
1242         int ndisp = params.numDisparities;
1243 
1244         int width = left0.cols;
1245         int height = left0.rows;
1246         int lofs = std::max(ndisp - 1 + mindisp, 0);
1247         int rofs = -std::min(ndisp - 1 + mindisp, 0);
1248         int width1 = width - rofs - ndisp + 1;
1249 
1250         if( lofs >= width || rofs >= width || width1 < 1 )
1251         {
1252             disp0 = Scalar::all( FILTERED * ( disp0.type() < CV_32F ? 1 : 1./(1 << disp_shift) ) );
1253             return;
1254         }
1255 
1256         Mat disp = disp0;
1257         if( dtype == CV_32F )
1258         {
1259             dispbuf.create(disp0.size(), CV_32S);
1260             disp = dispbuf;
1261         }
1262 
1263         {
1264             const double SAD_overhead_coeff = 10.0;
1265             const double N0 = 8000000 / (params.useShorts() ? 1 : 4);  // approx tbb's min number instructions reasonable for one thread
1266             const double maxStripeSize = std::min(
1267                 std::max(
1268                     N0 / (width * ndisp),
1269                     (params.SADWindowSize-1) * SAD_overhead_coeff
1270                 ),
1271                 (double)height
1272             );
1273             const int nstripes = cvCeil(height / maxStripeSize);
1274             BufferBM localBuf(nstripes, width, height, params);
1275 
1276             // Prefiltering
1277             parallel_for_(Range(0, 2), PrefilterInvoker(left0, right0, left, right, localBuf, params), 1);
1278 
1279 
1280             Rect validDisparityRect(0, 0, width, height), R1 = params.roi1, R2 = params.roi2;
1281             validDisparityRect = getValidDisparityROI(!R1.empty() ? R1 : validDisparityRect,
1282                                                       !R2.empty() ? R2 : validDisparityRect,
1283                                                       params.minDisparity, params.numDisparities,
1284                                                       params.SADWindowSize);
1285 
1286             FindStereoCorrespInvoker invoker(left, right, disp, params, nstripes, validDisparityRect, cost, localBuf);
1287             parallel_for_(Range(0, nstripes), invoker);
1288 
1289             if (params.useFilterSpeckles())
1290             {
1291                 slidingSumBuf.create( 1, width * height * (sizeof(Point_<short>) + sizeof(int) + sizeof(uchar)), CV_8U );
1292                 filterSpeckles(disp, FILTERED, params.speckleWindowSize, params.speckleRange, slidingSumBuf);
1293             }
1294 
1295         }
1296 
1297         if (disp0.data != disp.data)
1298             disp.convertTo(disp0, disp0.type(), 1./(1 << disp_shift), 0);
1299     }
1300 
getMinDisparity() const1301     int getMinDisparity() const CV_OVERRIDE { return params.minDisparity; }
setMinDisparity(int minDisparity)1302     void setMinDisparity(int minDisparity) CV_OVERRIDE { params.minDisparity = minDisparity; }
1303 
getNumDisparities() const1304     int getNumDisparities() const CV_OVERRIDE { return params.numDisparities; }
setNumDisparities(int numDisparities)1305     void setNumDisparities(int numDisparities) CV_OVERRIDE { params.numDisparities = numDisparities; }
1306 
getBlockSize() const1307     int getBlockSize() const CV_OVERRIDE { return params.SADWindowSize; }
setBlockSize(int blockSize)1308     void setBlockSize(int blockSize) CV_OVERRIDE { params.SADWindowSize = blockSize; }
1309 
getSpeckleWindowSize() const1310     int getSpeckleWindowSize() const CV_OVERRIDE { return params.speckleWindowSize; }
setSpeckleWindowSize(int speckleWindowSize)1311     void setSpeckleWindowSize(int speckleWindowSize) CV_OVERRIDE { params.speckleWindowSize = speckleWindowSize; }
1312 
getSpeckleRange() const1313     int getSpeckleRange() const CV_OVERRIDE { return params.speckleRange; }
setSpeckleRange(int speckleRange)1314     void setSpeckleRange(int speckleRange) CV_OVERRIDE { params.speckleRange = speckleRange; }
1315 
getDisp12MaxDiff() const1316     int getDisp12MaxDiff() const CV_OVERRIDE { return params.disp12MaxDiff; }
setDisp12MaxDiff(int disp12MaxDiff)1317     void setDisp12MaxDiff(int disp12MaxDiff) CV_OVERRIDE { params.disp12MaxDiff = disp12MaxDiff; }
1318 
getPreFilterType() const1319     int getPreFilterType() const CV_OVERRIDE { return params.preFilterType; }
setPreFilterType(int preFilterType)1320     void setPreFilterType(int preFilterType) CV_OVERRIDE { params.preFilterType = preFilterType; }
1321 
getPreFilterSize() const1322     int getPreFilterSize() const CV_OVERRIDE { return params.preFilterSize; }
setPreFilterSize(int preFilterSize)1323     void setPreFilterSize(int preFilterSize) CV_OVERRIDE { params.preFilterSize = preFilterSize; }
1324 
getPreFilterCap() const1325     int getPreFilterCap() const CV_OVERRIDE { return params.preFilterCap; }
setPreFilterCap(int preFilterCap)1326     void setPreFilterCap(int preFilterCap) CV_OVERRIDE { params.preFilterCap = preFilterCap; }
1327 
getTextureThreshold() const1328     int getTextureThreshold() const CV_OVERRIDE { return params.textureThreshold; }
setTextureThreshold(int textureThreshold)1329     void setTextureThreshold(int textureThreshold) CV_OVERRIDE { params.textureThreshold = textureThreshold; }
1330 
getUniquenessRatio() const1331     int getUniquenessRatio() const CV_OVERRIDE { return params.uniquenessRatio; }
setUniquenessRatio(int uniquenessRatio)1332     void setUniquenessRatio(int uniquenessRatio) CV_OVERRIDE { params.uniquenessRatio = uniquenessRatio; }
1333 
getSmallerBlockSize() const1334     int getSmallerBlockSize() const CV_OVERRIDE { return 0; }
setSmallerBlockSize(int)1335     void setSmallerBlockSize(int) CV_OVERRIDE {}
1336 
getROI1() const1337     Rect getROI1() const CV_OVERRIDE { return params.roi1; }
setROI1(Rect roi1)1338     void setROI1(Rect roi1) CV_OVERRIDE { params.roi1 = roi1; }
1339 
getROI2() const1340     Rect getROI2() const CV_OVERRIDE { return params.roi2; }
setROI2(Rect roi2)1341     void setROI2(Rect roi2) CV_OVERRIDE { params.roi2 = roi2; }
1342 
write(FileStorage & fs) const1343     void write(FileStorage& fs) const CV_OVERRIDE
1344     {
1345         writeFormat(fs);
1346         fs << "name" << name_
1347         << "minDisparity" << params.minDisparity
1348         << "numDisparities" << params.numDisparities
1349         << "blockSize" << params.SADWindowSize
1350         << "speckleWindowSize" << params.speckleWindowSize
1351         << "speckleRange" << params.speckleRange
1352         << "disp12MaxDiff" << params.disp12MaxDiff
1353         << "preFilterType" << params.preFilterType
1354         << "preFilterSize" << params.preFilterSize
1355         << "preFilterCap" << params.preFilterCap
1356         << "textureThreshold" << params.textureThreshold
1357         << "uniquenessRatio" << params.uniquenessRatio;
1358     }
1359 
read(const FileNode & fn)1360     void read(const FileNode& fn) CV_OVERRIDE
1361     {
1362         FileNode n = fn["name"];
1363         CV_Assert( n.isString() && String(n) == name_ );
1364         params.minDisparity = (int)fn["minDisparity"];
1365         params.numDisparities = (int)fn["numDisparities"];
1366         params.SADWindowSize = (int)fn["blockSize"];
1367         params.speckleWindowSize = (int)fn["speckleWindowSize"];
1368         params.speckleRange = (int)fn["speckleRange"];
1369         params.disp12MaxDiff = (int)fn["disp12MaxDiff"];
1370         params.preFilterType = (int)fn["preFilterType"];
1371         params.preFilterSize = (int)fn["preFilterSize"];
1372         params.preFilterCap = (int)fn["preFilterCap"];
1373         params.textureThreshold = (int)fn["textureThreshold"];
1374         params.uniquenessRatio = (int)fn["uniquenessRatio"];
1375         params.roi1 = params.roi2 = Rect();
1376     }
1377 
1378     StereoBMParams params;
1379     Mat preFilteredImg0, preFilteredImg1, cost, dispbuf;
1380     Mat slidingSumBuf;
1381 
1382     static const char* name_;
1383 };
1384 
1385 const char* StereoBMImpl::name_ = "StereoMatcher.BM";
1386 
create(int _numDisparities,int _SADWindowSize)1387 Ptr<StereoBM> StereoBM::create(int _numDisparities, int _SADWindowSize)
1388 {
1389     return makePtr<StereoBMImpl>(_numDisparities, _SADWindowSize);
1390 }
1391 
1392 }
1393 
1394 /* End of file. */
1395