1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 //  IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 //  By downloading, copying, installing or using the software you agree to this license.
6 //  If you do not agree to this license, do not download, install,
7 //  copy or use the software.
8 //
9 //
10 //                           License Agreement
11 //                For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
16 // Third party copyrights are property of their respective owners.
17 //
18 // Redistribution and use in source and binary forms, with or without modification,
19 // are permitted provided that the following conditions are met:
20 //
21 //   * Redistribution's of source code must retain the above copyright notice,
22 //     this list of conditions and the following disclaimer.
23 //
24 //   * Redistribution's in binary form must reproduce the above copyright notice,
25 //     this list of conditions and the following disclaimer in the documentation
26 //     and/or other materials provided with the distribution.
27 //
28 //   * The name of the copyright holders may not be used to endorse or promote products
29 //     derived from this software without specific prior written permission.
30 //
31 // This software is provided by the copyright holders and contributors "as is" and
32 // any express or implied warranties, including, but not limited to, the implied
33 // warranties of merchantability and fitness for a particular purpose are disclaimed.
34 // In no event shall the Intel Corporation or contributors be liable for any direct,
35 // indirect, incidental, special, exemplary, or consequential damages
36 // (including, but not limited to, procurement of substitute goods or services;
37 // loss of use, data, or profits; or business interruption) however caused
38 // and on any theory of liability, whether in contract, strict liability,
39 // or tort (including negligence or otherwise) arising in any way out of
40 // the use of this software, even if advised of the possibility of such damage.
41 //
42 //M*/
43 
44 /*
45  This is a variation of
46  "Stereo Processing by Semiglobal Matching and Mutual Information"
47  by Heiko Hirschmuller.
48 
49  We match blocks rather than individual pixels, thus the algorithm is called
50  SGBM (Semi-global block matching)
51  */
52 
53 #include "precomp.hpp"
54 #include <limits.h>
55 #include "opencv2/core/hal/intrin.hpp"
56 #include "opencv2/core/utils/buffer_area.private.hpp"
57 
58 namespace cv
59 {
60 
61 typedef uchar PixType;
62 typedef short CostType;
63 typedef short DispType;
64 
65 // NR - the number of directions. the loop on x that computes Lr assumes that NR == 8.
66 // if you change NR, please, modify the loop as well.
67 enum { NR = 8, NR2 = NR/2 };
68 
69 
70 struct StereoSGBMParams
71 {
StereoSGBMParamscv::StereoSGBMParams72     StereoSGBMParams()
73     {
74         minDisparity = numDisparities = 0;
75         SADWindowSize = 0;
76         P1 = P2 = 0;
77         disp12MaxDiff = 0;
78         preFilterCap = 0;
79         uniquenessRatio = 0;
80         speckleWindowSize = 0;
81         speckleRange = 0;
82         mode = StereoSGBM::MODE_SGBM;
83     }
84 
StereoSGBMParamscv::StereoSGBMParams85     StereoSGBMParams( int _minDisparity, int _numDisparities, int _SADWindowSize,
86                       int _P1, int _P2, int _disp12MaxDiff, int _preFilterCap,
87                       int _uniquenessRatio, int _speckleWindowSize, int _speckleRange,
88                       int _mode )
89     {
90         minDisparity = _minDisparity;
91         numDisparities = _numDisparities;
92         SADWindowSize = _SADWindowSize;
93         P1 = _P1;
94         P2 = _P2;
95         disp12MaxDiff = _disp12MaxDiff;
96         preFilterCap = _preFilterCap;
97         uniquenessRatio = _uniquenessRatio;
98         speckleWindowSize = _speckleWindowSize;
99         speckleRange = _speckleRange;
100         mode = _mode;
101     }
102 
isFullDPcv::StereoSGBMParams103     inline bool isFullDP() const
104     {
105         return mode == StereoSGBM::MODE_HH || mode == StereoSGBM::MODE_HH4;
106     }
calcSADWindowSizecv::StereoSGBMParams107     inline Size calcSADWindowSize() const
108     {
109         const int dim = SADWindowSize > 0 ? SADWindowSize : 5;
110         return Size(dim, dim);
111     }
112 
113     int minDisparity;
114     int numDisparities;
115     int SADWindowSize;
116     int preFilterCap;
117     int uniquenessRatio;
118     int P1;
119     int P2;
120     int speckleWindowSize;
121     int speckleRange;
122     int disp12MaxDiff;
123     int mode;
124 };
125 
126 #if CV_SIMD
127 #if CV_SIMD_WIDTH == 16
vx_setseq_s16()128 static inline v_int16 vx_setseq_s16()
129 { return v_int16(0, 1, 2, 3, 4, 5, 6, 7); }
130 #elif CV_SIMD_WIDTH == 32
vx_setseq_s16()131 static inline v_int16 vx_setseq_s16()
132 { return v_int16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); }
133 #elif CV_SIMD_WIDTH == 64
vx_setseq_s16()134 static inline v_int16 vx_setseq_s16()
135 { return v_int16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); }
136 #else
137 struct vseq_s16
138 {
139     short data[v_int16::nlanes];
vseq_s16cv::vseq_s16140     vseq_s16()
141     {
142         for (int i = 0; i < v_int16::nlanes; i++)
143             data[i] = i;
144     }
145 };
vx_setseq_s16()146 static inline v_int16 vx_setseq_s16()
147 {
148     static vseq_s16 vseq;
149     return vx_load(vseq.data);
150 }
151 #endif
152 // define some additional reduce operations:
min_pos(const v_int16 & val,const v_int16 & pos,short & min_val,short & min_pos)153 static inline void min_pos(const v_int16& val, const v_int16& pos, short &min_val, short &min_pos)
154 {
155     min_val = v_reduce_min(val);
156     v_int16 v_mask = (vx_setall_s16(min_val) == val);
157     min_pos = v_reduce_min(((pos+vx_setseq_s16()) & v_mask) | (vx_setall_s16(SHRT_MAX) & ~v_mask));
158 }
159 #endif
160 
161 static const int DEFAULT_RIGHT_BORDER = -1;
162 
163 /*
164  For each pixel row1[x], max(maxD, 0) <= minX <= x < maxX <= width - max(0, -minD),
165  and for each disparity minD<=d<maxD the function
166  computes the cost (cost[(x-minX)*(maxD - minD) + (d - minD)]), depending on the difference between
167  row1[x] and row2[x-d]. The subpixel algorithm from
168  "Depth Discontinuities by Pixel-to-Pixel Stereo" by Stan Birchfield and C. Tomasi
169  is used, hence the suffix BT.
170 
171  the temporary buffer should contain width2*2 elements
172  */
calcPixelCostBT(const Mat & img1,const Mat & img2,int y,int minD,int maxD,CostType * cost,PixType * buffer,const PixType * tab,int xrange_min=0,int xrange_max=DEFAULT_RIGHT_BORDER)173 static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
174                             int minD, int maxD, CostType* cost,
175                             PixType* buffer, const PixType* tab,
176                             int xrange_min = 0, int xrange_max = DEFAULT_RIGHT_BORDER )
177 {
178     int x, c, width = img1.cols, cn = img1.channels();
179     int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
180     int D = (int)alignSize(maxD - minD, v_int16::nlanes), width1 = maxX1 - minX1;
181     //This minX1 & maxX2 correction is defining which part of calculatable line must be calculated
182     //That is needs of parallel algorithm
183     xrange_min = (xrange_min < 0) ? 0: xrange_min;
184     xrange_max = (xrange_max == DEFAULT_RIGHT_BORDER) || (xrange_max > width1) ? width1 : xrange_max;
185     maxX1 = minX1 + xrange_max;
186     minX1 += xrange_min;
187     width1 = maxX1 - minX1;
188     int minX2 = std::max(minX1 - maxD, 0), maxX2 = std::min(maxX1 - minD, width);
189     int width2 = maxX2 - minX2;
190     const PixType *row1 = img1.ptr<PixType>(y), *row2 = img2.ptr<PixType>(y);
191     PixType *prow1 = buffer + width2*2, *prow2 = prow1 + width*cn*2;
192 
193     for( c = 0; c < cn*2; c++ )
194     {
195         prow1[width*c] = prow1[width*c + width-1] =
196         prow2[width*c] = prow2[width*c + width-1] = tab[0];
197     }
198 
199     int n1 = y > 0 ? -(int)img1.step : 0, s1 = y < img1.rows-1 ? (int)img1.step : 0;
200     int n2 = y > 0 ? -(int)img2.step : 0, s2 = y < img2.rows-1 ? (int)img2.step : 0;
201 
202     int minX_cmn = std::min(minX1,minX2)-1;
203     int maxX_cmn = std::max(maxX1,maxX2)+1;
204     minX_cmn = std::max(minX_cmn, 1);
205     maxX_cmn = std::min(maxX_cmn, width - 1);
206     if( cn == 1 )
207     {
208         for( x = minX_cmn; x < maxX_cmn; x++ )
209         {
210             prow1[x] = tab[(row1[x+1] - row1[x-1])*2 + row1[x+n1+1] - row1[x+n1-1] + row1[x+s1+1] - row1[x+s1-1]];
211             prow2[width-1-x] = tab[(row2[x+1] - row2[x-1])*2 + row2[x+n2+1] - row2[x+n2-1] + row2[x+s2+1] - row2[x+s2-1]];
212 
213             prow1[x+width] = row1[x];
214             prow2[width-1-x+width] = row2[x];
215         }
216     }
217     else
218     {
219         for( x = minX_cmn; x < maxX_cmn; x++ )
220         {
221             prow1[x] = tab[(row1[x*3+3] - row1[x*3-3])*2 + row1[x*3+n1+3] - row1[x*3+n1-3] + row1[x*3+s1+3] - row1[x*3+s1-3]];
222             prow1[x+width] = tab[(row1[x*3+4] - row1[x*3-2])*2 + row1[x*3+n1+4] - row1[x*3+n1-2] + row1[x*3+s1+4] - row1[x*3+s1-2]];
223             prow1[x+width*2] = tab[(row1[x*3+5] - row1[x*3-1])*2 + row1[x*3+n1+5] - row1[x*3+n1-1] + row1[x*3+s1+5] - row1[x*3+s1-1]];
224 
225             prow2[width-1-x] = tab[(row2[x*3+3] - row2[x*3-3])*2 + row2[x*3+n2+3] - row2[x*3+n2-3] + row2[x*3+s2+3] - row2[x*3+s2-3]];
226             prow2[width-1-x+width] = tab[(row2[x*3+4] - row2[x*3-2])*2 + row2[x*3+n2+4] - row2[x*3+n2-2] + row2[x*3+s2+4] - row2[x*3+s2-2]];
227             prow2[width-1-x+width*2] = tab[(row2[x*3+5] - row2[x*3-1])*2 + row2[x*3+n2+5] - row2[x*3+n2-1] + row2[x*3+s2+5] - row2[x*3+s2-1]];
228 
229             prow1[x+width*3] = row1[x*3];
230             prow1[x+width*4] = row1[x*3+1];
231             prow1[x+width*5] = row1[x*3+2];
232 
233             prow2[width-1-x+width*3] = row2[x*3];
234             prow2[width-1-x+width*4] = row2[x*3+1];
235             prow2[width-1-x+width*5] = row2[x*3+2];
236         }
237     }
238 
239     memset( cost + xrange_min*D, 0, width1*D*sizeof(cost[0]) );
240 
241     buffer -= width-maxX2;
242     cost -= (minX1-xrange_min)*D + minD; // simplify the cost indices inside the loop
243 
244     for( c = 0; c < cn*2; c++, prow1 += width, prow2 += width )
245     {
246         int diff_scale = c < cn ? 0 : 2;
247 
248         // precompute
249         //   v0 = min(row2[x-1/2], row2[x], row2[x+1/2]) and
250         //   v1 = max(row2[x-1/2], row2[x], row2[x+1/2]) and
251         //   to process values from [minX2, maxX2) we should check memory location (width - 1 - maxX2, width - 1 - minX2]
252         //   so iterate through [width - maxX2, width - minX2)
253         for( x = width-maxX2; x < width-minX2; x++ )
254         {
255             int v = prow2[x];
256             int vl = x > 0 ? (v + prow2[x-1])/2 : v;
257             int vr = x < width-1 ? (v + prow2[x+1])/2 : v;
258             int v0 = std::min(vl, vr); v0 = std::min(v0, v);
259             int v1 = std::max(vl, vr); v1 = std::max(v1, v);
260             buffer[x] = (PixType)v0;
261             buffer[x + width2] = (PixType)v1;
262         }
263 
264         for( x = minX1; x < maxX1; x++ )
265         {
266             int u = prow1[x];
267             int ul = x > 0 ? (u + prow1[x-1])/2 : u;
268             int ur = x < width-1 ? (u + prow1[x+1])/2 : u;
269             int u0 = std::min(ul, ur); u0 = std::min(u0, u);
270             int u1 = std::max(ul, ur); u1 = std::max(u1, u);
271 
272             int d = minD;
273         #if CV_SIMD
274             v_uint8 _u  = vx_setall_u8((uchar)u), _u0 = vx_setall_u8((uchar)u0);
275             v_uint8 _u1 = vx_setall_u8((uchar)u1);
276 
277             for( ; d <= maxD - 2*v_int16::nlanes; d += 2*v_int16::nlanes )
278             {
279                 v_uint8 _v  = vx_load(prow2  + width-x-1 + d);
280                 v_uint8 _v0 = vx_load(buffer + width-x-1 + d);
281                 v_uint8 _v1 = vx_load(buffer + width-x-1 + d + width2);
282                 v_uint8 c0 = v_max(_u - _v1, _v0 - _u);
283                 v_uint8 c1 = v_max(_v - _u1, _u0 - _v);
284                 v_uint8 diff = v_min(c0, c1);
285 
286                 v_int16 _c0 = vx_load_aligned(cost + x*D + d);
287                 v_int16 _c1 = vx_load_aligned(cost + x*D + d + v_int16::nlanes);
288 
289                 v_uint16 diff1,diff2;
290                 v_expand(diff,diff1,diff2);
291                 v_store_aligned(cost + x*D + d,                   _c0 + v_reinterpret_as_s16(diff1 >> diff_scale));
292                 v_store_aligned(cost + x*D + d + v_int16::nlanes, _c1 + v_reinterpret_as_s16(diff2 >> diff_scale));
293             }
294         #endif
295             for( ; d < maxD; d++ )
296             {
297                 int v = prow2[width-x-1 + d];
298                 int v0 = buffer[width-x-1 + d];
299                 int v1 = buffer[width-x-1 + d + width2];
300                 int c0 = std::max(0, u - v1); c0 = std::max(c0, v0 - u);
301                 int c1 = std::max(0, v - u1); c1 = std::max(c1, u0 - v);
302 
303                 cost[x*D + d] = (CostType)(cost[x*D+d] + (std::min(c0, c1) >> diff_scale));
304             }
305         }
306     }
307 }
308 
309 
310 
311 class BufferSGBM
312 {
313 private:
314     size_t width1;
315     size_t Da;
316     size_t Dlra;
317     size_t costWidth;
318     size_t costHeight;
319     size_t hsumRows;
320     bool fullDP;
321     uchar dirs;
322     uchar dirs2;
323     static const size_t TAB_OFS = 256*4;
324 
325 public:
326     CostType* Cbuf;
327     CostType* Sbuf;
328     CostType* hsumBuf;
329     CostType* pixDiff;
330     CostType* disp2cost;
331     DispType* disp2ptr;
332     PixType* tempBuf;
333     std::vector<CostType*> Lr;
334     std::vector<CostType*> minLr;
335     PixType * clipTab;
336 
337 private:
338     utils::BufferArea area;
339 
340 public:
BufferSGBM(size_t width1_,size_t Da_,size_t Dlra_,size_t cn,size_t width,size_t height,const StereoSGBMParams & params)341     BufferSGBM(size_t width1_,
342                size_t Da_,
343                size_t Dlra_,
344                size_t cn,
345                size_t width,
346                size_t height,
347                const StereoSGBMParams &params)
348         : width1(width1_),
349         Da(Da_),
350         Dlra(Dlra_),
351         Cbuf(NULL),
352         Sbuf(NULL),
353         hsumBuf(NULL),
354         pixDiff(NULL),
355         disp2cost(NULL),
356         disp2ptr(NULL),
357         tempBuf(NULL),
358         Lr(2, (CostType*)NULL),
359         minLr(2, (CostType*)NULL),
360         clipTab(NULL)
361     {
362         const size_t TAB_SIZE = 256 + TAB_OFS*2;
363         fullDP = params.isFullDP();
364         costWidth = width1 * Da;
365         costHeight = fullDP ? height : 1;
366         hsumRows = params.calcSADWindowSize().height + 2;
367         dirs = params.mode == StereoSGBM::MODE_HH4 ? 1 : NR;
368         dirs2 = params.mode == StereoSGBM::MODE_HH4 ? 1 : NR2;
369         // for each possible stereo match (img1(x,y) <=> img2(x-d,y))
370         // we keep pixel difference cost (C) and the summary cost over NR directions (S).
371         // we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k)
372         area.allocate(Cbuf, costWidth * costHeight, CV_SIMD_WIDTH); // summary cost over different (nDirs) directions
373         area.allocate(Sbuf, costWidth * costHeight, CV_SIMD_WIDTH);
374         area.allocate(hsumBuf, costWidth * hsumRows, CV_SIMD_WIDTH);
375         area.allocate(pixDiff, costWidth, CV_SIMD_WIDTH);
376         area.allocate(disp2cost,    width, CV_SIMD_WIDTH);
377         area.allocate(disp2ptr,     width, CV_SIMD_WIDTH);
378         area.allocate(tempBuf,      width * (4 * cn + 2), CV_SIMD_WIDTH);
379         // the number of L_r(.,.) and min_k L_r(.,.) lines in the buffer:
380         // for 8-way dynamic programming we need the current row and
381         // the previous row, i.e. 2 rows in total
382         for (size_t i = 0; i < 2; ++i)
383         {
384             // 2D: [ NR ][ w1 * NR2 ][ NR ] * [ Dlra ]
385             area.allocate(Lr[i], calcLrCount() * Dlra, CV_SIMD_WIDTH);
386             // 1D: [ NR ][ w1 * NR2 ][ NR ]
387             area.allocate(minLr[i], calcLrCount(), CV_SIMD_WIDTH);
388         }
389         area.allocate(clipTab, TAB_SIZE, CV_SIMD_WIDTH);
390         area.commit();
391 
392         // init clipTab
393         const int ftzero = std::max(params.preFilterCap, 15) | 1;
394         for(int i = 0; i < (int)TAB_SIZE; i++ )
395             clipTab[i] = (PixType)(std::min(std::max(i - (int)TAB_OFS, -ftzero), ftzero) + ftzero);
396     }
getClipTab() const397     inline const PixType * getClipTab() const
398     {
399         return clipTab + TAB_OFS;
400     }
initCBuf(CostType val) const401     inline void initCBuf(CostType val) const
402     {
403         for (size_t i = 0; i < costWidth * costHeight; ++i)
404             Cbuf[i] = val;
405     }
clearLr(const Range & range=Range::all ()) const406     inline void clearLr(const Range & range = Range::all()) const
407     {
408             for (uchar i = 0; i < 2; ++i)
409             {
410                 if (range == Range::all())
411                 {
412                     memset(Lr[i],    0, calcLrCount() * Dlra * sizeof(CostType));
413                     memset(minLr[i], 0, calcLrCount()        * sizeof(CostType));
414                 }
415                 else
416                 {
417                     memset(getLr(i, range.start), 0, range.size() * sizeof(CostType) * Dlra);
418                     memset(getMinLr(i, range.start), 0, range.size() * sizeof(CostType));
419                 }
420             }
421     }
calcLrCount() const422     inline size_t calcLrCount() const
423     {
424         return width1 * dirs2 + 2 * dirs;
425     }
swapLr()426     inline void swapLr()
427     {
428         std::swap(Lr[0], Lr[1]);
429         std::swap(minLr[0], minLr[1]);
430     }
getHSumBuf(int row) const431     inline CostType * getHSumBuf(int row) const
432     {
433         return hsumBuf + (row % hsumRows) * costWidth;
434     }
getCBuf(int row) const435     inline CostType * getCBuf(int row) const
436     {
437         CV_Assert(row >= 0);
438         return Cbuf + (!fullDP ? 0 : (row * costWidth));
439     }
getSBuf(int row) const440     inline CostType * getSBuf(int row) const
441     {
442         CV_Assert(row >= 0);
443         return Sbuf + (!fullDP ? 0 : (row * costWidth));
444     }
clearSBuf(int row,const Range & range=Range::all ()) const445     inline void clearSBuf(int row, const Range & range = Range::all()) const
446     {
447         if (range == Range::all())
448             memset(getSBuf(row), 0, costWidth * sizeof(CostType));
449         else
450             memset(getSBuf(row) + range.start * Da, 0, range.size() * Da * sizeof(CostType));
451     }
452 
453     // shift Lr[k] and minLr[k] pointers, because we allocated them with the borders,
454     // and will occasionally use negative indices with the arrays
455     // we need to shift Lr[k] pointers by 1, to give the space for d=-1.
getLr(uchar id,int idx,uchar shift=0) const456     inline CostType * getLr(uchar id, int idx, uchar shift = 0) const
457     {
458         CV_Assert(id < 2);
459         const size_t fixed_offset = dirs * Dlra;
460         return Lr[id] + fixed_offset + (idx * (int)dirs2 + (int)shift) * (int)Dlra;
461     }
getMinLr(uchar id,int idx,uchar shift=0) const462     inline CostType * getMinLr(uchar id, int idx, uchar shift = 0) const
463     {
464         CV_Assert(id < 2);
465         const size_t fixed_offset = dirs;
466         return minLr[id] + fixed_offset + (idx * dirs2 + shift);
467     }
468 };
469 
470 /*
471  computes disparity for "roi" in img1 w.r.t. img2 and write it to disp1buf.
472  that is, disp1buf(x, y)=d means that img1(x+roi.x, y+roi.y) ~ img2(x+roi.x-d, y+roi.y).
473  minD <= d < maxD.
474  disp2full is the reverse disparity map, that is:
475  disp2full(x+roi.x,y+roi.y)=d means that img2(x+roi.x, y+roi.y) ~ img1(x+roi.x+d, y+roi.y)
476 
477  note that disp1buf will have the same size as the roi and
478  disp2full will have the same size as img1 (or img2).
479  On exit disp2buf is not the final disparity, it is an intermediate result that becomes
480  final after all the tiles are processed.
481 
482  the disparity in disp1buf is written with sub-pixel accuracy
483  (4 fractional bits, see StereoSGBM::DISP_SCALE),
484  using quadratic interpolation, while the disparity in disp2buf
485  is written as is, without interpolation.
486 
487  disp2cost also has the same size as img1 (or img2).
488  It contains the minimum current cost, used to find the best disparity, corresponding to the minimal cost.
489  */
computeDisparitySGBM(const Mat & img1,const Mat & img2,Mat & disp1,const StereoSGBMParams & params)490 static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
491                                  Mat& disp1, const StereoSGBMParams& params )
492 {
493     const int DISP_SHIFT = StereoMatcher::DISP_SHIFT;
494     const int DISP_SCALE = (1 << DISP_SHIFT);
495     const CostType MAX_COST = SHRT_MAX;
496 
497     int minD = params.minDisparity, maxD = minD + params.numDisparities;
498     int uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10;
499     int disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;
500     int P1 = params.P1 > 0 ? params.P1 : 2, P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);
501     int k, width = disp1.cols, height = disp1.rows;
502     int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
503     const int D = params.numDisparities;
504     int width1 = maxX1 - minX1;
505     int Da = (int)alignSize(D, v_int16::nlanes);
506     int Dlra = Da + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D
507     int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
508     int SW2 = params.calcSADWindowSize().width/2, SH2 = params.calcSADWindowSize().height/2;
509     int npasses = params.isFullDP() ? 2 : 1;
510 
511     if( minX1 >= maxX1 )
512     {
513         disp1 = Scalar::all(INVALID_DISP_SCALED);
514         return;
515     }
516 
517     BufferSGBM mem(width1, Da, Dlra, img1.channels(), width, height, params);
518     mem.initCBuf((CostType)P2); // add P2 to every C(x,y). it saves a few operations in the inner loops
519 
520     for( int pass = 1; pass <= npasses; pass++ )
521     {
522         int x1, y1, x2, y2, dx, dy;
523 
524         if( pass == 1 )
525         {
526             y1 = 0; y2 = height; dy = 1;
527             x1 = 0; x2 = width1; dx = 1;
528         }
529         else
530         {
531             y1 = height-1; y2 = -1; dy = -1;
532             x1 = width1-1; x2 = -1; dx = -1;
533         }
534 
535         uchar lrID = 0;
536         mem.clearLr();
537 
538         for( int y = y1; y != y2; y += dy )
539         {
540             int x, d;
541             DispType* disp1ptr = disp1.ptr<DispType>(y);
542             CostType* const C = mem.getCBuf(y);
543             CostType* const S = mem.getSBuf(y);
544 
545             if( pass == 1 ) // compute C on the first pass, and reuse it on the second pass, if any.
546             {
547                 int dy1 = y == 0 ? 0 : y + SH2, dy2 = y == 0 ? SH2 : dy1;
548 
549                 for( k = dy1; k <= dy2; k++ )
550                 {
551                     CostType* hsumAdd = mem.getHSumBuf(std::min(k, height-1));
552 
553                     if( k < height )
554                     {
555                         calcPixelCostBT( img1, img2, k, minD, maxD, mem.pixDiff, mem.tempBuf, mem.getClipTab() );
556 
557                         memset(hsumAdd, 0, Da*sizeof(CostType));
558 #if CV_SIMD
559                         v_int16 h_scale = vx_setall_s16((short)SW2 + 1);
560                         for( d = 0; d < Da; d += v_int16::nlanes )
561                         {
562                             v_int16 v_hsumAdd = vx_load_aligned(mem.pixDiff + d) * h_scale;
563                             for( x = Da; x <= SW2*Da; x += Da )
564                                 v_hsumAdd += vx_load_aligned(mem.pixDiff + x + d);
565                             v_store_aligned(hsumAdd + d, v_hsumAdd);
566                         }
567 #else
568                         for (d = 0; d < D; d++)
569                         {
570                             hsumAdd[d] = (CostType)(mem.pixDiff[d] * (SW2 + 1));
571                             for( x = Da; x <= SW2*Da; x += Da )
572                                 hsumAdd[d] = (CostType)(hsumAdd[d] + mem.pixDiff[x + d]);
573                         }
574 #endif
575 
576                         if( y > 0 )
577                         {
578                             const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, 0));
579                             const CostType* Cprev =  mem.getCBuf(y - 1);
580 
581 #if CV_SIMD
582                             for (d = 0; d < Da; d += v_int16::nlanes)
583                                 v_store_aligned(C + d, vx_load_aligned(Cprev + d) + vx_load_aligned(hsumAdd + d) - vx_load_aligned(hsumSub + d));
584 #else
585                             for (d = 0; d < D; d++)
586                                 C[d] = (CostType)(Cprev[d] + hsumAdd[d] - hsumSub[d]);
587 #endif
588 
589                             for( x = Da; x < width1*Da; x += Da )
590                             {
591                                 const CostType* pixAdd = mem.pixDiff + std::min(x + SW2*Da, (width1-1)*Da);
592                                 const CostType* pixSub = mem.pixDiff + std::max(x - (SW2+1)*Da, 0);
593 #if CV_SIMD
594                                 for( d = 0; d < Da; d += v_int16::nlanes )
595                                 {
596                                     v_int16 hv = vx_load_aligned(hsumAdd + x - Da + d) - vx_load_aligned(pixSub + d) + vx_load_aligned(pixAdd + d);
597                                     v_store_aligned(hsumAdd + x + d, hv);
598                                     v_store_aligned(C + x + d, vx_load_aligned(Cprev + x + d) - vx_load_aligned(hsumSub + x + d) + hv);
599                                 }
600 #else
601                                 for( d = 0; d < D; d++ )
602                                 {
603                                     int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - Da + d] + pixAdd[d] - pixSub[d]);
604                                     C[x + d] = (CostType)(Cprev[x + d] + hv - hsumSub[x + d]);
605                                 }
606 #endif
607                             }
608                         }
609                         else
610                         {
611 #if CV_SIMD
612                             v_int16 v_scale = vx_setall_s16(k == 0 ? (short)SH2 + 1 : 1);
613                             for (d = 0; d < Da; d += v_int16::nlanes)
614                                 v_store_aligned(C + d, vx_load_aligned(C + d) + vx_load_aligned(hsumAdd + d) * v_scale);
615 #else
616                             int scale = k == 0 ? SH2 + 1 : 1;
617                             for (d = 0; d < D; d++)
618                                 C[d] = (CostType)(C[d] + hsumAdd[d] * scale);
619 #endif
620                             for( x = Da; x < width1*Da; x += Da )
621                             {
622                                 const CostType* pixAdd = mem.pixDiff + std::min(x + SW2*Da, (width1-1)*Da);
623                                 const CostType* pixSub = mem.pixDiff + std::max(x - (SW2+1)*Da, 0);
624 
625 #if CV_SIMD
626                                 for (d = 0; d < Da; d += v_int16::nlanes)
627                                 {
628                                     v_int16 hv = vx_load_aligned(hsumAdd + x - Da + d) + vx_load_aligned(pixAdd + d) - vx_load_aligned(pixSub + d);
629                                     v_store_aligned(hsumAdd + x + d, hv);
630                                     v_store_aligned(C + x + d, vx_load_aligned(C + x + d) + hv * v_scale);
631                                 }
632 #else
633                                 for( d = 0; d < D; d++ )
634                                 {
635                                     CostType hv = (CostType)(hsumAdd[x - Da + d] + pixAdd[d] - pixSub[d]);
636                                     hsumAdd[x + d] = hv;
637                                     C[x + d] = (CostType)(C[x + d] + hv * scale);
638                                 }
639 #endif
640                             }
641                         }
642                     }
643                     else
644                     {
645                         if( y > 0 )
646                         {
647                             const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, 0));
648                             const CostType* Cprev = mem.getCBuf(y - 1);
649 #if CV_SIMD
650                             for (x = 0; x < width1*Da; x += v_int16::nlanes)
651                                 v_store_aligned(C + x, vx_load_aligned(Cprev + x) - vx_load_aligned(hsumSub + x) + vx_load_aligned(hsumAdd + x));
652 #else
653                             for (x = 0; x < width1*Da; x++)
654                                 C[x] = (CostType)(Cprev[x] + hsumAdd[x] - hsumSub[x]);
655 #endif
656                         }
657                         else
658                         {
659 #if CV_SIMD
660                             for (x = 0; x < width1*Da; x += v_int16::nlanes)
661                                 v_store_aligned(C + x, vx_load_aligned(C + x) + vx_load_aligned(hsumAdd + x));
662 #else
663                             for (x = 0; x < width1*Da; x++)
664                                 C[x] = (CostType)(C[x] + hsumAdd[x]);
665 #endif
666                         }
667                     }
668 
669                 }
670 
671                 // also, clear the S buffer
672                 mem.clearSBuf(y);
673             }
674 
675             /*
676              [formula 13 in the paper]
677              compute L_r(p, d) = C(p, d) +
678              min(L_r(p-r, d),
679              L_r(p-r, d-1) + P1,
680              L_r(p-r, d+1) + P1,
681              min_k L_r(p-r, k) + P2) - min_k L_r(p-r, k)
682              where p = (x,y), r is one of the directions.
683              we process all the directions at once:
684              0: r=(-dx, 0)
685              1: r=(-1, -dy)
686              2: r=(0, -dy)
687              3: r=(1, -dy)   !!!Note that only directions 0 to 3 are processed
688              4: r=(-2, -dy)
689              5: r=(-1, -dy*2)
690              6: r=(1, -dy*2)
691              7: r=(2, -dy)
692              */
693 
694             for( x = x1; x != x2; x += dx )
695             {
696                 int delta0 = P2 + *mem.getMinLr(lrID, x - dx);
697                 int delta1 = P2 + *mem.getMinLr(1 - lrID, x - 1, 1);
698                 int delta2 = P2 + *mem.getMinLr(1 - lrID, x,     2);
699                 int delta3 = P2 + *mem.getMinLr(1 - lrID, x + 1, 3);
700 
701                 CostType* Lr_p0 = mem.getLr(lrID, x - dx);
702                 CostType* Lr_p1 = mem.getLr(1 - lrID, x - 1, 1);
703                 CostType* Lr_p2 = mem.getLr(1 - lrID, x,     2);
704                 CostType* Lr_p3 = mem.getLr(1 - lrID, x + 1, 3);
705 
706                 Lr_p0[-1] = Lr_p0[D] = MAX_COST;
707                 Lr_p1[-1] = Lr_p1[D] = MAX_COST;
708                 Lr_p2[-1] = Lr_p2[D] = MAX_COST;
709                 Lr_p3[-1] = Lr_p3[D] = MAX_COST;
710 
711                 CostType* Lr_p = mem.getLr(lrID, x);
712                 const CostType* Cp = C + x*Da;
713                 CostType* Sp = S + x*Da;
714 
715                 CostType* minL = mem.getMinLr(lrID, x);
716                 d = 0;
717 #if CV_SIMD
718                 v_int16 _P1 = vx_setall_s16((short)P1);
719 
720                 v_int16 _delta0 = vx_setall_s16((short)delta0);
721                 v_int16 _delta1 = vx_setall_s16((short)delta1);
722                 v_int16 _delta2 = vx_setall_s16((short)delta2);
723                 v_int16 _delta3 = vx_setall_s16((short)delta3);
724                 v_int16 _minL0 = vx_setall_s16((short)MAX_COST);
725                 v_int16 _minL1 = vx_setall_s16((short)MAX_COST);
726                 v_int16 _minL2 = vx_setall_s16((short)MAX_COST);
727                 v_int16 _minL3 = vx_setall_s16((short)MAX_COST);
728 
729                 for( ; d <= D - v_int16::nlanes; d += v_int16::nlanes )
730                 {
731                     v_int16 Cpd = vx_load_aligned(Cp + d);
732                     v_int16 Spd = vx_load_aligned(Sp + d);
733                     v_int16 L;
734 
735                     L = v_min(v_min(v_min(vx_load_aligned(Lr_p0 + d), vx_load(Lr_p0 + d - 1) + _P1), vx_load(Lr_p0 + d + 1) + _P1), _delta0) - _delta0 + Cpd;
736                     v_store_aligned(Lr_p + d, L);
737                     _minL0 = v_min(_minL0, L);
738                     Spd += L;
739 
740                     L = v_min(v_min(v_min(vx_load_aligned(Lr_p1 + d), vx_load(Lr_p1 + d - 1) + _P1), vx_load(Lr_p1 + d + 1) + _P1), _delta1) - _delta1 + Cpd;
741                     v_store_aligned(Lr_p + d + Dlra, L);
742                     _minL1 = v_min(_minL1, L);
743                     Spd += L;
744 
745                     L = v_min(v_min(v_min(vx_load_aligned(Lr_p2 + d), vx_load(Lr_p2 + d - 1) + _P1), vx_load(Lr_p2 + d + 1) + _P1), _delta2) - _delta2 + Cpd;
746                     v_store_aligned(Lr_p + d + Dlra*2, L);
747                     _minL2 = v_min(_minL2, L);
748                     Spd += L;
749 
750                     L = v_min(v_min(v_min(vx_load_aligned(Lr_p3 + d), vx_load(Lr_p3 + d - 1) + _P1), vx_load(Lr_p3 + d + 1) + _P1), _delta3) - _delta3 + Cpd;
751                     v_store_aligned(Lr_p + d + Dlra*3, L);
752                     _minL3 = v_min(_minL3, L);
753                     Spd += L;
754 
755                     v_store_aligned(Sp + d, Spd);
756                 }
757 
758 #if CV_SIMD_WIDTH > 32
759                 minL[0] = v_reduce_min(_minL0);
760                 minL[1] = v_reduce_min(_minL1);
761                 minL[2] = v_reduce_min(_minL2);
762                 minL[3] = v_reduce_min(_minL3);
763 #else
764                 // Get minimum for L0-L3
765                 v_int16 t0, t1, t2, t3;
766                 v_zip(_minL0, _minL2, t0, t2);
767                 v_zip(_minL1, _minL3, t1, t3);
768                 v_zip(v_min(t0, t2), v_min(t1, t3), t0, t1);
769                 t0 = v_min(t0, t1);
770                 t0 = v_min(t0, v_rotate_right<4>(t0));
771 #if CV_SIMD_WIDTH == 32
772                 CostType buf[v_int16::nlanes];
773                 v_store_low(buf, v_min(t0, v_rotate_right<8>(t0)));
774                 minL[0] = buf[0];
775                 minL[1] = buf[1];
776                 minL[2] = buf[2];
777                 minL[3] = buf[3];
778 #else
779                 v_store_low(minL, t0);
780 #endif
781 #endif
782 #else
783                 minL[0] = MAX_COST;
784                 minL[1] = MAX_COST;
785                 minL[2] = MAX_COST;
786                 minL[3] = MAX_COST;
787 #endif
788                 for( ; d < D; d++ )
789                 {
790                     int Cpd = Cp[d], L;
791                     int Spd = Sp[d];
792 
793                     L = Cpd + std::min((int)Lr_p0[d], std::min(Lr_p0[d - 1] + P1, std::min(Lr_p0[d + 1] + P1, delta0))) - delta0;
794                     Lr_p[d] = (CostType)L;
795                     minL[0] = std::min(minL[0], (CostType)L);
796                     Spd += L;
797 
798                     L = Cpd + std::min((int)Lr_p1[d], std::min(Lr_p1[d - 1] + P1, std::min(Lr_p1[d + 1] + P1, delta1))) - delta1;
799                     Lr_p[d + Dlra] = (CostType)L;
800                     minL[1] = std::min(minL[1], (CostType)L);
801                     Spd += L;
802 
803                     L = Cpd + std::min((int)Lr_p2[d], std::min(Lr_p2[d - 1] + P1, std::min(Lr_p2[d + 1] + P1, delta2))) - delta2;
804                     Lr_p[d + Dlra*2] = (CostType)L;
805                     minL[2] = std::min(minL[2], (CostType)L);
806                     Spd += L;
807 
808                     L = Cpd + std::min((int)Lr_p3[d], std::min(Lr_p3[d - 1] + P1, std::min(Lr_p3[d + 1] + P1, delta3))) - delta3;
809                     Lr_p[d + Dlra*3] = (CostType)L;
810                     minL[3] = std::min(minL[3], (CostType)L);
811                     Spd += L;
812 
813                     Sp[d] = saturate_cast<CostType>(Spd);
814                 }
815             }
816 
817             if( pass == npasses )
818             {
819                 x = 0;
820 #if CV_SIMD
821                 v_int16 v_inv_dist = vx_setall_s16((DispType)INVALID_DISP_SCALED);
822                 v_int16 v_max_cost = vx_setall_s16(MAX_COST);
823                 for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes )
824                 {
825                     v_store(disp1ptr + x, v_inv_dist);
826                     v_store(mem.disp2ptr + x, v_inv_dist);
827                     v_store(mem.disp2cost + x, v_max_cost);
828                 }
829 #endif
830                 for( ; x < width; x++ )
831                 {
832                     disp1ptr[x] = mem.disp2ptr[x] = (DispType)INVALID_DISP_SCALED;
833                     mem.disp2cost[x] = MAX_COST;
834                 }
835 
836                 for( x = width1 - 1; x >= 0; x-- )
837                 {
838                     CostType* Sp = S + x*Da;
839                     CostType minS = MAX_COST;
840                     short bestDisp = -1;
841 
842                     if( npasses == 1 )
843                     {
844                         CostType* Lr_p0 = mem.getLr(lrID, x + 1);
845                         Lr_p0[-1] = Lr_p0[D] = MAX_COST;
846                         CostType* Lr_p = mem.getLr(lrID, x);
847 
848                         const CostType* Cp = C + x*Da;
849 
850                         d = 0;
851                         int delta0 = P2 + *mem.getMinLr(lrID, x + 1);
852                         int minL0 = MAX_COST;
853 #if CV_SIMD
854                         v_int16 _P1 = vx_setall_s16((short)P1);
855                         v_int16 _delta0 = vx_setall_s16((short)delta0);
856 
857                         v_int16 _minL0 = vx_setall_s16((short)MAX_COST);
858                         v_int16 _minS = vx_setall_s16(MAX_COST), _bestDisp = vx_setall_s16(-1);
859                         for( ; d <= D - v_int16::nlanes; d += v_int16::nlanes )
860                         {
861                             v_int16 Cpd = vx_load_aligned(Cp + d);
862                             v_int16 L0 = v_min(v_min(v_min(vx_load_aligned(Lr_p0 + d), vx_load(Lr_p0 + d - 1) + _P1), vx_load(Lr_p0 + d + 1) + _P1), _delta0) - _delta0 + Cpd;
863 
864                             v_store_aligned(Lr_p + d, L0);
865                             _minL0 = v_min(_minL0, L0);
866                             L0 += vx_load_aligned(Sp + d);
867                             v_store_aligned(Sp + d, L0);
868 
869                             _bestDisp = v_select(_minS > L0, vx_setall_s16((short)d), _bestDisp);
870                             _minS = v_min(_minS, L0);
871                         }
872                         minL0 = (CostType)v_reduce_min(_minL0);
873                         min_pos(_minS, _bestDisp, minS, bestDisp);
874 #endif
875                         for( ; d < D; d++ )
876                         {
877                             int L0 = Cp[d] + std::min((int)Lr_p0[d], std::min(Lr_p0[d-1] + P1, std::min(Lr_p0[d+1] + P1, delta0))) - delta0;
878 
879                             Lr_p[d] = (CostType)L0;
880                             minL0 = std::min(minL0, L0);
881 
882                             CostType Sval = Sp[d] = saturate_cast<CostType>(Sp[d] + L0);
883                             if( Sval < minS )
884                             {
885                                 minS = Sval;
886                                 bestDisp = (short)d;
887                             }
888                         }
889                         *mem.getMinLr(lrID, x) = (CostType)minL0;
890                     }
891                     else
892                     {
893                         d = 0;
894 #if CV_SIMD
895                         v_int16 _minS = vx_setall_s16(MAX_COST), _bestDisp = vx_setall_s16(-1);
896                         for( ; d <= D - v_int16::nlanes; d+= v_int16::nlanes )
897                         {
898                             v_int16 L0 = vx_load_aligned(Sp + d);
899                             _bestDisp = v_select(_minS > L0, vx_setall_s16((short)d), _bestDisp);
900                             _minS = v_min( L0, _minS );
901                         }
902                         min_pos(_minS, _bestDisp, minS, bestDisp);
903 #endif
904                         for( ; d < D; d++ )
905                         {
906                             int Sval = Sp[d];
907                             if( Sval < minS )
908                             {
909                                 minS = (CostType)Sval;
910                                 bestDisp = (short)d;
911                             }
912                         }
913                     }
914 
915                     for( d = 0; d < D; d++ )
916                     {
917                         if( Sp[d]*(100 - uniquenessRatio) < minS*100 && std::abs(bestDisp - d) > 1 )
918                             break;
919                     }
920                     if( d < D )
921                         continue;
922                     d = bestDisp;
923                     int _x2 = x + minX1 - d - minD;
924                     if( mem.disp2cost[_x2] > minS )
925                     {
926                         mem.disp2cost[_x2] = (CostType)minS;
927                         mem.disp2ptr[_x2] = (DispType)(d + minD);
928                     }
929 
930                     if( 0 < d && d < D-1 )
931                     {
932                         // do subpixel quadratic interpolation:
933                         //   fit parabola into (x1=d-1, y1=Sp[d-1]), (x2=d, y2=Sp[d]), (x3=d+1, y3=Sp[d+1])
934                         //   then find minimum of the parabola.
935                         int denom2 = std::max(Sp[d-1] + Sp[d+1] - 2*Sp[d], 1);
936                         d = d*DISP_SCALE + ((Sp[d-1] - Sp[d+1])*DISP_SCALE + denom2)/(denom2*2);
937                     }
938                     else
939                         d *= DISP_SCALE;
940                     disp1ptr[x + minX1] = (DispType)(d + minD*DISP_SCALE);
941                 }
942 
943                 for( x = minX1; x < maxX1; x++ )
944                 {
945                     // we round the computed disparity both towards -inf and +inf and check
946                     // if either of the corresponding disparities in disp2 is consistent.
947                     // This is to give the computed disparity a chance to look valid if it is.
948                     int d1 = disp1ptr[x];
949                     if( d1 == INVALID_DISP_SCALED )
950                         continue;
951                     int _d = d1 >> DISP_SHIFT;
952                     int d_ = (d1 + DISP_SCALE-1) >> DISP_SHIFT;
953                     int _x = x - _d, x_ = x - d_;
954                     if( 0 <= _x && _x < width && mem.disp2ptr[_x] >= minD && std::abs(mem.disp2ptr[_x] - _d) > disp12MaxDiff &&
955                        0 <= x_ && x_ < width && mem.disp2ptr[x_] >= minD && std::abs(mem.disp2ptr[x_] - d_) > disp12MaxDiff )
956                         disp1ptr[x] = (DispType)INVALID_DISP_SCALED;
957                 }
958             }
959 
960             lrID = 1 - lrID; // now shift the cyclic buffers
961         }
962     }
963 }
964 
965 ////////////////////////////////////////////////////////////////////////////////////////////
966 struct CalcVerticalSums: public ParallelLoopBody
967 {
CalcVerticalSumscv::CalcVerticalSums968     CalcVerticalSums(const Mat& _img1, const Mat& _img2, const StereoSGBMParams& params, const BufferSGBM &mem_)
969         : img1(_img1), img2(_img2), mem(mem_)
970     {
971         minD = params.minDisparity;
972         maxD = minD + params.numDisparities;
973         SW2 = SH2 = params.calcSADWindowSize().height/2;
974         P1 = params.P1 > 0 ? params.P1 : 2;
975         P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);
976         height = img1.rows;
977         width = img1.cols;
978         int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
979         D = maxD - minD;
980         Da = (int)alignSize(D, v_int16::nlanes);
981         Dlra = Da + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D
982         width1 = maxX1 - minX1;
983         D = params.numDisparities;
984         Da = (int)alignSize(D, v_int16::nlanes);
985     }
986 
operator ()cv::CalcVerticalSums987     void operator()(const Range& range) const CV_OVERRIDE
988     {
989         const CostType MAX_COST = SHRT_MAX;
990         const int npasses = 2;
991         const int x1 = range.start, x2 = range.end;
992         int k;
993 
994         CostType* pixDiff = 0;
995         PixType* tempBuf = 0;
996         utils::BufferArea aux_area;
997         aux_area.allocate(pixDiff, ((x2 - x1) + 2 * SW2) * Da, CV_SIMD_WIDTH);
998         aux_area.allocate(tempBuf, width * (4 * img1.channels() + 2) * sizeof(PixType), CV_SIMD_WIDTH);
999         aux_area.commit();
1000 
1001         // Simplification of index calculation
1002         if (x1 > SW2)
1003             pixDiff -= (x1 - SW2) * Da;
1004 
1005         for( int pass = 1; pass <= npasses; pass++ )
1006         {
1007             int y1, y2, dy;
1008 
1009             if( pass == 1 )
1010             {
1011                 y1 = 0; y2 = height; dy = 1;
1012             }
1013             else
1014             {
1015                 y1 = height-1; y2 = -1; dy = -1;
1016             }
1017 
1018             uchar lrID = 0;
1019             mem.clearLr(range);
1020 
1021             for( int y = y1; y != y2; y += dy )
1022             {
1023                 int x, d;
1024                 CostType* C = mem.getCBuf(y);
1025                 CostType* S = mem.getSBuf(y);
1026 
1027                 if( pass == 1 ) // compute C on the first pass, and reuse it on the second pass, if any.
1028                 {
1029                     int dy1 = y == 0 ? 0 : y + SH2, dy2 = y == 0 ? SH2 : dy1;
1030 
1031                     for( k = dy1; k <= dy2; k++ )
1032                     {
1033                         CostType* hsumAdd = mem.getHSumBuf(std::min(k, height-1));
1034 
1035                         if( k < height )
1036                         {
1037                             calcPixelCostBT( img1, img2, k, minD, maxD, pixDiff, tempBuf, mem.getClipTab(), x1 - SW2, x2 + SW2);
1038 
1039                             memset(hsumAdd + x1*Da, 0, Da*sizeof(CostType));
1040                             for( x = (x1 - SW2)*Da; x <= (x1 + SW2)*Da; x += Da )
1041                             {
1042                                 int xbord = x <= 0 ? 0 : (x > (width1 - 1)*Da ? (width1 - 1)*Da : x);
1043 #if CV_SIMD
1044                                 for( d = 0; d < Da; d += v_int16::nlanes )
1045                                     v_store_aligned(hsumAdd + x1*Da + d, vx_load_aligned(hsumAdd + x1*Da + d) + vx_load_aligned(pixDiff + xbord + d));
1046 #else
1047                                 for( d = 0; d < D; d++ )
1048                                     hsumAdd[x1*Da + d] = (CostType)(hsumAdd[x1*Da + d] + pixDiff[xbord + d]);
1049 #endif
1050                             }
1051 
1052                             if( y > 0 )
1053                             {
1054                                 const CostType* hsumSub =  mem.getHSumBuf(std::max(y - SH2 - 1, 0));
1055                                 const CostType* Cprev = mem.getCBuf(y - 1);
1056 #if CV_SIMD
1057                                 for( d = 0; d < Da; d += v_int16::nlanes )
1058                                     v_store_aligned(C + x1*Da + d, vx_load_aligned(Cprev + x1*Da + d) + vx_load_aligned(hsumAdd + x1*Da + d) - vx_load_aligned(hsumSub + x1*Da + d));
1059 #else
1060                                 for( d = 0; d < D; d++ )
1061                                     C[x1*Da + d] = (CostType)(Cprev[x1*Da + d] + hsumAdd[x1*Da + d] - hsumSub[x1*Da + d]);
1062 #endif
1063                                 for( x = (x1+1)*Da; x < x2*Da; x += Da )
1064                                 {
1065                                     const CostType* pixAdd = pixDiff + std::min(x + SW2*Da, (width1-1)*Da);
1066                                     const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*Da, 0);
1067 
1068 #if CV_SIMD
1069                                     for( d = 0; d < Da; d += v_int16::nlanes )
1070                                     {
1071                                         v_int16 hv = vx_load_aligned(hsumAdd + x - Da + d) - vx_load_aligned(pixSub + d) + vx_load_aligned(pixAdd + d);
1072                                         v_store_aligned(hsumAdd + x + d, hv);
1073                                         v_store_aligned(C + x + d, vx_load_aligned(Cprev + x + d) - vx_load_aligned(hsumSub + x + d) + hv);
1074                                     }
1075 #else
1076                                     for( d = 0; d < D; d++ )
1077                                     {
1078                                         int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - Da + d] + pixAdd[d] - pixSub[d]);
1079                                         C[x + d] = (CostType)(Cprev[x + d] + hv - hsumSub[x + d]);
1080                                     }
1081 #endif
1082                                 }
1083                             }
1084                             else
1085                             {
1086 #if CV_SIMD
1087                                 v_int16 v_scale = vx_setall_s16(k == 0 ? (short)SH2 + 1 : 1);
1088                                 for (d = 0; d < Da; d += v_int16::nlanes)
1089                                     v_store_aligned(C + x1*Da + d, vx_load_aligned(C + x1*Da + d) + vx_load_aligned(hsumAdd + x1*Da + d) * v_scale);
1090 #else
1091                                 int scale = k == 0 ? SH2 + 1 : 1;
1092                                 for (d = 0; d < D; d++)
1093                                     C[x1*Da + d] = (CostType)(C[x1*Da + d] + hsumAdd[x1*Da + d] * scale);
1094 #endif
1095                                 for( x = (x1+1)*Da; x < x2*Da; x += Da )
1096                                 {
1097                                     const CostType* pixAdd = pixDiff + std::min(x + SW2*Da, (width1-1)*Da);
1098                                     const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*Da, 0);
1099 #if CV_SIMD
1100                                     for (d = 0; d < Da; d += v_int16::nlanes)
1101                                     {
1102                                         v_int16 hv = vx_load_aligned(hsumAdd + x - Da + d) + vx_load_aligned(pixAdd + d) - vx_load_aligned(pixSub + d);
1103                                         v_store_aligned(hsumAdd + x + d, hv);
1104                                         v_store_aligned(C + x + d, vx_load_aligned(C + x + d) + hv * v_scale);
1105                                     }
1106 #else
1107                                     for( d = 0; d < D; d++ )
1108                                     {
1109                                         CostType hv = (CostType)(hsumAdd[x - Da + d] + pixAdd[d] - pixSub[d]);
1110                                         hsumAdd[x + d] = hv;
1111                                         C[x + d] = (CostType)(C[x + d] + hv * scale);
1112                                     }
1113 #endif
1114                                 }
1115                             }
1116                         }
1117                         else
1118                         {
1119 /*                            if (y > 0)
1120                             {
1121                                 const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, 0));
1122                                 const CostType* Cprev = mem.getCBuf(y - 1);
1123 
1124 #if CV_SIMD
1125                                 for( x = x1*Da; x < x2*Da; x += v_int16::nlanes )
1126                                     v_store_aligned(C + x, vx_load_aligned(Cprev + x) - vx_load_aligned(hsumSub + x) + vx_load_aligned(hsumAdd + x));
1127 #else
1128                                 for( x = x1*Da; x < x2*Da; x++ )
1129                                     C[x] = (CostType)(Cprev[x] + hsumAdd[x] - hsumSub[x]);
1130 #endif
1131                             }
1132                             else*/
1133                             if(y == 0)
1134                             {
1135 #if CV_SIMD
1136                                 for( x = x1*Da; x < x2*Da; x += v_int16::nlanes )
1137                                     v_store_aligned(C + x, vx_load_aligned(C + x) + vx_load_aligned(hsumAdd + x));
1138 #else
1139                                 for( x = x1*Da; x < x2*Da; x++ )
1140                                     C[x] = (CostType)(C[x] + hsumAdd[x]);
1141 #endif
1142                             }
1143                         }
1144                     }
1145                     mem.clearSBuf(y, range);
1146                 }
1147 
1148 //              [formula 13 in the paper]
1149 //              compute L_r(p, d) = C(p, d) +
1150 //              min(L_r(p-r, d),
1151 //              L_r(p-r, d-1) + P1,
1152 //              L_r(p-r, d+1) + P1,
1153 //              min_k L_r(p-r, k) + P2) - min_k L_r(p-r, k)
1154 //              where p = (x,y), r is one of the directions.
1155 //              we process one directions on first pass and other on second:
1156 //              r=(0, dy), where dy=1 on first pass and dy=-1 on second
1157 
1158                 for( x = x1; x != x2; x++ )
1159                 {
1160                     int delta = P2 + *mem.getMinLr(1 - lrID, x);
1161                     CostType* Lr_ppr = mem.getLr(1 - lrID, x);
1162 
1163                     Lr_ppr[-1] = Lr_ppr[D] = MAX_COST;
1164 
1165                     CostType* Lr_p = mem.getLr(lrID, x);
1166                     const CostType* Cp = C + x*Da;
1167                     CostType* Sp = S + x*Da;
1168 
1169                     CostType& minL = *(mem.getMinLr(lrID, x));
1170                     d = 0;
1171 #if CV_SIMD
1172                     v_int16 _P1 = vx_setall_s16((short)P1);
1173 
1174                     v_int16 _delta = vx_setall_s16((short)delta);
1175                     v_int16 _minL = vx_setall_s16((short)MAX_COST);
1176 
1177                     for( ; d <= D - v_int16::nlanes; d += v_int16::nlanes )
1178                     {
1179                         v_int16 Cpd = vx_load_aligned(Cp + d);
1180                         v_int16 L = v_min(v_min(v_min(vx_load_aligned(Lr_ppr + d), vx_load(Lr_ppr + d - 1) + _P1), vx_load(Lr_ppr + d + 1) + _P1), _delta) - _delta + Cpd;
1181                         v_store_aligned(Lr_p + d, L);
1182                         _minL = v_min(_minL, L);
1183                         v_store_aligned(Sp + d, vx_load_aligned(Sp + d) + L);
1184                     }
1185                     minL = v_reduce_min(_minL);
1186 #else
1187                     minL = MAX_COST;
1188 #endif
1189                     for( ; d < D; d++ )
1190                     {
1191                         int Cpd = Cp[d], L;
1192 
1193                         L = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta;
1194 
1195                         Lr_p[d] = (CostType)L;
1196                         minL = std::min(minL, (CostType)L);
1197 
1198                         Sp[d] = saturate_cast<CostType>(Sp[d] + L);
1199                     }
1200                 }
1201                 lrID = 1 - lrID; // now shift the cyclic buffers
1202             }
1203         }
1204     }
1205     const Mat& img1;
1206     const Mat& img2;
1207     const BufferSGBM & mem;
1208     int minD;
1209     int maxD;
1210     int D, Da, Dlra;
1211     int SH2;
1212     int SW2;
1213     int width;
1214     int width1;
1215     int height;
1216     int P1;
1217     int P2;
1218 };
1219 
1220 struct CalcHorizontalSums: public ParallelLoopBody
1221 {
CalcHorizontalSumscv::CalcHorizontalSums1222     CalcHorizontalSums(const Mat& _img1, const Mat& _img2, Mat& _disp1, const StereoSGBMParams& params, const BufferSGBM &mem_)
1223         : img1(_img1), img2(_img2), disp1(_disp1), mem(mem_)
1224     {
1225         minD = params.minDisparity;
1226         maxD = minD + params.numDisparities;
1227         P1 = params.P1 > 0 ? params.P1 : 2;
1228         P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);
1229         uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10;
1230         disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;
1231         height = img1.rows;
1232         width = img1.cols;
1233         minX1 = std::max(maxD, 0);
1234         maxX1 = width + std::min(minD, 0);
1235         INVALID_DISP = minD - 1;
1236         INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
1237         D = maxD - minD;
1238         Da = (int)alignSize(D, v_int16::nlanes);
1239         Dlra = Da + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D
1240         width1 = maxX1 - minX1;
1241     }
1242 
operator ()cv::CalcHorizontalSums1243     void operator()(const Range& range) const CV_OVERRIDE
1244     {
1245         int y1 = range.start, y2 = range.end;
1246 
1247         const size_t LrSize = 2 * (1 + Dlra + 1);
1248 
1249         CostType * Lr = 0;
1250         CostType * disp2cost = 0;
1251         DispType * disp2ptr = 0;
1252         utils::BufferArea aux_area;
1253         aux_area.allocate(Lr, LrSize);
1254         aux_area.allocate(disp2cost, width, CV_SIMD_WIDTH);
1255         aux_area.allocate(disp2ptr, width, CV_SIMD_WIDTH);
1256         aux_area.commit();
1257 
1258         CostType minLr;
1259 
1260         for( int y = y1; y != y2; y++)
1261         {
1262             int x, d;
1263             DispType* disp1ptr = disp1.ptr<DispType>(y);
1264             CostType* C = mem.getCBuf(y);
1265             CostType* S = mem.getSBuf(y);
1266 
1267             x = 0;
1268 #if CV_SIMD
1269             v_int16 v_inv_dist = vx_setall_s16((DispType)INVALID_DISP_SCALED);
1270             v_int16 v_max_cost = vx_setall_s16(MAX_COST);
1271             for (; x <= width - v_int16::nlanes; x += v_int16::nlanes)
1272             {
1273                 v_store(disp1ptr + x, v_inv_dist);
1274                 v_store(disp2ptr + x, v_inv_dist);
1275                 v_store(disp2cost + x, v_max_cost);
1276             }
1277 #endif
1278             for( ; x < width; x++ )
1279             {
1280                 disp1ptr[x] = disp2ptr[x] = (DispType)INVALID_DISP_SCALED;
1281                 disp2cost[x] = MAX_COST;
1282             }
1283 
1284             // clear buffers
1285             aux_area.zeroFill(Lr);
1286             Lr[0] = Lr[1 + D] = Lr[3 + Dlra - 1] = Lr[3 + Dlra + D] = MAX_COST;
1287 
1288             minLr = 0;
1289 //          [formula 13 in the paper]
1290 //          compute L_r(p, d) = C(p, d) +
1291 //          min(L_r(p-r, d),
1292 //          L_r(p-r, d-1) + P1,
1293 //          L_r(p-r, d+1) + P1,
1294 //          min_k L_r(p-r, k) + P2) - min_k L_r(p-r, k)
1295 //          where p = (x,y), r is one of the directions.
1296 //          we process all the directions at once:
1297 //          we process one directions on first pass and other on second:
1298 //          r=(dx, 0), where dx=1 on first pass and dx=-1 on second
1299             for( x = 0; x != width1; x++)
1300             {
1301                 int delta = minLr + P2;
1302                 CostType* Lr_ppr = Lr + ((x&1)? 1 : 3 + Dlra);
1303                 CostType* Lr_p = Lr + ((x&1)? 3 + Dlra : 1);
1304                 const CostType* Cp = C + x*Da;
1305                 CostType* Sp = S + x*Da;
1306 
1307                 d = 0;
1308 #if CV_SIMD
1309                 v_int16 _P1 = vx_setall_s16((short)P1);
1310 
1311                 v_int16 _delta = vx_setall_s16((short)delta);
1312                 v_int16 _minL = vx_setall_s16((short)MAX_COST);
1313 
1314                 for( ; d <= D - v_int16::nlanes; d += v_int16::nlanes)
1315                 {
1316                     v_int16 Cpd = vx_load_aligned(Cp + d);
1317                     v_int16 L = v_min(v_min(v_min(vx_load(Lr_ppr + d), vx_load(Lr_ppr + d - 1) + _P1), vx_load(Lr_ppr + d + 1) + _P1), _delta) - _delta + Cpd;
1318                     v_store(Lr_p + d, L);
1319                     _minL = v_min(_minL, L);
1320                     v_store_aligned(Sp + d, vx_load_aligned(Sp + d) + L);
1321                 }
1322                 minLr = v_reduce_min(_minL);
1323 #else
1324                 minLr = MAX_COST;
1325 #endif
1326                 for( ; d < D; d++ )
1327                 {
1328                     int Cpd = Cp[d], L;
1329                     L = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta;
1330                     Lr_p[d] = (CostType)L;
1331                     minLr = std::min(minLr, (CostType)L);
1332                     Sp[d] = saturate_cast<CostType>(Sp[d] + L);
1333                 }
1334             }
1335 
1336             aux_area.zeroFill(Lr);
1337             Lr[0] = Lr[1 + D] = Lr[3 + Dlra - 1] = Lr[3 + Dlra + D] = MAX_COST;
1338 
1339             minLr = 0;
1340 
1341             for( x = width1-1; x != -1; x--)
1342             {
1343                 int delta = minLr + P2;
1344                 CostType* Lr_ppr = Lr + ((x&1)? 1 : 3 + Dlra);
1345                 CostType* Lr_p = Lr + ((x&1)? 3 + Dlra : 1);
1346                 const CostType* Cp = C + x*Da;
1347                 CostType* Sp = S + x*Da;
1348                 CostType minS = MAX_COST;
1349                 short bestDisp = -1;
1350                 minLr = MAX_COST;
1351 
1352                 d = 0;
1353 #if CV_SIMD
1354                 v_int16 _P1 = vx_setall_s16((short)P1);
1355                 v_int16 _delta = vx_setall_s16((short)delta);
1356 
1357                 v_int16 _minL = vx_setall_s16((short)MAX_COST);
1358                 v_int16 _minS = vx_setall_s16(MAX_COST), _bestDisp = vx_setall_s16(-1);
1359                 for( ; d <= D - v_int16::nlanes; d += v_int16::nlanes )
1360                 {
1361                     v_int16 Cpd = vx_load_aligned(Cp + d);
1362                     v_int16 L = v_min(v_min(v_min(vx_load(Lr_ppr + d), vx_load(Lr_ppr + d - 1) + _P1), vx_load(Lr_ppr + d + 1) + _P1), _delta) - _delta + Cpd;
1363                     v_store(Lr_p + d, L);
1364                     _minL = v_min(_minL, L);
1365                     L += vx_load_aligned(Sp + d);
1366                     v_store_aligned(Sp + d, L);
1367 
1368                     _bestDisp = v_select(_minS > L, vx_setall_s16((short)d), _bestDisp);
1369                     _minS = v_min( L, _minS );
1370                 }
1371                 minLr = v_reduce_min(_minL);
1372 
1373                 min_pos(_minS, _bestDisp, minS, bestDisp);
1374 #endif
1375                 for( ; d < D; d++ )
1376                 {
1377                     int Cpd = Cp[d], L;
1378 
1379                     L = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta;
1380 
1381                     Lr_p[d] = (CostType)L;
1382                     minLr = std::min(minLr, (CostType)L);
1383 
1384                     Sp[d] = saturate_cast<CostType>(Sp[d] + L);
1385                     if( Sp[d] < minS )
1386                     {
1387                         minS = Sp[d];
1388                         bestDisp = (short)d;
1389                     }
1390                 }
1391 
1392                 //Some postprocessing procedures and saving
1393                 for( d = 0; d < D; d++ )
1394                 {
1395                     if( Sp[d]*(100 - uniquenessRatio) < minS*100 && std::abs(bestDisp - d) > 1 )
1396                         break;
1397                 }
1398                 if( d < D )
1399                     continue;
1400                 d = bestDisp;
1401                 int _x2 = x + minX1 - d - minD;
1402                 if( disp2cost[_x2] > minS )
1403                 {
1404                     disp2cost[_x2] = (CostType)minS;
1405                     disp2ptr[_x2] = (DispType)(d + minD);
1406                 }
1407 
1408                 if( 0 < d && d < D-1 )
1409                 {
1410                     // do subpixel quadratic interpolation:
1411                     //   fit parabola into (x1=d-1, y1=Sp[d-1]), (x2=d, y2=Sp[d]), (x3=d+1, y3=Sp[d+1])
1412                     //   then find minimum of the parabola.
1413                     int denom2 = std::max(Sp[d-1] + Sp[d+1] - 2*Sp[d], 1);
1414                     d = d*DISP_SCALE + ((Sp[d-1] - Sp[d+1])*DISP_SCALE + denom2)/(denom2*2);
1415                 }
1416                 else
1417                     d *= DISP_SCALE;
1418                 disp1ptr[x + minX1] = (DispType)(d + minD*DISP_SCALE);
1419             }
1420             //Left-right check sanity procedure
1421             for( x = minX1; x < maxX1; x++ )
1422             {
1423                 // we round the computed disparity both towards -inf and +inf and check
1424                 // if either of the corresponding disparities in disp2 is consistent.
1425                 // This is to give the computed disparity a chance to look valid if it is.
1426                 int d1 = disp1ptr[x];
1427                 if( d1 == INVALID_DISP_SCALED )
1428                     continue;
1429                 int _d = d1 >> DISP_SHIFT;
1430                 int d_ = (d1 + DISP_SCALE-1) >> DISP_SHIFT;
1431                 int _x = x - _d, x_ = x - d_;
1432                 if( 0 <= _x && _x < width && disp2ptr[_x] >= minD && std::abs(disp2ptr[_x] - _d) > disp12MaxDiff &&
1433                    0 <= x_ && x_ < width && disp2ptr[x_] >= minD && std::abs(disp2ptr[x_] - d_) > disp12MaxDiff )
1434                     disp1ptr[x] = (DispType)INVALID_DISP_SCALED;
1435             }
1436         }
1437     }
1438 
1439     static const int DISP_SHIFT = StereoMatcher::DISP_SHIFT;
1440     static const int DISP_SCALE = (1 << DISP_SHIFT);
1441     static const CostType MAX_COST = SHRT_MAX;
1442     const Mat& img1;
1443     const Mat& img2;
1444     Mat& disp1;
1445     const BufferSGBM & mem;
1446     int minD;
1447     int maxD;
1448     int D, Da, Dlra;
1449     int width;
1450     int width1;
1451     int height;
1452     int P1;
1453     int P2;
1454     int minX1;
1455     int maxX1;
1456     int INVALID_DISP;
1457     int INVALID_DISP_SCALED;
1458     int uniquenessRatio;
1459     int disp12MaxDiff;
1460 };
1461 /*
1462  computes disparity for "roi" in img1 w.r.t. img2 and write it to disp1buf.
1463  that is, disp1buf(x, y)=d means that img1(x+roi.x, y+roi.y) ~ img2(x+roi.x-d, y+roi.y).
1464  minD <= d < maxD.
1465 
1466  note that disp1buf will have the same size as the roi and
1467  On exit disp1buf is not the final disparity, it is an intermediate result that becomes
1468  final after all the tiles are processed.
1469 
1470  the disparity in disp1buf is written with sub-pixel accuracy
1471  (4 fractional bits, see StereoSGBM::DISP_SCALE),
1472  using quadratic interpolation, while the disparity in disp2buf
1473  is written as is, without interpolation.
1474  */
computeDisparitySGBM_HH4(const Mat & img1,const Mat & img2,Mat & disp1,const StereoSGBMParams & params)1475 static void computeDisparitySGBM_HH4( const Mat& img1, const Mat& img2,
1476                                  Mat& disp1, const StereoSGBMParams& params)
1477 {
1478     const int DISP_SHIFT = StereoMatcher::DISP_SHIFT;
1479     const int DISP_SCALE = (1 << DISP_SHIFT);
1480     int minD = params.minDisparity, maxD = minD + params.numDisparities;
1481     Size SADWindowSize;
1482     SADWindowSize.width = SADWindowSize.height = params.SADWindowSize > 0 ? params.SADWindowSize : 5;
1483     int P1 = params.P1 > 0 ? params.P1 : 2, P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);
1484     int width = disp1.cols, height = disp1.rows;
1485     int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
1486     int width1 = maxX1 - minX1;
1487     int Da = (int)alignSize(params.numDisparities, v_int16::nlanes);
1488     int Dlra = Da + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D
1489     int INVALID_DISP = minD - 1;
1490     int INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
1491 
1492     if( minX1 >= maxX1 )
1493     {
1494         disp1 = Scalar::all(INVALID_DISP_SCALED);
1495         return;
1496     }
1497 
1498     BufferSGBM mem(width1, Da, Dlra, img1.channels(), width, height, params);
1499     mem.initCBuf((CostType)P2); // add P2 to every C(x,y). it saves a few operations in the inner loops
1500 
1501     parallel_for_(Range(0,width1),CalcVerticalSums(img1, img2, params, mem),8);
1502     parallel_for_(Range(0,height),CalcHorizontalSums(img1, img2, disp1, params, mem),8);
1503 }
1504 
1505 //////////////////////////////////////////////////////////////////////////////////////////////////////
1506 
1507 class BufferSGBM3Way
1508 {
1509 private:
1510     size_t hsumCols;
1511     size_t hsumRows;
1512 public:
1513     CostType *curCostVolumeLine;
1514     CostType *hsumBuf;
1515     CostType *pixDiff;
1516     PixType *tmpBuf;
1517     CostType *horPassCostVolume;
1518     CostType *vertPassCostVolume;
1519     CostType *vertPassMin;
1520     CostType *rightPassBuf;
1521     CostType *disp2CostBuf;
1522     short *disp2Buf;
1523 private:
1524     utils::BufferArea area;
1525 public:
BufferSGBM3Way(int width1,int width,int num_ch,int Da,int SH2,int P2)1526     BufferSGBM3Way(int width1, int width, int num_ch, int Da, int SH2, int P2) :
1527         curCostVolumeLine(0),
1528         hsumBuf(0),
1529         pixDiff(0),
1530         tmpBuf(0),
1531         horPassCostVolume(0),
1532         vertPassCostVolume(0),
1533         vertPassMin(0),
1534         rightPassBuf(0),
1535         disp2CostBuf(0),
1536         disp2Buf(0)
1537     {
1538         hsumCols = width1 * Da;
1539         hsumRows = SH2*2 + 2;
1540         area.allocate(curCostVolumeLine, hsumCols, CV_SIMD_WIDTH);
1541         area.allocate(hsumBuf, hsumCols * hsumRows, CV_SIMD_WIDTH);
1542         area.allocate(pixDiff,hsumCols, CV_SIMD_WIDTH);
1543         area.allocate(tmpBuf, width * (4 * num_ch + 2), CV_SIMD_WIDTH);
1544         area.allocate(horPassCostVolume, (width1 + 2) * Da, CV_SIMD_WIDTH);
1545         area.allocate(vertPassCostVolume, (width1 + 2) * Da, CV_SIMD_WIDTH);
1546         area.allocate(vertPassMin, width1 + 2, CV_SIMD_WIDTH);
1547         area.allocate(rightPassBuf, Da, CV_SIMD_WIDTH);
1548         area.allocate(disp2CostBuf, width, CV_SIMD_WIDTH);
1549         area.allocate(disp2Buf, width, CV_SIMD_WIDTH);
1550         area.commit();
1551         area.zeroFill();
1552         for(size_t i = 0; i < hsumCols; i++)
1553             curCostVolumeLine[i] = (CostType)P2;
1554     }
clearRightPassBuf()1555     inline void clearRightPassBuf()
1556     {
1557         area.zeroFill(rightPassBuf);
1558     }
getHSumBuf(int x) const1559     CostType *getHSumBuf(int x) const
1560     {
1561         return hsumBuf + (x % hsumRows) * hsumCols;
1562     }
1563 };
1564 
1565 struct SGBM3WayMainLoop : public ParallelLoopBody
1566 {
1567     const Mat *img1, *img2;
1568     Mat* dst_disp;
1569 
1570     int stripe_sz;
1571     int stripe_overlap;
1572 
1573     int width,height;
1574     int minD, maxD, D, Da;
1575     int minX1, maxX1, width1;
1576 
1577     int SW2, SH2;
1578     int P1, P2;
1579     int uniquenessRatio, disp12MaxDiff;
1580 
1581     int TAB_OFS;
1582 
1583     utils::BufferArea aux_area;
1584     PixType* clipTab;
1585 #if CV_SIMD
1586     short idx_row[v_int16::nlanes];
1587 #endif
1588     SGBM3WayMainLoop(const Mat& _img1, const Mat& _img2, Mat* _dst_disp, const StereoSGBMParams& params, int stripe_size, int _stripe_overlap);
1589     void operator () (const Range& range) const CV_OVERRIDE;
1590     template<bool x_nlanes> void impl(const Range& range) const;
1591 
1592 private:
1593     void getRawMatchingCost(const BufferSGBM3Way &mem, int y, int src_start_idx) const;
1594 
1595     template<bool x_nlanes>
1596     void accumulateCostsLeftTop(const BufferSGBM3Way &mem,
1597                                 int x,
1598                                 CostType &leftMinCost) const;
1599 
1600     template<bool x_nlanes>
1601     void accumulateCostsRight(const BufferSGBM3Way &mem,
1602                               int x,
1603                               CostType &rightMinCost,
1604                               short &optimal_disp,
1605                               CostType &min_cost) const;
1606 };
1607 
SGBM3WayMainLoop(const Mat & _img1,const Mat & _img2,Mat * _dst_disp,const StereoSGBMParams & params,int _stripe_sz,int _stripe_overlap)1608 SGBM3WayMainLoop::SGBM3WayMainLoop(const Mat& _img1,
1609                                    const Mat& _img2,
1610                                    Mat* _dst_disp,
1611                                    const StereoSGBMParams& params,
1612                                    int _stripe_sz,
1613                                    int _stripe_overlap)
1614     : img1(&_img1),
1615     img2(&_img2),
1616     dst_disp(_dst_disp),
1617     stripe_sz(_stripe_sz),
1618     stripe_overlap(_stripe_overlap),
1619     clipTab(0)
1620 {
1621     // precompute a lookup table for the raw matching cost computation:
1622     TAB_OFS = 256*4;
1623     const int TAB_SIZE = 256 + TAB_OFS*2;
1624     aux_area.allocate(clipTab, TAB_SIZE, CV_SIMD_WIDTH);
1625     aux_area.commit();
1626     const int ftzero = std::max(params.preFilterCap, 15) | 1;
1627     for(int k = 0; k < TAB_SIZE; k++ )
1628         clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero);
1629 
1630     width = img1->cols; height = img1->rows;
1631     minD = params.minDisparity; maxD = minD + params.numDisparities; D = maxD - minD;
1632     minX1 = std::max(maxD, 0); maxX1 = width + std::min(minD, 0); width1 = maxX1 - minX1;
1633     Da = (int)alignSize(D, v_int16::nlanes);
1634 
1635     SW2 = SH2 = params.SADWindowSize > 0 ? params.SADWindowSize/2 : 1;
1636 
1637     P1 = params.P1 > 0 ? params.P1 : 2; P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);
1638     uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10;
1639     disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;
1640 
1641 #if CV_SIMD
1642     for(short i = 0; i < v_int16::nlanes; ++i)
1643         idx_row[i] = i;
1644 #endif
1645 }
1646 
1647 // performing block matching and building raw cost-volume for the current row
getRawMatchingCost(const BufferSGBM3Way & mem,int y,int src_start_idx) const1648 void SGBM3WayMainLoop::getRawMatchingCost(const BufferSGBM3Way &mem, int y, int src_start_idx) const
1649 {
1650     CostType* C = mem.curCostVolumeLine;
1651     CostType* pixDiff = mem.pixDiff;
1652     PixType* tmpBuf = mem.tmpBuf;
1653     int x, d;
1654     int dy1 = (y == src_start_idx) ? src_start_idx : y + SH2, dy2 = (y == src_start_idx) ? src_start_idx+SH2 : dy1;
1655 
1656     for(int k = dy1; k <= dy2; k++ )
1657     {
1658         CostType* hsumAdd = mem.getHSumBuf(std::min(k, height-1));
1659         if( k < height )
1660         {
1661             calcPixelCostBT( *img1, *img2, k, minD, maxD, pixDiff, tmpBuf, clipTab + TAB_OFS );
1662 
1663 #if CV_SIMD
1664             v_int16 sw2_1 = vx_setall_s16((short)SW2 + 1);
1665             for (d = 0; d < Da; d += v_int16::nlanes)
1666             {
1667                 v_int16 hsA = vx_load_aligned(pixDiff + d) * sw2_1;
1668                 for (x = Da; x <= SW2 * Da; x += Da)
1669                     hsA += vx_load_aligned(pixDiff + x + d);
1670                 v_store_aligned(hsumAdd + d, hsA);
1671             }
1672 #else
1673             for (d = 0; d < D; d++)
1674             {
1675                 CostType hsA = (CostType)(pixDiff[d] * (SW2 + 1));
1676                 for (x = Da; x <= SW2 * Da; x += Da)
1677                     hsA += pixDiff[x + d];
1678                 hsumAdd[d] = hsA;
1679             }
1680 #endif
1681             if( y > src_start_idx )
1682             {
1683                 const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, src_start_idx));
1684 
1685 #if CV_SIMD
1686                 for (d = 0; d < Da; d += v_int16::nlanes)
1687                     v_store_aligned(C + d, vx_load_aligned(C + d) + vx_load_aligned(hsumAdd + d) - vx_load_aligned(hsumSub + d));
1688 #else
1689                 for (d = 0; d < D; d++)
1690                     C[d] = (CostType)(C[d] + hsumAdd[d] - hsumSub[d]);
1691 #endif
1692 
1693                 for( x = Da; x < width1*Da; x += Da )
1694                 {
1695                     const CostType* pixAdd = pixDiff + std::min(x + SW2*Da, (width1-1)*Da);
1696                     const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*Da, 0);
1697 #if CV_SIMD
1698                     v_int16 hv_reg;
1699                     for( d = 0; d < Da; d+=v_int16::nlanes )
1700                     {
1701                         hv_reg = vx_load_aligned(hsumAdd+x-Da+d) + vx_load_aligned(pixAdd+d) - vx_load_aligned(pixSub+d);
1702                         v_store_aligned(hsumAdd+x+d,hv_reg);
1703                         v_store_aligned(C+x+d,vx_load_aligned(C+x+d)+hv_reg-vx_load_aligned(hsumSub+x+d));
1704                     }
1705 #else
1706                     for( d = 0; d < D; d++ )
1707                     {
1708                         int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
1709                         C[x + d] = (CostType)(C[x + d] + hv - hsumSub[x + d]);
1710                     }
1711 #endif
1712                 }
1713             }
1714             else
1715             {
1716 #if CV_SIMD
1717                 v_int16 v_scale = vx_setall_s16(k == src_start_idx ? (short)SH2 + 1 : 1);
1718                 for (d = 0; d < Da; d += v_int16::nlanes)
1719                     v_store_aligned(C + d, vx_load_aligned(C + d) + vx_load_aligned(hsumAdd + d) * v_scale);
1720 #else
1721                 int scale = k == src_start_idx ? SH2 + 1 : 1;
1722                 for (d = 0; d < D; d++)
1723                     C[d] = (CostType)(C[d] + hsumAdd[d] * scale);
1724 #endif
1725                 for( x = Da; x < width1*Da; x += Da )
1726                 {
1727                     const CostType* pixAdd = pixDiff + std::min(x + SW2*Da, (width1-1)*Da);
1728                     const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*Da, 0);
1729 #if CV_SIMD
1730                     for (d = 0; d < Da; d += v_int16::nlanes)
1731                     {
1732                         v_int16 hv = vx_load_aligned(hsumAdd + x - Da + d) + vx_load_aligned(pixAdd + d) - vx_load_aligned(pixSub + d);
1733                         v_store_aligned(hsumAdd + x + d, hv);
1734                         v_store_aligned(C + x + d, vx_load_aligned(C + x + d) + hv * v_scale);
1735                     }
1736 #else
1737                     for (d = 0; d < D; d++)
1738                     {
1739                         CostType hv = (CostType)(hsumAdd[x - Da + d] + pixAdd[d] - pixSub[d]);
1740                         hsumAdd[x + d] = hv;
1741                         C[x + d] = (CostType)(C[x + d] + hv * scale);
1742                     }
1743 #endif
1744                 }
1745             }
1746         }
1747         else
1748         {
1749             if( y > src_start_idx )
1750             {
1751                 const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, src_start_idx));
1752 #if CV_SIMD
1753                 for( x = 0; x < width1*Da; x += v_int16::nlanes)
1754                     v_store_aligned(C + x, vx_load_aligned(C + x) + vx_load_aligned(hsumAdd + x) - vx_load_aligned(hsumSub + x));
1755 #else
1756                 for( x = 0; x < width1*Da; x++ )
1757                     C[x] = (CostType)(C[x] + hsumAdd[x] - hsumSub[x]);
1758 #endif
1759             }
1760             else
1761             {
1762 #if CV_SIMD
1763                 for( x = 0; x < width1*Da; x += v_int16::nlanes)
1764                     v_store_aligned(C + x, vx_load_aligned(C + x) + vx_load_aligned(hsumAdd + x));
1765 #else
1766                 for( x = 0; x < width1*Da; x++ )
1767                     C[x] = (CostType)(C[x] + hsumAdd[x]);
1768 #endif
1769             }
1770         }
1771     }
1772 }
1773 
1774 // performing SGM cost accumulation from left to right (result is stored in leftBuf) and
1775 // in-place cost accumulation from top to bottom (result is stored in topBuf)
1776 template<bool x_nlanes>
accumulateCostsLeftTop(const BufferSGBM3Way & mem,int x,CostType & leftMinCost) const1777 void SGBM3WayMainLoop::accumulateCostsLeftTop(const BufferSGBM3Way &mem, int x, CostType& leftMinCost) const
1778 {
1779     CostType *leftBuf = mem.horPassCostVolume + x;
1780     CostType *leftBuf_prev = mem.horPassCostVolume + x - Da;
1781     CostType *topBuf = mem.vertPassCostVolume + x;
1782     CostType *costs = mem.curCostVolumeLine - Da + x;
1783     CostType& topMinCost = mem.vertPassMin[x/Da];
1784     int i = 0;
1785 #if CV_SIMD
1786     v_int16 P1_reg = vx_setall_s16(cv::saturate_cast<CostType>(P1));
1787 
1788     v_int16 leftMinCostP2_reg   = vx_setall_s16(cv::saturate_cast<CostType>(leftMinCost+P2));
1789     v_int16 leftMinCost_new_reg = vx_setall_s16(SHRT_MAX);
1790     v_int16 src0_leftBuf        = vx_setall_s16(SHRT_MAX);
1791     v_int16 src1_leftBuf        = vx_load_aligned(leftBuf_prev);
1792 
1793     v_int16 topMinCostP2_reg   = vx_setall_s16(cv::saturate_cast<CostType>(topMinCost+P2));
1794     v_int16 topMinCost_new_reg = vx_setall_s16(SHRT_MAX);
1795     v_int16 src0_topBuf        = vx_setall_s16(SHRT_MAX);
1796     v_int16 src1_topBuf        = vx_load_aligned(topBuf);
1797 
1798     v_int16 src2;
1799     v_int16 src_shifted_left,src_shifted_right;
1800     v_int16 res;
1801 
1802     for(;i<Da-v_int16::nlanes;i+= v_int16::nlanes)
1803     {
1804         //process leftBuf:
1805         //lookahead load:
1806         src2 = vx_load_aligned(leftBuf_prev+i+v_int16::nlanes);
1807 
1808         //get shifted versions of the current block and add P1:
1809         src_shifted_left  = v_rotate_left<1>  (src1_leftBuf,src0_leftBuf);
1810         src_shifted_right = v_rotate_right<1> (src1_leftBuf,src2        );
1811 
1812         // process and save current block:
1813         res = vx_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right) + P1_reg,v_min(src1_leftBuf,leftMinCostP2_reg))-leftMinCostP2_reg);
1814         leftMinCost_new_reg = v_min(leftMinCost_new_reg,res);
1815         v_store_aligned(leftBuf+i, res);
1816 
1817         //update src buffers:
1818         src0_leftBuf = src1_leftBuf;
1819         src1_leftBuf = src2;
1820 
1821         //process topBuf:
1822         //lookahead load:
1823         src2 = vx_load_aligned(topBuf+i+v_int16::nlanes);
1824 
1825         //get shifted versions of the current block and add P1:
1826         src_shifted_left  = v_rotate_left<1>  (src1_topBuf,src0_topBuf);
1827         src_shifted_right = v_rotate_right<1> (src1_topBuf,src2       );
1828 
1829         // process and save current block:
1830         res = vx_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right) + P1_reg,v_min(src1_topBuf,topMinCostP2_reg))-topMinCostP2_reg);
1831         topMinCost_new_reg = v_min(topMinCost_new_reg,res);
1832         v_store_aligned(topBuf+i, res);
1833 
1834         //update src buffers:
1835         src0_topBuf = src1_topBuf;
1836         src1_topBuf = src2;
1837     }
1838 
1839     // a bit different processing for the last cycle of the loop:
1840     if(x_nlanes)
1841     {
1842         src2 = vx_setall_s16(SHRT_MAX);
1843         //process leftBuf:
1844         src_shifted_left  = v_rotate_left<1>  (src1_leftBuf,src0_leftBuf);
1845         src_shifted_right = v_rotate_right<1> (src1_leftBuf,src2        );
1846 
1847         res = vx_load_aligned(costs+Da-v_int16::nlanes) + (v_min(v_min(src_shifted_left,src_shifted_right) + P1_reg,v_min(src1_leftBuf,leftMinCostP2_reg))-leftMinCostP2_reg);
1848         leftMinCost = v_reduce_min(v_min(leftMinCost_new_reg,res));
1849         v_store_aligned(leftBuf+Da-v_int16::nlanes, res);
1850 
1851         //process topBuf:
1852         src_shifted_left  = v_rotate_left<1>  (src1_topBuf,src0_topBuf);
1853         src_shifted_right = v_rotate_right<1> (src1_topBuf,src2       );
1854 
1855         res = vx_load_aligned(costs+Da-v_int16::nlanes) + (v_min(v_min(src_shifted_left,src_shifted_right) + P1_reg,v_min(src1_topBuf,topMinCostP2_reg))-topMinCostP2_reg);
1856         topMinCost = v_reduce_min(v_min(topMinCost_new_reg,res));
1857         v_store_aligned(topBuf+Da-v_int16::nlanes, res);
1858     }
1859     else
1860     {
1861         CostType leftMinCost_new = v_reduce_min(leftMinCost_new_reg);
1862         CostType topMinCost_new  = v_reduce_min(topMinCost_new_reg);
1863         CostType leftBuf_prev_i_minus_1 = i > 0 ? leftBuf_prev[i-1] : SHRT_MAX;
1864         CostType topBuf_i_minus_1       = i > 0 ? topBuf[i-1] : SHRT_MAX;
1865 #else
1866     {
1867         CostType leftMinCost_new = SHRT_MAX;
1868         CostType topMinCost_new  = SHRT_MAX;
1869         CostType leftBuf_prev_i_minus_1 = SHRT_MAX;
1870         CostType topBuf_i_minus_1       = SHRT_MAX;
1871 #endif
1872         int leftMinCost_P2  = leftMinCost + P2;
1873         int topMinCost_P2   = topMinCost  + P2;
1874         CostType tmp;
1875         for(;i<D-1;i++)
1876         {
1877             leftBuf[i] = cv::saturate_cast<CostType>(costs[i] + std::min(std::min(leftBuf_prev_i_minus_1+P1,leftBuf_prev[i+1]+P1),std::min((int)leftBuf_prev[i],leftMinCost_P2))-leftMinCost_P2);
1878             leftBuf_prev_i_minus_1 = leftBuf_prev[i];
1879             leftMinCost_new = std::min(leftMinCost_new,leftBuf[i]);
1880 
1881             tmp = topBuf[i];
1882             topBuf[i]  = cv::saturate_cast<CostType>(costs[i] + std::min(std::min(topBuf_i_minus_1+P1,topBuf[i+1]+P1),std::min((int)topBuf[i],topMinCost_P2))-topMinCost_P2);
1883             topBuf_i_minus_1 = tmp;
1884             topMinCost_new  = std::min(topMinCost_new,topBuf[i]);
1885         }
1886 
1887         leftBuf[D-1] = cv::saturate_cast<CostType>(costs[D-1] + std::min(leftBuf_prev_i_minus_1+P1,std::min((int)leftBuf_prev[D-1],leftMinCost_P2))-leftMinCost_P2);
1888         leftMinCost = std::min(leftMinCost_new,leftBuf[D-1]);
1889 
1890         topBuf[D-1]  = cv::saturate_cast<CostType>(costs[D-1] + std::min(topBuf_i_minus_1+P1,std::min((int)topBuf[D-1],topMinCost_P2))-topMinCost_P2);
1891         topMinCost  = std::min(topMinCost_new,topBuf[D-1]);
1892     }
1893 }
1894 
1895 // performing in-place SGM cost accumulation from right to left (the result is stored in rightBuf) and
1896 // summing rightBuf, topBuf, leftBuf together (the result is stored in leftBuf), as well as finding the
1897 // optimal disparity value with minimum accumulated cost
1898 template<bool x_nlanes>
1899 void SGBM3WayMainLoop::accumulateCostsRight(const BufferSGBM3Way &mem, int x,
1900                                             CostType& rightMinCost, short& optimal_disp, CostType& min_cost) const
1901 {
1902     CostType* costs = mem.curCostVolumeLine - Da + x;
1903     CostType* rightBuf = mem.rightPassBuf;
1904     CostType* topBuf = mem.vertPassCostVolume + x;
1905     CostType* leftBuf = mem.horPassCostVolume + x;
1906 
1907     int i = 0;
1908 #if CV_SIMD
1909     v_int16 P1_reg = vx_setall_s16(cv::saturate_cast<CostType>(P1));
1910 
1911     v_int16 rightMinCostP2_reg   = vx_setall_s16(cv::saturate_cast<CostType>(rightMinCost+P2));
1912     v_int16 rightMinCost_new_reg = vx_setall_s16(SHRT_MAX);
1913     v_int16 src0_rightBuf        = vx_setall_s16(SHRT_MAX);
1914     v_int16 src1_rightBuf        = vx_load(rightBuf);
1915 
1916     v_int16 src2;
1917     v_int16 src_shifted_left,src_shifted_right;
1918     v_int16 res;
1919 
1920     v_int16 min_sum_cost_reg = vx_setall_s16(SHRT_MAX);
1921     v_int16 min_sum_pos_reg  = vx_setall_s16(0);
1922 
1923     for(;i<Da-v_int16::nlanes;i+=v_int16::nlanes)
1924     {
1925         //lookahead load:
1926         src2 = vx_load_aligned(rightBuf+i+v_int16::nlanes);
1927 
1928         //get shifted versions of the current block and add P1:
1929         src_shifted_left  = v_rotate_left<1>  (src1_rightBuf,src0_rightBuf);
1930         src_shifted_right = v_rotate_right<1> (src1_rightBuf,src2         );
1931 
1932         // process and save current block:
1933         res = vx_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right) + P1_reg,v_min(src1_rightBuf,rightMinCostP2_reg))-rightMinCostP2_reg);
1934         rightMinCost_new_reg = v_min(rightMinCost_new_reg,res);
1935         v_store_aligned(rightBuf+i, res);
1936 
1937         // compute and save total cost:
1938         res = res + vx_load_aligned(leftBuf+i) + vx_load_aligned(topBuf+i);
1939         v_store_aligned(leftBuf+i, res);
1940 
1941         // track disparity value with the minimum cost:
1942         min_sum_cost_reg = v_min(min_sum_cost_reg,res);
1943         min_sum_pos_reg = min_sum_pos_reg + ((min_sum_cost_reg == res) & (vx_setall_s16((short)i) - min_sum_pos_reg));
1944 
1945         //update src:
1946         src0_rightBuf    = src1_rightBuf;
1947         src1_rightBuf    = src2;
1948     }
1949 
1950     // a bit different processing for the last cycle of the loop:
1951     if(x_nlanes)
1952     {
1953         src2 = vx_setall_s16(SHRT_MAX);
1954         src_shifted_left  = v_rotate_left<1>  (src1_rightBuf,src0_rightBuf);
1955         src_shifted_right = v_rotate_right<1> (src1_rightBuf,src2         );
1956 
1957         res = vx_load_aligned(costs+D-v_int16::nlanes) + (v_min(v_min(src_shifted_left,src_shifted_right) + P1_reg,v_min(src1_rightBuf,rightMinCostP2_reg))-rightMinCostP2_reg);
1958         rightMinCost = v_reduce_min(v_min(rightMinCost_new_reg,res));
1959         v_store_aligned(rightBuf+D-v_int16::nlanes, res);
1960 
1961         res = res + vx_load_aligned(leftBuf+D-v_int16::nlanes) + vx_load_aligned(topBuf+D-v_int16::nlanes);
1962         v_store_aligned(leftBuf+D-v_int16::nlanes, res);
1963 
1964         min_sum_cost_reg = v_min(min_sum_cost_reg,res);
1965         min_sum_pos_reg = min_sum_pos_reg + ((min_sum_cost_reg == res) & (vx_setall_s16((short)D-v_int16::nlanes) - min_sum_pos_reg));
1966         min_pos(min_sum_cost_reg,min_sum_pos_reg, min_cost, optimal_disp);
1967     }
1968     else
1969     {
1970         CostType rightMinCost_new = v_reduce_min(rightMinCost_new_reg);
1971         CostType rightBuf_i_minus_1 = i > 0 ? rightBuf[i] : SHRT_MAX;
1972         min_pos(min_sum_cost_reg,min_sum_pos_reg, min_cost, optimal_disp);
1973 #else
1974     {
1975         CostType rightMinCost_new = SHRT_MAX;
1976         CostType rightBuf_i_minus_1 = SHRT_MAX;
1977         min_cost = SHRT_MAX;
1978 #endif
1979         int rightMinCost_P2  = rightMinCost + P2;
1980         CostType tmp;
1981         for(;i<D-1;i++)
1982         {
1983             tmp = rightBuf[i];
1984             rightBuf[i]  = cv::saturate_cast<CostType>(costs[i] + std::min(std::min(rightBuf_i_minus_1+P1,rightBuf[i+1]+P1),std::min((int)rightBuf[i],rightMinCost_P2))-rightMinCost_P2);
1985             rightBuf_i_minus_1 = tmp;
1986             rightMinCost_new  = std::min(rightMinCost_new,rightBuf[i]);
1987             leftBuf[i] = cv::saturate_cast<CostType>((int)leftBuf[i]+rightBuf[i]+topBuf[i]);
1988             if(leftBuf[i]<min_cost)
1989             {
1990                 optimal_disp = (short)i;
1991                 min_cost = leftBuf[i];
1992             }
1993         }
1994 
1995         rightBuf[D-1]  = cv::saturate_cast<CostType>(costs[D-1] + std::min(rightBuf_i_minus_1+P1,std::min((int)rightBuf[D-1],rightMinCost_P2))-rightMinCost_P2);
1996         rightMinCost  = std::min(rightMinCost_new,rightBuf[D-1]);
1997         leftBuf[D-1] = cv::saturate_cast<CostType>((int)leftBuf[D-1]+rightBuf[D-1]+topBuf[D-1]);
1998         if(leftBuf[D-1]<min_cost)
1999         {
2000             optimal_disp = (short)D-1;
2001             min_cost = leftBuf[D-1];
2002         }
2003     }
2004 }
2005 
2006 void SGBM3WayMainLoop::operator () (const Range& range) const
2007 {
2008     if (D == Da) impl<true>(range);
2009     else impl<false>(range);
2010 }
2011 
2012 template<bool x_nlanes>
2013 void SGBM3WayMainLoop::impl(const Range& range) const
2014 {
2015     // force separate processing of stripes:
2016     if(range.end>range.start+1)
2017     {
2018         for(int n=range.start;n<range.end;n++)
2019             (*this)(Range(n,n+1));
2020         return;
2021     }
2022 
2023     const int DISP_SCALE = (1 << StereoMatcher::DISP_SHIFT);
2024     int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
2025 
2026     // setting up the ranges:
2027     int src_start_idx = std::max(std::min(range.start * stripe_sz - stripe_overlap, height),0);
2028     int src_end_idx   = std::min(range.end   * stripe_sz, height);
2029 
2030     int dst_offset;
2031     if(range.start==0)
2032         dst_offset=stripe_overlap;
2033     else
2034         dst_offset=0;
2035 
2036     Mat cur_disp   = dst_disp[range.start];
2037     cur_disp = Scalar(INVALID_DISP_SCALED);
2038 
2039     BufferSGBM3Way mem(width1, width, img1->channels(), Da, SH2, P2);
2040     CostType *horPassCostVolume = mem.horPassCostVolume;
2041     // start real processing:
2042     for(int y=src_start_idx;y<src_end_idx;y++)
2043     {
2044         getRawMatchingCost(mem, y, src_start_idx);
2045 
2046         short* disp_row = (short*)cur_disp.ptr(dst_offset+(y-src_start_idx));
2047 
2048         // initialize the auxiliary buffers for the pseudo left-right consistency check:
2049         for(int x=0;x<width;x++)
2050         {
2051             mem.disp2Buf[x] = (short)INVALID_DISP_SCALED;
2052             mem.disp2CostBuf[x] = SHRT_MAX;
2053         }
2054         CostType prev_min, min_cost;
2055         int d;
2056         short best_d;
2057         d = best_d = 0;
2058 
2059         // forward pass
2060         prev_min=0;
2061         for (int x=Da;x<(1+width1)*Da;x+=Da)
2062             accumulateCostsLeftTop<x_nlanes>(mem, x, prev_min);
2063 
2064         //backward pass
2065         mem.clearRightPassBuf();
2066         prev_min=0;
2067         for (int x=width1*Da;x>=Da;x-=Da)
2068         {
2069             accumulateCostsRight<x_nlanes>(mem, x, prev_min, best_d, min_cost);
2070 
2071             if(uniquenessRatio>0)
2072             {
2073                 d = 0;
2074 #if CV_SIMD
2075                 horPassCostVolume+=x;
2076                 int thresh = (100*min_cost)/(100-uniquenessRatio);
2077                 v_int16 thresh_reg = vx_setall_s16((short)(thresh+1));
2078                 v_int16 d1 = vx_setall_s16((short)(best_d-1));
2079                 v_int16 d2 = vx_setall_s16((short)(best_d+1));
2080                 v_int16 eight_reg = vx_setall_s16(v_int16::nlanes);
2081                 v_int16 cur_d = vx_load(idx_row);
2082                 v_int16 mask;
2083 
2084                 for( ; d <= D - 2*v_int16::nlanes; d+=2*v_int16::nlanes )
2085                 {
2086                     mask = (vx_load_aligned(horPassCostVolume + d) < thresh_reg) & ( (cur_d<d1) | (cur_d>d2) );
2087                     cur_d = cur_d+eight_reg;
2088                     if( v_check_any(mask) )
2089                         break;
2090                     mask = (vx_load_aligned(horPassCostVolume + d + v_int16::nlanes) < thresh_reg) & ( (cur_d<d1) | (cur_d>d2) );
2091                     cur_d = cur_d+eight_reg;
2092                     if( v_check_any(mask) )
2093                         break;
2094                 }
2095                 if( d <= D - 2*v_int16::nlanes )
2096                 {
2097                     horPassCostVolume-=x;
2098                     continue;
2099                 }
2100                 if( d <= D - v_int16::nlanes )
2101                 {
2102                     if( v_check_any((vx_load_aligned(horPassCostVolume + d) < thresh_reg) & ((cur_d < d1) | (cur_d > d2))) )
2103                     {
2104                         horPassCostVolume-=x;
2105                         continue;
2106                     }
2107                     d+=v_int16::nlanes;
2108                 }
2109                 horPassCostVolume-=x;
2110 #endif
2111                 for( ; d < D; d++ )
2112                 {
2113                     if( horPassCostVolume[x+d]*(100 - uniquenessRatio) < min_cost*100 && std::abs(d - best_d) > 1 )
2114                         break;
2115                 }
2116                 if( d < D )
2117                     continue;
2118             }
2119             d = best_d;
2120 
2121             int _x2 = x/Da - 1 + minX1 - d - minD;
2122             if( _x2>=0 && _x2<width && mem.disp2CostBuf[_x2] > min_cost )
2123             {
2124                 mem.disp2CostBuf[_x2] = min_cost;
2125                 mem.disp2Buf[_x2] = (short)(d + minD);
2126             }
2127 
2128             if( 0 < d && d < D-1 )
2129             {
2130                 // do subpixel quadratic interpolation:
2131                 //   fit parabola into (x1=d-1, y1=Sp[d-1]), (x2=d, y2=Sp[d]), (x3=d+1, y3=Sp[d+1])
2132                 //   then find minimum of the parabola.
2133                 int denom2 = std::max(horPassCostVolume[x+d-1] + horPassCostVolume[x+d+1] - 2*horPassCostVolume[x+d], 1);
2134                 d = d*DISP_SCALE + ((horPassCostVolume[x+d-1] - horPassCostVolume[x+d+1])*DISP_SCALE + denom2)/(denom2*2);
2135             }
2136             else
2137                 d *= DISP_SCALE;
2138 
2139             disp_row[(x/Da)-1 + minX1] = (DispType)(d + minD*DISP_SCALE);
2140         }
2141 
2142         for(int x = minX1; x < maxX1; x++ )
2143         {
2144             // pseudo LRC consistency check using only one disparity map;
2145             // pixels with difference more than disp12MaxDiff are invalidated
2146             int d1 = disp_row[x];
2147             if( d1 == INVALID_DISP_SCALED )
2148                 continue;
2149             int _d = d1 >> StereoMatcher::DISP_SHIFT;
2150             int d_ = (d1 + DISP_SCALE-1) >> StereoMatcher::DISP_SHIFT;
2151             int _x = x - _d, x_ = x - d_;
2152             if( 0 <= _x && _x < width && mem.disp2Buf[_x] >= minD && std::abs(mem.disp2Buf[_x] - _d) > disp12MaxDiff &&
2153                 0 <= x_ && x_ < width && mem.disp2Buf[x_] >= minD && std::abs(mem.disp2Buf[x_] - d_) > disp12MaxDiff )
2154                 disp_row[x] = (short)INVALID_DISP_SCALED;
2155         }
2156     }
2157 }
2158 
2159 template <uchar nstripes>
2160 static void computeDisparity3WaySGBM(const Mat& img1, const Mat& img2, Mat& disp1, const StereoSGBMParams& params)
2161 {
2162     // allocate separate dst_disp arrays to avoid conflicts due to stripe overlap:
2163     int stripe_sz = (int)ceil(img1.rows/(double)nstripes);
2164     int stripe_overlap = (params.SADWindowSize/2+1) + (int)ceil(0.1*stripe_sz);
2165     Mat dst_disp[nstripes];
2166     for(int i=0;i<nstripes;i++)
2167         dst_disp[i].create(stripe_sz+stripe_overlap,img1.cols,CV_16S);
2168 
2169     parallel_for_(
2170         Range(0,nstripes),
2171         SGBM3WayMainLoop(img1,img2,dst_disp,params,stripe_sz,stripe_overlap)
2172     );
2173 
2174     //assemble disp1 from dst_disp:
2175     short* dst_row;
2176     short* src_row;
2177     for(int i=0;i<disp1.rows;i++)
2178     {
2179         dst_row = (short*)disp1.ptr(i);
2180         src_row = (short*)dst_disp[i/stripe_sz].ptr(stripe_overlap+i%stripe_sz);
2181         memcpy(dst_row,src_row,disp1.cols*sizeof(short));
2182     }
2183 }
2184 
2185 class StereoSGBMImpl CV_FINAL : public StereoSGBM
2186 {
2187 public:
2188     StereoSGBMImpl()
2189         : params()
2190     {
2191         // nothing
2192     }
2193 
2194     StereoSGBMImpl( int _minDisparity, int _numDisparities, int _SADWindowSize,
2195                     int _P1, int _P2, int _disp12MaxDiff, int _preFilterCap,
2196                     int _uniquenessRatio, int _speckleWindowSize, int _speckleRange,
2197                     int _mode )
2198         : params(_minDisparity, _numDisparities, _SADWindowSize,
2199                  _P1, _P2, _disp12MaxDiff, _preFilterCap,
2200                  _uniquenessRatio, _speckleWindowSize, _speckleRange,
2201                  _mode)
2202     {
2203         // nothing
2204     }
2205 
2206     void compute( InputArray leftarr, InputArray rightarr, OutputArray disparr ) CV_OVERRIDE
2207     {
2208         CV_INSTRUMENT_REGION();
2209 
2210         Mat left = leftarr.getMat(), right = rightarr.getMat();
2211         CV_Assert( left.size() == right.size() && left.type() == right.type() &&
2212                    left.depth() == CV_8U );
2213 
2214         disparr.create( left.size(), CV_16S );
2215         Mat disp = disparr.getMat();
2216 
2217         if(params.mode==MODE_SGBM_3WAY)
2218             // the number of stripes is fixed, disregarding the number of threads/processors
2219             // to make the results fully reproducible
2220             computeDisparity3WaySGBM<4>( left, right, disp, params );
2221         else if(params.mode==MODE_HH4)
2222             computeDisparitySGBM_HH4( left, right, disp, params );
2223         else
2224             computeDisparitySGBM( left, right, disp, params );
2225 
2226         medianBlur(disp, disp, 3);
2227 
2228         if( params.speckleWindowSize > 0 )
2229             filterSpeckles(disp, (params.minDisparity - 1)*StereoMatcher::DISP_SCALE, params.speckleWindowSize,
2230                            StereoMatcher::DISP_SCALE*params.speckleRange, buffer);
2231     }
2232 
2233     int getMinDisparity() const CV_OVERRIDE { return params.minDisparity; }
2234     void setMinDisparity(int minDisparity) CV_OVERRIDE { params.minDisparity = minDisparity; }
2235 
2236     int getNumDisparities() const CV_OVERRIDE { return params.numDisparities; }
2237     void setNumDisparities(int numDisparities) CV_OVERRIDE { params.numDisparities = numDisparities; }
2238 
2239     int getBlockSize() const CV_OVERRIDE { return params.SADWindowSize; }
2240     void setBlockSize(int blockSize) CV_OVERRIDE { params.SADWindowSize = blockSize; }
2241 
2242     int getSpeckleWindowSize() const CV_OVERRIDE { return params.speckleWindowSize; }
2243     void setSpeckleWindowSize(int speckleWindowSize) CV_OVERRIDE { params.speckleWindowSize = speckleWindowSize; }
2244 
2245     int getSpeckleRange() const CV_OVERRIDE { return params.speckleRange; }
2246     void setSpeckleRange(int speckleRange) CV_OVERRIDE { params.speckleRange = speckleRange; }
2247 
2248     int getDisp12MaxDiff() const CV_OVERRIDE { return params.disp12MaxDiff; }
2249     void setDisp12MaxDiff(int disp12MaxDiff) CV_OVERRIDE { params.disp12MaxDiff = disp12MaxDiff; }
2250 
2251     int getPreFilterCap() const CV_OVERRIDE { return params.preFilterCap; }
2252     void setPreFilterCap(int preFilterCap) CV_OVERRIDE { params.preFilterCap = preFilterCap; }
2253 
2254     int getUniquenessRatio() const CV_OVERRIDE { return params.uniquenessRatio; }
2255     void setUniquenessRatio(int uniquenessRatio) CV_OVERRIDE { params.uniquenessRatio = uniquenessRatio; }
2256 
2257     int getP1() const CV_OVERRIDE { return params.P1; }
2258     void setP1(int P1) CV_OVERRIDE { params.P1 = P1; }
2259 
2260     int getP2() const CV_OVERRIDE { return params.P2; }
2261     void setP2(int P2) CV_OVERRIDE { params.P2 = P2; }
2262 
2263     int getMode() const CV_OVERRIDE { return params.mode; }
2264     void setMode(int mode) CV_OVERRIDE { params.mode = mode; }
2265 
2266     void write(FileStorage& fs) const CV_OVERRIDE
2267     {
2268         writeFormat(fs);
2269         fs << "name" << name_
2270         << "minDisparity" << params.minDisparity
2271         << "numDisparities" << params.numDisparities
2272         << "blockSize" << params.SADWindowSize
2273         << "speckleWindowSize" << params.speckleWindowSize
2274         << "speckleRange" << params.speckleRange
2275         << "disp12MaxDiff" << params.disp12MaxDiff
2276         << "preFilterCap" << params.preFilterCap
2277         << "uniquenessRatio" << params.uniquenessRatio
2278         << "P1" << params.P1
2279         << "P2" << params.P2
2280         << "mode" << params.mode;
2281     }
2282 
2283     void read(const FileNode& fn) CV_OVERRIDE
2284     {
2285         FileNode n = fn["name"];
2286         CV_Assert( n.isString() && String(n) == name_ );
2287         params.minDisparity = (int)fn["minDisparity"];
2288         params.numDisparities = (int)fn["numDisparities"];
2289         params.SADWindowSize = (int)fn["blockSize"];
2290         params.speckleWindowSize = (int)fn["speckleWindowSize"];
2291         params.speckleRange = (int)fn["speckleRange"];
2292         params.disp12MaxDiff = (int)fn["disp12MaxDiff"];
2293         params.preFilterCap = (int)fn["preFilterCap"];
2294         params.uniquenessRatio = (int)fn["uniquenessRatio"];
2295         params.P1 = (int)fn["P1"];
2296         params.P2 = (int)fn["P2"];
2297         params.mode = (int)fn["mode"];
2298     }
2299 
2300     StereoSGBMParams params;
2301     Mat buffer;
2302 
2303     static const char* name_;
2304 };
2305 
2306 const char* StereoSGBMImpl::name_ = "StereoMatcher.SGBM";
2307 
2308 
2309 Ptr<StereoSGBM> StereoSGBM::create(int minDisparity, int numDisparities, int SADWindowSize,
2310                                  int P1, int P2, int disp12MaxDiff,
2311                                  int preFilterCap, int uniquenessRatio,
2312                                  int speckleWindowSize, int speckleRange,
2313                                  int mode)
2314 {
2315     return Ptr<StereoSGBM>(
2316         new StereoSGBMImpl(minDisparity, numDisparities, SADWindowSize,
2317                            P1, P2, disp12MaxDiff,
2318                            preFilterCap, uniquenessRatio,
2319                            speckleWindowSize, speckleRange,
2320                            mode));
2321 }
2322 
2323 Rect getValidDisparityROI( Rect roi1, Rect roi2,
2324                           int minDisparity,
2325                           int numberOfDisparities,
2326                           int SADWindowSize )
2327 {
2328     int SW2 = SADWindowSize/2;
2329     int maxD = minDisparity + numberOfDisparities - 1;
2330 
2331     int xmin = std::max(roi1.x, roi2.x + maxD) + SW2;
2332     int xmax = std::min(roi1.x + roi1.width, roi2.x + roi2.width) - SW2;
2333     int ymin = std::max(roi1.y, roi2.y) + SW2;
2334     int ymax = std::min(roi1.y + roi1.height, roi2.y + roi2.height) - SW2;
2335 
2336     Rect r(xmin, ymin, xmax - xmin, ymax - ymin);
2337 
2338     return r.width > 0 && r.height > 0 ? r : Rect();
2339 }
2340 
2341 typedef cv::Point_<short> Point2s;
2342 
2343 template <typename T>
2344 void filterSpecklesImpl(cv::Mat& img, int newVal, int maxSpeckleSize, int maxDiff, cv::Mat& _buf)
2345 {
2346     using namespace cv;
2347 
2348     int width = img.cols, height = img.rows, npixels = width*height;
2349     size_t bufSize = npixels*(int)(sizeof(Point2s) + sizeof(int) + sizeof(uchar));
2350     if( !_buf.isContinuous() || _buf.empty() || _buf.cols*_buf.rows*_buf.elemSize() < bufSize )
2351         _buf.reserveBuffer(bufSize);
2352 
2353     uchar* buf = _buf.ptr();
2354     int i, j, dstep = (int)(img.step/sizeof(T));
2355     int* labels = (int*)buf;
2356     buf += npixels*sizeof(labels[0]);
2357     Point2s* wbuf = (Point2s*)buf;
2358     buf += npixels*sizeof(wbuf[0]);
2359     uchar* rtype = (uchar*)buf;
2360     int curlabel = 0;
2361 
2362     // clear out label assignments
2363     memset(labels, 0, npixels*sizeof(labels[0]));
2364 
2365     for( i = 0; i < height; i++ )
2366     {
2367         T* ds = img.ptr<T>(i);
2368         int* ls = labels + width*i;
2369 
2370         for( j = 0; j < width; j++ )
2371         {
2372             if( ds[j] != newVal )   // not a bad disparity
2373             {
2374                 if( ls[j] )     // has a label, check for bad label
2375                 {
2376                     if( rtype[ls[j]] ) // small region, zero out disparity
2377                         ds[j] = (T)newVal;
2378                 }
2379                 // no label, assign and propagate
2380                 else
2381                 {
2382                     Point2s* ws = wbuf; // initialize wavefront
2383                     Point2s p((short)j, (short)i);  // current pixel
2384                     curlabel++; // next label
2385                     int count = 0;  // current region size
2386                     ls[j] = curlabel;
2387 
2388                     // wavefront propagation
2389                     while( ws >= wbuf ) // wavefront not empty
2390                     {
2391                         count++;
2392                         // put neighbors onto wavefront
2393                         T* dpp = &img.at<T>(p.y, p.x);
2394                         T dp = *dpp;
2395                         int* lpp = labels + width*p.y + p.x;
2396 
2397                         if( p.y < height-1 && !lpp[+width] && dpp[+dstep] != newVal && std::abs(dp - dpp[+dstep]) <= maxDiff )
2398                         {
2399                             lpp[+width] = curlabel;
2400                             *ws++ = Point2s(p.x, p.y+1);
2401                         }
2402 
2403                         if( p.y > 0 && !lpp[-width] && dpp[-dstep] != newVal && std::abs(dp - dpp[-dstep]) <= maxDiff )
2404                         {
2405                             lpp[-width] = curlabel;
2406                             *ws++ = Point2s(p.x, p.y-1);
2407                         }
2408 
2409                         if( p.x < width-1 && !lpp[+1] && dpp[+1] != newVal && std::abs(dp - dpp[+1]) <= maxDiff )
2410                         {
2411                             lpp[+1] = curlabel;
2412                             *ws++ = Point2s(p.x+1, p.y);
2413                         }
2414 
2415                         if( p.x > 0 && !lpp[-1] && dpp[-1] != newVal && std::abs(dp - dpp[-1]) <= maxDiff )
2416                         {
2417                             lpp[-1] = curlabel;
2418                             *ws++ = Point2s(p.x-1, p.y);
2419                         }
2420 
2421                         // pop most recent and propagate
2422                         // NB: could try least recent, maybe better convergence
2423                         p = *--ws;
2424                     }
2425 
2426                     // assign label type
2427                     if( count <= maxSpeckleSize )   // speckle region
2428                     {
2429                         rtype[ls[j]] = 1;   // small region label
2430                         ds[j] = (T)newVal;
2431                     }
2432                     else
2433                         rtype[ls[j]] = 0;   // large region label
2434                 }
2435             }
2436         }
2437     }
2438 }
2439 
2440 #ifdef HAVE_IPP
2441 static bool ipp_filterSpeckles(Mat &img, int maxSpeckleSize, int newVal, int maxDiff, Mat &buffer)
2442 {
2443 #if IPP_VERSION_X100 >= 810
2444     CV_INSTRUMENT_REGION_IPP();
2445 
2446     IppDataType dataType = ippiGetDataType(img.depth());
2447     IppiSize    size     = ippiSize(img.size());
2448     int         bufferSize;
2449 
2450     if(img.channels() != 1)
2451         return false;
2452 
2453     if(dataType != ipp8u && dataType != ipp16s)
2454         return false;
2455 
2456     if(ippiMarkSpecklesGetBufferSize(size, dataType, 1, &bufferSize) < 0)
2457         return false;
2458 
2459     if(bufferSize && (buffer.empty() || (int)(buffer.step*buffer.rows) < bufferSize))
2460         buffer.create(1, (int)bufferSize, CV_8U);
2461 
2462     switch(dataType)
2463     {
2464     case ipp8u:  return CV_INSTRUMENT_FUN_IPP(ippiMarkSpeckles_8u_C1IR, img.ptr<Ipp8u>(), (int)img.step, size, (Ipp8u)newVal, maxSpeckleSize, (Ipp8u)maxDiff, ippiNormL1, buffer.ptr<Ipp8u>()) >= 0;
2465     case ipp16s: return CV_INSTRUMENT_FUN_IPP(ippiMarkSpeckles_16s_C1IR, img.ptr<Ipp16s>(), (int)img.step, size, (Ipp16s)newVal, maxSpeckleSize, (Ipp16s)maxDiff, ippiNormL1, buffer.ptr<Ipp8u>()) >= 0;
2466     default:     return false;
2467     }
2468 #else
2469     CV_UNUSED(img); CV_UNUSED(maxSpeckleSize); CV_UNUSED(newVal); CV_UNUSED(maxDiff); CV_UNUSED(buffer);
2470     return false;
2471 #endif
2472 }
2473 #endif
2474 
2475 }
2476 
2477 void cv::filterSpeckles( InputOutputArray _img, double _newval, int maxSpeckleSize,
2478                          double _maxDiff, InputOutputArray __buf )
2479 {
2480     CV_INSTRUMENT_REGION();
2481 
2482     Mat img = _img.getMat();
2483     int type = img.type();
2484     Mat temp, &_buf = __buf.needed() ? __buf.getMatRef() : temp;
2485     CV_Assert( type == CV_8UC1 || type == CV_16SC1 );
2486 
2487     int newVal = cvRound(_newval), maxDiff = cvRound(_maxDiff);
2488 
2489     CV_IPP_RUN_FAST(ipp_filterSpeckles(img, maxSpeckleSize, newVal, maxDiff, _buf));
2490 
2491     if (type == CV_8UC1)
2492         filterSpecklesImpl<uchar>(img, newVal, maxSpeckleSize, maxDiff, _buf);
2493     else
2494         filterSpecklesImpl<short>(img, newVal, maxSpeckleSize, maxDiff, _buf);
2495 }
2496 
2497 void cv::validateDisparity( InputOutputArray _disp, InputArray _cost, int minDisparity,
2498                             int numberOfDisparities, int disp12MaxDiff )
2499 {
2500     CV_INSTRUMENT_REGION();
2501 
2502     Mat disp = _disp.getMat(), cost = _cost.getMat();
2503     int cols = disp.cols, rows = disp.rows;
2504     int minD = minDisparity, maxD = minDisparity + numberOfDisparities;
2505     int x, minX1 = std::max(maxD, 0), maxX1 = cols + std::min(minD, 0);
2506     AutoBuffer<int> _disp2buf(cols*2);
2507     int* disp2buf = _disp2buf.data();
2508     int* disp2cost = disp2buf + cols;
2509     const int DISP_SHIFT = 4, DISP_SCALE = 1 << DISP_SHIFT;
2510     int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
2511     int costType = cost.type();
2512 
2513     disp12MaxDiff *= DISP_SCALE;
2514 
2515     CV_Assert( numberOfDisparities > 0 && disp.type() == CV_16S &&
2516               (costType == CV_16S || costType == CV_32S) &&
2517               disp.size() == cost.size() );
2518 
2519     for( int y = 0; y < rows; y++ )
2520     {
2521         short* dptr = disp.ptr<short>(y);
2522 
2523         for( x = 0; x < cols; x++ )
2524         {
2525             disp2buf[x] = INVALID_DISP_SCALED;
2526             disp2cost[x] = INT_MAX;
2527         }
2528 
2529         if( costType == CV_16S )
2530         {
2531             const short* cptr = cost.ptr<short>(y);
2532 
2533             for( x = minX1; x < maxX1; x++ )
2534             {
2535                 int d = dptr[x], c = cptr[x];
2536 
2537                 if( d == INVALID_DISP_SCALED )
2538                     continue;
2539 
2540                 int x2 = x - ((d + DISP_SCALE/2) >> DISP_SHIFT);
2541 
2542                 if( disp2cost[x2] > c )
2543                 {
2544                     disp2cost[x2] = c;
2545                     disp2buf[x2] = d;
2546                 }
2547             }
2548         }
2549         else
2550         {
2551             const int* cptr = cost.ptr<int>(y);
2552 
2553             for( x = minX1; x < maxX1; x++ )
2554             {
2555                 int d = dptr[x], c = cptr[x];
2556 
2557                 if( d == INVALID_DISP_SCALED )
2558                     continue;
2559 
2560                 int x2 = x - ((d + DISP_SCALE/2) >> DISP_SHIFT);
2561 
2562                 if( disp2cost[x2] > c )
2563                 {
2564                     disp2cost[x2] = c;
2565                     disp2buf[x2] = d;
2566                 }
2567             }
2568         }
2569 
2570         for( x = minX1; x < maxX1; x++ )
2571         {
2572             // we round the computed disparity both towards -inf and +inf and check
2573             // if either of the corresponding disparities in disp2 is consistent.
2574             // This is to give the computed disparity a chance to look valid if it is.
2575             int d = dptr[x];
2576             if( d == INVALID_DISP_SCALED )
2577                 continue;
2578             int d0 = d >> DISP_SHIFT;
2579             int d1 = (d + DISP_SCALE-1) >> DISP_SHIFT;
2580             int x0 = x - d0, x1 = x - d1;
2581             if( (0 <= x0 && x0 < cols && disp2buf[x0] > INVALID_DISP_SCALED && std::abs(disp2buf[x0] - d) > disp12MaxDiff) &&
2582                 (0 <= x1 && x1 < cols && disp2buf[x1] > INVALID_DISP_SCALED && std::abs(disp2buf[x1] - d) > disp12MaxDiff) )
2583                 dptr[x] = (short)INVALID_DISP_SCALED;
2584         }
2585     }
2586 }
2587