1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
8 //
9 //
10 // License Agreement
11 // For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Copyright (C) 2013, OpenCV Foundation, all rights reserved.
16 // Third party copyrights are property of their respective owners.
17 //
18 // Redistribution and use in source and binary forms, with or without modification,
19 // are permitted provided that the following conditions are met:
20 //
21 // * Redistribution's of source code must retain the above copyright notice,
22 // this list of conditions and the following disclaimer.
23 //
24 // * Redistribution's in binary form must reproduce the above copyright notice,
25 // this list of conditions and the following disclaimer in the documentation
26 // and/or other materials provided with the distribution.
27 //
28 // * The name of the copyright holders may not be used to endorse or promote products
29 // derived from this software without specific prior written permission.
30 //
31 // This software is provided by the copyright holders and contributors "as is" and
32 // any express or implied warranties, including, but not limited to, the implied
33 // warranties of merchantability and fitness for a particular purpose are disclaimed.
34 // In no event shall the Intel Corporation or contributors be liable for any direct,
35 // indirect, incidental, special, exemplary, or consequential damages
36 // (including, but not limited to, procurement of substitute goods or services;
37 // loss of use, data, or profits; or business interruption) however caused
38 // and on any theory of liability, whether in contract, strict liability,
39 // or tort (including negligence or otherwise) arising in any way out of
40 // the use of this software, even if advised of the possibility of such damage.
41 //
42 //M*/
43
44 /*
45 This is a variation of
46 "Stereo Processing by Semiglobal Matching and Mutual Information"
47 by Heiko Hirschmuller.
48
49 We match blocks rather than individual pixels, thus the algorithm is called
50 SGBM (Semi-global block matching)
51 */
52
53 #include "precomp.hpp"
54 #include <limits.h>
55 #include "opencv2/core/hal/intrin.hpp"
56 #include "opencv2/core/utils/buffer_area.private.hpp"
57
58 namespace cv
59 {
60
61 typedef uchar PixType;
62 typedef short CostType;
63 typedef short DispType;
64
65 // NR - the number of directions. the loop on x that computes Lr assumes that NR == 8.
66 // if you change NR, please, modify the loop as well.
67 enum { NR = 8, NR2 = NR/2 };
68
69
70 struct StereoSGBMParams
71 {
StereoSGBMParamscv::StereoSGBMParams72 StereoSGBMParams()
73 {
74 minDisparity = numDisparities = 0;
75 SADWindowSize = 0;
76 P1 = P2 = 0;
77 disp12MaxDiff = 0;
78 preFilterCap = 0;
79 uniquenessRatio = 0;
80 speckleWindowSize = 0;
81 speckleRange = 0;
82 mode = StereoSGBM::MODE_SGBM;
83 }
84
StereoSGBMParamscv::StereoSGBMParams85 StereoSGBMParams( int _minDisparity, int _numDisparities, int _SADWindowSize,
86 int _P1, int _P2, int _disp12MaxDiff, int _preFilterCap,
87 int _uniquenessRatio, int _speckleWindowSize, int _speckleRange,
88 int _mode )
89 {
90 minDisparity = _minDisparity;
91 numDisparities = _numDisparities;
92 SADWindowSize = _SADWindowSize;
93 P1 = _P1;
94 P2 = _P2;
95 disp12MaxDiff = _disp12MaxDiff;
96 preFilterCap = _preFilterCap;
97 uniquenessRatio = _uniquenessRatio;
98 speckleWindowSize = _speckleWindowSize;
99 speckleRange = _speckleRange;
100 mode = _mode;
101 }
102
isFullDPcv::StereoSGBMParams103 inline bool isFullDP() const
104 {
105 return mode == StereoSGBM::MODE_HH || mode == StereoSGBM::MODE_HH4;
106 }
calcSADWindowSizecv::StereoSGBMParams107 inline Size calcSADWindowSize() const
108 {
109 const int dim = SADWindowSize > 0 ? SADWindowSize : 5;
110 return Size(dim, dim);
111 }
112
113 int minDisparity;
114 int numDisparities;
115 int SADWindowSize;
116 int preFilterCap;
117 int uniquenessRatio;
118 int P1;
119 int P2;
120 int speckleWindowSize;
121 int speckleRange;
122 int disp12MaxDiff;
123 int mode;
124 };
125
126 #if CV_SIMD
127 #if CV_SIMD_WIDTH == 16
vx_setseq_s16()128 static inline v_int16 vx_setseq_s16()
129 { return v_int16(0, 1, 2, 3, 4, 5, 6, 7); }
130 #elif CV_SIMD_WIDTH == 32
vx_setseq_s16()131 static inline v_int16 vx_setseq_s16()
132 { return v_int16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); }
133 #elif CV_SIMD_WIDTH == 64
vx_setseq_s16()134 static inline v_int16 vx_setseq_s16()
135 { return v_int16(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31); }
136 #else
137 struct vseq_s16
138 {
139 short data[v_int16::nlanes];
vseq_s16cv::vseq_s16140 vseq_s16()
141 {
142 for (int i = 0; i < v_int16::nlanes; i++)
143 data[i] = i;
144 }
145 };
vx_setseq_s16()146 static inline v_int16 vx_setseq_s16()
147 {
148 static vseq_s16 vseq;
149 return vx_load(vseq.data);
150 }
151 #endif
152 // define some additional reduce operations:
min_pos(const v_int16 & val,const v_int16 & pos,short & min_val,short & min_pos)153 static inline void min_pos(const v_int16& val, const v_int16& pos, short &min_val, short &min_pos)
154 {
155 min_val = v_reduce_min(val);
156 v_int16 v_mask = (vx_setall_s16(min_val) == val);
157 min_pos = v_reduce_min(((pos+vx_setseq_s16()) & v_mask) | (vx_setall_s16(SHRT_MAX) & ~v_mask));
158 }
159 #endif
160
161 static const int DEFAULT_RIGHT_BORDER = -1;
162
163 /*
164 For each pixel row1[x], max(maxD, 0) <= minX <= x < maxX <= width - max(0, -minD),
165 and for each disparity minD<=d<maxD the function
166 computes the cost (cost[(x-minX)*(maxD - minD) + (d - minD)]), depending on the difference between
167 row1[x] and row2[x-d]. The subpixel algorithm from
168 "Depth Discontinuities by Pixel-to-Pixel Stereo" by Stan Birchfield and C. Tomasi
169 is used, hence the suffix BT.
170
171 the temporary buffer should contain width2*2 elements
172 */
calcPixelCostBT(const Mat & img1,const Mat & img2,int y,int minD,int maxD,CostType * cost,PixType * buffer,const PixType * tab,int xrange_min=0,int xrange_max=DEFAULT_RIGHT_BORDER)173 static void calcPixelCostBT( const Mat& img1, const Mat& img2, int y,
174 int minD, int maxD, CostType* cost,
175 PixType* buffer, const PixType* tab,
176 int xrange_min = 0, int xrange_max = DEFAULT_RIGHT_BORDER )
177 {
178 int x, c, width = img1.cols, cn = img1.channels();
179 int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
180 int D = (int)alignSize(maxD - minD, v_int16::nlanes), width1 = maxX1 - minX1;
181 //This minX1 & maxX2 correction is defining which part of calculatable line must be calculated
182 //That is needs of parallel algorithm
183 xrange_min = (xrange_min < 0) ? 0: xrange_min;
184 xrange_max = (xrange_max == DEFAULT_RIGHT_BORDER) || (xrange_max > width1) ? width1 : xrange_max;
185 maxX1 = minX1 + xrange_max;
186 minX1 += xrange_min;
187 width1 = maxX1 - minX1;
188 int minX2 = std::max(minX1 - maxD, 0), maxX2 = std::min(maxX1 - minD, width);
189 int width2 = maxX2 - minX2;
190 const PixType *row1 = img1.ptr<PixType>(y), *row2 = img2.ptr<PixType>(y);
191 PixType *prow1 = buffer + width2*2, *prow2 = prow1 + width*cn*2;
192
193 for( c = 0; c < cn*2; c++ )
194 {
195 prow1[width*c] = prow1[width*c + width-1] =
196 prow2[width*c] = prow2[width*c + width-1] = tab[0];
197 }
198
199 int n1 = y > 0 ? -(int)img1.step : 0, s1 = y < img1.rows-1 ? (int)img1.step : 0;
200 int n2 = y > 0 ? -(int)img2.step : 0, s2 = y < img2.rows-1 ? (int)img2.step : 0;
201
202 int minX_cmn = std::min(minX1,minX2)-1;
203 int maxX_cmn = std::max(maxX1,maxX2)+1;
204 minX_cmn = std::max(minX_cmn, 1);
205 maxX_cmn = std::min(maxX_cmn, width - 1);
206 if( cn == 1 )
207 {
208 for( x = minX_cmn; x < maxX_cmn; x++ )
209 {
210 prow1[x] = tab[(row1[x+1] - row1[x-1])*2 + row1[x+n1+1] - row1[x+n1-1] + row1[x+s1+1] - row1[x+s1-1]];
211 prow2[width-1-x] = tab[(row2[x+1] - row2[x-1])*2 + row2[x+n2+1] - row2[x+n2-1] + row2[x+s2+1] - row2[x+s2-1]];
212
213 prow1[x+width] = row1[x];
214 prow2[width-1-x+width] = row2[x];
215 }
216 }
217 else
218 {
219 for( x = minX_cmn; x < maxX_cmn; x++ )
220 {
221 prow1[x] = tab[(row1[x*3+3] - row1[x*3-3])*2 + row1[x*3+n1+3] - row1[x*3+n1-3] + row1[x*3+s1+3] - row1[x*3+s1-3]];
222 prow1[x+width] = tab[(row1[x*3+4] - row1[x*3-2])*2 + row1[x*3+n1+4] - row1[x*3+n1-2] + row1[x*3+s1+4] - row1[x*3+s1-2]];
223 prow1[x+width*2] = tab[(row1[x*3+5] - row1[x*3-1])*2 + row1[x*3+n1+5] - row1[x*3+n1-1] + row1[x*3+s1+5] - row1[x*3+s1-1]];
224
225 prow2[width-1-x] = tab[(row2[x*3+3] - row2[x*3-3])*2 + row2[x*3+n2+3] - row2[x*3+n2-3] + row2[x*3+s2+3] - row2[x*3+s2-3]];
226 prow2[width-1-x+width] = tab[(row2[x*3+4] - row2[x*3-2])*2 + row2[x*3+n2+4] - row2[x*3+n2-2] + row2[x*3+s2+4] - row2[x*3+s2-2]];
227 prow2[width-1-x+width*2] = tab[(row2[x*3+5] - row2[x*3-1])*2 + row2[x*3+n2+5] - row2[x*3+n2-1] + row2[x*3+s2+5] - row2[x*3+s2-1]];
228
229 prow1[x+width*3] = row1[x*3];
230 prow1[x+width*4] = row1[x*3+1];
231 prow1[x+width*5] = row1[x*3+2];
232
233 prow2[width-1-x+width*3] = row2[x*3];
234 prow2[width-1-x+width*4] = row2[x*3+1];
235 prow2[width-1-x+width*5] = row2[x*3+2];
236 }
237 }
238
239 memset( cost + xrange_min*D, 0, width1*D*sizeof(cost[0]) );
240
241 buffer -= width-maxX2;
242 cost -= (minX1-xrange_min)*D + minD; // simplify the cost indices inside the loop
243
244 for( c = 0; c < cn*2; c++, prow1 += width, prow2 += width )
245 {
246 int diff_scale = c < cn ? 0 : 2;
247
248 // precompute
249 // v0 = min(row2[x-1/2], row2[x], row2[x+1/2]) and
250 // v1 = max(row2[x-1/2], row2[x], row2[x+1/2]) and
251 // to process values from [minX2, maxX2) we should check memory location (width - 1 - maxX2, width - 1 - minX2]
252 // so iterate through [width - maxX2, width - minX2)
253 for( x = width-maxX2; x < width-minX2; x++ )
254 {
255 int v = prow2[x];
256 int vl = x > 0 ? (v + prow2[x-1])/2 : v;
257 int vr = x < width-1 ? (v + prow2[x+1])/2 : v;
258 int v0 = std::min(vl, vr); v0 = std::min(v0, v);
259 int v1 = std::max(vl, vr); v1 = std::max(v1, v);
260 buffer[x] = (PixType)v0;
261 buffer[x + width2] = (PixType)v1;
262 }
263
264 for( x = minX1; x < maxX1; x++ )
265 {
266 int u = prow1[x];
267 int ul = x > 0 ? (u + prow1[x-1])/2 : u;
268 int ur = x < width-1 ? (u + prow1[x+1])/2 : u;
269 int u0 = std::min(ul, ur); u0 = std::min(u0, u);
270 int u1 = std::max(ul, ur); u1 = std::max(u1, u);
271
272 int d = minD;
273 #if CV_SIMD
274 v_uint8 _u = vx_setall_u8((uchar)u), _u0 = vx_setall_u8((uchar)u0);
275 v_uint8 _u1 = vx_setall_u8((uchar)u1);
276
277 for( ; d <= maxD - 2*v_int16::nlanes; d += 2*v_int16::nlanes )
278 {
279 v_uint8 _v = vx_load(prow2 + width-x-1 + d);
280 v_uint8 _v0 = vx_load(buffer + width-x-1 + d);
281 v_uint8 _v1 = vx_load(buffer + width-x-1 + d + width2);
282 v_uint8 c0 = v_max(_u - _v1, _v0 - _u);
283 v_uint8 c1 = v_max(_v - _u1, _u0 - _v);
284 v_uint8 diff = v_min(c0, c1);
285
286 v_int16 _c0 = vx_load_aligned(cost + x*D + d);
287 v_int16 _c1 = vx_load_aligned(cost + x*D + d + v_int16::nlanes);
288
289 v_uint16 diff1,diff2;
290 v_expand(diff,diff1,diff2);
291 v_store_aligned(cost + x*D + d, _c0 + v_reinterpret_as_s16(diff1 >> diff_scale));
292 v_store_aligned(cost + x*D + d + v_int16::nlanes, _c1 + v_reinterpret_as_s16(diff2 >> diff_scale));
293 }
294 #endif
295 for( ; d < maxD; d++ )
296 {
297 int v = prow2[width-x-1 + d];
298 int v0 = buffer[width-x-1 + d];
299 int v1 = buffer[width-x-1 + d + width2];
300 int c0 = std::max(0, u - v1); c0 = std::max(c0, v0 - u);
301 int c1 = std::max(0, v - u1); c1 = std::max(c1, u0 - v);
302
303 cost[x*D + d] = (CostType)(cost[x*D+d] + (std::min(c0, c1) >> diff_scale));
304 }
305 }
306 }
307 }
308
309
310
311 class BufferSGBM
312 {
313 private:
314 size_t width1;
315 size_t Da;
316 size_t Dlra;
317 size_t costWidth;
318 size_t costHeight;
319 size_t hsumRows;
320 bool fullDP;
321 uchar dirs;
322 uchar dirs2;
323 static const size_t TAB_OFS = 256*4;
324
325 public:
326 CostType* Cbuf;
327 CostType* Sbuf;
328 CostType* hsumBuf;
329 CostType* pixDiff;
330 CostType* disp2cost;
331 DispType* disp2ptr;
332 PixType* tempBuf;
333 std::vector<CostType*> Lr;
334 std::vector<CostType*> minLr;
335 PixType * clipTab;
336
337 private:
338 utils::BufferArea area;
339
340 public:
BufferSGBM(size_t width1_,size_t Da_,size_t Dlra_,size_t cn,size_t width,size_t height,const StereoSGBMParams & params)341 BufferSGBM(size_t width1_,
342 size_t Da_,
343 size_t Dlra_,
344 size_t cn,
345 size_t width,
346 size_t height,
347 const StereoSGBMParams ¶ms)
348 : width1(width1_),
349 Da(Da_),
350 Dlra(Dlra_),
351 Cbuf(NULL),
352 Sbuf(NULL),
353 hsumBuf(NULL),
354 pixDiff(NULL),
355 disp2cost(NULL),
356 disp2ptr(NULL),
357 tempBuf(NULL),
358 Lr(2, (CostType*)NULL),
359 minLr(2, (CostType*)NULL),
360 clipTab(NULL)
361 {
362 const size_t TAB_SIZE = 256 + TAB_OFS*2;
363 fullDP = params.isFullDP();
364 costWidth = width1 * Da;
365 costHeight = fullDP ? height : 1;
366 hsumRows = params.calcSADWindowSize().height + 2;
367 dirs = params.mode == StereoSGBM::MODE_HH4 ? 1 : NR;
368 dirs2 = params.mode == StereoSGBM::MODE_HH4 ? 1 : NR2;
369 // for each possible stereo match (img1(x,y) <=> img2(x-d,y))
370 // we keep pixel difference cost (C) and the summary cost over NR directions (S).
371 // we also keep all the partial costs for the previous line L_r(x,d) and also min_k L_r(x, k)
372 area.allocate(Cbuf, costWidth * costHeight, CV_SIMD_WIDTH); // summary cost over different (nDirs) directions
373 area.allocate(Sbuf, costWidth * costHeight, CV_SIMD_WIDTH);
374 area.allocate(hsumBuf, costWidth * hsumRows, CV_SIMD_WIDTH);
375 area.allocate(pixDiff, costWidth, CV_SIMD_WIDTH);
376 area.allocate(disp2cost, width, CV_SIMD_WIDTH);
377 area.allocate(disp2ptr, width, CV_SIMD_WIDTH);
378 area.allocate(tempBuf, width * (4 * cn + 2), CV_SIMD_WIDTH);
379 // the number of L_r(.,.) and min_k L_r(.,.) lines in the buffer:
380 // for 8-way dynamic programming we need the current row and
381 // the previous row, i.e. 2 rows in total
382 for (size_t i = 0; i < 2; ++i)
383 {
384 // 2D: [ NR ][ w1 * NR2 ][ NR ] * [ Dlra ]
385 area.allocate(Lr[i], calcLrCount() * Dlra, CV_SIMD_WIDTH);
386 // 1D: [ NR ][ w1 * NR2 ][ NR ]
387 area.allocate(minLr[i], calcLrCount(), CV_SIMD_WIDTH);
388 }
389 area.allocate(clipTab, TAB_SIZE, CV_SIMD_WIDTH);
390 area.commit();
391
392 // init clipTab
393 const int ftzero = std::max(params.preFilterCap, 15) | 1;
394 for(int i = 0; i < (int)TAB_SIZE; i++ )
395 clipTab[i] = (PixType)(std::min(std::max(i - (int)TAB_OFS, -ftzero), ftzero) + ftzero);
396 }
getClipTab() const397 inline const PixType * getClipTab() const
398 {
399 return clipTab + TAB_OFS;
400 }
initCBuf(CostType val) const401 inline void initCBuf(CostType val) const
402 {
403 for (size_t i = 0; i < costWidth * costHeight; ++i)
404 Cbuf[i] = val;
405 }
clearLr(const Range & range=Range::all ()) const406 inline void clearLr(const Range & range = Range::all()) const
407 {
408 for (uchar i = 0; i < 2; ++i)
409 {
410 if (range == Range::all())
411 {
412 memset(Lr[i], 0, calcLrCount() * Dlra * sizeof(CostType));
413 memset(minLr[i], 0, calcLrCount() * sizeof(CostType));
414 }
415 else
416 {
417 memset(getLr(i, range.start), 0, range.size() * sizeof(CostType) * Dlra);
418 memset(getMinLr(i, range.start), 0, range.size() * sizeof(CostType));
419 }
420 }
421 }
calcLrCount() const422 inline size_t calcLrCount() const
423 {
424 return width1 * dirs2 + 2 * dirs;
425 }
swapLr()426 inline void swapLr()
427 {
428 std::swap(Lr[0], Lr[1]);
429 std::swap(minLr[0], minLr[1]);
430 }
getHSumBuf(int row) const431 inline CostType * getHSumBuf(int row) const
432 {
433 return hsumBuf + (row % hsumRows) * costWidth;
434 }
getCBuf(int row) const435 inline CostType * getCBuf(int row) const
436 {
437 CV_Assert(row >= 0);
438 return Cbuf + (!fullDP ? 0 : (row * costWidth));
439 }
getSBuf(int row) const440 inline CostType * getSBuf(int row) const
441 {
442 CV_Assert(row >= 0);
443 return Sbuf + (!fullDP ? 0 : (row * costWidth));
444 }
clearSBuf(int row,const Range & range=Range::all ()) const445 inline void clearSBuf(int row, const Range & range = Range::all()) const
446 {
447 if (range == Range::all())
448 memset(getSBuf(row), 0, costWidth * sizeof(CostType));
449 else
450 memset(getSBuf(row) + range.start * Da, 0, range.size() * Da * sizeof(CostType));
451 }
452
453 // shift Lr[k] and minLr[k] pointers, because we allocated them with the borders,
454 // and will occasionally use negative indices with the arrays
455 // we need to shift Lr[k] pointers by 1, to give the space for d=-1.
getLr(uchar id,int idx,uchar shift=0) const456 inline CostType * getLr(uchar id, int idx, uchar shift = 0) const
457 {
458 CV_Assert(id < 2);
459 const size_t fixed_offset = dirs * Dlra;
460 return Lr[id] + fixed_offset + (idx * (int)dirs2 + (int)shift) * (int)Dlra;
461 }
getMinLr(uchar id,int idx,uchar shift=0) const462 inline CostType * getMinLr(uchar id, int idx, uchar shift = 0) const
463 {
464 CV_Assert(id < 2);
465 const size_t fixed_offset = dirs;
466 return minLr[id] + fixed_offset + (idx * dirs2 + shift);
467 }
468 };
469
470 /*
471 computes disparity for "roi" in img1 w.r.t. img2 and write it to disp1buf.
472 that is, disp1buf(x, y)=d means that img1(x+roi.x, y+roi.y) ~ img2(x+roi.x-d, y+roi.y).
473 minD <= d < maxD.
474 disp2full is the reverse disparity map, that is:
475 disp2full(x+roi.x,y+roi.y)=d means that img2(x+roi.x, y+roi.y) ~ img1(x+roi.x+d, y+roi.y)
476
477 note that disp1buf will have the same size as the roi and
478 disp2full will have the same size as img1 (or img2).
479 On exit disp2buf is not the final disparity, it is an intermediate result that becomes
480 final after all the tiles are processed.
481
482 the disparity in disp1buf is written with sub-pixel accuracy
483 (4 fractional bits, see StereoSGBM::DISP_SCALE),
484 using quadratic interpolation, while the disparity in disp2buf
485 is written as is, without interpolation.
486
487 disp2cost also has the same size as img1 (or img2).
488 It contains the minimum current cost, used to find the best disparity, corresponding to the minimal cost.
489 */
computeDisparitySGBM(const Mat & img1,const Mat & img2,Mat & disp1,const StereoSGBMParams & params)490 static void computeDisparitySGBM( const Mat& img1, const Mat& img2,
491 Mat& disp1, const StereoSGBMParams& params )
492 {
493 const int DISP_SHIFT = StereoMatcher::DISP_SHIFT;
494 const int DISP_SCALE = (1 << DISP_SHIFT);
495 const CostType MAX_COST = SHRT_MAX;
496
497 int minD = params.minDisparity, maxD = minD + params.numDisparities;
498 int uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10;
499 int disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;
500 int P1 = params.P1 > 0 ? params.P1 : 2, P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);
501 int k, width = disp1.cols, height = disp1.rows;
502 int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
503 const int D = params.numDisparities;
504 int width1 = maxX1 - minX1;
505 int Da = (int)alignSize(D, v_int16::nlanes);
506 int Dlra = Da + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D
507 int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
508 int SW2 = params.calcSADWindowSize().width/2, SH2 = params.calcSADWindowSize().height/2;
509 int npasses = params.isFullDP() ? 2 : 1;
510
511 if( minX1 >= maxX1 )
512 {
513 disp1 = Scalar::all(INVALID_DISP_SCALED);
514 return;
515 }
516
517 BufferSGBM mem(width1, Da, Dlra, img1.channels(), width, height, params);
518 mem.initCBuf((CostType)P2); // add P2 to every C(x,y). it saves a few operations in the inner loops
519
520 for( int pass = 1; pass <= npasses; pass++ )
521 {
522 int x1, y1, x2, y2, dx, dy;
523
524 if( pass == 1 )
525 {
526 y1 = 0; y2 = height; dy = 1;
527 x1 = 0; x2 = width1; dx = 1;
528 }
529 else
530 {
531 y1 = height-1; y2 = -1; dy = -1;
532 x1 = width1-1; x2 = -1; dx = -1;
533 }
534
535 uchar lrID = 0;
536 mem.clearLr();
537
538 for( int y = y1; y != y2; y += dy )
539 {
540 int x, d;
541 DispType* disp1ptr = disp1.ptr<DispType>(y);
542 CostType* const C = mem.getCBuf(y);
543 CostType* const S = mem.getSBuf(y);
544
545 if( pass == 1 ) // compute C on the first pass, and reuse it on the second pass, if any.
546 {
547 int dy1 = y == 0 ? 0 : y + SH2, dy2 = y == 0 ? SH2 : dy1;
548
549 for( k = dy1; k <= dy2; k++ )
550 {
551 CostType* hsumAdd = mem.getHSumBuf(std::min(k, height-1));
552
553 if( k < height )
554 {
555 calcPixelCostBT( img1, img2, k, minD, maxD, mem.pixDiff, mem.tempBuf, mem.getClipTab() );
556
557 memset(hsumAdd, 0, Da*sizeof(CostType));
558 #if CV_SIMD
559 v_int16 h_scale = vx_setall_s16((short)SW2 + 1);
560 for( d = 0; d < Da; d += v_int16::nlanes )
561 {
562 v_int16 v_hsumAdd = vx_load_aligned(mem.pixDiff + d) * h_scale;
563 for( x = Da; x <= SW2*Da; x += Da )
564 v_hsumAdd += vx_load_aligned(mem.pixDiff + x + d);
565 v_store_aligned(hsumAdd + d, v_hsumAdd);
566 }
567 #else
568 for (d = 0; d < D; d++)
569 {
570 hsumAdd[d] = (CostType)(mem.pixDiff[d] * (SW2 + 1));
571 for( x = Da; x <= SW2*Da; x += Da )
572 hsumAdd[d] = (CostType)(hsumAdd[d] + mem.pixDiff[x + d]);
573 }
574 #endif
575
576 if( y > 0 )
577 {
578 const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, 0));
579 const CostType* Cprev = mem.getCBuf(y - 1);
580
581 #if CV_SIMD
582 for (d = 0; d < Da; d += v_int16::nlanes)
583 v_store_aligned(C + d, vx_load_aligned(Cprev + d) + vx_load_aligned(hsumAdd + d) - vx_load_aligned(hsumSub + d));
584 #else
585 for (d = 0; d < D; d++)
586 C[d] = (CostType)(Cprev[d] + hsumAdd[d] - hsumSub[d]);
587 #endif
588
589 for( x = Da; x < width1*Da; x += Da )
590 {
591 const CostType* pixAdd = mem.pixDiff + std::min(x + SW2*Da, (width1-1)*Da);
592 const CostType* pixSub = mem.pixDiff + std::max(x - (SW2+1)*Da, 0);
593 #if CV_SIMD
594 for( d = 0; d < Da; d += v_int16::nlanes )
595 {
596 v_int16 hv = vx_load_aligned(hsumAdd + x - Da + d) - vx_load_aligned(pixSub + d) + vx_load_aligned(pixAdd + d);
597 v_store_aligned(hsumAdd + x + d, hv);
598 v_store_aligned(C + x + d, vx_load_aligned(Cprev + x + d) - vx_load_aligned(hsumSub + x + d) + hv);
599 }
600 #else
601 for( d = 0; d < D; d++ )
602 {
603 int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - Da + d] + pixAdd[d] - pixSub[d]);
604 C[x + d] = (CostType)(Cprev[x + d] + hv - hsumSub[x + d]);
605 }
606 #endif
607 }
608 }
609 else
610 {
611 #if CV_SIMD
612 v_int16 v_scale = vx_setall_s16(k == 0 ? (short)SH2 + 1 : 1);
613 for (d = 0; d < Da; d += v_int16::nlanes)
614 v_store_aligned(C + d, vx_load_aligned(C + d) + vx_load_aligned(hsumAdd + d) * v_scale);
615 #else
616 int scale = k == 0 ? SH2 + 1 : 1;
617 for (d = 0; d < D; d++)
618 C[d] = (CostType)(C[d] + hsumAdd[d] * scale);
619 #endif
620 for( x = Da; x < width1*Da; x += Da )
621 {
622 const CostType* pixAdd = mem.pixDiff + std::min(x + SW2*Da, (width1-1)*Da);
623 const CostType* pixSub = mem.pixDiff + std::max(x - (SW2+1)*Da, 0);
624
625 #if CV_SIMD
626 for (d = 0; d < Da; d += v_int16::nlanes)
627 {
628 v_int16 hv = vx_load_aligned(hsumAdd + x - Da + d) + vx_load_aligned(pixAdd + d) - vx_load_aligned(pixSub + d);
629 v_store_aligned(hsumAdd + x + d, hv);
630 v_store_aligned(C + x + d, vx_load_aligned(C + x + d) + hv * v_scale);
631 }
632 #else
633 for( d = 0; d < D; d++ )
634 {
635 CostType hv = (CostType)(hsumAdd[x - Da + d] + pixAdd[d] - pixSub[d]);
636 hsumAdd[x + d] = hv;
637 C[x + d] = (CostType)(C[x + d] + hv * scale);
638 }
639 #endif
640 }
641 }
642 }
643 else
644 {
645 if( y > 0 )
646 {
647 const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, 0));
648 const CostType* Cprev = mem.getCBuf(y - 1);
649 #if CV_SIMD
650 for (x = 0; x < width1*Da; x += v_int16::nlanes)
651 v_store_aligned(C + x, vx_load_aligned(Cprev + x) - vx_load_aligned(hsumSub + x) + vx_load_aligned(hsumAdd + x));
652 #else
653 for (x = 0; x < width1*Da; x++)
654 C[x] = (CostType)(Cprev[x] + hsumAdd[x] - hsumSub[x]);
655 #endif
656 }
657 else
658 {
659 #if CV_SIMD
660 for (x = 0; x < width1*Da; x += v_int16::nlanes)
661 v_store_aligned(C + x, vx_load_aligned(C + x) + vx_load_aligned(hsumAdd + x));
662 #else
663 for (x = 0; x < width1*Da; x++)
664 C[x] = (CostType)(C[x] + hsumAdd[x]);
665 #endif
666 }
667 }
668
669 }
670
671 // also, clear the S buffer
672 mem.clearSBuf(y);
673 }
674
675 /*
676 [formula 13 in the paper]
677 compute L_r(p, d) = C(p, d) +
678 min(L_r(p-r, d),
679 L_r(p-r, d-1) + P1,
680 L_r(p-r, d+1) + P1,
681 min_k L_r(p-r, k) + P2) - min_k L_r(p-r, k)
682 where p = (x,y), r is one of the directions.
683 we process all the directions at once:
684 0: r=(-dx, 0)
685 1: r=(-1, -dy)
686 2: r=(0, -dy)
687 3: r=(1, -dy) !!!Note that only directions 0 to 3 are processed
688 4: r=(-2, -dy)
689 5: r=(-1, -dy*2)
690 6: r=(1, -dy*2)
691 7: r=(2, -dy)
692 */
693
694 for( x = x1; x != x2; x += dx )
695 {
696 int delta0 = P2 + *mem.getMinLr(lrID, x - dx);
697 int delta1 = P2 + *mem.getMinLr(1 - lrID, x - 1, 1);
698 int delta2 = P2 + *mem.getMinLr(1 - lrID, x, 2);
699 int delta3 = P2 + *mem.getMinLr(1 - lrID, x + 1, 3);
700
701 CostType* Lr_p0 = mem.getLr(lrID, x - dx);
702 CostType* Lr_p1 = mem.getLr(1 - lrID, x - 1, 1);
703 CostType* Lr_p2 = mem.getLr(1 - lrID, x, 2);
704 CostType* Lr_p3 = mem.getLr(1 - lrID, x + 1, 3);
705
706 Lr_p0[-1] = Lr_p0[D] = MAX_COST;
707 Lr_p1[-1] = Lr_p1[D] = MAX_COST;
708 Lr_p2[-1] = Lr_p2[D] = MAX_COST;
709 Lr_p3[-1] = Lr_p3[D] = MAX_COST;
710
711 CostType* Lr_p = mem.getLr(lrID, x);
712 const CostType* Cp = C + x*Da;
713 CostType* Sp = S + x*Da;
714
715 CostType* minL = mem.getMinLr(lrID, x);
716 d = 0;
717 #if CV_SIMD
718 v_int16 _P1 = vx_setall_s16((short)P1);
719
720 v_int16 _delta0 = vx_setall_s16((short)delta0);
721 v_int16 _delta1 = vx_setall_s16((short)delta1);
722 v_int16 _delta2 = vx_setall_s16((short)delta2);
723 v_int16 _delta3 = vx_setall_s16((short)delta3);
724 v_int16 _minL0 = vx_setall_s16((short)MAX_COST);
725 v_int16 _minL1 = vx_setall_s16((short)MAX_COST);
726 v_int16 _minL2 = vx_setall_s16((short)MAX_COST);
727 v_int16 _minL3 = vx_setall_s16((short)MAX_COST);
728
729 for( ; d <= D - v_int16::nlanes; d += v_int16::nlanes )
730 {
731 v_int16 Cpd = vx_load_aligned(Cp + d);
732 v_int16 Spd = vx_load_aligned(Sp + d);
733 v_int16 L;
734
735 L = v_min(v_min(v_min(vx_load_aligned(Lr_p0 + d), vx_load(Lr_p0 + d - 1) + _P1), vx_load(Lr_p0 + d + 1) + _P1), _delta0) - _delta0 + Cpd;
736 v_store_aligned(Lr_p + d, L);
737 _minL0 = v_min(_minL0, L);
738 Spd += L;
739
740 L = v_min(v_min(v_min(vx_load_aligned(Lr_p1 + d), vx_load(Lr_p1 + d - 1) + _P1), vx_load(Lr_p1 + d + 1) + _P1), _delta1) - _delta1 + Cpd;
741 v_store_aligned(Lr_p + d + Dlra, L);
742 _minL1 = v_min(_minL1, L);
743 Spd += L;
744
745 L = v_min(v_min(v_min(vx_load_aligned(Lr_p2 + d), vx_load(Lr_p2 + d - 1) + _P1), vx_load(Lr_p2 + d + 1) + _P1), _delta2) - _delta2 + Cpd;
746 v_store_aligned(Lr_p + d + Dlra*2, L);
747 _minL2 = v_min(_minL2, L);
748 Spd += L;
749
750 L = v_min(v_min(v_min(vx_load_aligned(Lr_p3 + d), vx_load(Lr_p3 + d - 1) + _P1), vx_load(Lr_p3 + d + 1) + _P1), _delta3) - _delta3 + Cpd;
751 v_store_aligned(Lr_p + d + Dlra*3, L);
752 _minL3 = v_min(_minL3, L);
753 Spd += L;
754
755 v_store_aligned(Sp + d, Spd);
756 }
757
758 #if CV_SIMD_WIDTH > 32
759 minL[0] = v_reduce_min(_minL0);
760 minL[1] = v_reduce_min(_minL1);
761 minL[2] = v_reduce_min(_minL2);
762 minL[3] = v_reduce_min(_minL3);
763 #else
764 // Get minimum for L0-L3
765 v_int16 t0, t1, t2, t3;
766 v_zip(_minL0, _minL2, t0, t2);
767 v_zip(_minL1, _minL3, t1, t3);
768 v_zip(v_min(t0, t2), v_min(t1, t3), t0, t1);
769 t0 = v_min(t0, t1);
770 t0 = v_min(t0, v_rotate_right<4>(t0));
771 #if CV_SIMD_WIDTH == 32
772 CostType buf[v_int16::nlanes];
773 v_store_low(buf, v_min(t0, v_rotate_right<8>(t0)));
774 minL[0] = buf[0];
775 minL[1] = buf[1];
776 minL[2] = buf[2];
777 minL[3] = buf[3];
778 #else
779 v_store_low(minL, t0);
780 #endif
781 #endif
782 #else
783 minL[0] = MAX_COST;
784 minL[1] = MAX_COST;
785 minL[2] = MAX_COST;
786 minL[3] = MAX_COST;
787 #endif
788 for( ; d < D; d++ )
789 {
790 int Cpd = Cp[d], L;
791 int Spd = Sp[d];
792
793 L = Cpd + std::min((int)Lr_p0[d], std::min(Lr_p0[d - 1] + P1, std::min(Lr_p0[d + 1] + P1, delta0))) - delta0;
794 Lr_p[d] = (CostType)L;
795 minL[0] = std::min(minL[0], (CostType)L);
796 Spd += L;
797
798 L = Cpd + std::min((int)Lr_p1[d], std::min(Lr_p1[d - 1] + P1, std::min(Lr_p1[d + 1] + P1, delta1))) - delta1;
799 Lr_p[d + Dlra] = (CostType)L;
800 minL[1] = std::min(minL[1], (CostType)L);
801 Spd += L;
802
803 L = Cpd + std::min((int)Lr_p2[d], std::min(Lr_p2[d - 1] + P1, std::min(Lr_p2[d + 1] + P1, delta2))) - delta2;
804 Lr_p[d + Dlra*2] = (CostType)L;
805 minL[2] = std::min(minL[2], (CostType)L);
806 Spd += L;
807
808 L = Cpd + std::min((int)Lr_p3[d], std::min(Lr_p3[d - 1] + P1, std::min(Lr_p3[d + 1] + P1, delta3))) - delta3;
809 Lr_p[d + Dlra*3] = (CostType)L;
810 minL[3] = std::min(minL[3], (CostType)L);
811 Spd += L;
812
813 Sp[d] = saturate_cast<CostType>(Spd);
814 }
815 }
816
817 if( pass == npasses )
818 {
819 x = 0;
820 #if CV_SIMD
821 v_int16 v_inv_dist = vx_setall_s16((DispType)INVALID_DISP_SCALED);
822 v_int16 v_max_cost = vx_setall_s16(MAX_COST);
823 for( ; x <= width - v_int16::nlanes; x += v_int16::nlanes )
824 {
825 v_store(disp1ptr + x, v_inv_dist);
826 v_store(mem.disp2ptr + x, v_inv_dist);
827 v_store(mem.disp2cost + x, v_max_cost);
828 }
829 #endif
830 for( ; x < width; x++ )
831 {
832 disp1ptr[x] = mem.disp2ptr[x] = (DispType)INVALID_DISP_SCALED;
833 mem.disp2cost[x] = MAX_COST;
834 }
835
836 for( x = width1 - 1; x >= 0; x-- )
837 {
838 CostType* Sp = S + x*Da;
839 CostType minS = MAX_COST;
840 short bestDisp = -1;
841
842 if( npasses == 1 )
843 {
844 CostType* Lr_p0 = mem.getLr(lrID, x + 1);
845 Lr_p0[-1] = Lr_p0[D] = MAX_COST;
846 CostType* Lr_p = mem.getLr(lrID, x);
847
848 const CostType* Cp = C + x*Da;
849
850 d = 0;
851 int delta0 = P2 + *mem.getMinLr(lrID, x + 1);
852 int minL0 = MAX_COST;
853 #if CV_SIMD
854 v_int16 _P1 = vx_setall_s16((short)P1);
855 v_int16 _delta0 = vx_setall_s16((short)delta0);
856
857 v_int16 _minL0 = vx_setall_s16((short)MAX_COST);
858 v_int16 _minS = vx_setall_s16(MAX_COST), _bestDisp = vx_setall_s16(-1);
859 for( ; d <= D - v_int16::nlanes; d += v_int16::nlanes )
860 {
861 v_int16 Cpd = vx_load_aligned(Cp + d);
862 v_int16 L0 = v_min(v_min(v_min(vx_load_aligned(Lr_p0 + d), vx_load(Lr_p0 + d - 1) + _P1), vx_load(Lr_p0 + d + 1) + _P1), _delta0) - _delta0 + Cpd;
863
864 v_store_aligned(Lr_p + d, L0);
865 _minL0 = v_min(_minL0, L0);
866 L0 += vx_load_aligned(Sp + d);
867 v_store_aligned(Sp + d, L0);
868
869 _bestDisp = v_select(_minS > L0, vx_setall_s16((short)d), _bestDisp);
870 _minS = v_min(_minS, L0);
871 }
872 minL0 = (CostType)v_reduce_min(_minL0);
873 min_pos(_minS, _bestDisp, minS, bestDisp);
874 #endif
875 for( ; d < D; d++ )
876 {
877 int L0 = Cp[d] + std::min((int)Lr_p0[d], std::min(Lr_p0[d-1] + P1, std::min(Lr_p0[d+1] + P1, delta0))) - delta0;
878
879 Lr_p[d] = (CostType)L0;
880 minL0 = std::min(minL0, L0);
881
882 CostType Sval = Sp[d] = saturate_cast<CostType>(Sp[d] + L0);
883 if( Sval < minS )
884 {
885 minS = Sval;
886 bestDisp = (short)d;
887 }
888 }
889 *mem.getMinLr(lrID, x) = (CostType)minL0;
890 }
891 else
892 {
893 d = 0;
894 #if CV_SIMD
895 v_int16 _minS = vx_setall_s16(MAX_COST), _bestDisp = vx_setall_s16(-1);
896 for( ; d <= D - v_int16::nlanes; d+= v_int16::nlanes )
897 {
898 v_int16 L0 = vx_load_aligned(Sp + d);
899 _bestDisp = v_select(_minS > L0, vx_setall_s16((short)d), _bestDisp);
900 _minS = v_min( L0, _minS );
901 }
902 min_pos(_minS, _bestDisp, minS, bestDisp);
903 #endif
904 for( ; d < D; d++ )
905 {
906 int Sval = Sp[d];
907 if( Sval < minS )
908 {
909 minS = (CostType)Sval;
910 bestDisp = (short)d;
911 }
912 }
913 }
914
915 for( d = 0; d < D; d++ )
916 {
917 if( Sp[d]*(100 - uniquenessRatio) < minS*100 && std::abs(bestDisp - d) > 1 )
918 break;
919 }
920 if( d < D )
921 continue;
922 d = bestDisp;
923 int _x2 = x + minX1 - d - minD;
924 if( mem.disp2cost[_x2] > minS )
925 {
926 mem.disp2cost[_x2] = (CostType)minS;
927 mem.disp2ptr[_x2] = (DispType)(d + minD);
928 }
929
930 if( 0 < d && d < D-1 )
931 {
932 // do subpixel quadratic interpolation:
933 // fit parabola into (x1=d-1, y1=Sp[d-1]), (x2=d, y2=Sp[d]), (x3=d+1, y3=Sp[d+1])
934 // then find minimum of the parabola.
935 int denom2 = std::max(Sp[d-1] + Sp[d+1] - 2*Sp[d], 1);
936 d = d*DISP_SCALE + ((Sp[d-1] - Sp[d+1])*DISP_SCALE + denom2)/(denom2*2);
937 }
938 else
939 d *= DISP_SCALE;
940 disp1ptr[x + minX1] = (DispType)(d + minD*DISP_SCALE);
941 }
942
943 for( x = minX1; x < maxX1; x++ )
944 {
945 // we round the computed disparity both towards -inf and +inf and check
946 // if either of the corresponding disparities in disp2 is consistent.
947 // This is to give the computed disparity a chance to look valid if it is.
948 int d1 = disp1ptr[x];
949 if( d1 == INVALID_DISP_SCALED )
950 continue;
951 int _d = d1 >> DISP_SHIFT;
952 int d_ = (d1 + DISP_SCALE-1) >> DISP_SHIFT;
953 int _x = x - _d, x_ = x - d_;
954 if( 0 <= _x && _x < width && mem.disp2ptr[_x] >= minD && std::abs(mem.disp2ptr[_x] - _d) > disp12MaxDiff &&
955 0 <= x_ && x_ < width && mem.disp2ptr[x_] >= minD && std::abs(mem.disp2ptr[x_] - d_) > disp12MaxDiff )
956 disp1ptr[x] = (DispType)INVALID_DISP_SCALED;
957 }
958 }
959
960 lrID = 1 - lrID; // now shift the cyclic buffers
961 }
962 }
963 }
964
965 ////////////////////////////////////////////////////////////////////////////////////////////
966 struct CalcVerticalSums: public ParallelLoopBody
967 {
CalcVerticalSumscv::CalcVerticalSums968 CalcVerticalSums(const Mat& _img1, const Mat& _img2, const StereoSGBMParams& params, const BufferSGBM &mem_)
969 : img1(_img1), img2(_img2), mem(mem_)
970 {
971 minD = params.minDisparity;
972 maxD = minD + params.numDisparities;
973 SW2 = SH2 = params.calcSADWindowSize().height/2;
974 P1 = params.P1 > 0 ? params.P1 : 2;
975 P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);
976 height = img1.rows;
977 width = img1.cols;
978 int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
979 D = maxD - minD;
980 Da = (int)alignSize(D, v_int16::nlanes);
981 Dlra = Da + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D
982 width1 = maxX1 - minX1;
983 D = params.numDisparities;
984 Da = (int)alignSize(D, v_int16::nlanes);
985 }
986
operator ()cv::CalcVerticalSums987 void operator()(const Range& range) const CV_OVERRIDE
988 {
989 const CostType MAX_COST = SHRT_MAX;
990 const int npasses = 2;
991 const int x1 = range.start, x2 = range.end;
992 int k;
993
994 CostType* pixDiff = 0;
995 PixType* tempBuf = 0;
996 utils::BufferArea aux_area;
997 aux_area.allocate(pixDiff, ((x2 - x1) + 2 * SW2) * Da, CV_SIMD_WIDTH);
998 aux_area.allocate(tempBuf, width * (4 * img1.channels() + 2) * sizeof(PixType), CV_SIMD_WIDTH);
999 aux_area.commit();
1000
1001 // Simplification of index calculation
1002 if (x1 > SW2)
1003 pixDiff -= (x1 - SW2) * Da;
1004
1005 for( int pass = 1; pass <= npasses; pass++ )
1006 {
1007 int y1, y2, dy;
1008
1009 if( pass == 1 )
1010 {
1011 y1 = 0; y2 = height; dy = 1;
1012 }
1013 else
1014 {
1015 y1 = height-1; y2 = -1; dy = -1;
1016 }
1017
1018 uchar lrID = 0;
1019 mem.clearLr(range);
1020
1021 for( int y = y1; y != y2; y += dy )
1022 {
1023 int x, d;
1024 CostType* C = mem.getCBuf(y);
1025 CostType* S = mem.getSBuf(y);
1026
1027 if( pass == 1 ) // compute C on the first pass, and reuse it on the second pass, if any.
1028 {
1029 int dy1 = y == 0 ? 0 : y + SH2, dy2 = y == 0 ? SH2 : dy1;
1030
1031 for( k = dy1; k <= dy2; k++ )
1032 {
1033 CostType* hsumAdd = mem.getHSumBuf(std::min(k, height-1));
1034
1035 if( k < height )
1036 {
1037 calcPixelCostBT( img1, img2, k, minD, maxD, pixDiff, tempBuf, mem.getClipTab(), x1 - SW2, x2 + SW2);
1038
1039 memset(hsumAdd + x1*Da, 0, Da*sizeof(CostType));
1040 for( x = (x1 - SW2)*Da; x <= (x1 + SW2)*Da; x += Da )
1041 {
1042 int xbord = x <= 0 ? 0 : (x > (width1 - 1)*Da ? (width1 - 1)*Da : x);
1043 #if CV_SIMD
1044 for( d = 0; d < Da; d += v_int16::nlanes )
1045 v_store_aligned(hsumAdd + x1*Da + d, vx_load_aligned(hsumAdd + x1*Da + d) + vx_load_aligned(pixDiff + xbord + d));
1046 #else
1047 for( d = 0; d < D; d++ )
1048 hsumAdd[x1*Da + d] = (CostType)(hsumAdd[x1*Da + d] + pixDiff[xbord + d]);
1049 #endif
1050 }
1051
1052 if( y > 0 )
1053 {
1054 const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, 0));
1055 const CostType* Cprev = mem.getCBuf(y - 1);
1056 #if CV_SIMD
1057 for( d = 0; d < Da; d += v_int16::nlanes )
1058 v_store_aligned(C + x1*Da + d, vx_load_aligned(Cprev + x1*Da + d) + vx_load_aligned(hsumAdd + x1*Da + d) - vx_load_aligned(hsumSub + x1*Da + d));
1059 #else
1060 for( d = 0; d < D; d++ )
1061 C[x1*Da + d] = (CostType)(Cprev[x1*Da + d] + hsumAdd[x1*Da + d] - hsumSub[x1*Da + d]);
1062 #endif
1063 for( x = (x1+1)*Da; x < x2*Da; x += Da )
1064 {
1065 const CostType* pixAdd = pixDiff + std::min(x + SW2*Da, (width1-1)*Da);
1066 const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*Da, 0);
1067
1068 #if CV_SIMD
1069 for( d = 0; d < Da; d += v_int16::nlanes )
1070 {
1071 v_int16 hv = vx_load_aligned(hsumAdd + x - Da + d) - vx_load_aligned(pixSub + d) + vx_load_aligned(pixAdd + d);
1072 v_store_aligned(hsumAdd + x + d, hv);
1073 v_store_aligned(C + x + d, vx_load_aligned(Cprev + x + d) - vx_load_aligned(hsumSub + x + d) + hv);
1074 }
1075 #else
1076 for( d = 0; d < D; d++ )
1077 {
1078 int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - Da + d] + pixAdd[d] - pixSub[d]);
1079 C[x + d] = (CostType)(Cprev[x + d] + hv - hsumSub[x + d]);
1080 }
1081 #endif
1082 }
1083 }
1084 else
1085 {
1086 #if CV_SIMD
1087 v_int16 v_scale = vx_setall_s16(k == 0 ? (short)SH2 + 1 : 1);
1088 for (d = 0; d < Da; d += v_int16::nlanes)
1089 v_store_aligned(C + x1*Da + d, vx_load_aligned(C + x1*Da + d) + vx_load_aligned(hsumAdd + x1*Da + d) * v_scale);
1090 #else
1091 int scale = k == 0 ? SH2 + 1 : 1;
1092 for (d = 0; d < D; d++)
1093 C[x1*Da + d] = (CostType)(C[x1*Da + d] + hsumAdd[x1*Da + d] * scale);
1094 #endif
1095 for( x = (x1+1)*Da; x < x2*Da; x += Da )
1096 {
1097 const CostType* pixAdd = pixDiff + std::min(x + SW2*Da, (width1-1)*Da);
1098 const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*Da, 0);
1099 #if CV_SIMD
1100 for (d = 0; d < Da; d += v_int16::nlanes)
1101 {
1102 v_int16 hv = vx_load_aligned(hsumAdd + x - Da + d) + vx_load_aligned(pixAdd + d) - vx_load_aligned(pixSub + d);
1103 v_store_aligned(hsumAdd + x + d, hv);
1104 v_store_aligned(C + x + d, vx_load_aligned(C + x + d) + hv * v_scale);
1105 }
1106 #else
1107 for( d = 0; d < D; d++ )
1108 {
1109 CostType hv = (CostType)(hsumAdd[x - Da + d] + pixAdd[d] - pixSub[d]);
1110 hsumAdd[x + d] = hv;
1111 C[x + d] = (CostType)(C[x + d] + hv * scale);
1112 }
1113 #endif
1114 }
1115 }
1116 }
1117 else
1118 {
1119 /* if (y > 0)
1120 {
1121 const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, 0));
1122 const CostType* Cprev = mem.getCBuf(y - 1);
1123
1124 #if CV_SIMD
1125 for( x = x1*Da; x < x2*Da; x += v_int16::nlanes )
1126 v_store_aligned(C + x, vx_load_aligned(Cprev + x) - vx_load_aligned(hsumSub + x) + vx_load_aligned(hsumAdd + x));
1127 #else
1128 for( x = x1*Da; x < x2*Da; x++ )
1129 C[x] = (CostType)(Cprev[x] + hsumAdd[x] - hsumSub[x]);
1130 #endif
1131 }
1132 else*/
1133 if(y == 0)
1134 {
1135 #if CV_SIMD
1136 for( x = x1*Da; x < x2*Da; x += v_int16::nlanes )
1137 v_store_aligned(C + x, vx_load_aligned(C + x) + vx_load_aligned(hsumAdd + x));
1138 #else
1139 for( x = x1*Da; x < x2*Da; x++ )
1140 C[x] = (CostType)(C[x] + hsumAdd[x]);
1141 #endif
1142 }
1143 }
1144 }
1145 mem.clearSBuf(y, range);
1146 }
1147
1148 // [formula 13 in the paper]
1149 // compute L_r(p, d) = C(p, d) +
1150 // min(L_r(p-r, d),
1151 // L_r(p-r, d-1) + P1,
1152 // L_r(p-r, d+1) + P1,
1153 // min_k L_r(p-r, k) + P2) - min_k L_r(p-r, k)
1154 // where p = (x,y), r is one of the directions.
1155 // we process one directions on first pass and other on second:
1156 // r=(0, dy), where dy=1 on first pass and dy=-1 on second
1157
1158 for( x = x1; x != x2; x++ )
1159 {
1160 int delta = P2 + *mem.getMinLr(1 - lrID, x);
1161 CostType* Lr_ppr = mem.getLr(1 - lrID, x);
1162
1163 Lr_ppr[-1] = Lr_ppr[D] = MAX_COST;
1164
1165 CostType* Lr_p = mem.getLr(lrID, x);
1166 const CostType* Cp = C + x*Da;
1167 CostType* Sp = S + x*Da;
1168
1169 CostType& minL = *(mem.getMinLr(lrID, x));
1170 d = 0;
1171 #if CV_SIMD
1172 v_int16 _P1 = vx_setall_s16((short)P1);
1173
1174 v_int16 _delta = vx_setall_s16((short)delta);
1175 v_int16 _minL = vx_setall_s16((short)MAX_COST);
1176
1177 for( ; d <= D - v_int16::nlanes; d += v_int16::nlanes )
1178 {
1179 v_int16 Cpd = vx_load_aligned(Cp + d);
1180 v_int16 L = v_min(v_min(v_min(vx_load_aligned(Lr_ppr + d), vx_load(Lr_ppr + d - 1) + _P1), vx_load(Lr_ppr + d + 1) + _P1), _delta) - _delta + Cpd;
1181 v_store_aligned(Lr_p + d, L);
1182 _minL = v_min(_minL, L);
1183 v_store_aligned(Sp + d, vx_load_aligned(Sp + d) + L);
1184 }
1185 minL = v_reduce_min(_minL);
1186 #else
1187 minL = MAX_COST;
1188 #endif
1189 for( ; d < D; d++ )
1190 {
1191 int Cpd = Cp[d], L;
1192
1193 L = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta;
1194
1195 Lr_p[d] = (CostType)L;
1196 minL = std::min(minL, (CostType)L);
1197
1198 Sp[d] = saturate_cast<CostType>(Sp[d] + L);
1199 }
1200 }
1201 lrID = 1 - lrID; // now shift the cyclic buffers
1202 }
1203 }
1204 }
1205 const Mat& img1;
1206 const Mat& img2;
1207 const BufferSGBM & mem;
1208 int minD;
1209 int maxD;
1210 int D, Da, Dlra;
1211 int SH2;
1212 int SW2;
1213 int width;
1214 int width1;
1215 int height;
1216 int P1;
1217 int P2;
1218 };
1219
1220 struct CalcHorizontalSums: public ParallelLoopBody
1221 {
CalcHorizontalSumscv::CalcHorizontalSums1222 CalcHorizontalSums(const Mat& _img1, const Mat& _img2, Mat& _disp1, const StereoSGBMParams& params, const BufferSGBM &mem_)
1223 : img1(_img1), img2(_img2), disp1(_disp1), mem(mem_)
1224 {
1225 minD = params.minDisparity;
1226 maxD = minD + params.numDisparities;
1227 P1 = params.P1 > 0 ? params.P1 : 2;
1228 P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);
1229 uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10;
1230 disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;
1231 height = img1.rows;
1232 width = img1.cols;
1233 minX1 = std::max(maxD, 0);
1234 maxX1 = width + std::min(minD, 0);
1235 INVALID_DISP = minD - 1;
1236 INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
1237 D = maxD - minD;
1238 Da = (int)alignSize(D, v_int16::nlanes);
1239 Dlra = Da + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D
1240 width1 = maxX1 - minX1;
1241 }
1242
operator ()cv::CalcHorizontalSums1243 void operator()(const Range& range) const CV_OVERRIDE
1244 {
1245 int y1 = range.start, y2 = range.end;
1246
1247 const size_t LrSize = 2 * (1 + Dlra + 1);
1248
1249 CostType * Lr = 0;
1250 CostType * disp2cost = 0;
1251 DispType * disp2ptr = 0;
1252 utils::BufferArea aux_area;
1253 aux_area.allocate(Lr, LrSize);
1254 aux_area.allocate(disp2cost, width, CV_SIMD_WIDTH);
1255 aux_area.allocate(disp2ptr, width, CV_SIMD_WIDTH);
1256 aux_area.commit();
1257
1258 CostType minLr;
1259
1260 for( int y = y1; y != y2; y++)
1261 {
1262 int x, d;
1263 DispType* disp1ptr = disp1.ptr<DispType>(y);
1264 CostType* C = mem.getCBuf(y);
1265 CostType* S = mem.getSBuf(y);
1266
1267 x = 0;
1268 #if CV_SIMD
1269 v_int16 v_inv_dist = vx_setall_s16((DispType)INVALID_DISP_SCALED);
1270 v_int16 v_max_cost = vx_setall_s16(MAX_COST);
1271 for (; x <= width - v_int16::nlanes; x += v_int16::nlanes)
1272 {
1273 v_store(disp1ptr + x, v_inv_dist);
1274 v_store(disp2ptr + x, v_inv_dist);
1275 v_store(disp2cost + x, v_max_cost);
1276 }
1277 #endif
1278 for( ; x < width; x++ )
1279 {
1280 disp1ptr[x] = disp2ptr[x] = (DispType)INVALID_DISP_SCALED;
1281 disp2cost[x] = MAX_COST;
1282 }
1283
1284 // clear buffers
1285 aux_area.zeroFill(Lr);
1286 Lr[0] = Lr[1 + D] = Lr[3 + Dlra - 1] = Lr[3 + Dlra + D] = MAX_COST;
1287
1288 minLr = 0;
1289 // [formula 13 in the paper]
1290 // compute L_r(p, d) = C(p, d) +
1291 // min(L_r(p-r, d),
1292 // L_r(p-r, d-1) + P1,
1293 // L_r(p-r, d+1) + P1,
1294 // min_k L_r(p-r, k) + P2) - min_k L_r(p-r, k)
1295 // where p = (x,y), r is one of the directions.
1296 // we process all the directions at once:
1297 // we process one directions on first pass and other on second:
1298 // r=(dx, 0), where dx=1 on first pass and dx=-1 on second
1299 for( x = 0; x != width1; x++)
1300 {
1301 int delta = minLr + P2;
1302 CostType* Lr_ppr = Lr + ((x&1)? 1 : 3 + Dlra);
1303 CostType* Lr_p = Lr + ((x&1)? 3 + Dlra : 1);
1304 const CostType* Cp = C + x*Da;
1305 CostType* Sp = S + x*Da;
1306
1307 d = 0;
1308 #if CV_SIMD
1309 v_int16 _P1 = vx_setall_s16((short)P1);
1310
1311 v_int16 _delta = vx_setall_s16((short)delta);
1312 v_int16 _minL = vx_setall_s16((short)MAX_COST);
1313
1314 for( ; d <= D - v_int16::nlanes; d += v_int16::nlanes)
1315 {
1316 v_int16 Cpd = vx_load_aligned(Cp + d);
1317 v_int16 L = v_min(v_min(v_min(vx_load(Lr_ppr + d), vx_load(Lr_ppr + d - 1) + _P1), vx_load(Lr_ppr + d + 1) + _P1), _delta) - _delta + Cpd;
1318 v_store(Lr_p + d, L);
1319 _minL = v_min(_minL, L);
1320 v_store_aligned(Sp + d, vx_load_aligned(Sp + d) + L);
1321 }
1322 minLr = v_reduce_min(_minL);
1323 #else
1324 minLr = MAX_COST;
1325 #endif
1326 for( ; d < D; d++ )
1327 {
1328 int Cpd = Cp[d], L;
1329 L = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta;
1330 Lr_p[d] = (CostType)L;
1331 minLr = std::min(minLr, (CostType)L);
1332 Sp[d] = saturate_cast<CostType>(Sp[d] + L);
1333 }
1334 }
1335
1336 aux_area.zeroFill(Lr);
1337 Lr[0] = Lr[1 + D] = Lr[3 + Dlra - 1] = Lr[3 + Dlra + D] = MAX_COST;
1338
1339 minLr = 0;
1340
1341 for( x = width1-1; x != -1; x--)
1342 {
1343 int delta = minLr + P2;
1344 CostType* Lr_ppr = Lr + ((x&1)? 1 : 3 + Dlra);
1345 CostType* Lr_p = Lr + ((x&1)? 3 + Dlra : 1);
1346 const CostType* Cp = C + x*Da;
1347 CostType* Sp = S + x*Da;
1348 CostType minS = MAX_COST;
1349 short bestDisp = -1;
1350 minLr = MAX_COST;
1351
1352 d = 0;
1353 #if CV_SIMD
1354 v_int16 _P1 = vx_setall_s16((short)P1);
1355 v_int16 _delta = vx_setall_s16((short)delta);
1356
1357 v_int16 _minL = vx_setall_s16((short)MAX_COST);
1358 v_int16 _minS = vx_setall_s16(MAX_COST), _bestDisp = vx_setall_s16(-1);
1359 for( ; d <= D - v_int16::nlanes; d += v_int16::nlanes )
1360 {
1361 v_int16 Cpd = vx_load_aligned(Cp + d);
1362 v_int16 L = v_min(v_min(v_min(vx_load(Lr_ppr + d), vx_load(Lr_ppr + d - 1) + _P1), vx_load(Lr_ppr + d + 1) + _P1), _delta) - _delta + Cpd;
1363 v_store(Lr_p + d, L);
1364 _minL = v_min(_minL, L);
1365 L += vx_load_aligned(Sp + d);
1366 v_store_aligned(Sp + d, L);
1367
1368 _bestDisp = v_select(_minS > L, vx_setall_s16((short)d), _bestDisp);
1369 _minS = v_min( L, _minS );
1370 }
1371 minLr = v_reduce_min(_minL);
1372
1373 min_pos(_minS, _bestDisp, minS, bestDisp);
1374 #endif
1375 for( ; d < D; d++ )
1376 {
1377 int Cpd = Cp[d], L;
1378
1379 L = Cpd + std::min((int)Lr_ppr[d], std::min(Lr_ppr[d-1] + P1, std::min(Lr_ppr[d+1] + P1, delta))) - delta;
1380
1381 Lr_p[d] = (CostType)L;
1382 minLr = std::min(minLr, (CostType)L);
1383
1384 Sp[d] = saturate_cast<CostType>(Sp[d] + L);
1385 if( Sp[d] < minS )
1386 {
1387 minS = Sp[d];
1388 bestDisp = (short)d;
1389 }
1390 }
1391
1392 //Some postprocessing procedures and saving
1393 for( d = 0; d < D; d++ )
1394 {
1395 if( Sp[d]*(100 - uniquenessRatio) < minS*100 && std::abs(bestDisp - d) > 1 )
1396 break;
1397 }
1398 if( d < D )
1399 continue;
1400 d = bestDisp;
1401 int _x2 = x + minX1 - d - minD;
1402 if( disp2cost[_x2] > minS )
1403 {
1404 disp2cost[_x2] = (CostType)minS;
1405 disp2ptr[_x2] = (DispType)(d + minD);
1406 }
1407
1408 if( 0 < d && d < D-1 )
1409 {
1410 // do subpixel quadratic interpolation:
1411 // fit parabola into (x1=d-1, y1=Sp[d-1]), (x2=d, y2=Sp[d]), (x3=d+1, y3=Sp[d+1])
1412 // then find minimum of the parabola.
1413 int denom2 = std::max(Sp[d-1] + Sp[d+1] - 2*Sp[d], 1);
1414 d = d*DISP_SCALE + ((Sp[d-1] - Sp[d+1])*DISP_SCALE + denom2)/(denom2*2);
1415 }
1416 else
1417 d *= DISP_SCALE;
1418 disp1ptr[x + minX1] = (DispType)(d + minD*DISP_SCALE);
1419 }
1420 //Left-right check sanity procedure
1421 for( x = minX1; x < maxX1; x++ )
1422 {
1423 // we round the computed disparity both towards -inf and +inf and check
1424 // if either of the corresponding disparities in disp2 is consistent.
1425 // This is to give the computed disparity a chance to look valid if it is.
1426 int d1 = disp1ptr[x];
1427 if( d1 == INVALID_DISP_SCALED )
1428 continue;
1429 int _d = d1 >> DISP_SHIFT;
1430 int d_ = (d1 + DISP_SCALE-1) >> DISP_SHIFT;
1431 int _x = x - _d, x_ = x - d_;
1432 if( 0 <= _x && _x < width && disp2ptr[_x] >= minD && std::abs(disp2ptr[_x] - _d) > disp12MaxDiff &&
1433 0 <= x_ && x_ < width && disp2ptr[x_] >= minD && std::abs(disp2ptr[x_] - d_) > disp12MaxDiff )
1434 disp1ptr[x] = (DispType)INVALID_DISP_SCALED;
1435 }
1436 }
1437 }
1438
1439 static const int DISP_SHIFT = StereoMatcher::DISP_SHIFT;
1440 static const int DISP_SCALE = (1 << DISP_SHIFT);
1441 static const CostType MAX_COST = SHRT_MAX;
1442 const Mat& img1;
1443 const Mat& img2;
1444 Mat& disp1;
1445 const BufferSGBM & mem;
1446 int minD;
1447 int maxD;
1448 int D, Da, Dlra;
1449 int width;
1450 int width1;
1451 int height;
1452 int P1;
1453 int P2;
1454 int minX1;
1455 int maxX1;
1456 int INVALID_DISP;
1457 int INVALID_DISP_SCALED;
1458 int uniquenessRatio;
1459 int disp12MaxDiff;
1460 };
1461 /*
1462 computes disparity for "roi" in img1 w.r.t. img2 and write it to disp1buf.
1463 that is, disp1buf(x, y)=d means that img1(x+roi.x, y+roi.y) ~ img2(x+roi.x-d, y+roi.y).
1464 minD <= d < maxD.
1465
1466 note that disp1buf will have the same size as the roi and
1467 On exit disp1buf is not the final disparity, it is an intermediate result that becomes
1468 final after all the tiles are processed.
1469
1470 the disparity in disp1buf is written with sub-pixel accuracy
1471 (4 fractional bits, see StereoSGBM::DISP_SCALE),
1472 using quadratic interpolation, while the disparity in disp2buf
1473 is written as is, without interpolation.
1474 */
computeDisparitySGBM_HH4(const Mat & img1,const Mat & img2,Mat & disp1,const StereoSGBMParams & params)1475 static void computeDisparitySGBM_HH4( const Mat& img1, const Mat& img2,
1476 Mat& disp1, const StereoSGBMParams& params)
1477 {
1478 const int DISP_SHIFT = StereoMatcher::DISP_SHIFT;
1479 const int DISP_SCALE = (1 << DISP_SHIFT);
1480 int minD = params.minDisparity, maxD = minD + params.numDisparities;
1481 Size SADWindowSize;
1482 SADWindowSize.width = SADWindowSize.height = params.SADWindowSize > 0 ? params.SADWindowSize : 5;
1483 int P1 = params.P1 > 0 ? params.P1 : 2, P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);
1484 int width = disp1.cols, height = disp1.rows;
1485 int minX1 = std::max(maxD, 0), maxX1 = width + std::min(minD, 0);
1486 int width1 = maxX1 - minX1;
1487 int Da = (int)alignSize(params.numDisparities, v_int16::nlanes);
1488 int Dlra = Da + v_int16::nlanes;//Additional memory is necessary to store disparity values(MAX_COST) for d=-1 and d=D
1489 int INVALID_DISP = minD - 1;
1490 int INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
1491
1492 if( minX1 >= maxX1 )
1493 {
1494 disp1 = Scalar::all(INVALID_DISP_SCALED);
1495 return;
1496 }
1497
1498 BufferSGBM mem(width1, Da, Dlra, img1.channels(), width, height, params);
1499 mem.initCBuf((CostType)P2); // add P2 to every C(x,y). it saves a few operations in the inner loops
1500
1501 parallel_for_(Range(0,width1),CalcVerticalSums(img1, img2, params, mem),8);
1502 parallel_for_(Range(0,height),CalcHorizontalSums(img1, img2, disp1, params, mem),8);
1503 }
1504
1505 //////////////////////////////////////////////////////////////////////////////////////////////////////
1506
1507 class BufferSGBM3Way
1508 {
1509 private:
1510 size_t hsumCols;
1511 size_t hsumRows;
1512 public:
1513 CostType *curCostVolumeLine;
1514 CostType *hsumBuf;
1515 CostType *pixDiff;
1516 PixType *tmpBuf;
1517 CostType *horPassCostVolume;
1518 CostType *vertPassCostVolume;
1519 CostType *vertPassMin;
1520 CostType *rightPassBuf;
1521 CostType *disp2CostBuf;
1522 short *disp2Buf;
1523 private:
1524 utils::BufferArea area;
1525 public:
BufferSGBM3Way(int width1,int width,int num_ch,int Da,int SH2,int P2)1526 BufferSGBM3Way(int width1, int width, int num_ch, int Da, int SH2, int P2) :
1527 curCostVolumeLine(0),
1528 hsumBuf(0),
1529 pixDiff(0),
1530 tmpBuf(0),
1531 horPassCostVolume(0),
1532 vertPassCostVolume(0),
1533 vertPassMin(0),
1534 rightPassBuf(0),
1535 disp2CostBuf(0),
1536 disp2Buf(0)
1537 {
1538 hsumCols = width1 * Da;
1539 hsumRows = SH2*2 + 2;
1540 area.allocate(curCostVolumeLine, hsumCols, CV_SIMD_WIDTH);
1541 area.allocate(hsumBuf, hsumCols * hsumRows, CV_SIMD_WIDTH);
1542 area.allocate(pixDiff,hsumCols, CV_SIMD_WIDTH);
1543 area.allocate(tmpBuf, width * (4 * num_ch + 2), CV_SIMD_WIDTH);
1544 area.allocate(horPassCostVolume, (width1 + 2) * Da, CV_SIMD_WIDTH);
1545 area.allocate(vertPassCostVolume, (width1 + 2) * Da, CV_SIMD_WIDTH);
1546 area.allocate(vertPassMin, width1 + 2, CV_SIMD_WIDTH);
1547 area.allocate(rightPassBuf, Da, CV_SIMD_WIDTH);
1548 area.allocate(disp2CostBuf, width, CV_SIMD_WIDTH);
1549 area.allocate(disp2Buf, width, CV_SIMD_WIDTH);
1550 area.commit();
1551 area.zeroFill();
1552 for(size_t i = 0; i < hsumCols; i++)
1553 curCostVolumeLine[i] = (CostType)P2;
1554 }
clearRightPassBuf()1555 inline void clearRightPassBuf()
1556 {
1557 area.zeroFill(rightPassBuf);
1558 }
getHSumBuf(int x) const1559 CostType *getHSumBuf(int x) const
1560 {
1561 return hsumBuf + (x % hsumRows) * hsumCols;
1562 }
1563 };
1564
1565 struct SGBM3WayMainLoop : public ParallelLoopBody
1566 {
1567 const Mat *img1, *img2;
1568 Mat* dst_disp;
1569
1570 int stripe_sz;
1571 int stripe_overlap;
1572
1573 int width,height;
1574 int minD, maxD, D, Da;
1575 int minX1, maxX1, width1;
1576
1577 int SW2, SH2;
1578 int P1, P2;
1579 int uniquenessRatio, disp12MaxDiff;
1580
1581 int TAB_OFS;
1582
1583 utils::BufferArea aux_area;
1584 PixType* clipTab;
1585 #if CV_SIMD
1586 short idx_row[v_int16::nlanes];
1587 #endif
1588 SGBM3WayMainLoop(const Mat& _img1, const Mat& _img2, Mat* _dst_disp, const StereoSGBMParams& params, int stripe_size, int _stripe_overlap);
1589 void operator () (const Range& range) const CV_OVERRIDE;
1590 template<bool x_nlanes> void impl(const Range& range) const;
1591
1592 private:
1593 void getRawMatchingCost(const BufferSGBM3Way &mem, int y, int src_start_idx) const;
1594
1595 template<bool x_nlanes>
1596 void accumulateCostsLeftTop(const BufferSGBM3Way &mem,
1597 int x,
1598 CostType &leftMinCost) const;
1599
1600 template<bool x_nlanes>
1601 void accumulateCostsRight(const BufferSGBM3Way &mem,
1602 int x,
1603 CostType &rightMinCost,
1604 short &optimal_disp,
1605 CostType &min_cost) const;
1606 };
1607
SGBM3WayMainLoop(const Mat & _img1,const Mat & _img2,Mat * _dst_disp,const StereoSGBMParams & params,int _stripe_sz,int _stripe_overlap)1608 SGBM3WayMainLoop::SGBM3WayMainLoop(const Mat& _img1,
1609 const Mat& _img2,
1610 Mat* _dst_disp,
1611 const StereoSGBMParams& params,
1612 int _stripe_sz,
1613 int _stripe_overlap)
1614 : img1(&_img1),
1615 img2(&_img2),
1616 dst_disp(_dst_disp),
1617 stripe_sz(_stripe_sz),
1618 stripe_overlap(_stripe_overlap),
1619 clipTab(0)
1620 {
1621 // precompute a lookup table for the raw matching cost computation:
1622 TAB_OFS = 256*4;
1623 const int TAB_SIZE = 256 + TAB_OFS*2;
1624 aux_area.allocate(clipTab, TAB_SIZE, CV_SIMD_WIDTH);
1625 aux_area.commit();
1626 const int ftzero = std::max(params.preFilterCap, 15) | 1;
1627 for(int k = 0; k < TAB_SIZE; k++ )
1628 clipTab[k] = (PixType)(std::min(std::max(k - TAB_OFS, -ftzero), ftzero) + ftzero);
1629
1630 width = img1->cols; height = img1->rows;
1631 minD = params.minDisparity; maxD = minD + params.numDisparities; D = maxD - minD;
1632 minX1 = std::max(maxD, 0); maxX1 = width + std::min(minD, 0); width1 = maxX1 - minX1;
1633 Da = (int)alignSize(D, v_int16::nlanes);
1634
1635 SW2 = SH2 = params.SADWindowSize > 0 ? params.SADWindowSize/2 : 1;
1636
1637 P1 = params.P1 > 0 ? params.P1 : 2; P2 = std::max(params.P2 > 0 ? params.P2 : 5, P1+1);
1638 uniquenessRatio = params.uniquenessRatio >= 0 ? params.uniquenessRatio : 10;
1639 disp12MaxDiff = params.disp12MaxDiff > 0 ? params.disp12MaxDiff : 1;
1640
1641 #if CV_SIMD
1642 for(short i = 0; i < v_int16::nlanes; ++i)
1643 idx_row[i] = i;
1644 #endif
1645 }
1646
1647 // performing block matching and building raw cost-volume for the current row
getRawMatchingCost(const BufferSGBM3Way & mem,int y,int src_start_idx) const1648 void SGBM3WayMainLoop::getRawMatchingCost(const BufferSGBM3Way &mem, int y, int src_start_idx) const
1649 {
1650 CostType* C = mem.curCostVolumeLine;
1651 CostType* pixDiff = mem.pixDiff;
1652 PixType* tmpBuf = mem.tmpBuf;
1653 int x, d;
1654 int dy1 = (y == src_start_idx) ? src_start_idx : y + SH2, dy2 = (y == src_start_idx) ? src_start_idx+SH2 : dy1;
1655
1656 for(int k = dy1; k <= dy2; k++ )
1657 {
1658 CostType* hsumAdd = mem.getHSumBuf(std::min(k, height-1));
1659 if( k < height )
1660 {
1661 calcPixelCostBT( *img1, *img2, k, minD, maxD, pixDiff, tmpBuf, clipTab + TAB_OFS );
1662
1663 #if CV_SIMD
1664 v_int16 sw2_1 = vx_setall_s16((short)SW2 + 1);
1665 for (d = 0; d < Da; d += v_int16::nlanes)
1666 {
1667 v_int16 hsA = vx_load_aligned(pixDiff + d) * sw2_1;
1668 for (x = Da; x <= SW2 * Da; x += Da)
1669 hsA += vx_load_aligned(pixDiff + x + d);
1670 v_store_aligned(hsumAdd + d, hsA);
1671 }
1672 #else
1673 for (d = 0; d < D; d++)
1674 {
1675 CostType hsA = (CostType)(pixDiff[d] * (SW2 + 1));
1676 for (x = Da; x <= SW2 * Da; x += Da)
1677 hsA += pixDiff[x + d];
1678 hsumAdd[d] = hsA;
1679 }
1680 #endif
1681 if( y > src_start_idx )
1682 {
1683 const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, src_start_idx));
1684
1685 #if CV_SIMD
1686 for (d = 0; d < Da; d += v_int16::nlanes)
1687 v_store_aligned(C + d, vx_load_aligned(C + d) + vx_load_aligned(hsumAdd + d) - vx_load_aligned(hsumSub + d));
1688 #else
1689 for (d = 0; d < D; d++)
1690 C[d] = (CostType)(C[d] + hsumAdd[d] - hsumSub[d]);
1691 #endif
1692
1693 for( x = Da; x < width1*Da; x += Da )
1694 {
1695 const CostType* pixAdd = pixDiff + std::min(x + SW2*Da, (width1-1)*Da);
1696 const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*Da, 0);
1697 #if CV_SIMD
1698 v_int16 hv_reg;
1699 for( d = 0; d < Da; d+=v_int16::nlanes )
1700 {
1701 hv_reg = vx_load_aligned(hsumAdd+x-Da+d) + vx_load_aligned(pixAdd+d) - vx_load_aligned(pixSub+d);
1702 v_store_aligned(hsumAdd+x+d,hv_reg);
1703 v_store_aligned(C+x+d,vx_load_aligned(C+x+d)+hv_reg-vx_load_aligned(hsumSub+x+d));
1704 }
1705 #else
1706 for( d = 0; d < D; d++ )
1707 {
1708 int hv = hsumAdd[x + d] = (CostType)(hsumAdd[x - D + d] + pixAdd[d] - pixSub[d]);
1709 C[x + d] = (CostType)(C[x + d] + hv - hsumSub[x + d]);
1710 }
1711 #endif
1712 }
1713 }
1714 else
1715 {
1716 #if CV_SIMD
1717 v_int16 v_scale = vx_setall_s16(k == src_start_idx ? (short)SH2 + 1 : 1);
1718 for (d = 0; d < Da; d += v_int16::nlanes)
1719 v_store_aligned(C + d, vx_load_aligned(C + d) + vx_load_aligned(hsumAdd + d) * v_scale);
1720 #else
1721 int scale = k == src_start_idx ? SH2 + 1 : 1;
1722 for (d = 0; d < D; d++)
1723 C[d] = (CostType)(C[d] + hsumAdd[d] * scale);
1724 #endif
1725 for( x = Da; x < width1*Da; x += Da )
1726 {
1727 const CostType* pixAdd = pixDiff + std::min(x + SW2*Da, (width1-1)*Da);
1728 const CostType* pixSub = pixDiff + std::max(x - (SW2+1)*Da, 0);
1729 #if CV_SIMD
1730 for (d = 0; d < Da; d += v_int16::nlanes)
1731 {
1732 v_int16 hv = vx_load_aligned(hsumAdd + x - Da + d) + vx_load_aligned(pixAdd + d) - vx_load_aligned(pixSub + d);
1733 v_store_aligned(hsumAdd + x + d, hv);
1734 v_store_aligned(C + x + d, vx_load_aligned(C + x + d) + hv * v_scale);
1735 }
1736 #else
1737 for (d = 0; d < D; d++)
1738 {
1739 CostType hv = (CostType)(hsumAdd[x - Da + d] + pixAdd[d] - pixSub[d]);
1740 hsumAdd[x + d] = hv;
1741 C[x + d] = (CostType)(C[x + d] + hv * scale);
1742 }
1743 #endif
1744 }
1745 }
1746 }
1747 else
1748 {
1749 if( y > src_start_idx )
1750 {
1751 const CostType* hsumSub = mem.getHSumBuf(std::max(y - SH2 - 1, src_start_idx));
1752 #if CV_SIMD
1753 for( x = 0; x < width1*Da; x += v_int16::nlanes)
1754 v_store_aligned(C + x, vx_load_aligned(C + x) + vx_load_aligned(hsumAdd + x) - vx_load_aligned(hsumSub + x));
1755 #else
1756 for( x = 0; x < width1*Da; x++ )
1757 C[x] = (CostType)(C[x] + hsumAdd[x] - hsumSub[x]);
1758 #endif
1759 }
1760 else
1761 {
1762 #if CV_SIMD
1763 for( x = 0; x < width1*Da; x += v_int16::nlanes)
1764 v_store_aligned(C + x, vx_load_aligned(C + x) + vx_load_aligned(hsumAdd + x));
1765 #else
1766 for( x = 0; x < width1*Da; x++ )
1767 C[x] = (CostType)(C[x] + hsumAdd[x]);
1768 #endif
1769 }
1770 }
1771 }
1772 }
1773
1774 // performing SGM cost accumulation from left to right (result is stored in leftBuf) and
1775 // in-place cost accumulation from top to bottom (result is stored in topBuf)
1776 template<bool x_nlanes>
accumulateCostsLeftTop(const BufferSGBM3Way & mem,int x,CostType & leftMinCost) const1777 void SGBM3WayMainLoop::accumulateCostsLeftTop(const BufferSGBM3Way &mem, int x, CostType& leftMinCost) const
1778 {
1779 CostType *leftBuf = mem.horPassCostVolume + x;
1780 CostType *leftBuf_prev = mem.horPassCostVolume + x - Da;
1781 CostType *topBuf = mem.vertPassCostVolume + x;
1782 CostType *costs = mem.curCostVolumeLine - Da + x;
1783 CostType& topMinCost = mem.vertPassMin[x/Da];
1784 int i = 0;
1785 #if CV_SIMD
1786 v_int16 P1_reg = vx_setall_s16(cv::saturate_cast<CostType>(P1));
1787
1788 v_int16 leftMinCostP2_reg = vx_setall_s16(cv::saturate_cast<CostType>(leftMinCost+P2));
1789 v_int16 leftMinCost_new_reg = vx_setall_s16(SHRT_MAX);
1790 v_int16 src0_leftBuf = vx_setall_s16(SHRT_MAX);
1791 v_int16 src1_leftBuf = vx_load_aligned(leftBuf_prev);
1792
1793 v_int16 topMinCostP2_reg = vx_setall_s16(cv::saturate_cast<CostType>(topMinCost+P2));
1794 v_int16 topMinCost_new_reg = vx_setall_s16(SHRT_MAX);
1795 v_int16 src0_topBuf = vx_setall_s16(SHRT_MAX);
1796 v_int16 src1_topBuf = vx_load_aligned(topBuf);
1797
1798 v_int16 src2;
1799 v_int16 src_shifted_left,src_shifted_right;
1800 v_int16 res;
1801
1802 for(;i<Da-v_int16::nlanes;i+= v_int16::nlanes)
1803 {
1804 //process leftBuf:
1805 //lookahead load:
1806 src2 = vx_load_aligned(leftBuf_prev+i+v_int16::nlanes);
1807
1808 //get shifted versions of the current block and add P1:
1809 src_shifted_left = v_rotate_left<1> (src1_leftBuf,src0_leftBuf);
1810 src_shifted_right = v_rotate_right<1> (src1_leftBuf,src2 );
1811
1812 // process and save current block:
1813 res = vx_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right) + P1_reg,v_min(src1_leftBuf,leftMinCostP2_reg))-leftMinCostP2_reg);
1814 leftMinCost_new_reg = v_min(leftMinCost_new_reg,res);
1815 v_store_aligned(leftBuf+i, res);
1816
1817 //update src buffers:
1818 src0_leftBuf = src1_leftBuf;
1819 src1_leftBuf = src2;
1820
1821 //process topBuf:
1822 //lookahead load:
1823 src2 = vx_load_aligned(topBuf+i+v_int16::nlanes);
1824
1825 //get shifted versions of the current block and add P1:
1826 src_shifted_left = v_rotate_left<1> (src1_topBuf,src0_topBuf);
1827 src_shifted_right = v_rotate_right<1> (src1_topBuf,src2 );
1828
1829 // process and save current block:
1830 res = vx_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right) + P1_reg,v_min(src1_topBuf,topMinCostP2_reg))-topMinCostP2_reg);
1831 topMinCost_new_reg = v_min(topMinCost_new_reg,res);
1832 v_store_aligned(topBuf+i, res);
1833
1834 //update src buffers:
1835 src0_topBuf = src1_topBuf;
1836 src1_topBuf = src2;
1837 }
1838
1839 // a bit different processing for the last cycle of the loop:
1840 if(x_nlanes)
1841 {
1842 src2 = vx_setall_s16(SHRT_MAX);
1843 //process leftBuf:
1844 src_shifted_left = v_rotate_left<1> (src1_leftBuf,src0_leftBuf);
1845 src_shifted_right = v_rotate_right<1> (src1_leftBuf,src2 );
1846
1847 res = vx_load_aligned(costs+Da-v_int16::nlanes) + (v_min(v_min(src_shifted_left,src_shifted_right) + P1_reg,v_min(src1_leftBuf,leftMinCostP2_reg))-leftMinCostP2_reg);
1848 leftMinCost = v_reduce_min(v_min(leftMinCost_new_reg,res));
1849 v_store_aligned(leftBuf+Da-v_int16::nlanes, res);
1850
1851 //process topBuf:
1852 src_shifted_left = v_rotate_left<1> (src1_topBuf,src0_topBuf);
1853 src_shifted_right = v_rotate_right<1> (src1_topBuf,src2 );
1854
1855 res = vx_load_aligned(costs+Da-v_int16::nlanes) + (v_min(v_min(src_shifted_left,src_shifted_right) + P1_reg,v_min(src1_topBuf,topMinCostP2_reg))-topMinCostP2_reg);
1856 topMinCost = v_reduce_min(v_min(topMinCost_new_reg,res));
1857 v_store_aligned(topBuf+Da-v_int16::nlanes, res);
1858 }
1859 else
1860 {
1861 CostType leftMinCost_new = v_reduce_min(leftMinCost_new_reg);
1862 CostType topMinCost_new = v_reduce_min(topMinCost_new_reg);
1863 CostType leftBuf_prev_i_minus_1 = i > 0 ? leftBuf_prev[i-1] : SHRT_MAX;
1864 CostType topBuf_i_minus_1 = i > 0 ? topBuf[i-1] : SHRT_MAX;
1865 #else
1866 {
1867 CostType leftMinCost_new = SHRT_MAX;
1868 CostType topMinCost_new = SHRT_MAX;
1869 CostType leftBuf_prev_i_minus_1 = SHRT_MAX;
1870 CostType topBuf_i_minus_1 = SHRT_MAX;
1871 #endif
1872 int leftMinCost_P2 = leftMinCost + P2;
1873 int topMinCost_P2 = topMinCost + P2;
1874 CostType tmp;
1875 for(;i<D-1;i++)
1876 {
1877 leftBuf[i] = cv::saturate_cast<CostType>(costs[i] + std::min(std::min(leftBuf_prev_i_minus_1+P1,leftBuf_prev[i+1]+P1),std::min((int)leftBuf_prev[i],leftMinCost_P2))-leftMinCost_P2);
1878 leftBuf_prev_i_minus_1 = leftBuf_prev[i];
1879 leftMinCost_new = std::min(leftMinCost_new,leftBuf[i]);
1880
1881 tmp = topBuf[i];
1882 topBuf[i] = cv::saturate_cast<CostType>(costs[i] + std::min(std::min(topBuf_i_minus_1+P1,topBuf[i+1]+P1),std::min((int)topBuf[i],topMinCost_P2))-topMinCost_P2);
1883 topBuf_i_minus_1 = tmp;
1884 topMinCost_new = std::min(topMinCost_new,topBuf[i]);
1885 }
1886
1887 leftBuf[D-1] = cv::saturate_cast<CostType>(costs[D-1] + std::min(leftBuf_prev_i_minus_1+P1,std::min((int)leftBuf_prev[D-1],leftMinCost_P2))-leftMinCost_P2);
1888 leftMinCost = std::min(leftMinCost_new,leftBuf[D-1]);
1889
1890 topBuf[D-1] = cv::saturate_cast<CostType>(costs[D-1] + std::min(topBuf_i_minus_1+P1,std::min((int)topBuf[D-1],topMinCost_P2))-topMinCost_P2);
1891 topMinCost = std::min(topMinCost_new,topBuf[D-1]);
1892 }
1893 }
1894
1895 // performing in-place SGM cost accumulation from right to left (the result is stored in rightBuf) and
1896 // summing rightBuf, topBuf, leftBuf together (the result is stored in leftBuf), as well as finding the
1897 // optimal disparity value with minimum accumulated cost
1898 template<bool x_nlanes>
1899 void SGBM3WayMainLoop::accumulateCostsRight(const BufferSGBM3Way &mem, int x,
1900 CostType& rightMinCost, short& optimal_disp, CostType& min_cost) const
1901 {
1902 CostType* costs = mem.curCostVolumeLine - Da + x;
1903 CostType* rightBuf = mem.rightPassBuf;
1904 CostType* topBuf = mem.vertPassCostVolume + x;
1905 CostType* leftBuf = mem.horPassCostVolume + x;
1906
1907 int i = 0;
1908 #if CV_SIMD
1909 v_int16 P1_reg = vx_setall_s16(cv::saturate_cast<CostType>(P1));
1910
1911 v_int16 rightMinCostP2_reg = vx_setall_s16(cv::saturate_cast<CostType>(rightMinCost+P2));
1912 v_int16 rightMinCost_new_reg = vx_setall_s16(SHRT_MAX);
1913 v_int16 src0_rightBuf = vx_setall_s16(SHRT_MAX);
1914 v_int16 src1_rightBuf = vx_load(rightBuf);
1915
1916 v_int16 src2;
1917 v_int16 src_shifted_left,src_shifted_right;
1918 v_int16 res;
1919
1920 v_int16 min_sum_cost_reg = vx_setall_s16(SHRT_MAX);
1921 v_int16 min_sum_pos_reg = vx_setall_s16(0);
1922
1923 for(;i<Da-v_int16::nlanes;i+=v_int16::nlanes)
1924 {
1925 //lookahead load:
1926 src2 = vx_load_aligned(rightBuf+i+v_int16::nlanes);
1927
1928 //get shifted versions of the current block and add P1:
1929 src_shifted_left = v_rotate_left<1> (src1_rightBuf,src0_rightBuf);
1930 src_shifted_right = v_rotate_right<1> (src1_rightBuf,src2 );
1931
1932 // process and save current block:
1933 res = vx_load_aligned(costs+i) + (v_min(v_min(src_shifted_left,src_shifted_right) + P1_reg,v_min(src1_rightBuf,rightMinCostP2_reg))-rightMinCostP2_reg);
1934 rightMinCost_new_reg = v_min(rightMinCost_new_reg,res);
1935 v_store_aligned(rightBuf+i, res);
1936
1937 // compute and save total cost:
1938 res = res + vx_load_aligned(leftBuf+i) + vx_load_aligned(topBuf+i);
1939 v_store_aligned(leftBuf+i, res);
1940
1941 // track disparity value with the minimum cost:
1942 min_sum_cost_reg = v_min(min_sum_cost_reg,res);
1943 min_sum_pos_reg = min_sum_pos_reg + ((min_sum_cost_reg == res) & (vx_setall_s16((short)i) - min_sum_pos_reg));
1944
1945 //update src:
1946 src0_rightBuf = src1_rightBuf;
1947 src1_rightBuf = src2;
1948 }
1949
1950 // a bit different processing for the last cycle of the loop:
1951 if(x_nlanes)
1952 {
1953 src2 = vx_setall_s16(SHRT_MAX);
1954 src_shifted_left = v_rotate_left<1> (src1_rightBuf,src0_rightBuf);
1955 src_shifted_right = v_rotate_right<1> (src1_rightBuf,src2 );
1956
1957 res = vx_load_aligned(costs+D-v_int16::nlanes) + (v_min(v_min(src_shifted_left,src_shifted_right) + P1_reg,v_min(src1_rightBuf,rightMinCostP2_reg))-rightMinCostP2_reg);
1958 rightMinCost = v_reduce_min(v_min(rightMinCost_new_reg,res));
1959 v_store_aligned(rightBuf+D-v_int16::nlanes, res);
1960
1961 res = res + vx_load_aligned(leftBuf+D-v_int16::nlanes) + vx_load_aligned(topBuf+D-v_int16::nlanes);
1962 v_store_aligned(leftBuf+D-v_int16::nlanes, res);
1963
1964 min_sum_cost_reg = v_min(min_sum_cost_reg,res);
1965 min_sum_pos_reg = min_sum_pos_reg + ((min_sum_cost_reg == res) & (vx_setall_s16((short)D-v_int16::nlanes) - min_sum_pos_reg));
1966 min_pos(min_sum_cost_reg,min_sum_pos_reg, min_cost, optimal_disp);
1967 }
1968 else
1969 {
1970 CostType rightMinCost_new = v_reduce_min(rightMinCost_new_reg);
1971 CostType rightBuf_i_minus_1 = i > 0 ? rightBuf[i] : SHRT_MAX;
1972 min_pos(min_sum_cost_reg,min_sum_pos_reg, min_cost, optimal_disp);
1973 #else
1974 {
1975 CostType rightMinCost_new = SHRT_MAX;
1976 CostType rightBuf_i_minus_1 = SHRT_MAX;
1977 min_cost = SHRT_MAX;
1978 #endif
1979 int rightMinCost_P2 = rightMinCost + P2;
1980 CostType tmp;
1981 for(;i<D-1;i++)
1982 {
1983 tmp = rightBuf[i];
1984 rightBuf[i] = cv::saturate_cast<CostType>(costs[i] + std::min(std::min(rightBuf_i_minus_1+P1,rightBuf[i+1]+P1),std::min((int)rightBuf[i],rightMinCost_P2))-rightMinCost_P2);
1985 rightBuf_i_minus_1 = tmp;
1986 rightMinCost_new = std::min(rightMinCost_new,rightBuf[i]);
1987 leftBuf[i] = cv::saturate_cast<CostType>((int)leftBuf[i]+rightBuf[i]+topBuf[i]);
1988 if(leftBuf[i]<min_cost)
1989 {
1990 optimal_disp = (short)i;
1991 min_cost = leftBuf[i];
1992 }
1993 }
1994
1995 rightBuf[D-1] = cv::saturate_cast<CostType>(costs[D-1] + std::min(rightBuf_i_minus_1+P1,std::min((int)rightBuf[D-1],rightMinCost_P2))-rightMinCost_P2);
1996 rightMinCost = std::min(rightMinCost_new,rightBuf[D-1]);
1997 leftBuf[D-1] = cv::saturate_cast<CostType>((int)leftBuf[D-1]+rightBuf[D-1]+topBuf[D-1]);
1998 if(leftBuf[D-1]<min_cost)
1999 {
2000 optimal_disp = (short)D-1;
2001 min_cost = leftBuf[D-1];
2002 }
2003 }
2004 }
2005
2006 void SGBM3WayMainLoop::operator () (const Range& range) const
2007 {
2008 if (D == Da) impl<true>(range);
2009 else impl<false>(range);
2010 }
2011
2012 template<bool x_nlanes>
2013 void SGBM3WayMainLoop::impl(const Range& range) const
2014 {
2015 // force separate processing of stripes:
2016 if(range.end>range.start+1)
2017 {
2018 for(int n=range.start;n<range.end;n++)
2019 (*this)(Range(n,n+1));
2020 return;
2021 }
2022
2023 const int DISP_SCALE = (1 << StereoMatcher::DISP_SHIFT);
2024 int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
2025
2026 // setting up the ranges:
2027 int src_start_idx = std::max(std::min(range.start * stripe_sz - stripe_overlap, height),0);
2028 int src_end_idx = std::min(range.end * stripe_sz, height);
2029
2030 int dst_offset;
2031 if(range.start==0)
2032 dst_offset=stripe_overlap;
2033 else
2034 dst_offset=0;
2035
2036 Mat cur_disp = dst_disp[range.start];
2037 cur_disp = Scalar(INVALID_DISP_SCALED);
2038
2039 BufferSGBM3Way mem(width1, width, img1->channels(), Da, SH2, P2);
2040 CostType *horPassCostVolume = mem.horPassCostVolume;
2041 // start real processing:
2042 for(int y=src_start_idx;y<src_end_idx;y++)
2043 {
2044 getRawMatchingCost(mem, y, src_start_idx);
2045
2046 short* disp_row = (short*)cur_disp.ptr(dst_offset+(y-src_start_idx));
2047
2048 // initialize the auxiliary buffers for the pseudo left-right consistency check:
2049 for(int x=0;x<width;x++)
2050 {
2051 mem.disp2Buf[x] = (short)INVALID_DISP_SCALED;
2052 mem.disp2CostBuf[x] = SHRT_MAX;
2053 }
2054 CostType prev_min, min_cost;
2055 int d;
2056 short best_d;
2057 d = best_d = 0;
2058
2059 // forward pass
2060 prev_min=0;
2061 for (int x=Da;x<(1+width1)*Da;x+=Da)
2062 accumulateCostsLeftTop<x_nlanes>(mem, x, prev_min);
2063
2064 //backward pass
2065 mem.clearRightPassBuf();
2066 prev_min=0;
2067 for (int x=width1*Da;x>=Da;x-=Da)
2068 {
2069 accumulateCostsRight<x_nlanes>(mem, x, prev_min, best_d, min_cost);
2070
2071 if(uniquenessRatio>0)
2072 {
2073 d = 0;
2074 #if CV_SIMD
2075 horPassCostVolume+=x;
2076 int thresh = (100*min_cost)/(100-uniquenessRatio);
2077 v_int16 thresh_reg = vx_setall_s16((short)(thresh+1));
2078 v_int16 d1 = vx_setall_s16((short)(best_d-1));
2079 v_int16 d2 = vx_setall_s16((short)(best_d+1));
2080 v_int16 eight_reg = vx_setall_s16(v_int16::nlanes);
2081 v_int16 cur_d = vx_load(idx_row);
2082 v_int16 mask;
2083
2084 for( ; d <= D - 2*v_int16::nlanes; d+=2*v_int16::nlanes )
2085 {
2086 mask = (vx_load_aligned(horPassCostVolume + d) < thresh_reg) & ( (cur_d<d1) | (cur_d>d2) );
2087 cur_d = cur_d+eight_reg;
2088 if( v_check_any(mask) )
2089 break;
2090 mask = (vx_load_aligned(horPassCostVolume + d + v_int16::nlanes) < thresh_reg) & ( (cur_d<d1) | (cur_d>d2) );
2091 cur_d = cur_d+eight_reg;
2092 if( v_check_any(mask) )
2093 break;
2094 }
2095 if( d <= D - 2*v_int16::nlanes )
2096 {
2097 horPassCostVolume-=x;
2098 continue;
2099 }
2100 if( d <= D - v_int16::nlanes )
2101 {
2102 if( v_check_any((vx_load_aligned(horPassCostVolume + d) < thresh_reg) & ((cur_d < d1) | (cur_d > d2))) )
2103 {
2104 horPassCostVolume-=x;
2105 continue;
2106 }
2107 d+=v_int16::nlanes;
2108 }
2109 horPassCostVolume-=x;
2110 #endif
2111 for( ; d < D; d++ )
2112 {
2113 if( horPassCostVolume[x+d]*(100 - uniquenessRatio) < min_cost*100 && std::abs(d - best_d) > 1 )
2114 break;
2115 }
2116 if( d < D )
2117 continue;
2118 }
2119 d = best_d;
2120
2121 int _x2 = x/Da - 1 + minX1 - d - minD;
2122 if( _x2>=0 && _x2<width && mem.disp2CostBuf[_x2] > min_cost )
2123 {
2124 mem.disp2CostBuf[_x2] = min_cost;
2125 mem.disp2Buf[_x2] = (short)(d + minD);
2126 }
2127
2128 if( 0 < d && d < D-1 )
2129 {
2130 // do subpixel quadratic interpolation:
2131 // fit parabola into (x1=d-1, y1=Sp[d-1]), (x2=d, y2=Sp[d]), (x3=d+1, y3=Sp[d+1])
2132 // then find minimum of the parabola.
2133 int denom2 = std::max(horPassCostVolume[x+d-1] + horPassCostVolume[x+d+1] - 2*horPassCostVolume[x+d], 1);
2134 d = d*DISP_SCALE + ((horPassCostVolume[x+d-1] - horPassCostVolume[x+d+1])*DISP_SCALE + denom2)/(denom2*2);
2135 }
2136 else
2137 d *= DISP_SCALE;
2138
2139 disp_row[(x/Da)-1 + minX1] = (DispType)(d + minD*DISP_SCALE);
2140 }
2141
2142 for(int x = minX1; x < maxX1; x++ )
2143 {
2144 // pseudo LRC consistency check using only one disparity map;
2145 // pixels with difference more than disp12MaxDiff are invalidated
2146 int d1 = disp_row[x];
2147 if( d1 == INVALID_DISP_SCALED )
2148 continue;
2149 int _d = d1 >> StereoMatcher::DISP_SHIFT;
2150 int d_ = (d1 + DISP_SCALE-1) >> StereoMatcher::DISP_SHIFT;
2151 int _x = x - _d, x_ = x - d_;
2152 if( 0 <= _x && _x < width && mem.disp2Buf[_x] >= minD && std::abs(mem.disp2Buf[_x] - _d) > disp12MaxDiff &&
2153 0 <= x_ && x_ < width && mem.disp2Buf[x_] >= minD && std::abs(mem.disp2Buf[x_] - d_) > disp12MaxDiff )
2154 disp_row[x] = (short)INVALID_DISP_SCALED;
2155 }
2156 }
2157 }
2158
2159 template <uchar nstripes>
2160 static void computeDisparity3WaySGBM(const Mat& img1, const Mat& img2, Mat& disp1, const StereoSGBMParams& params)
2161 {
2162 // allocate separate dst_disp arrays to avoid conflicts due to stripe overlap:
2163 int stripe_sz = (int)ceil(img1.rows/(double)nstripes);
2164 int stripe_overlap = (params.SADWindowSize/2+1) + (int)ceil(0.1*stripe_sz);
2165 Mat dst_disp[nstripes];
2166 for(int i=0;i<nstripes;i++)
2167 dst_disp[i].create(stripe_sz+stripe_overlap,img1.cols,CV_16S);
2168
2169 parallel_for_(
2170 Range(0,nstripes),
2171 SGBM3WayMainLoop(img1,img2,dst_disp,params,stripe_sz,stripe_overlap)
2172 );
2173
2174 //assemble disp1 from dst_disp:
2175 short* dst_row;
2176 short* src_row;
2177 for(int i=0;i<disp1.rows;i++)
2178 {
2179 dst_row = (short*)disp1.ptr(i);
2180 src_row = (short*)dst_disp[i/stripe_sz].ptr(stripe_overlap+i%stripe_sz);
2181 memcpy(dst_row,src_row,disp1.cols*sizeof(short));
2182 }
2183 }
2184
2185 class StereoSGBMImpl CV_FINAL : public StereoSGBM
2186 {
2187 public:
2188 StereoSGBMImpl()
2189 : params()
2190 {
2191 // nothing
2192 }
2193
2194 StereoSGBMImpl( int _minDisparity, int _numDisparities, int _SADWindowSize,
2195 int _P1, int _P2, int _disp12MaxDiff, int _preFilterCap,
2196 int _uniquenessRatio, int _speckleWindowSize, int _speckleRange,
2197 int _mode )
2198 : params(_minDisparity, _numDisparities, _SADWindowSize,
2199 _P1, _P2, _disp12MaxDiff, _preFilterCap,
2200 _uniquenessRatio, _speckleWindowSize, _speckleRange,
2201 _mode)
2202 {
2203 // nothing
2204 }
2205
2206 void compute( InputArray leftarr, InputArray rightarr, OutputArray disparr ) CV_OVERRIDE
2207 {
2208 CV_INSTRUMENT_REGION();
2209
2210 Mat left = leftarr.getMat(), right = rightarr.getMat();
2211 CV_Assert( left.size() == right.size() && left.type() == right.type() &&
2212 left.depth() == CV_8U );
2213
2214 disparr.create( left.size(), CV_16S );
2215 Mat disp = disparr.getMat();
2216
2217 if(params.mode==MODE_SGBM_3WAY)
2218 // the number of stripes is fixed, disregarding the number of threads/processors
2219 // to make the results fully reproducible
2220 computeDisparity3WaySGBM<4>( left, right, disp, params );
2221 else if(params.mode==MODE_HH4)
2222 computeDisparitySGBM_HH4( left, right, disp, params );
2223 else
2224 computeDisparitySGBM( left, right, disp, params );
2225
2226 medianBlur(disp, disp, 3);
2227
2228 if( params.speckleWindowSize > 0 )
2229 filterSpeckles(disp, (params.minDisparity - 1)*StereoMatcher::DISP_SCALE, params.speckleWindowSize,
2230 StereoMatcher::DISP_SCALE*params.speckleRange, buffer);
2231 }
2232
2233 int getMinDisparity() const CV_OVERRIDE { return params.minDisparity; }
2234 void setMinDisparity(int minDisparity) CV_OVERRIDE { params.minDisparity = minDisparity; }
2235
2236 int getNumDisparities() const CV_OVERRIDE { return params.numDisparities; }
2237 void setNumDisparities(int numDisparities) CV_OVERRIDE { params.numDisparities = numDisparities; }
2238
2239 int getBlockSize() const CV_OVERRIDE { return params.SADWindowSize; }
2240 void setBlockSize(int blockSize) CV_OVERRIDE { params.SADWindowSize = blockSize; }
2241
2242 int getSpeckleWindowSize() const CV_OVERRIDE { return params.speckleWindowSize; }
2243 void setSpeckleWindowSize(int speckleWindowSize) CV_OVERRIDE { params.speckleWindowSize = speckleWindowSize; }
2244
2245 int getSpeckleRange() const CV_OVERRIDE { return params.speckleRange; }
2246 void setSpeckleRange(int speckleRange) CV_OVERRIDE { params.speckleRange = speckleRange; }
2247
2248 int getDisp12MaxDiff() const CV_OVERRIDE { return params.disp12MaxDiff; }
2249 void setDisp12MaxDiff(int disp12MaxDiff) CV_OVERRIDE { params.disp12MaxDiff = disp12MaxDiff; }
2250
2251 int getPreFilterCap() const CV_OVERRIDE { return params.preFilterCap; }
2252 void setPreFilterCap(int preFilterCap) CV_OVERRIDE { params.preFilterCap = preFilterCap; }
2253
2254 int getUniquenessRatio() const CV_OVERRIDE { return params.uniquenessRatio; }
2255 void setUniquenessRatio(int uniquenessRatio) CV_OVERRIDE { params.uniquenessRatio = uniquenessRatio; }
2256
2257 int getP1() const CV_OVERRIDE { return params.P1; }
2258 void setP1(int P1) CV_OVERRIDE { params.P1 = P1; }
2259
2260 int getP2() const CV_OVERRIDE { return params.P2; }
2261 void setP2(int P2) CV_OVERRIDE { params.P2 = P2; }
2262
2263 int getMode() const CV_OVERRIDE { return params.mode; }
2264 void setMode(int mode) CV_OVERRIDE { params.mode = mode; }
2265
2266 void write(FileStorage& fs) const CV_OVERRIDE
2267 {
2268 writeFormat(fs);
2269 fs << "name" << name_
2270 << "minDisparity" << params.minDisparity
2271 << "numDisparities" << params.numDisparities
2272 << "blockSize" << params.SADWindowSize
2273 << "speckleWindowSize" << params.speckleWindowSize
2274 << "speckleRange" << params.speckleRange
2275 << "disp12MaxDiff" << params.disp12MaxDiff
2276 << "preFilterCap" << params.preFilterCap
2277 << "uniquenessRatio" << params.uniquenessRatio
2278 << "P1" << params.P1
2279 << "P2" << params.P2
2280 << "mode" << params.mode;
2281 }
2282
2283 void read(const FileNode& fn) CV_OVERRIDE
2284 {
2285 FileNode n = fn["name"];
2286 CV_Assert( n.isString() && String(n) == name_ );
2287 params.minDisparity = (int)fn["minDisparity"];
2288 params.numDisparities = (int)fn["numDisparities"];
2289 params.SADWindowSize = (int)fn["blockSize"];
2290 params.speckleWindowSize = (int)fn["speckleWindowSize"];
2291 params.speckleRange = (int)fn["speckleRange"];
2292 params.disp12MaxDiff = (int)fn["disp12MaxDiff"];
2293 params.preFilterCap = (int)fn["preFilterCap"];
2294 params.uniquenessRatio = (int)fn["uniquenessRatio"];
2295 params.P1 = (int)fn["P1"];
2296 params.P2 = (int)fn["P2"];
2297 params.mode = (int)fn["mode"];
2298 }
2299
2300 StereoSGBMParams params;
2301 Mat buffer;
2302
2303 static const char* name_;
2304 };
2305
2306 const char* StereoSGBMImpl::name_ = "StereoMatcher.SGBM";
2307
2308
2309 Ptr<StereoSGBM> StereoSGBM::create(int minDisparity, int numDisparities, int SADWindowSize,
2310 int P1, int P2, int disp12MaxDiff,
2311 int preFilterCap, int uniquenessRatio,
2312 int speckleWindowSize, int speckleRange,
2313 int mode)
2314 {
2315 return Ptr<StereoSGBM>(
2316 new StereoSGBMImpl(minDisparity, numDisparities, SADWindowSize,
2317 P1, P2, disp12MaxDiff,
2318 preFilterCap, uniquenessRatio,
2319 speckleWindowSize, speckleRange,
2320 mode));
2321 }
2322
2323 Rect getValidDisparityROI( Rect roi1, Rect roi2,
2324 int minDisparity,
2325 int numberOfDisparities,
2326 int SADWindowSize )
2327 {
2328 int SW2 = SADWindowSize/2;
2329 int maxD = minDisparity + numberOfDisparities - 1;
2330
2331 int xmin = std::max(roi1.x, roi2.x + maxD) + SW2;
2332 int xmax = std::min(roi1.x + roi1.width, roi2.x + roi2.width) - SW2;
2333 int ymin = std::max(roi1.y, roi2.y) + SW2;
2334 int ymax = std::min(roi1.y + roi1.height, roi2.y + roi2.height) - SW2;
2335
2336 Rect r(xmin, ymin, xmax - xmin, ymax - ymin);
2337
2338 return r.width > 0 && r.height > 0 ? r : Rect();
2339 }
2340
2341 typedef cv::Point_<short> Point2s;
2342
2343 template <typename T>
2344 void filterSpecklesImpl(cv::Mat& img, int newVal, int maxSpeckleSize, int maxDiff, cv::Mat& _buf)
2345 {
2346 using namespace cv;
2347
2348 int width = img.cols, height = img.rows, npixels = width*height;
2349 size_t bufSize = npixels*(int)(sizeof(Point2s) + sizeof(int) + sizeof(uchar));
2350 if( !_buf.isContinuous() || _buf.empty() || _buf.cols*_buf.rows*_buf.elemSize() < bufSize )
2351 _buf.reserveBuffer(bufSize);
2352
2353 uchar* buf = _buf.ptr();
2354 int i, j, dstep = (int)(img.step/sizeof(T));
2355 int* labels = (int*)buf;
2356 buf += npixels*sizeof(labels[0]);
2357 Point2s* wbuf = (Point2s*)buf;
2358 buf += npixels*sizeof(wbuf[0]);
2359 uchar* rtype = (uchar*)buf;
2360 int curlabel = 0;
2361
2362 // clear out label assignments
2363 memset(labels, 0, npixels*sizeof(labels[0]));
2364
2365 for( i = 0; i < height; i++ )
2366 {
2367 T* ds = img.ptr<T>(i);
2368 int* ls = labels + width*i;
2369
2370 for( j = 0; j < width; j++ )
2371 {
2372 if( ds[j] != newVal ) // not a bad disparity
2373 {
2374 if( ls[j] ) // has a label, check for bad label
2375 {
2376 if( rtype[ls[j]] ) // small region, zero out disparity
2377 ds[j] = (T)newVal;
2378 }
2379 // no label, assign and propagate
2380 else
2381 {
2382 Point2s* ws = wbuf; // initialize wavefront
2383 Point2s p((short)j, (short)i); // current pixel
2384 curlabel++; // next label
2385 int count = 0; // current region size
2386 ls[j] = curlabel;
2387
2388 // wavefront propagation
2389 while( ws >= wbuf ) // wavefront not empty
2390 {
2391 count++;
2392 // put neighbors onto wavefront
2393 T* dpp = &img.at<T>(p.y, p.x);
2394 T dp = *dpp;
2395 int* lpp = labels + width*p.y + p.x;
2396
2397 if( p.y < height-1 && !lpp[+width] && dpp[+dstep] != newVal && std::abs(dp - dpp[+dstep]) <= maxDiff )
2398 {
2399 lpp[+width] = curlabel;
2400 *ws++ = Point2s(p.x, p.y+1);
2401 }
2402
2403 if( p.y > 0 && !lpp[-width] && dpp[-dstep] != newVal && std::abs(dp - dpp[-dstep]) <= maxDiff )
2404 {
2405 lpp[-width] = curlabel;
2406 *ws++ = Point2s(p.x, p.y-1);
2407 }
2408
2409 if( p.x < width-1 && !lpp[+1] && dpp[+1] != newVal && std::abs(dp - dpp[+1]) <= maxDiff )
2410 {
2411 lpp[+1] = curlabel;
2412 *ws++ = Point2s(p.x+1, p.y);
2413 }
2414
2415 if( p.x > 0 && !lpp[-1] && dpp[-1] != newVal && std::abs(dp - dpp[-1]) <= maxDiff )
2416 {
2417 lpp[-1] = curlabel;
2418 *ws++ = Point2s(p.x-1, p.y);
2419 }
2420
2421 // pop most recent and propagate
2422 // NB: could try least recent, maybe better convergence
2423 p = *--ws;
2424 }
2425
2426 // assign label type
2427 if( count <= maxSpeckleSize ) // speckle region
2428 {
2429 rtype[ls[j]] = 1; // small region label
2430 ds[j] = (T)newVal;
2431 }
2432 else
2433 rtype[ls[j]] = 0; // large region label
2434 }
2435 }
2436 }
2437 }
2438 }
2439
2440 #ifdef HAVE_IPP
2441 static bool ipp_filterSpeckles(Mat &img, int maxSpeckleSize, int newVal, int maxDiff, Mat &buffer)
2442 {
2443 #if IPP_VERSION_X100 >= 810
2444 CV_INSTRUMENT_REGION_IPP();
2445
2446 IppDataType dataType = ippiGetDataType(img.depth());
2447 IppiSize size = ippiSize(img.size());
2448 int bufferSize;
2449
2450 if(img.channels() != 1)
2451 return false;
2452
2453 if(dataType != ipp8u && dataType != ipp16s)
2454 return false;
2455
2456 if(ippiMarkSpecklesGetBufferSize(size, dataType, 1, &bufferSize) < 0)
2457 return false;
2458
2459 if(bufferSize && (buffer.empty() || (int)(buffer.step*buffer.rows) < bufferSize))
2460 buffer.create(1, (int)bufferSize, CV_8U);
2461
2462 switch(dataType)
2463 {
2464 case ipp8u: return CV_INSTRUMENT_FUN_IPP(ippiMarkSpeckles_8u_C1IR, img.ptr<Ipp8u>(), (int)img.step, size, (Ipp8u)newVal, maxSpeckleSize, (Ipp8u)maxDiff, ippiNormL1, buffer.ptr<Ipp8u>()) >= 0;
2465 case ipp16s: return CV_INSTRUMENT_FUN_IPP(ippiMarkSpeckles_16s_C1IR, img.ptr<Ipp16s>(), (int)img.step, size, (Ipp16s)newVal, maxSpeckleSize, (Ipp16s)maxDiff, ippiNormL1, buffer.ptr<Ipp8u>()) >= 0;
2466 default: return false;
2467 }
2468 #else
2469 CV_UNUSED(img); CV_UNUSED(maxSpeckleSize); CV_UNUSED(newVal); CV_UNUSED(maxDiff); CV_UNUSED(buffer);
2470 return false;
2471 #endif
2472 }
2473 #endif
2474
2475 }
2476
2477 void cv::filterSpeckles( InputOutputArray _img, double _newval, int maxSpeckleSize,
2478 double _maxDiff, InputOutputArray __buf )
2479 {
2480 CV_INSTRUMENT_REGION();
2481
2482 Mat img = _img.getMat();
2483 int type = img.type();
2484 Mat temp, &_buf = __buf.needed() ? __buf.getMatRef() : temp;
2485 CV_Assert( type == CV_8UC1 || type == CV_16SC1 );
2486
2487 int newVal = cvRound(_newval), maxDiff = cvRound(_maxDiff);
2488
2489 CV_IPP_RUN_FAST(ipp_filterSpeckles(img, maxSpeckleSize, newVal, maxDiff, _buf));
2490
2491 if (type == CV_8UC1)
2492 filterSpecklesImpl<uchar>(img, newVal, maxSpeckleSize, maxDiff, _buf);
2493 else
2494 filterSpecklesImpl<short>(img, newVal, maxSpeckleSize, maxDiff, _buf);
2495 }
2496
2497 void cv::validateDisparity( InputOutputArray _disp, InputArray _cost, int minDisparity,
2498 int numberOfDisparities, int disp12MaxDiff )
2499 {
2500 CV_INSTRUMENT_REGION();
2501
2502 Mat disp = _disp.getMat(), cost = _cost.getMat();
2503 int cols = disp.cols, rows = disp.rows;
2504 int minD = minDisparity, maxD = minDisparity + numberOfDisparities;
2505 int x, minX1 = std::max(maxD, 0), maxX1 = cols + std::min(minD, 0);
2506 AutoBuffer<int> _disp2buf(cols*2);
2507 int* disp2buf = _disp2buf.data();
2508 int* disp2cost = disp2buf + cols;
2509 const int DISP_SHIFT = 4, DISP_SCALE = 1 << DISP_SHIFT;
2510 int INVALID_DISP = minD - 1, INVALID_DISP_SCALED = INVALID_DISP*DISP_SCALE;
2511 int costType = cost.type();
2512
2513 disp12MaxDiff *= DISP_SCALE;
2514
2515 CV_Assert( numberOfDisparities > 0 && disp.type() == CV_16S &&
2516 (costType == CV_16S || costType == CV_32S) &&
2517 disp.size() == cost.size() );
2518
2519 for( int y = 0; y < rows; y++ )
2520 {
2521 short* dptr = disp.ptr<short>(y);
2522
2523 for( x = 0; x < cols; x++ )
2524 {
2525 disp2buf[x] = INVALID_DISP_SCALED;
2526 disp2cost[x] = INT_MAX;
2527 }
2528
2529 if( costType == CV_16S )
2530 {
2531 const short* cptr = cost.ptr<short>(y);
2532
2533 for( x = minX1; x < maxX1; x++ )
2534 {
2535 int d = dptr[x], c = cptr[x];
2536
2537 if( d == INVALID_DISP_SCALED )
2538 continue;
2539
2540 int x2 = x - ((d + DISP_SCALE/2) >> DISP_SHIFT);
2541
2542 if( disp2cost[x2] > c )
2543 {
2544 disp2cost[x2] = c;
2545 disp2buf[x2] = d;
2546 }
2547 }
2548 }
2549 else
2550 {
2551 const int* cptr = cost.ptr<int>(y);
2552
2553 for( x = minX1; x < maxX1; x++ )
2554 {
2555 int d = dptr[x], c = cptr[x];
2556
2557 if( d == INVALID_DISP_SCALED )
2558 continue;
2559
2560 int x2 = x - ((d + DISP_SCALE/2) >> DISP_SHIFT);
2561
2562 if( disp2cost[x2] > c )
2563 {
2564 disp2cost[x2] = c;
2565 disp2buf[x2] = d;
2566 }
2567 }
2568 }
2569
2570 for( x = minX1; x < maxX1; x++ )
2571 {
2572 // we round the computed disparity both towards -inf and +inf and check
2573 // if either of the corresponding disparities in disp2 is consistent.
2574 // This is to give the computed disparity a chance to look valid if it is.
2575 int d = dptr[x];
2576 if( d == INVALID_DISP_SCALED )
2577 continue;
2578 int d0 = d >> DISP_SHIFT;
2579 int d1 = (d + DISP_SCALE-1) >> DISP_SHIFT;
2580 int x0 = x - d0, x1 = x - d1;
2581 if( (0 <= x0 && x0 < cols && disp2buf[x0] > INVALID_DISP_SCALED && std::abs(disp2buf[x0] - d) > disp12MaxDiff) &&
2582 (0 <= x1 && x1 < cols && disp2buf[x1] > INVALID_DISP_SCALED && std::abs(disp2buf[x1] - d) > disp12MaxDiff) )
2583 dptr[x] = (short)INVALID_DISP_SCALED;
2584 }
2585 }
2586 }
2587