1 /*M///////////////////////////////////////////////////////////////////////////////////////
2 //
3 // IMPORTANT: READ BEFORE DOWNLOADING, COPYING, INSTALLING OR USING.
4 //
5 // By downloading, copying, installing or using the software you agree to this license.
6 // If you do not agree to this license, do not download, install,
7 // copy or use the software.
8 //
9 //
10 // License Agreement
11 // For Open Source Computer Vision Library
12 //
13 // Copyright (C) 2000-2008, Intel Corporation, all rights reserved.
14 // Copyright (C) 2009, Willow Garage Inc., all rights reserved.
15 // Third party copyrights are property of their respective owners.
16 //
17 // Redistribution and use in source and binary forms, with or without modification,
18 // are permitted provided that the following conditions are met:
19 //
20 // * Redistribution's of source code must retain the above copyright notice,
21 // this list of conditions and the following disclaimer.
22 //
23 // * Redistribution's in binary form must reproduce the above copyright notice,
24 // this list of conditions and the following disclaimer in the documentation
25 // and/or other materials provided with the distribution.
26 //
27 // * The name of the copyright holders may not be used to endorse or promote products
28 // derived from this software without specific prior written permission.
29 //
30 // This software is provided by the copyright holders and contributors "as is" and
31 // any express or implied warranties, including, but not limited to, the implied
32 // warranties of merchantability and fitness for a particular purpose are disclaimed.
33 // In no event shall the Intel Corporation or contributors be liable for any direct,
34 // indirect, incidental, special, exemplary, or consequential damages
35 // (including, but not limited to, procurement of substitute goods or services;
36 // loss of use, data, or profits; or business interruption) however caused
37 // and on any theory of liability, whether in contract, strict liability,
38 // or tort (including negligence or otherwise) arising in any way out of
39 // the use of this software, even if advised of the possibility of such damage.
40 //
41 //M*/
42
43 #include "precomp.hpp"
44 #include "opencv2/core/hal/intrin.hpp"
45 #include "opencl_kernels_video.hpp"
46
47 using namespace std;
48 #define EPS 0.001F
49 #define INF 1E+10F
50
51 namespace cv {
52
53 class DISOpticalFlowImpl CV_FINAL : public DISOpticalFlow
54 {
55 public:
56 DISOpticalFlowImpl();
57
58 void calc(InputArray I0, InputArray I1, InputOutputArray flow) CV_OVERRIDE;
59 void collectGarbage() CV_OVERRIDE;
60
61 protected: //!< algorithm parameters
62 int finest_scale, coarsest_scale;
63 int patch_size;
64 int patch_stride;
65 int grad_descent_iter;
66 int variational_refinement_iter;
67 float variational_refinement_alpha;
68 float variational_refinement_gamma;
69 float variational_refinement_delta;
70 bool use_mean_normalization;
71 bool use_spatial_propagation;
72
73 protected: //!< some auxiliary variables
74 int border_size;
75 int w, h; //!< flow buffer width and height on the current scale
76 int ws, hs; //!< sparse flow buffer width and height on the current scale
77
78 public:
getFinestScale() const79 int getFinestScale() const CV_OVERRIDE { return finest_scale; }
setFinestScale(int val)80 void setFinestScale(int val) CV_OVERRIDE { finest_scale = val; }
getPatchSize() const81 int getPatchSize() const CV_OVERRIDE { return patch_size; }
setPatchSize(int val)82 void setPatchSize(int val) CV_OVERRIDE { patch_size = val; }
getPatchStride() const83 int getPatchStride() const CV_OVERRIDE { return patch_stride; }
setPatchStride(int val)84 void setPatchStride(int val) CV_OVERRIDE { patch_stride = val; }
getGradientDescentIterations() const85 int getGradientDescentIterations() const CV_OVERRIDE { return grad_descent_iter; }
setGradientDescentIterations(int val)86 void setGradientDescentIterations(int val) CV_OVERRIDE { grad_descent_iter = val; }
getVariationalRefinementIterations() const87 int getVariationalRefinementIterations() const CV_OVERRIDE { return variational_refinement_iter; }
setVariationalRefinementIterations(int val)88 void setVariationalRefinementIterations(int val) CV_OVERRIDE { variational_refinement_iter = val; }
getVariationalRefinementAlpha() const89 float getVariationalRefinementAlpha() const CV_OVERRIDE { return variational_refinement_alpha; }
setVariationalRefinementAlpha(float val)90 void setVariationalRefinementAlpha(float val) CV_OVERRIDE { variational_refinement_alpha = val; }
getVariationalRefinementDelta() const91 float getVariationalRefinementDelta() const CV_OVERRIDE { return variational_refinement_delta; }
setVariationalRefinementDelta(float val)92 void setVariationalRefinementDelta(float val) CV_OVERRIDE { variational_refinement_delta = val; }
getVariationalRefinementGamma() const93 float getVariationalRefinementGamma() const CV_OVERRIDE { return variational_refinement_gamma; }
setVariationalRefinementGamma(float val)94 void setVariationalRefinementGamma(float val) CV_OVERRIDE { variational_refinement_gamma = val; }
95
getUseMeanNormalization() const96 bool getUseMeanNormalization() const CV_OVERRIDE { return use_mean_normalization; }
setUseMeanNormalization(bool val)97 void setUseMeanNormalization(bool val) CV_OVERRIDE { use_mean_normalization = val; }
getUseSpatialPropagation() const98 bool getUseSpatialPropagation() const CV_OVERRIDE { return use_spatial_propagation; }
setUseSpatialPropagation(bool val)99 void setUseSpatialPropagation(bool val) CV_OVERRIDE { use_spatial_propagation = val; }
100
101 protected: //!< internal buffers
102 vector<Mat_<uchar> > I0s; //!< Gaussian pyramid for the current frame
103 vector<Mat_<uchar> > I1s; //!< Gaussian pyramid for the next frame
104 vector<Mat_<uchar> > I1s_ext; //!< I1s with borders
105
106 vector<Mat_<short> > I0xs; //!< Gaussian pyramid for the x gradient of the current frame
107 vector<Mat_<short> > I0ys; //!< Gaussian pyramid for the y gradient of the current frame
108
109 vector<Mat_<float> > Ux; //!< x component of the flow vectors
110 vector<Mat_<float> > Uy; //!< y component of the flow vectors
111
112 vector<Mat_<float> > initial_Ux; //!< x component of the initial flow field, if one was passed as an input
113 vector<Mat_<float> > initial_Uy; //!< y component of the initial flow field, if one was passed as an input
114
115 Mat_<Vec2f> U; //!< a buffer for the merged flow
116
117 Mat_<float> Sx; //!< intermediate sparse flow representation (x component)
118 Mat_<float> Sy; //!< intermediate sparse flow representation (y component)
119
120 /* Structure tensor components: */
121 Mat_<float> I0xx_buf; //!< sum of squares of x gradient values
122 Mat_<float> I0yy_buf; //!< sum of squares of y gradient values
123 Mat_<float> I0xy_buf; //!< sum of x and y gradient products
124
125 /* Extra buffers that are useful if patch mean-normalization is used: */
126 Mat_<float> I0x_buf; //!< sum of x gradient values
127 Mat_<float> I0y_buf; //!< sum of y gradient values
128
129 /* Auxiliary buffers used in structure tensor computation: */
130 Mat_<float> I0xx_buf_aux;
131 Mat_<float> I0yy_buf_aux;
132 Mat_<float> I0xy_buf_aux;
133 Mat_<float> I0x_buf_aux;
134 Mat_<float> I0y_buf_aux;
135
136 vector<Ptr<VariationalRefinement> > variational_refinement_processors;
137
138 private: //!< private methods and parallel sections
139 void prepareBuffers(Mat &I0, Mat &I1, Mat &flow, bool use_flow);
140 void precomputeStructureTensor(Mat &dst_I0xx, Mat &dst_I0yy, Mat &dst_I0xy, Mat &dst_I0x, Mat &dst_I0y, Mat &I0x,
141 Mat &I0y);
142 int autoSelectCoarsestScale(int img_width);
143 void autoSelectPatchSizeAndScales(int img_width);
144
145 struct PatchInverseSearch_ParBody : public ParallelLoopBody
146 {
147 DISOpticalFlowImpl *dis;
148 int nstripes, stripe_sz;
149 int hs;
150 Mat *Sx, *Sy, *Ux, *Uy, *I0, *I1, *I0x, *I0y;
151 int num_iter, pyr_level;
152
153 PatchInverseSearch_ParBody(DISOpticalFlowImpl &_dis, int _nstripes, int _hs, Mat &dst_Sx, Mat &dst_Sy,
154 Mat &src_Ux, Mat &src_Uy, Mat &_I0, Mat &_I1, Mat &_I0x, Mat &_I0y, int _num_iter,
155 int _pyr_level);
156 void operator()(const Range &range) const CV_OVERRIDE;
157 };
158
159 struct Densification_ParBody : public ParallelLoopBody
160 {
161 DISOpticalFlowImpl *dis;
162 int nstripes, stripe_sz;
163 int h;
164 Mat *Ux, *Uy, *Sx, *Sy, *I0, *I1;
165
166 Densification_ParBody(DISOpticalFlowImpl &_dis, int _nstripes, int _h, Mat &dst_Ux, Mat &dst_Uy, Mat &src_Sx,
167 Mat &src_Sy, Mat &_I0, Mat &_I1);
168 void operator()(const Range &range) const CV_OVERRIDE;
169 };
170
171 #ifdef HAVE_OPENCL
172 vector<UMat> u_I0s; //!< Gaussian pyramid for the current frame
173 vector<UMat> u_I1s; //!< Gaussian pyramid for the next frame
174 vector<UMat> u_I1s_ext; //!< I1s with borders
175
176 vector<UMat> u_I0xs; //!< Gaussian pyramid for the x gradient of the current frame
177 vector<UMat> u_I0ys; //!< Gaussian pyramid for the y gradient of the current frame
178
179 vector<UMat> u_U; //!< (x,y) component of the flow vectors (CV_32FC2)
180 vector<UMat> u_initial_U; //!< (x, y) components of the initial flow field, if one was passed as an input (CV_32FC2)
181
182 UMat u_S; //!< intermediate sparse flow representation (x,y components - CV_32FC2)
183
184 /* Structure tensor components: */
185 UMat u_I0xx_buf; //!< sum of squares of x gradient values
186 UMat u_I0yy_buf; //!< sum of squares of y gradient values
187 UMat u_I0xy_buf; //!< sum of x and y gradient products
188
189 /* Extra buffers that are useful if patch mean-normalization is used: */
190 UMat u_I0x_buf; //!< sum of x gradient values
191 UMat u_I0y_buf; //!< sum of y gradient values
192
193 /* Auxiliary buffers used in structure tensor computation: */
194 UMat u_I0xx_buf_aux;
195 UMat u_I0yy_buf_aux;
196 UMat u_I0xy_buf_aux;
197 UMat u_I0x_buf_aux;
198 UMat u_I0y_buf_aux;
199
200 bool ocl_precomputeStructureTensor(UMat &dst_I0xx, UMat &dst_I0yy, UMat &dst_I0xy,
201 UMat &dst_I0x, UMat &dst_I0y, UMat &I0x, UMat &I0y);
202 void ocl_prepareBuffers(UMat &I0, UMat &I1, InputArray flow, bool use_flow);
203 bool ocl_calc(InputArray I0, InputArray I1, InputOutputArray flow);
204 bool ocl_Densification(UMat &dst_U, UMat &src_S, UMat &_I0, UMat &_I1);
205 bool ocl_PatchInverseSearch(UMat &src_U,
206 UMat &I0, UMat &I1, UMat &I0x, UMat &I0y, int num_iter, int pyr_level);
207 #endif
208 };
209
DISOpticalFlowImpl()210 DISOpticalFlowImpl::DISOpticalFlowImpl()
211 {
212 CV_INSTRUMENT_REGION();
213
214 finest_scale = 2;
215 patch_size = 8;
216 patch_stride = 4;
217 grad_descent_iter = 16;
218 variational_refinement_iter = 5;
219 variational_refinement_alpha = 20.f;
220 variational_refinement_gamma = 10.f;
221 variational_refinement_delta = 5.f;
222
223 border_size = 16;
224 use_mean_normalization = true;
225 use_spatial_propagation = true;
226 coarsest_scale = 10;
227
228 /* Use separate variational refinement instances for different scales to avoid repeated memory allocation: */
229 int max_possible_scales = 10;
230 ws = hs = w = h = 0;
231 for (int i = 0; i < max_possible_scales; i++)
232 variational_refinement_processors.push_back(VariationalRefinement::create());
233 }
234
prepareBuffers(Mat & I0,Mat & I1,Mat & flow,bool use_flow)235 void DISOpticalFlowImpl::prepareBuffers(Mat &I0, Mat &I1, Mat &flow, bool use_flow)
236 {
237 CV_INSTRUMENT_REGION();
238
239 I0s.resize(coarsest_scale + 1);
240 I1s.resize(coarsest_scale + 1);
241 I1s_ext.resize(coarsest_scale + 1);
242 I0xs.resize(coarsest_scale + 1);
243 I0ys.resize(coarsest_scale + 1);
244 Ux.resize(coarsest_scale + 1);
245 Uy.resize(coarsest_scale + 1);
246
247 Mat flow_uv[2];
248 if (use_flow)
249 {
250 split(flow, flow_uv);
251 initial_Ux.resize(coarsest_scale + 1);
252 initial_Uy.resize(coarsest_scale + 1);
253 }
254
255 int fraction = 1;
256 int cur_rows = 0, cur_cols = 0;
257
258 for (int i = 0; i <= coarsest_scale; i++)
259 {
260 /* Avoid initializing the pyramid levels above the finest scale, as they won't be used anyway */
261 if (i == finest_scale)
262 {
263 cur_rows = I0.rows / fraction;
264 cur_cols = I0.cols / fraction;
265 I0s[i].create(cur_rows, cur_cols);
266 resize(I0, I0s[i], I0s[i].size(), 0.0, 0.0, INTER_AREA);
267 I1s[i].create(cur_rows, cur_cols);
268 resize(I1, I1s[i], I1s[i].size(), 0.0, 0.0, INTER_AREA);
269
270 /* These buffers are reused in each scale so we initialize them once on the finest scale: */
271 Sx.create(cur_rows / patch_stride, cur_cols / patch_stride);
272 Sy.create(cur_rows / patch_stride, cur_cols / patch_stride);
273 I0xx_buf.create(cur_rows / patch_stride, cur_cols / patch_stride);
274 I0yy_buf.create(cur_rows / patch_stride, cur_cols / patch_stride);
275 I0xy_buf.create(cur_rows / patch_stride, cur_cols / patch_stride);
276 I0x_buf.create(cur_rows / patch_stride, cur_cols / patch_stride);
277 I0y_buf.create(cur_rows / patch_stride, cur_cols / patch_stride);
278
279 I0xx_buf_aux.create(cur_rows, cur_cols / patch_stride);
280 I0yy_buf_aux.create(cur_rows, cur_cols / patch_stride);
281 I0xy_buf_aux.create(cur_rows, cur_cols / patch_stride);
282 I0x_buf_aux.create(cur_rows, cur_cols / patch_stride);
283 I0y_buf_aux.create(cur_rows, cur_cols / patch_stride);
284
285 U.create(cur_rows, cur_cols);
286 }
287 else if (i > finest_scale)
288 {
289 cur_rows = I0s[i - 1].rows / 2;
290 cur_cols = I0s[i - 1].cols / 2;
291 I0s[i].create(cur_rows, cur_cols);
292 resize(I0s[i - 1], I0s[i], I0s[i].size(), 0.0, 0.0, INTER_AREA);
293 I1s[i].create(cur_rows, cur_cols);
294 resize(I1s[i - 1], I1s[i], I1s[i].size(), 0.0, 0.0, INTER_AREA);
295 }
296
297 if (i >= finest_scale)
298 {
299 I1s_ext[i].create(cur_rows + 2 * border_size, cur_cols + 2 * border_size);
300 copyMakeBorder(I1s[i], I1s_ext[i], border_size, border_size, border_size, border_size, BORDER_REPLICATE);
301 I0xs[i].create(cur_rows, cur_cols);
302 I0ys[i].create(cur_rows, cur_cols);
303 spatialGradient(I0s[i], I0xs[i], I0ys[i]);
304 Ux[i].create(cur_rows, cur_cols);
305 Uy[i].create(cur_rows, cur_cols);
306 variational_refinement_processors[i]->setAlpha(variational_refinement_alpha);
307 variational_refinement_processors[i]->setDelta(variational_refinement_delta);
308 variational_refinement_processors[i]->setGamma(variational_refinement_gamma);
309 variational_refinement_processors[i]->setSorIterations(5);
310 variational_refinement_processors[i]->setFixedPointIterations(variational_refinement_iter);
311
312 if (use_flow)
313 {
314 resize(flow_uv[0], initial_Ux[i], Size(cur_cols, cur_rows));
315 initial_Ux[i] /= fraction;
316 resize(flow_uv[1], initial_Uy[i], Size(cur_cols, cur_rows));
317 initial_Uy[i] /= fraction;
318 }
319 }
320
321 fraction *= 2;
322 }
323 }
324
325 /* This function computes the structure tensor elements (local sums of I0x^2, I0x*I0y and I0y^2).
326 * A simple box filter is not used instead because we need to compute these sums on a sparse grid
327 * and store them densely in the output buffers.
328 */
precomputeStructureTensor(Mat & dst_I0xx,Mat & dst_I0yy,Mat & dst_I0xy,Mat & dst_I0x,Mat & dst_I0y,Mat & I0x,Mat & I0y)329 void DISOpticalFlowImpl::precomputeStructureTensor(Mat &dst_I0xx, Mat &dst_I0yy, Mat &dst_I0xy, Mat &dst_I0x,
330 Mat &dst_I0y, Mat &I0x, Mat &I0y)
331 {
332 CV_INSTRUMENT_REGION();
333
334 float *I0xx_ptr = dst_I0xx.ptr<float>();
335 float *I0yy_ptr = dst_I0yy.ptr<float>();
336 float *I0xy_ptr = dst_I0xy.ptr<float>();
337 float *I0x_ptr = dst_I0x.ptr<float>();
338 float *I0y_ptr = dst_I0y.ptr<float>();
339
340 float *I0xx_aux_ptr = I0xx_buf_aux.ptr<float>();
341 float *I0yy_aux_ptr = I0yy_buf_aux.ptr<float>();
342 float *I0xy_aux_ptr = I0xy_buf_aux.ptr<float>();
343 float *I0x_aux_ptr = I0x_buf_aux.ptr<float>();
344 float *I0y_aux_ptr = I0y_buf_aux.ptr<float>();
345
346 /* Separable box filter: horizontal pass */
347 for (int i = 0; i < h; i++)
348 {
349 float sum_xx = 0.0f, sum_yy = 0.0f, sum_xy = 0.0f, sum_x = 0.0f, sum_y = 0.0f;
350 short *x_row = I0x.ptr<short>(i);
351 short *y_row = I0y.ptr<short>(i);
352 for (int j = 0; j < patch_size; j++)
353 {
354 sum_xx += x_row[j] * x_row[j];
355 sum_yy += y_row[j] * y_row[j];
356 sum_xy += x_row[j] * y_row[j];
357 sum_x += x_row[j];
358 sum_y += y_row[j];
359 }
360 I0xx_aux_ptr[i * ws] = sum_xx;
361 I0yy_aux_ptr[i * ws] = sum_yy;
362 I0xy_aux_ptr[i * ws] = sum_xy;
363 I0x_aux_ptr[i * ws] = sum_x;
364 I0y_aux_ptr[i * ws] = sum_y;
365 int js = 1;
366 for (int j = patch_size; j < w; j++)
367 {
368 sum_xx += (x_row[j] * x_row[j] - x_row[j - patch_size] * x_row[j - patch_size]);
369 sum_yy += (y_row[j] * y_row[j] - y_row[j - patch_size] * y_row[j - patch_size]);
370 sum_xy += (x_row[j] * y_row[j] - x_row[j - patch_size] * y_row[j - patch_size]);
371 sum_x += (x_row[j] - x_row[j - patch_size]);
372 sum_y += (y_row[j] - y_row[j - patch_size]);
373 if ((j - patch_size + 1) % patch_stride == 0)
374 {
375 I0xx_aux_ptr[i * ws + js] = sum_xx;
376 I0yy_aux_ptr[i * ws + js] = sum_yy;
377 I0xy_aux_ptr[i * ws + js] = sum_xy;
378 I0x_aux_ptr[i * ws + js] = sum_x;
379 I0y_aux_ptr[i * ws + js] = sum_y;
380 js++;
381 }
382 }
383 }
384
385 AutoBuffer<float> sum_xx(ws), sum_yy(ws), sum_xy(ws), sum_x(ws), sum_y(ws);
386 for (int j = 0; j < ws; j++)
387 {
388 sum_xx[j] = 0.0f;
389 sum_yy[j] = 0.0f;
390 sum_xy[j] = 0.0f;
391 sum_x[j] = 0.0f;
392 sum_y[j] = 0.0f;
393 }
394
395 /* Separable box filter: vertical pass */
396 for (int i = 0; i < patch_size; i++)
397 for (int j = 0; j < ws; j++)
398 {
399 sum_xx[j] += I0xx_aux_ptr[i * ws + j];
400 sum_yy[j] += I0yy_aux_ptr[i * ws + j];
401 sum_xy[j] += I0xy_aux_ptr[i * ws + j];
402 sum_x[j] += I0x_aux_ptr[i * ws + j];
403 sum_y[j] += I0y_aux_ptr[i * ws + j];
404 }
405 for (int j = 0; j < ws; j++)
406 {
407 I0xx_ptr[j] = sum_xx[j];
408 I0yy_ptr[j] = sum_yy[j];
409 I0xy_ptr[j] = sum_xy[j];
410 I0x_ptr[j] = sum_x[j];
411 I0y_ptr[j] = sum_y[j];
412 }
413 int is = 1;
414 for (int i = patch_size; i < h; i++)
415 {
416 for (int j = 0; j < ws; j++)
417 {
418 sum_xx[j] += (I0xx_aux_ptr[i * ws + j] - I0xx_aux_ptr[(i - patch_size) * ws + j]);
419 sum_yy[j] += (I0yy_aux_ptr[i * ws + j] - I0yy_aux_ptr[(i - patch_size) * ws + j]);
420 sum_xy[j] += (I0xy_aux_ptr[i * ws + j] - I0xy_aux_ptr[(i - patch_size) * ws + j]);
421 sum_x[j] += (I0x_aux_ptr[i * ws + j] - I0x_aux_ptr[(i - patch_size) * ws + j]);
422 sum_y[j] += (I0y_aux_ptr[i * ws + j] - I0y_aux_ptr[(i - patch_size) * ws + j]);
423 }
424 if ((i - patch_size + 1) % patch_stride == 0)
425 {
426 for (int j = 0; j < ws; j++)
427 {
428 I0xx_ptr[is * ws + j] = sum_xx[j];
429 I0yy_ptr[is * ws + j] = sum_yy[j];
430 I0xy_ptr[is * ws + j] = sum_xy[j];
431 I0x_ptr[is * ws + j] = sum_x[j];
432 I0y_ptr[is * ws + j] = sum_y[j];
433 }
434 is++;
435 }
436 }
437 }
438
autoSelectCoarsestScale(int img_width)439 int DISOpticalFlowImpl::autoSelectCoarsestScale(int img_width)
440 {
441 const int fratio = 5;
442 return std::max(0, (int)std::floor(log2((2.0f*(float)img_width) / ((float)fratio * (float)patch_size))));
443 }
444
autoSelectPatchSizeAndScales(int img_width)445 void DISOpticalFlowImpl::autoSelectPatchSizeAndScales(int img_width)
446 {
447 switch (finest_scale)
448 {
449 case 1:
450 patch_size = 8;
451 coarsest_scale = autoSelectCoarsestScale(img_width);
452 finest_scale = std::max(coarsest_scale-2, 0);
453 break;
454
455 case 3:
456 patch_size = 12;
457 coarsest_scale = autoSelectCoarsestScale(img_width);
458 finest_scale = std::max(coarsest_scale-4, 0);
459 break;
460
461 case 4:
462 patch_size = 12;
463 coarsest_scale = autoSelectCoarsestScale(img_width);
464 finest_scale = std::max(coarsest_scale-5, 0);
465 break;
466
467 // default case, fall-through.
468 case 2:
469 default:
470 patch_size = 8;
471 coarsest_scale = autoSelectCoarsestScale(img_width);
472 finest_scale = std::max(coarsest_scale-2, 0);
473 break;
474 }
475 }
476
PatchInverseSearch_ParBody(DISOpticalFlowImpl & _dis,int _nstripes,int _hs,Mat & dst_Sx,Mat & dst_Sy,Mat & src_Ux,Mat & src_Uy,Mat & _I0,Mat & _I1,Mat & _I0x,Mat & _I0y,int _num_iter,int _pyr_level)477 DISOpticalFlowImpl::PatchInverseSearch_ParBody::PatchInverseSearch_ParBody(DISOpticalFlowImpl &_dis, int _nstripes,
478 int _hs, Mat &dst_Sx, Mat &dst_Sy,
479 Mat &src_Ux, Mat &src_Uy, Mat &_I0, Mat &_I1,
480 Mat &_I0x, Mat &_I0y, int _num_iter,
481 int _pyr_level)
482 : dis(&_dis), nstripes(_nstripes), hs(_hs), Sx(&dst_Sx), Sy(&dst_Sy), Ux(&src_Ux), Uy(&src_Uy), I0(&_I0), I1(&_I1),
483 I0x(&_I0x), I0y(&_I0y), num_iter(_num_iter), pyr_level(_pyr_level)
484 {
485 stripe_sz = (int)ceil(hs / (double)nstripes);
486 }
487
488 /////////////////////////////////////////////* Patch processing functions */////////////////////////////////////////////
489
490 /* Some auxiliary macros */
491 #define HAL_INIT_BILINEAR_8x8_PATCH_EXTRACTION \
492 v_float32x4 w00v = v_setall_f32(w00); \
493 v_float32x4 w01v = v_setall_f32(w01); \
494 v_float32x4 w10v = v_setall_f32(w10); \
495 v_float32x4 w11v = v_setall_f32(w11); \
496 \
497 v_uint16x8 I0_row_8, I1_row_8, I1_row_shifted_8, I1_row_next_8, I1_row_next_shifted_8, tmp; \
498 v_uint32x4 I0_row_4_left, I1_row_4_left, I1_row_shifted_4_left, I1_row_next_4_left, I1_row_next_shifted_4_left; \
499 v_uint32x4 I0_row_4_right, I1_row_4_right, I1_row_shifted_4_right, I1_row_next_4_right, \
500 I1_row_next_shifted_4_right; \
501 v_float32x4 I_diff_left, I_diff_right; \
502 \
503 /* Preload and expand the first row of I1: */ \
504 I1_row_8 = v_load_expand(I1_ptr); \
505 I1_row_shifted_8 = v_load_expand(I1_ptr + 1); \
506 v_expand(I1_row_8, I1_row_4_left, I1_row_4_right); \
507 v_expand(I1_row_shifted_8, I1_row_shifted_4_left, I1_row_shifted_4_right); \
508 I1_ptr += I1_stride;
509
510 #define HAL_PROCESS_BILINEAR_8x8_PATCH_EXTRACTION \
511 /* Load the next row of I1: */ \
512 I1_row_next_8 = v_load_expand(I1_ptr); \
513 I1_row_next_shifted_8 = v_load_expand(I1_ptr + 1); \
514 /* Separate the left and right halves: */ \
515 v_expand(I1_row_next_8, I1_row_next_4_left, I1_row_next_4_right); \
516 v_expand(I1_row_next_shifted_8, I1_row_next_shifted_4_left, I1_row_next_shifted_4_right); \
517 \
518 /* Load current row of I0: */ \
519 I0_row_8 = v_load_expand(I0_ptr); \
520 v_expand(I0_row_8, I0_row_4_left, I0_row_4_right); \
521 \
522 /* Compute diffs between I0 and bilinearly interpolated I1: */ \
523 I_diff_left = w00v * v_cvt_f32(v_reinterpret_as_s32(I1_row_4_left)) + \
524 w01v * v_cvt_f32(v_reinterpret_as_s32(I1_row_shifted_4_left)) + \
525 w10v * v_cvt_f32(v_reinterpret_as_s32(I1_row_next_4_left)) + \
526 w11v * v_cvt_f32(v_reinterpret_as_s32(I1_row_next_shifted_4_left)) - \
527 v_cvt_f32(v_reinterpret_as_s32(I0_row_4_left)); \
528 I_diff_right = w00v * v_cvt_f32(v_reinterpret_as_s32(I1_row_4_right)) + \
529 w01v * v_cvt_f32(v_reinterpret_as_s32(I1_row_shifted_4_right)) + \
530 w10v * v_cvt_f32(v_reinterpret_as_s32(I1_row_next_4_right)) + \
531 w11v * v_cvt_f32(v_reinterpret_as_s32(I1_row_next_shifted_4_right)) - \
532 v_cvt_f32(v_reinterpret_as_s32(I0_row_4_right));
533
534 #define HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW \
535 I0_ptr += I0_stride; \
536 I1_ptr += I1_stride; \
537 \
538 I1_row_4_left = I1_row_next_4_left; \
539 I1_row_4_right = I1_row_next_4_right; \
540 I1_row_shifted_4_left = I1_row_next_shifted_4_left; \
541 I1_row_shifted_4_right = I1_row_next_shifted_4_right;
542
543 /* This function essentially performs one iteration of gradient descent when finding the most similar patch in I1 for a
544 * given one in I0. It assumes that I0_ptr and I1_ptr already point to the corresponding patches and w00, w01, w10, w11
545 * are precomputed bilinear interpolation weights. It returns the SSD (sum of squared differences) between these patches
546 * and computes the values (dst_dUx, dst_dUy) that are used in the flow vector update. HAL acceleration is implemented
547 * only for the default patch size (8x8). Everything is processed in floats as using fixed-point approximations harms
548 * the quality significantly.
549 */
processPatch(float & dst_dUx,float & dst_dUy,uchar * I0_ptr,uchar * I1_ptr,short * I0x_ptr,short * I0y_ptr,int I0_stride,int I1_stride,float w00,float w01,float w10,float w11,int patch_sz)550 inline float processPatch(float &dst_dUx, float &dst_dUy, uchar *I0_ptr, uchar *I1_ptr, short *I0x_ptr, short *I0y_ptr,
551 int I0_stride, int I1_stride, float w00, float w01, float w10, float w11, int patch_sz)
552 {
553 float SSD = 0.0f;
554 #if CV_SIMD128
555 if (patch_sz == 8)
556 {
557 /* Variables to accumulate the sums */
558 v_float32x4 Ux_vec = v_setall_f32(0);
559 v_float32x4 Uy_vec = v_setall_f32(0);
560 v_float32x4 SSD_vec = v_setall_f32(0);
561
562 v_int16x8 I0x_row, I0y_row;
563 v_int32x4 I0x_row_4_left, I0x_row_4_right, I0y_row_4_left, I0y_row_4_right;
564
565 HAL_INIT_BILINEAR_8x8_PATCH_EXTRACTION;
566 for (int row = 0; row < 8; row++)
567 {
568 HAL_PROCESS_BILINEAR_8x8_PATCH_EXTRACTION;
569 I0x_row = v_load(I0x_ptr);
570 v_expand(I0x_row, I0x_row_4_left, I0x_row_4_right);
571 I0y_row = v_load(I0y_ptr);
572 v_expand(I0y_row, I0y_row_4_left, I0y_row_4_right);
573
574 /* Update the sums: */
575 Ux_vec += I_diff_left * v_cvt_f32(I0x_row_4_left) + I_diff_right * v_cvt_f32(I0x_row_4_right);
576 Uy_vec += I_diff_left * v_cvt_f32(I0y_row_4_left) + I_diff_right * v_cvt_f32(I0y_row_4_right);
577 SSD_vec += I_diff_left * I_diff_left + I_diff_right * I_diff_right;
578
579 I0x_ptr += I0_stride;
580 I0y_ptr += I0_stride;
581 HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW;
582 }
583
584 /* Final reduce operations: */
585 dst_dUx = v_reduce_sum(Ux_vec);
586 dst_dUy = v_reduce_sum(Uy_vec);
587 SSD = v_reduce_sum(SSD_vec);
588 }
589 else
590 #endif
591 {
592 dst_dUx = 0.0f;
593 dst_dUy = 0.0f;
594 float diff;
595 for (int i = 0; i < patch_sz; i++)
596 for (int j = 0; j < patch_sz; j++)
597 {
598 diff = w00 * I1_ptr[i * I1_stride + j] + w01 * I1_ptr[i * I1_stride + j + 1] +
599 w10 * I1_ptr[(i + 1) * I1_stride + j] + w11 * I1_ptr[(i + 1) * I1_stride + j + 1] -
600 I0_ptr[i * I0_stride + j];
601
602 SSD += diff * diff;
603 dst_dUx += diff * I0x_ptr[i * I0_stride + j];
604 dst_dUy += diff * I0y_ptr[i * I0_stride + j];
605 }
606 }
607 return SSD;
608 }
609
610 /* Same as processPatch, but with patch mean normalization, which improves robustness under changing
611 * lighting conditions
612 */
processPatchMeanNorm(float & dst_dUx,float & dst_dUy,uchar * I0_ptr,uchar * I1_ptr,short * I0x_ptr,short * I0y_ptr,int I0_stride,int I1_stride,float w00,float w01,float w10,float w11,int patch_sz,float x_grad_sum,float y_grad_sum)613 inline float processPatchMeanNorm(float &dst_dUx, float &dst_dUy, uchar *I0_ptr, uchar *I1_ptr, short *I0x_ptr,
614 short *I0y_ptr, int I0_stride, int I1_stride, float w00, float w01, float w10,
615 float w11, int patch_sz, float x_grad_sum, float y_grad_sum)
616 {
617 float sum_diff = 0.0, sum_diff_sq = 0.0;
618 float sum_I0x_mul = 0.0, sum_I0y_mul = 0.0;
619 float n = (float)patch_sz * patch_sz;
620
621 #if CV_SIMD128
622 if (patch_sz == 8)
623 {
624 /* Variables to accumulate the sums */
625 v_float32x4 sum_I0x_mul_vec = v_setall_f32(0);
626 v_float32x4 sum_I0y_mul_vec = v_setall_f32(0);
627 v_float32x4 sum_diff_vec = v_setall_f32(0);
628 v_float32x4 sum_diff_sq_vec = v_setall_f32(0);
629
630 v_int16x8 I0x_row, I0y_row;
631 v_int32x4 I0x_row_4_left, I0x_row_4_right, I0y_row_4_left, I0y_row_4_right;
632
633 HAL_INIT_BILINEAR_8x8_PATCH_EXTRACTION;
634 for (int row = 0; row < 8; row++)
635 {
636 HAL_PROCESS_BILINEAR_8x8_PATCH_EXTRACTION;
637 I0x_row = v_load(I0x_ptr);
638 v_expand(I0x_row, I0x_row_4_left, I0x_row_4_right);
639 I0y_row = v_load(I0y_ptr);
640 v_expand(I0y_row, I0y_row_4_left, I0y_row_4_right);
641
642 /* Update the sums: */
643 sum_I0x_mul_vec += I_diff_left * v_cvt_f32(I0x_row_4_left) + I_diff_right * v_cvt_f32(I0x_row_4_right);
644 sum_I0y_mul_vec += I_diff_left * v_cvt_f32(I0y_row_4_left) + I_diff_right * v_cvt_f32(I0y_row_4_right);
645 sum_diff_sq_vec += I_diff_left * I_diff_left + I_diff_right * I_diff_right;
646 sum_diff_vec += I_diff_left + I_diff_right;
647
648 I0x_ptr += I0_stride;
649 I0y_ptr += I0_stride;
650 HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW;
651 }
652
653 /* Final reduce operations: */
654 sum_I0x_mul = v_reduce_sum(sum_I0x_mul_vec);
655 sum_I0y_mul = v_reduce_sum(sum_I0y_mul_vec);
656 sum_diff = v_reduce_sum(sum_diff_vec);
657 sum_diff_sq = v_reduce_sum(sum_diff_sq_vec);
658 }
659 else
660 #endif
661 {
662 float diff;
663 for (int i = 0; i < patch_sz; i++)
664 for (int j = 0; j < patch_sz; j++)
665 {
666 diff = w00 * I1_ptr[i * I1_stride + j] + w01 * I1_ptr[i * I1_stride + j + 1] +
667 w10 * I1_ptr[(i + 1) * I1_stride + j] + w11 * I1_ptr[(i + 1) * I1_stride + j + 1] -
668 I0_ptr[i * I0_stride + j];
669
670 sum_diff += diff;
671 sum_diff_sq += diff * diff;
672
673 sum_I0x_mul += diff * I0x_ptr[i * I0_stride + j];
674 sum_I0y_mul += diff * I0y_ptr[i * I0_stride + j];
675 }
676 }
677 dst_dUx = sum_I0x_mul - sum_diff * x_grad_sum / n;
678 dst_dUy = sum_I0y_mul - sum_diff * y_grad_sum / n;
679 return sum_diff_sq - sum_diff * sum_diff / n;
680 }
681
682 /* Similar to processPatch, but compute only the sum of squared differences (SSD) between the patches */
computeSSD(uchar * I0_ptr,uchar * I1_ptr,int I0_stride,int I1_stride,float w00,float w01,float w10,float w11,int patch_sz)683 inline float computeSSD(uchar *I0_ptr, uchar *I1_ptr, int I0_stride, int I1_stride, float w00, float w01, float w10,
684 float w11, int patch_sz)
685 {
686 float SSD = 0.0f;
687 #if CV_SIMD128
688 if (patch_sz == 8)
689 {
690 v_float32x4 SSD_vec = v_setall_f32(0);
691 HAL_INIT_BILINEAR_8x8_PATCH_EXTRACTION;
692 for (int row = 0; row < 8; row++)
693 {
694 HAL_PROCESS_BILINEAR_8x8_PATCH_EXTRACTION;
695 SSD_vec += I_diff_left * I_diff_left + I_diff_right * I_diff_right;
696 HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW;
697 }
698 SSD = v_reduce_sum(SSD_vec);
699 }
700 else
701 #endif
702 {
703 float diff;
704 for (int i = 0; i < patch_sz; i++)
705 for (int j = 0; j < patch_sz; j++)
706 {
707 diff = w00 * I1_ptr[i * I1_stride + j] + w01 * I1_ptr[i * I1_stride + j + 1] +
708 w10 * I1_ptr[(i + 1) * I1_stride + j] + w11 * I1_ptr[(i + 1) * I1_stride + j + 1] -
709 I0_ptr[i * I0_stride + j];
710 SSD += diff * diff;
711 }
712 }
713 return SSD;
714 }
715
716 /* Same as computeSSD, but with patch mean normalization */
computeSSDMeanNorm(uchar * I0_ptr,uchar * I1_ptr,int I0_stride,int I1_stride,float w00,float w01,float w10,float w11,int patch_sz)717 inline float computeSSDMeanNorm(uchar *I0_ptr, uchar *I1_ptr, int I0_stride, int I1_stride, float w00, float w01,
718 float w10, float w11, int patch_sz)
719 {
720 float sum_diff = 0.0f, sum_diff_sq = 0.0f;
721 float n = (float)patch_sz * patch_sz;
722 #if CV_SIMD128
723 if (patch_sz == 8)
724 {
725 v_float32x4 sum_diff_vec = v_setall_f32(0);
726 v_float32x4 sum_diff_sq_vec = v_setall_f32(0);
727 HAL_INIT_BILINEAR_8x8_PATCH_EXTRACTION;
728 for (int row = 0; row < 8; row++)
729 {
730 HAL_PROCESS_BILINEAR_8x8_PATCH_EXTRACTION;
731 sum_diff_sq_vec += I_diff_left * I_diff_left + I_diff_right * I_diff_right;
732 sum_diff_vec += I_diff_left + I_diff_right;
733 HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW;
734 }
735 sum_diff = v_reduce_sum(sum_diff_vec);
736 sum_diff_sq = v_reduce_sum(sum_diff_sq_vec);
737 }
738 else
739 {
740 #endif
741 float diff;
742 for (int i = 0; i < patch_sz; i++)
743 for (int j = 0; j < patch_sz; j++)
744 {
745 diff = w00 * I1_ptr[i * I1_stride + j] + w01 * I1_ptr[i * I1_stride + j + 1] +
746 w10 * I1_ptr[(i + 1) * I1_stride + j] + w11 * I1_ptr[(i + 1) * I1_stride + j + 1] -
747 I0_ptr[i * I0_stride + j];
748
749 sum_diff += diff;
750 sum_diff_sq += diff * diff;
751 }
752 #if CV_SIMD128
753 }
754 #endif
755 return sum_diff_sq - sum_diff * sum_diff / n;
756 }
757
758 #undef HAL_INIT_BILINEAR_8x8_PATCH_EXTRACTION
759 #undef HAL_PROCESS_BILINEAR_8x8_PATCH_EXTRACTION
760 #undef HAL_BILINEAR_8x8_PATCH_EXTRACTION_NEXT_ROW
761 ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
762
operator ()(const Range & range) const763 void DISOpticalFlowImpl::PatchInverseSearch_ParBody::operator()(const Range &range) const
764 {
765 CV_INSTRUMENT_REGION();
766
767 // force separate processing of stripes if we are using spatial propagation:
768 if (dis->use_spatial_propagation && range.end > range.start + 1)
769 {
770 for (int n = range.start; n < range.end; n++)
771 (*this)(Range(n, n + 1));
772 return;
773 }
774 int psz = dis->patch_size;
775 int psz2 = psz / 2;
776 int w_ext = dis->w + 2 * dis->border_size; //!< width of I1_ext
777 int bsz = dis->border_size;
778
779 /* Input dense flow */
780 float *Ux_ptr = Ux->ptr<float>();
781 float *Uy_ptr = Uy->ptr<float>();
782
783 /* Output sparse flow */
784 float *Sx_ptr = Sx->ptr<float>();
785 float *Sy_ptr = Sy->ptr<float>();
786
787 uchar *I0_ptr = I0->ptr<uchar>();
788 uchar *I1_ptr = I1->ptr<uchar>();
789 short *I0x_ptr = I0x->ptr<short>();
790 short *I0y_ptr = I0y->ptr<short>();
791
792 /* Precomputed structure tensor */
793 float *xx_ptr = dis->I0xx_buf.ptr<float>();
794 float *yy_ptr = dis->I0yy_buf.ptr<float>();
795 float *xy_ptr = dis->I0xy_buf.ptr<float>();
796 /* And extra buffers for mean-normalization: */
797 float *x_ptr = dis->I0x_buf.ptr<float>();
798 float *y_ptr = dis->I0y_buf.ptr<float>();
799
800 bool use_temporal_candidates = false;
801 float *initial_Ux_ptr = NULL, *initial_Uy_ptr = NULL;
802 if (!dis->initial_Ux.empty())
803 {
804 initial_Ux_ptr = dis->initial_Ux[pyr_level].ptr<float>();
805 initial_Uy_ptr = dis->initial_Uy[pyr_level].ptr<float>();
806 use_temporal_candidates = true;
807 }
808
809 int i, j, dir;
810 int start_is, end_is, start_js, end_js;
811 int start_i, start_j;
812 float i_lower_limit = bsz - psz + 1.0f;
813 float i_upper_limit = bsz + dis->h - 1.0f;
814 float j_lower_limit = bsz - psz + 1.0f;
815 float j_upper_limit = bsz + dis->w - 1.0f;
816 float dUx, dUy, i_I1, j_I1, w00, w01, w10, w11, dx, dy;
817
818 #define INIT_BILINEAR_WEIGHTS(Ux, Uy) \
819 i_I1 = min(max(i + Uy + bsz, i_lower_limit), i_upper_limit); \
820 j_I1 = min(max(j + Ux + bsz, j_lower_limit), j_upper_limit); \
821 { \
822 float di = i_I1 - floor(i_I1); \
823 float dj = j_I1 - floor(j_I1); \
824 w11 = di * dj; \
825 w10 = di * (1 - dj); \
826 w01 = (1 - di) * dj; \
827 w00 = (1 - di) * (1 - dj); \
828 }
829
830 #define COMPUTE_SSD(dst, Ux, Uy) \
831 INIT_BILINEAR_WEIGHTS(Ux, Uy); \
832 if (dis->use_mean_normalization) \
833 dst = computeSSDMeanNorm(I0_ptr + i * dis->w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1, dis->w, w_ext, w00, \
834 w01, w10, w11, psz); \
835 else \
836 dst = computeSSD(I0_ptr + i * dis->w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1, dis->w, w_ext, w00, w01, \
837 w10, w11, psz);
838
839 int num_inner_iter = (int)floor(dis->grad_descent_iter / (float)num_iter);
840 for (int iter = 0; iter < num_iter; iter++)
841 {
842 if (iter % 2 == 0)
843 {
844 dir = 1;
845 start_is = min(range.start * stripe_sz, hs);
846 end_is = min(range.end * stripe_sz, hs);
847 start_js = 0;
848 end_js = dis->ws;
849 start_i = start_is * dis->patch_stride;
850 start_j = 0;
851 }
852 else
853 {
854 dir = -1;
855 start_is = min(range.end * stripe_sz, hs) - 1;
856 end_is = min(range.start * stripe_sz, hs) - 1;
857 start_js = dis->ws - 1;
858 end_js = -1;
859 start_i = start_is * dis->patch_stride;
860 start_j = (dis->ws - 1) * dis->patch_stride;
861 }
862
863 i = start_i;
864 for (int is = start_is; dir * is < dir * end_is; is += dir)
865 {
866 j = start_j;
867 for (int js = start_js; dir * js < dir * end_js; js += dir)
868 {
869 if (iter == 0)
870 {
871 /* Using result form the previous pyramid level as the very first approximation: */
872 Sx_ptr[is * dis->ws + js] = Ux_ptr[(i + psz2) * dis->w + j + psz2];
873 Sy_ptr[is * dis->ws + js] = Uy_ptr[(i + psz2) * dis->w + j + psz2];
874 }
875
876 float min_SSD = INF, cur_SSD;
877 if (use_temporal_candidates || dis->use_spatial_propagation)
878 {
879 COMPUTE_SSD(min_SSD, Sx_ptr[is * dis->ws + js], Sy_ptr[is * dis->ws + js]);
880 }
881
882 if (use_temporal_candidates)
883 {
884 /* Try temporal candidates (vectors from the initial flow field that was passed to the function) */
885 COMPUTE_SSD(cur_SSD, initial_Ux_ptr[(i + psz2) * dis->w + j + psz2],
886 initial_Uy_ptr[(i + psz2) * dis->w + j + psz2]);
887 if (cur_SSD < min_SSD)
888 {
889 min_SSD = cur_SSD;
890 Sx_ptr[is * dis->ws + js] = initial_Ux_ptr[(i + psz2) * dis->w + j + psz2];
891 Sy_ptr[is * dis->ws + js] = initial_Uy_ptr[(i + psz2) * dis->w + j + psz2];
892 }
893 }
894
895 if (dis->use_spatial_propagation)
896 {
897 /* Try spatial candidates: */
898 if (dir * js > dir * start_js)
899 {
900 COMPUTE_SSD(cur_SSD, Sx_ptr[is * dis->ws + js - dir], Sy_ptr[is * dis->ws + js - dir]);
901 if (cur_SSD < min_SSD)
902 {
903 min_SSD = cur_SSD;
904 Sx_ptr[is * dis->ws + js] = Sx_ptr[is * dis->ws + js - dir];
905 Sy_ptr[is * dis->ws + js] = Sy_ptr[is * dis->ws + js - dir];
906 }
907 }
908 /* Flow vectors won't actually propagate across different stripes, which is the reason for keeping
909 * the number of stripes constant. It works well enough in practice and doesn't introduce any
910 * visible seams.
911 */
912 if (dir * is > dir * start_is)
913 {
914 COMPUTE_SSD(cur_SSD, Sx_ptr[(is - dir) * dis->ws + js], Sy_ptr[(is - dir) * dis->ws + js]);
915 if (cur_SSD < min_SSD)
916 {
917 min_SSD = cur_SSD;
918 Sx_ptr[is * dis->ws + js] = Sx_ptr[(is - dir) * dis->ws + js];
919 Sy_ptr[is * dis->ws + js] = Sy_ptr[(is - dir) * dis->ws + js];
920 }
921 }
922 }
923
924 /* Use the best candidate as a starting point for the gradient descent: */
925 float cur_Ux = Sx_ptr[is * dis->ws + js];
926 float cur_Uy = Sy_ptr[is * dis->ws + js];
927
928 /* Computing the inverse of the structure tensor: */
929 float detH = xx_ptr[is * dis->ws + js] * yy_ptr[is * dis->ws + js] -
930 xy_ptr[is * dis->ws + js] * xy_ptr[is * dis->ws + js];
931 if (abs(detH) < EPS)
932 detH = EPS;
933 float invH11 = yy_ptr[is * dis->ws + js] / detH;
934 float invH12 = -xy_ptr[is * dis->ws + js] / detH;
935 float invH22 = xx_ptr[is * dis->ws + js] / detH;
936 float prev_SSD = INF, SSD;
937 float x_grad_sum = x_ptr[is * dis->ws + js];
938 float y_grad_sum = y_ptr[is * dis->ws + js];
939
940 for (int t = 0; t < num_inner_iter; t++)
941 {
942 INIT_BILINEAR_WEIGHTS(cur_Ux, cur_Uy);
943 if (dis->use_mean_normalization)
944 SSD = processPatchMeanNorm(dUx, dUy,
945 I0_ptr + i * dis->w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,
946 I0x_ptr + i * dis->w + j, I0y_ptr + i * dis->w + j,
947 dis->w, w_ext, w00, w01, w10, w11, psz,
948 x_grad_sum, y_grad_sum);
949 else
950 SSD = processPatch(dUx, dUy,
951 I0_ptr + i * dis->w + j, I1_ptr + (int)i_I1 * w_ext + (int)j_I1,
952 I0x_ptr + i * dis->w + j, I0y_ptr + i * dis->w + j,
953 dis->w, w_ext, w00, w01, w10, w11, psz);
954
955 dx = invH11 * dUx + invH12 * dUy;
956 dy = invH12 * dUx + invH22 * dUy;
957 cur_Ux -= dx;
958 cur_Uy -= dy;
959
960 /* Break when patch distance stops decreasing */
961 if (SSD >= prev_SSD)
962 break;
963 prev_SSD = SSD;
964 }
965
966 /* If gradient descent converged to a flow vector that is very far from the initial approximation
967 * (more than patch size) then we don't use it. Noticeably improves the robustness.
968 */
969 if (norm(Vec2f(cur_Ux - Sx_ptr[is * dis->ws + js], cur_Uy - Sy_ptr[is * dis->ws + js])) <= psz)
970 {
971 Sx_ptr[is * dis->ws + js] = cur_Ux;
972 Sy_ptr[is * dis->ws + js] = cur_Uy;
973 }
974 j += dir * dis->patch_stride;
975 }
976 i += dir * dis->patch_stride;
977 }
978 }
979 #undef INIT_BILINEAR_WEIGHTS
980 #undef COMPUTE_SSD
981 }
982
Densification_ParBody(DISOpticalFlowImpl & _dis,int _nstripes,int _h,Mat & dst_Ux,Mat & dst_Uy,Mat & src_Sx,Mat & src_Sy,Mat & _I0,Mat & _I1)983 DISOpticalFlowImpl::Densification_ParBody::Densification_ParBody(DISOpticalFlowImpl &_dis, int _nstripes, int _h,
984 Mat &dst_Ux, Mat &dst_Uy, Mat &src_Sx, Mat &src_Sy,
985 Mat &_I0, Mat &_I1)
986 : dis(&_dis), nstripes(_nstripes), h(_h), Ux(&dst_Ux), Uy(&dst_Uy), Sx(&src_Sx), Sy(&src_Sy), I0(&_I0), I1(&_I1)
987 {
988 stripe_sz = (int)ceil(h / (double)nstripes);
989 }
990
991 /* This function transforms a sparse optical flow field obtained by PatchInverseSearch (which computes flow values
992 * on a sparse grid defined by patch_stride) into a dense optical flow field by weighted averaging of values from the
993 * overlapping patches.
994 */
operator ()(const Range & range) const995 void DISOpticalFlowImpl::Densification_ParBody::operator()(const Range &range) const
996 {
997 CV_INSTRUMENT_REGION();
998
999 int start_i = min(range.start * stripe_sz, h);
1000 int end_i = min(range.end * stripe_sz, h);
1001
1002 /* Input sparse flow */
1003 float *Sx_ptr = Sx->ptr<float>();
1004 float *Sy_ptr = Sy->ptr<float>();
1005
1006 /* Output dense flow */
1007 float *Ux_ptr = Ux->ptr<float>();
1008 float *Uy_ptr = Uy->ptr<float>();
1009
1010 uchar *I0_ptr = I0->ptr<uchar>();
1011 uchar *I1_ptr = I1->ptr<uchar>();
1012
1013 int psz = dis->patch_size;
1014 int pstr = dis->patch_stride;
1015 int i_l, i_u;
1016 int j_l, j_u;
1017 float i_m, j_m, diff;
1018
1019 /* These values define the set of sparse grid locations that contain patches overlapping with the current dense flow
1020 * location */
1021 int start_is, end_is;
1022 int start_js, end_js;
1023
1024 /* Some helper macros for updating this set of sparse grid locations */
1025 #define UPDATE_SPARSE_I_COORDINATES \
1026 if (i % pstr == 0 && i + psz <= h) \
1027 end_is++; \
1028 if (i - psz >= 0 && (i - psz) % pstr == 0 && start_is < end_is) \
1029 start_is++;
1030
1031 #define UPDATE_SPARSE_J_COORDINATES \
1032 if (j % pstr == 0 && j + psz <= dis->w) \
1033 end_js++; \
1034 if (j - psz >= 0 && (j - psz) % pstr == 0 && start_js < end_js) \
1035 start_js++;
1036
1037 start_is = 0;
1038 end_is = -1;
1039 for (int i = 0; i < start_i; i++)
1040 {
1041 UPDATE_SPARSE_I_COORDINATES;
1042 }
1043 for (int i = start_i; i < end_i; i++)
1044 {
1045 UPDATE_SPARSE_I_COORDINATES;
1046 start_js = 0;
1047 end_js = -1;
1048 for (int j = 0; j < dis->w; j++)
1049 {
1050 UPDATE_SPARSE_J_COORDINATES;
1051 float coef, sum_coef = 0.0f;
1052 float sum_Ux = 0.0f;
1053 float sum_Uy = 0.0f;
1054
1055 /* Iterate through all the patches that overlap the current location (i,j) */
1056 for (int is = start_is; is <= end_is; is++)
1057 for (int js = start_js; js <= end_js; js++)
1058 {
1059 j_m = min(max(j + Sx_ptr[is * dis->ws + js], 0.0f), dis->w - 1.0f - EPS);
1060 i_m = min(max(i + Sy_ptr[is * dis->ws + js], 0.0f), dis->h - 1.0f - EPS);
1061 j_l = (int)j_m;
1062 j_u = j_l + 1;
1063 i_l = (int)i_m;
1064 i_u = i_l + 1;
1065 diff = (j_m - j_l) * (i_m - i_l) * I1_ptr[i_u * dis->w + j_u] +
1066 (j_u - j_m) * (i_m - i_l) * I1_ptr[i_u * dis->w + j_l] +
1067 (j_m - j_l) * (i_u - i_m) * I1_ptr[i_l * dis->w + j_u] +
1068 (j_u - j_m) * (i_u - i_m) * I1_ptr[i_l * dis->w + j_l] - I0_ptr[i * dis->w + j];
1069 coef = 1 / max(1.0f, abs(diff));
1070 sum_Ux += coef * Sx_ptr[is * dis->ws + js];
1071 sum_Uy += coef * Sy_ptr[is * dis->ws + js];
1072 sum_coef += coef;
1073 }
1074 CV_DbgAssert(sum_coef != 0);
1075 Ux_ptr[i * dis->w + j] = sum_Ux / sum_coef;
1076 Uy_ptr[i * dis->w + j] = sum_Uy / sum_coef;
1077 }
1078 }
1079 #undef UPDATE_SPARSE_I_COORDINATES
1080 #undef UPDATE_SPARSE_J_COORDINATES
1081 }
1082
1083 #ifdef HAVE_OPENCL
ocl_PatchInverseSearch(UMat & src_U,UMat & I0,UMat & I1,UMat & I0x,UMat & I0y,int num_iter,int)1084 bool DISOpticalFlowImpl::ocl_PatchInverseSearch(UMat &src_U,
1085 UMat &I0, UMat &I1, UMat &I0x, UMat &I0y, int num_iter, int /*pyr_level*/)
1086 {
1087 CV_INSTRUMENT_REGION();
1088 CV_INSTRUMENT_REGION_OPENCL();
1089
1090 size_t globalSize[] = {(size_t)ws, (size_t)hs};
1091 size_t localSize[] = {16, 16};
1092 int num_inner_iter = (int)floor(grad_descent_iter / (float)num_iter);
1093
1094 String subgroups_build_options;
1095 if (ocl::Device::getDefault().isExtensionSupported("cl_khr_subgroups"))
1096 subgroups_build_options = " -DCV_USE_SUBGROUPS=1";
1097
1098 String build_options = cv::format(
1099 "-DDIS_BORDER_SIZE=%d -DDIS_PATCH_SIZE=%d -DDIS_PATCH_STRIDE=%d",
1100 border_size, patch_size, patch_stride
1101 ) + subgroups_build_options;
1102
1103 #if 0 // OpenCL debug
1104 u_Sx = Scalar::all(0);
1105 u_Sy = Scalar::all(0);
1106 #endif
1107
1108 CV_Assert(num_iter == 2);
1109 for (int iter = 0; iter < num_iter; iter++)
1110 {
1111 if (iter == 0)
1112 {
1113 ocl::Kernel k1("dis_patch_inverse_search_fwd_1", ocl::video::dis_flow_oclsrc, build_options);
1114 size_t global_sz[] = {(size_t)hs * 8};
1115 size_t local_sz[] = {8};
1116
1117 k1.args(
1118 ocl::KernelArg::PtrReadOnly(src_U),
1119 ocl::KernelArg::PtrReadOnly(I0),
1120 ocl::KernelArg::PtrReadOnly(I1),
1121 (int)w, (int)h, (int)ws, (int)hs,
1122 ocl::KernelArg::PtrWriteOnly(u_S)
1123 );
1124 if (!k1.run(1, global_sz, local_sz, false))
1125 return false;
1126
1127 ocl::Kernel k2("dis_patch_inverse_search_fwd_2", ocl::video::dis_flow_oclsrc, build_options);
1128
1129 k2.args(
1130 ocl::KernelArg::PtrReadOnly(src_U),
1131 ocl::KernelArg::PtrReadOnly(I0),
1132 ocl::KernelArg::PtrReadOnly(I1),
1133 ocl::KernelArg::PtrReadOnly(I0x),
1134 ocl::KernelArg::PtrReadOnly(I0y),
1135 ocl::KernelArg::PtrReadOnly(u_I0xx_buf),
1136 ocl::KernelArg::PtrReadOnly(u_I0yy_buf),
1137 ocl::KernelArg::PtrReadOnly(u_I0xy_buf),
1138 ocl::KernelArg::PtrReadOnly(u_I0x_buf),
1139 ocl::KernelArg::PtrReadOnly(u_I0y_buf),
1140 (int)w, (int)h, (int)ws, (int)hs,
1141 (int)num_inner_iter,
1142 ocl::KernelArg::PtrReadWrite(u_S)
1143 );
1144 if (!k2.run(2, globalSize, localSize, false))
1145 return false;
1146 }
1147 else
1148 {
1149 ocl::Kernel k3("dis_patch_inverse_search_bwd_1", ocl::video::dis_flow_oclsrc, build_options);
1150 size_t global_sz[] = {(size_t)hs * 8};
1151 size_t local_sz[] = {8};
1152
1153 k3.args(
1154 ocl::KernelArg::PtrReadOnly(I0),
1155 ocl::KernelArg::PtrReadOnly(I1),
1156 (int)w, (int)h, (int)ws, (int)hs,
1157 ocl::KernelArg::PtrReadWrite(u_S)
1158 );
1159 if (!k3.run(1, global_sz, local_sz, false))
1160 return false;
1161
1162 ocl::Kernel k4("dis_patch_inverse_search_bwd_2", ocl::video::dis_flow_oclsrc, build_options);
1163
1164 k4.args(
1165 ocl::KernelArg::PtrReadOnly(I0),
1166 ocl::KernelArg::PtrReadOnly(I1),
1167 ocl::KernelArg::PtrReadOnly(I0x),
1168 ocl::KernelArg::PtrReadOnly(I0y),
1169 ocl::KernelArg::PtrReadOnly(u_I0xx_buf),
1170 ocl::KernelArg::PtrReadOnly(u_I0yy_buf),
1171 ocl::KernelArg::PtrReadOnly(u_I0xy_buf),
1172 ocl::KernelArg::PtrReadOnly(u_I0x_buf),
1173 ocl::KernelArg::PtrReadOnly(u_I0y_buf),
1174 (int)w, (int)h,(int)ws, (int)hs,
1175 (int)num_inner_iter,
1176 ocl::KernelArg::PtrReadWrite(u_S)
1177 );
1178 if (!k4.run(2, globalSize, localSize, false))
1179 return false;
1180 }
1181 }
1182 return true;
1183 }
1184
ocl_Densification(UMat & dst_U,UMat & src_S,UMat & _I0,UMat & _I1)1185 bool DISOpticalFlowImpl::ocl_Densification(UMat &dst_U, UMat &src_S, UMat &_I0, UMat &_I1)
1186 {
1187 CV_INSTRUMENT_REGION();
1188 CV_INSTRUMENT_REGION_OPENCL();
1189
1190 size_t globalSize[] = {(size_t)w, (size_t)h};
1191 size_t localSize[] = {16, 16};
1192
1193 String build_options = cv::format(
1194 "-DDIS_PATCH_SIZE=%d -DDIS_PATCH_STRIDE=%d",
1195 patch_size, patch_stride
1196 );
1197
1198 ocl::Kernel kernel("dis_densification", ocl::video::dis_flow_oclsrc, build_options);
1199 kernel.args(
1200 ocl::KernelArg::PtrReadOnly(src_S),
1201 ocl::KernelArg::PtrReadOnly(_I0),
1202 ocl::KernelArg::PtrReadOnly(_I1),
1203 (int)w, (int)h, (int)ws,
1204 ocl::KernelArg::PtrWriteOnly(dst_U)
1205 );
1206 return kernel.run(2, globalSize, localSize, false);
1207 }
1208
ocl_prepareBuffers(UMat & I0,UMat & I1,InputArray flow,bool use_flow)1209 void DISOpticalFlowImpl::ocl_prepareBuffers(UMat &I0, UMat &I1, InputArray flow, bool use_flow)
1210 {
1211 CV_INSTRUMENT_REGION();
1212 // not pure OpenCV code: CV_INSTRUMENT_REGION_OPENCL();
1213
1214 u_I0s.resize(coarsest_scale + 1);
1215 u_I1s.resize(coarsest_scale + 1);
1216 u_I1s_ext.resize(coarsest_scale + 1);
1217 u_I0xs.resize(coarsest_scale + 1);
1218 u_I0ys.resize(coarsest_scale + 1);
1219 u_U.resize(coarsest_scale + 1);
1220
1221 if (use_flow)
1222 {
1223 u_initial_U.resize(coarsest_scale + 1);
1224 }
1225
1226 int fraction = 1;
1227 int cur_rows = 0, cur_cols = 0;
1228
1229 for (int i = 0; i <= coarsest_scale; i++)
1230 {
1231 CV_TRACE_REGION("coarsest_scale_iteration");
1232 /* Avoid initializing the pyramid levels above the finest scale, as they won't be used anyway */
1233 if (i == finest_scale)
1234 {
1235 cur_rows = I0.rows / fraction;
1236 cur_cols = I0.cols / fraction;
1237 u_I0s[i].create(cur_rows, cur_cols, CV_8UC1);
1238 resize(I0, u_I0s[i], u_I0s[i].size(), 0.0, 0.0, INTER_AREA);
1239 u_I1s[i].create(cur_rows, cur_cols, CV_8UC1);
1240 resize(I1, u_I1s[i], u_I1s[i].size(), 0.0, 0.0, INTER_AREA);
1241
1242 /* These buffers are reused in each scale so we initialize them once on the finest scale: */
1243 u_S.create(cur_rows / patch_stride, cur_cols / patch_stride, CV_32FC2);
1244 u_I0xx_buf.create(cur_rows / patch_stride, cur_cols / patch_stride, CV_32FC1);
1245 u_I0yy_buf.create(cur_rows / patch_stride, cur_cols / patch_stride, CV_32FC1);
1246 u_I0xy_buf.create(cur_rows / patch_stride, cur_cols / patch_stride, CV_32FC1);
1247 u_I0x_buf.create(cur_rows / patch_stride, cur_cols / patch_stride, CV_32FC1);
1248 u_I0y_buf.create(cur_rows / patch_stride, cur_cols / patch_stride, CV_32FC1);
1249
1250 u_I0xx_buf_aux.create(cur_rows, cur_cols / patch_stride, CV_32FC1);
1251 u_I0yy_buf_aux.create(cur_rows, cur_cols / patch_stride, CV_32FC1);
1252 u_I0xy_buf_aux.create(cur_rows, cur_cols / patch_stride, CV_32FC1);
1253 u_I0x_buf_aux.create(cur_rows, cur_cols / patch_stride, CV_32FC1);
1254 u_I0y_buf_aux.create(cur_rows, cur_cols / patch_stride, CV_32FC1);
1255 }
1256 else if (i > finest_scale)
1257 {
1258 cur_rows = u_I0s[i - 1].rows / 2;
1259 cur_cols = u_I0s[i - 1].cols / 2;
1260 u_I0s[i].create(cur_rows, cur_cols, CV_8UC1);
1261 resize(u_I0s[i - 1], u_I0s[i], u_I0s[i].size(), 0.0, 0.0, INTER_AREA);
1262 u_I1s[i].create(cur_rows, cur_cols, CV_8UC1);
1263 resize(u_I1s[i - 1], u_I1s[i], u_I1s[i].size(), 0.0, 0.0, INTER_AREA);
1264 }
1265
1266 if (i >= finest_scale)
1267 {
1268 u_I1s_ext[i].create(cur_rows + 2 * border_size, cur_cols + 2 * border_size, CV_8UC1);
1269 copyMakeBorder(u_I1s[i], u_I1s_ext[i], border_size, border_size, border_size, border_size, BORDER_REPLICATE);
1270 u_I0xs[i].create(cur_rows, cur_cols, CV_16SC1);
1271 u_I0ys[i].create(cur_rows, cur_cols, CV_16SC1);
1272 spatialGradient(u_I0s[i], u_I0xs[i], u_I0ys[i]);
1273 u_U[i].create(cur_rows, cur_cols, CV_32FC2);
1274 variational_refinement_processors[i]->setAlpha(variational_refinement_alpha);
1275 variational_refinement_processors[i]->setDelta(variational_refinement_delta);
1276 variational_refinement_processors[i]->setGamma(variational_refinement_gamma);
1277 variational_refinement_processors[i]->setSorIterations(5);
1278 variational_refinement_processors[i]->setFixedPointIterations(variational_refinement_iter);
1279
1280 if (use_flow)
1281 {
1282 UMat resized_flow;
1283 resize(flow, resized_flow, Size(cur_cols, cur_rows));
1284 float scale = 1.0f / fraction;
1285 resized_flow.convertTo(u_initial_U[i], CV_32FC2, scale, 0.0f);
1286 }
1287 }
1288
1289 fraction *= 2;
1290 }
1291 }
1292
ocl_precomputeStructureTensor(UMat & dst_I0xx,UMat & dst_I0yy,UMat & dst_I0xy,UMat & dst_I0x,UMat & dst_I0y,UMat & I0x,UMat & I0y)1293 bool DISOpticalFlowImpl::ocl_precomputeStructureTensor(UMat &dst_I0xx, UMat &dst_I0yy, UMat &dst_I0xy,
1294 UMat &dst_I0x, UMat &dst_I0y, UMat &I0x, UMat &I0y)
1295 {
1296 CV_INSTRUMENT_REGION();
1297 CV_INSTRUMENT_REGION_OPENCL();
1298
1299 size_t globalSizeX[] = {(size_t)h};
1300 size_t localSizeX[] = {16};
1301
1302 #if 0 // OpenCL debug
1303 u_I0xx_buf_aux = Scalar::all(0);
1304 u_I0yy_buf_aux = Scalar::all(0);
1305 u_I0xy_buf_aux = Scalar::all(0);
1306 u_I0x_buf_aux = Scalar::all(0);
1307 u_I0y_buf_aux = Scalar::all(0);
1308 dst_I0xx = Scalar::all(0);
1309 dst_I0yy = Scalar::all(0);
1310 dst_I0xy = Scalar::all(0);
1311 dst_I0x = Scalar::all(0);
1312 dst_I0y = Scalar::all(0);
1313 #endif
1314
1315 String build_options = cv::format(
1316 "-DDIS_PATCH_SIZE=%d -DDIS_PATCH_STRIDE=%d",
1317 patch_size, patch_stride
1318 );
1319
1320 ocl::Kernel kernelX("dis_precomputeStructureTensor_hor", ocl::video::dis_flow_oclsrc, build_options);
1321 kernelX.args(
1322 ocl::KernelArg::PtrReadOnly(I0x),
1323 ocl::KernelArg::PtrReadOnly(I0y),
1324 (int)w, (int)h, (int)ws,
1325 ocl::KernelArg::PtrWriteOnly(u_I0xx_buf_aux),
1326 ocl::KernelArg::PtrWriteOnly(u_I0yy_buf_aux),
1327 ocl::KernelArg::PtrWriteOnly(u_I0xy_buf_aux),
1328 ocl::KernelArg::PtrWriteOnly(u_I0x_buf_aux),
1329 ocl::KernelArg::PtrWriteOnly(u_I0y_buf_aux)
1330 );
1331 if (!kernelX.run(1, globalSizeX, localSizeX, false))
1332 return false;
1333
1334 size_t globalSizeY[] = {(size_t)ws};
1335 size_t localSizeY[] = {16};
1336
1337 ocl::Kernel kernelY("dis_precomputeStructureTensor_ver", ocl::video::dis_flow_oclsrc, build_options);
1338 kernelY.args(
1339 ocl::KernelArg::PtrReadOnly(u_I0xx_buf_aux),
1340 ocl::KernelArg::PtrReadOnly(u_I0yy_buf_aux),
1341 ocl::KernelArg::PtrReadOnly(u_I0xy_buf_aux),
1342 ocl::KernelArg::PtrReadOnly(u_I0x_buf_aux),
1343 ocl::KernelArg::PtrReadOnly(u_I0y_buf_aux),
1344 (int)w, (int)h, (int)ws,
1345 ocl::KernelArg::PtrWriteOnly(dst_I0xx),
1346 ocl::KernelArg::PtrWriteOnly(dst_I0yy),
1347 ocl::KernelArg::PtrWriteOnly(dst_I0xy),
1348 ocl::KernelArg::PtrWriteOnly(dst_I0x),
1349 ocl::KernelArg::PtrWriteOnly(dst_I0y)
1350 );
1351 return kernelY.run(1, globalSizeY, localSizeY, false);
1352 }
1353
ocl_calc(InputArray I0,InputArray I1,InputOutputArray flow)1354 bool DISOpticalFlowImpl::ocl_calc(InputArray I0, InputArray I1, InputOutputArray flow)
1355 {
1356 CV_INSTRUMENT_REGION();
1357 // not pure OpenCV code: CV_INSTRUMENT_REGION_OPENCL();
1358
1359 UMat I0Mat = I0.getUMat();
1360 UMat I1Mat = I1.getUMat();
1361 bool use_input_flow = false;
1362 if (flow.sameSize(I0) && flow.depth() == CV_32F && flow.channels() == 2)
1363 use_input_flow = true;
1364 coarsest_scale = min((int)(log(max(I0Mat.cols, I0Mat.rows) / (4.0 * patch_size)) / log(2.0) + 0.5), /* Original code search for maximal movement of width/4 */
1365 (int)(log(min(I0Mat.cols, I0Mat.rows) / patch_size) / log(2.0))); /* Deepest pyramid level greater or equal than patch*/
1366
1367 if (coarsest_scale<0)
1368 CV_Error(cv::Error::StsBadSize, "The input image must have either width or height >= 12");
1369
1370 if (coarsest_scale<finest_scale)
1371 {
1372 // choose the finest level based on coarsest level.
1373 // Refs: https://github.com/tikroeger/OF_DIS/blob/2c9f2a674f3128d3a41c10e41cc9f3a35bb1b523/run_dense.cpp#L239
1374 int original_img_width = I0.size().width;
1375 autoSelectPatchSizeAndScales(original_img_width);
1376 }
1377
1378 ocl_prepareBuffers(I0Mat, I1Mat, flow, use_input_flow);
1379 u_U[coarsest_scale].setTo(0.0f);
1380
1381 for (int i = coarsest_scale; i >= finest_scale; i--)
1382 {
1383 CV_TRACE_REGION("coarsest_scale_iteration");
1384 w = u_I0s[i].cols;
1385 h = u_I0s[i].rows;
1386 ws = 1 + (w - patch_size) / patch_stride;
1387 hs = 1 + (h - patch_size) / patch_stride;
1388
1389 if (!ocl_precomputeStructureTensor(u_I0xx_buf, u_I0yy_buf, u_I0xy_buf,
1390 u_I0x_buf, u_I0y_buf, u_I0xs[i], u_I0ys[i]))
1391 return false;
1392
1393 if (!ocl_PatchInverseSearch(u_U[i], u_I0s[i], u_I1s_ext[i], u_I0xs[i], u_I0ys[i], 2, i))
1394 return false;
1395
1396 if (!ocl_Densification(u_U[i], u_S, u_I0s[i], u_I1s[i]))
1397 return false;
1398
1399 if (variational_refinement_iter > 0)
1400 {
1401 std::vector<Mat> U_channels;
1402 split(u_U[i], U_channels); CV_Assert(U_channels.size() == 2);
1403 variational_refinement_processors[i]->calcUV(u_I0s[i], u_I1s[i],
1404 U_channels[0], U_channels[1]);
1405 merge(U_channels, u_U[i]);
1406 }
1407
1408 if (i > finest_scale)
1409 {
1410 UMat resized;
1411 resize(u_U[i], resized, u_U[i - 1].size());
1412 multiply(resized, 2, u_U[i - 1]);
1413 }
1414 }
1415
1416 UMat resized_flow;
1417 resize(u_U[finest_scale], resized_flow, I1Mat.size());
1418 multiply(resized_flow, 1 << finest_scale, flow);
1419
1420 return true;
1421 }
1422 #endif
1423
calc(InputArray I0,InputArray I1,InputOutputArray flow)1424 void DISOpticalFlowImpl::calc(InputArray I0, InputArray I1, InputOutputArray flow)
1425 {
1426 CV_INSTRUMENT_REGION();
1427
1428 CV_Assert(!I0.empty() && I0.depth() == CV_8U && I0.channels() == 1);
1429 CV_Assert(!I1.empty() && I1.depth() == CV_8U && I1.channels() == 1);
1430 CV_Assert(I0.sameSize(I1));
1431 CV_Assert(I0.isContinuous());
1432 CV_Assert(I1.isContinuous());
1433
1434 CV_OCL_RUN(flow.isUMat() &&
1435 (patch_size == 8) && (use_spatial_propagation == true),
1436 ocl_calc(I0, I1, flow));
1437
1438 Mat I0Mat = I0.getMat();
1439 Mat I1Mat = I1.getMat();
1440 bool use_input_flow = false;
1441 if (flow.sameSize(I0) && flow.depth() == CV_32F && flow.channels() == 2)
1442 use_input_flow = true;
1443 else
1444 flow.create(I1Mat.size(), CV_32FC2);
1445 Mat flowMat = flow.getMat();
1446 coarsest_scale = min((int)(log(max(I0Mat.cols, I0Mat.rows) / (4.0 * patch_size)) / log(2.0) + 0.5), /* Original code search for maximal movement of width/4 */
1447 (int)(log(min(I0Mat.cols, I0Mat.rows) / patch_size) / log(2.0))); /* Deepest pyramid level greater or equal than patch*/
1448
1449 if (coarsest_scale<0)
1450 CV_Error(cv::Error::StsBadSize, "The input image must have either width or height >= 12");
1451
1452 if (coarsest_scale<finest_scale)
1453 {
1454 // choose the finest level based on coarsest level.
1455 // Refs: https://github.com/tikroeger/OF_DIS/blob/2c9f2a674f3128d3a41c10e41cc9f3a35bb1b523/run_dense.cpp#L239
1456 int original_img_width = I0.size().width;
1457 autoSelectPatchSizeAndScales(original_img_width);
1458 }
1459
1460 int num_stripes = getNumThreads();
1461
1462 prepareBuffers(I0Mat, I1Mat, flowMat, use_input_flow);
1463 Ux[coarsest_scale].setTo(0.0f);
1464 Uy[coarsest_scale].setTo(0.0f);
1465
1466 for (int i = coarsest_scale; i >= finest_scale; i--)
1467 {
1468 CV_TRACE_REGION("coarsest_scale_iteration");
1469 w = I0s[i].cols;
1470 h = I0s[i].rows;
1471 ws = 1 + (w - patch_size) / patch_stride;
1472 hs = 1 + (h - patch_size) / patch_stride;
1473
1474 precomputeStructureTensor(I0xx_buf, I0yy_buf, I0xy_buf, I0x_buf, I0y_buf, I0xs[i], I0ys[i]);
1475 if (use_spatial_propagation)
1476 {
1477 /* Use a fixed number of stripes regardless the number of threads to make inverse search
1478 * with spatial propagation reproducible
1479 */
1480 parallel_for_(Range(0, 8), PatchInverseSearch_ParBody(*this, 8, hs, Sx, Sy, Ux[i], Uy[i], I0s[i],
1481 I1s_ext[i], I0xs[i], I0ys[i], 2, i));
1482 }
1483 else
1484 {
1485 parallel_for_(Range(0, num_stripes),
1486 PatchInverseSearch_ParBody(*this, num_stripes, hs, Sx, Sy, Ux[i], Uy[i], I0s[i], I1s_ext[i],
1487 I0xs[i], I0ys[i], 1, i));
1488 }
1489
1490 parallel_for_(Range(0, num_stripes),
1491 Densification_ParBody(*this, num_stripes, I0s[i].rows, Ux[i], Uy[i], Sx, Sy, I0s[i], I1s[i]));
1492 if (variational_refinement_iter > 0)
1493 variational_refinement_processors[i]->calcUV(I0s[i], I1s[i], Ux[i], Uy[i]);
1494
1495 if (i > finest_scale)
1496 {
1497 resize(Ux[i], Ux[i - 1], Ux[i - 1].size());
1498 resize(Uy[i], Uy[i - 1], Uy[i - 1].size());
1499 Ux[i - 1] *= 2;
1500 Uy[i - 1] *= 2;
1501 }
1502 }
1503 Mat uxy[] = {Ux[finest_scale], Uy[finest_scale]};
1504 merge(uxy, 2, U);
1505 resize(U, flowMat, flowMat.size());
1506 flowMat *= 1 << finest_scale;
1507 }
1508
collectGarbage()1509 void DISOpticalFlowImpl::collectGarbage()
1510 {
1511 CV_INSTRUMENT_REGION();
1512
1513 I0s.clear();
1514 I1s.clear();
1515 I1s_ext.clear();
1516 I0xs.clear();
1517 I0ys.clear();
1518 Ux.clear();
1519 Uy.clear();
1520 U.release();
1521 Sx.release();
1522 Sy.release();
1523 I0xx_buf.release();
1524 I0yy_buf.release();
1525 I0xy_buf.release();
1526 I0xx_buf_aux.release();
1527 I0yy_buf_aux.release();
1528 I0xy_buf_aux.release();
1529
1530 #ifdef HAVE_OPENCL
1531 u_I0s.clear();
1532 u_I1s.clear();
1533 u_I1s_ext.clear();
1534 u_I0xs.clear();
1535 u_I0ys.clear();
1536 u_U.clear();
1537 u_S.release();
1538 u_I0xx_buf.release();
1539 u_I0yy_buf.release();
1540 u_I0xy_buf.release();
1541 u_I0xx_buf_aux.release();
1542 u_I0yy_buf_aux.release();
1543 u_I0xy_buf_aux.release();
1544 #endif
1545
1546 for (int i = finest_scale; i <= coarsest_scale; i++)
1547 variational_refinement_processors[i]->collectGarbage();
1548 variational_refinement_processors.clear();
1549 }
1550
create(int preset)1551 Ptr<DISOpticalFlow> DISOpticalFlow::create(int preset)
1552 {
1553 CV_INSTRUMENT_REGION();
1554
1555 Ptr<DISOpticalFlow> dis = makePtr<DISOpticalFlowImpl>();
1556 dis->setPatchSize(8);
1557 if (preset == DISOpticalFlow::PRESET_ULTRAFAST)
1558 {
1559 dis->setFinestScale(2);
1560 dis->setPatchStride(4);
1561 dis->setGradientDescentIterations(12);
1562 dis->setVariationalRefinementIterations(0);
1563 }
1564 else if (preset == DISOpticalFlow::PRESET_FAST)
1565 {
1566 dis->setFinestScale(2);
1567 dis->setPatchStride(4);
1568 dis->setGradientDescentIterations(16);
1569 dis->setVariationalRefinementIterations(5);
1570 }
1571 else if (preset == DISOpticalFlow::PRESET_MEDIUM)
1572 {
1573 dis->setFinestScale(1);
1574 dis->setPatchStride(3);
1575 dis->setGradientDescentIterations(25);
1576 dis->setVariationalRefinementIterations(5);
1577 }
1578
1579 return dis;
1580 }
1581
1582
1583 } // namespace
1584